In [None]:
# Install libraries for metrics and data handling
%pip install pandas scikit-learn jsonlines rouge-score bert-score transformers torch

import pandas as pd
import json
import re
import torch
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics import precision_recall_fscore_support



In [None]:
# --- Regular Expressions for Parsing Generated Text ---

# Pattern to capture the caption text
CAPTION_RE = re.compile(
    r"Caption:\s*(.*?)\s*(?:\\nConcept descriptions:|\\nConcepts:|$)",
    flags=re.S
)

# Pattern to capture the raw concepts list
CONCEPTS_RE = re.compile(
    r"Concepts:\s*(.*)\s*$",
    flags=re.S
)

def extract_caption_and_concepts(text: str) -> pd.Series:
    """
    Parses the model's single-string output to extract the structured caption and concepts.
    """
    text = (text or "").strip()

    # 1. Extract Caption (Primary target)
    m_cap = CAPTION_RE.search(text)
    # Clean up the generation, removing the end-of-turn token if present
    caption = m_cap.group(1).strip().replace('<end_of_turn>', '').strip() if m_cap else ""

    # 2. Extract Concepts (Secondary target)
    m_con = CONCEPTS_RE.search(text)
    concepts_str = m_con.group(1).strip().replace('<end_of_turn>', '').strip() if m_con else ""

    # 3. Split raw string into a list of cleaned CUIs
    concepts = [c.strip() for c in concepts_str.split(",") if c.strip()] if concepts_str else []

    return pd.Series({"caption_extracted": caption, "concepts_extracted": concepts})

In [None]:
# Initialize ROUGE Scorer outside the loop for efficiency
ROUGE_SCORER = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

# Initialize BERT Scorer outside the loop for efficiency
# NOTE: Requires a GPU for fast execution and downloads the BERT model upon first call.
# Using 'bert-base-uncased' as a common default.
BERT_SCORER = score

def calculate_rouge_1_f1(reference: str, candidate: str) -> float:
    """Calculates the ROUGE-1 F1 score."""
    if not reference or not candidate:
        return 0.0

    # ROUGE scorer handles tokenization internally
    scores = ROUGE_SCORER.score(reference, candidate)
    return scores['rouge1'].fmeasure

def calculate_bertscore_f1(references: list[str], candidates: list[str]) -> float:
    """Calculates the average BERTScore F1 across the corpus."""
    if not references or not candidates:
        return 0.0

    # P, R, F1 are tensors; we need the mean of the F1 tensor
    # lang='en' is appropriate for the ROCO dataset captions.
    P, R, F1 = BERT_SCORER(candidates, references, lang="en", verbose=False)
    return F1.mean().item()


def calculate_concept_metrics(df: pd.DataFrame) -> pd.Series:
    """
    Calculates Primary and Secondary F1 scores for concept extraction.

    Primary: Average of per-sample F1 scores (unfiltered).
    Secondary: Average of per-sample F1 scores (filtered by the set of all manually annotated concepts in the dataset).
    """

    # 1. Primary Score: Average per-sample F1 on raw lists
    # Note: calculate_sample_f1 must be defined in the notebook (e.g., from previous cell)
    primary_scores = df.apply(
        lambda row: calculate_sample_f1(row['cui'], row['concepts_extracted']),
        axis=1
    )
    primary_f1 = primary_scores.mean()

    # 2. Secondary Score: Filter by 'Manually Annotated Concepts' then average per-sample F1

    # Define the set of manually annotated concepts (union of all ground truth concepts)
    all_annotated_concepts = set()
    for concepts in df['cui']:
        if isinstance(concepts, list):
            all_annotated_concepts.update(concepts)

    def filter_concepts(concept_list, allowed_set):
        if not isinstance(concept_list, list):
            return []
        return [c for c in concept_list if c in allowed_set]

    secondary_scores = df.apply(
        lambda row: calculate_sample_f1(
            filter_concepts(row['cui'], all_annotated_concepts),
            filter_concepts(row['concepts_extracted'], all_annotated_concepts)
        ),
        axis=1
    )
    secondary_f1 = secondary_scores.mean()

    return pd.Series({
        'Concept_F1_Primary': primary_f1,
        'Concept_F1_Secondary': secondary_f1
    })

In [None]:
def run_evaluation(file_path: str) -> None:
    """
    Loads results from a JSONL file, computes all requested metrics, and prints the final report.
    """
    print(f"--- Starting Evaluation for: {file_path} ---")

    # Load data from JSONL
    # try:
    #     results_df = pd.read_json(file_path, lines=True)
    # except Exception as e:
    #     print(f"\nERROR: Could not load JSONL file. Please check path and format.")
    #     print(f"Details: {e}")
    #     return
    results_df = pd.read_json(file_path, lines=True)
    print(f"Loaded {len(results_df)} samples.")

    # Data Preprocessing: Parse the generated text
    print("\nParsing generated text to extract captions and concepts...")
    results_df[['caption_extracted', 'concepts_extracted']] = results_df['generation'].apply(extract_caption_and_concepts)

    # 1. Calculate Caption Generation Metrics

    # ROUGE-1 F1
    print("Calculating ROUGE-1 F1...")
    results_df['rouge1_f1'] = results_df.apply(
        lambda row: calculate_rouge_1_f1(row['caption'], row['caption_extracted']),
        axis=1
    )
    average_rouge1_f1 = results_df['rouge1_f1'].mean()

    # BERTScore F1
    print("Calculating BERTScore F1 (may take a minute to load model)...")
    references = results_df['caption'].tolist()
    candidates = results_df['caption_extracted'].tolist()

    if len(references) > 0:
        average_bertscore_f1 = calculate_bertscore_f1(references, candidates)
    else:
        average_bertscore_f1 = 0.0

    # 2. Calculate Concept Extraction Metrics
    print("Calculating Concept F1 Metrics (Primary and Secondary)...")
    concept_metrics = calculate_concept_metrics(results_df)

    # --- Final Report ---
    print("\n" + "="*70)
    print("                MODEL PERFORMANCE EVALUATION REPORT")
    print("="*70)

    print("\n[CAPTIONING METRICS]")
    print(f"  > ROUGE-1 F1:     {average_rouge1_f1:.4f}")
    print(f"  > BERTScore F1:   {average_bertscore_f1:.4f}")

    print("\n[CONCEPT DETECTION METRICS]")
    print(f"  > Primary F1 (Unfiltered Avg):    {concept_metrics['Concept_F1_Primary']:.4f}")
    print(f"  > Secondary F1 (GT-Filtered Avg): {concept_metrics['Concept_F1_Secondary']:.4f}")

    print("\n" + "="*70)
    print("Evaluation Complete.")
    print("="*70)

    # Optional: Display a few sample rows with scores
    print("\nSample Predictions vs. Actuals:")
    print(results_df[['caption', 'caption_extracted', 'rouge1_f1']].head().to_markdown(index=False, numalign="left"))


# --- USER INPUT SECTION ---

# TODO: Replace the placeholder path below with the actual path to your JSONL results file
# generated by the scoring notebook (e.g., 'Score_Results/4Bit_Qunat_Gemma_...jsonl').
#Mount the notebook on to the google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set the working directory to dl_project_fall_2025
import os
os.chdir("/content/drive/MyDrive/DL_Project_2025/dl_project_fall_2025")

# Auto relaod doesnt work in google colab, so you can use reload to reload your function calls
from importlib import reload
RESULTS_FILE_PATH = "./matt_results/12-billion-8Bit_Quant_Gemma_MM_test_trained_model_check.jsonl"

# To run the evaluation, uncomment the line below and replace with your file path:
run_evaluation(RESULTS_FILE_PATH)

Mounted at /content/drive
--- Starting Evaluation for: ./matt_results/12-billion-8Bit_Quant_Gemma_MM_test_trained_model_check.jsonl ---
Loaded 200 samples.

Parsing generated text to extract captions and concepts...
Calculating ROUGE-1 F1...
Calculating BERTScore F1 (may take a minute to load model)...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating Concept F1 Metrics (Primary and Secondary)...

                MODEL PERFORMANCE EVALUATION REPORT

[CAPTIONING METRICS]
  > ROUGE-1 F1:     0.2034
  > BERTScore F1:   0.8325

[CONCEPT DETECTION METRICS]
  > Primary F1 (Unfiltered Avg):    0.8850
  > Secondary F1 (GT-Filtered Avg): 0.8850

Evaluation Complete.

Sample Predictions vs. Actuals:
| caption                                                                                                                                                                                                                                                                        | caption_extracted                                                                                                                        | rouge1_f1   |
|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Task
Write a Python function `calculate_sample_f1(reference_list, candidate_list)` that calculates the F1 score for a single sample based on the intersection of reference and candidate sets. It should return the F1 score, handling empty lists gracefully.

## Define F1 Helper Function

### Subtask:
Write a helper function to calculate the F1 score for a single sample given reference and candidate lists.


**Reasoning**:
Define the helper function `calculate_sample_f1` to compute F1 score based on reference and candidate lists as per the instructions.



In [None]:
def calculate_sample_f1(reference_list: list, candidate_list: list) -> float:
    """
    Calculates the F1 score for a single sample given reference and candidate lists.
    """
    # Convert lists to sets for efficient intersection and unique element counting
    ref_set = set(reference_list)
    cand_set = set(candidate_list)

    # Calculate True Positives (TP)
    tp = len(ref_set.intersection(cand_set))

    # Calculate Precision
    if len(cand_set) > 0:
        precision = tp / len(cand_set)
    else:
        precision = 0.0

    # Calculate Recall
    if len(ref_set) > 0:
        recall = tp / len(ref_set)
    else:
        recall = 0.0

    # Calculate F1 Score
    if (precision + recall) > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0

    return f1

## Final Task

### Subtask:
Verify the implementation of the `calculate_sample_f1` function using various test cases.


## Summary:

### Data Analysis Key Findings
*   A function was successfully implemented to calculate the F1 score for individual samples by converting input lists into sets to find the intersection of elements (True Positives).
*   The logic incorporates safeguards against division-by-zero errors:
    *   **Precision** defaults to 0.0 if the candidate set is empty.
    *   **Recall** defaults to 0.0 if the reference set is empty.
    *   **F1 Score** defaults to 0.0 if the combined sum of precision and recall is zero.

### Insights or Next Steps
*   Verify the function's accuracy by running it against a variety of test cases, specifically targeting edge cases like empty lists or disjoint sets.
*   Once validated, apply this function iteratively across a full dataset to compute aggregate performance metrics.
