In [1]:
# Install libraries for metrics and data handling
%pip install pandas scikit-learn jsonlines rouge-score bert-score transformers torch

import pandas as pd
import json
import re
import torch
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics import precision_recall_fscore_support

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=835dff971b976f7e55ad6636159a9bf3ef3a0e9267a2a9a83d3e6b2668a74623
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: jsonlines, rouge-score, 

In [None]:
# --- Regular Expressions for Parsing Generated Text ---

# Pattern to capture the caption text
CAPTION_RE = re.compile(
    r"Caption:\s*(.*?)\s*(?:\\nConcept descriptions:|\\nConcepts:|$)",
    flags=re.S
)

# Pattern to capture the raw concepts list
CONCEPTS_RE = re.compile(
    r"Concepts:\s*(.*)\s*$",
    flags=re.S
)

def extract_caption_and_concepts(text: str) -> pd.Series:
    """
    Parses the model's single-string output to extract the structured caption and concepts.
    """
    text = (text or "").strip()

    # 1. Extract Caption (Primary target)
    m_cap = CAPTION_RE.search(text)
    # Clean up the generation, removing the end-of-turn token if present
    caption = m_cap.group(1).strip().replace('<end_of_turn>', '').strip() if m_cap else ""

    # 2. Extract Concepts (Secondary target)
    m_con = CONCEPTS_RE.search(text)
    concepts_str = m_con.group(1).strip().replace('<end_of_turn>', '').strip() if m_con else ""

    # 3. Split raw string into a list of cleaned CUIs
    concepts = [c.strip() for c in concepts_str.split(",") if c.strip()] if concepts_str else []

    return pd.Series({"caption_extracted": caption, "concepts_extracted": concepts})

In [None]:
# Initialize ROUGE Scorer outside the loop for efficiency
ROUGE_SCORER = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

# Initialize BERT Scorer outside the loop for efficiency
# NOTE: Requires a GPU for fast execution and downloads the BERT model upon first call.
# Using 'bert-base-uncased' as a common default.
BERT_SCORER = score

def calculate_rouge_1_f1(reference: str, candidate: str) -> float:
    """Calculates the ROUGE-1 F1 score."""
    if not reference or not candidate:
        return 0.0

    # ROUGE scorer handles tokenization internally
    scores = ROUGE_SCORER.score(reference, candidate)
    return scores['rouge1'].fmeasure

def calculate_bertscore_f1(references: list[str], candidates: list[str]) -> float:
    """Calculates the average BERTScore F1 across the corpus."""
    if not references or not candidates:
        return 0.0

    # P, R, F1 are tensors; we need the mean of the F1 tensor
    # lang='en' is appropriate for the ROCO dataset captions.
    P, R, F1 = BERT_SCORER(candidates, references, lang="en", verbose=False)
    return F1.mean().item()


def calculate_concept_metrics(df: pd.DataFrame) -> pd.Series:
    """
    Calculates overall Micro F1 (Primary) and Macro F1 (Secondary) for concept extraction.
    """

    # 1. Identify all unique concepts across all actual and predicted lists
    all_actuals = set(c for sublist in df['cui'] for c in sublist if isinstance(c, str))
    all_predicted = set(c for sublist in df['concepts_extracted'] for c in sublist if isinstance(c, str))

    all_unique_concepts = sorted(list(all_actuals.union(all_predicted)))

    if not all_unique_concepts:
        return pd.Series({
            'Concept_F1_Micro': 0.0,
            'Concept_F1_Macro': 0.0
        })

    # 2. Create binary presence vectors (y_true and y_pred)
    y_true = []
    y_pred = []

    for _, row in df.iterrows():
        actual_concepts = set(c for c in row['cui'] if isinstance(c, str))
        predicted_concepts = set(c for c in row['concepts_extracted'] for c in sublist if isinstance(c, str))

        y_true.append([1 if c in actual_concepts else 0 for c in all_unique_concepts])
        y_pred.append([1 if c in predicted_concepts else 0 for c in all_unique_concepts])

    # 3. Calculate metrics using scikit-learn
    # F1 (Primary): Micro-averaged F1 (focuses on overall agreement, common for multi-label)
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='micro', zero_division=0
    )

    # F1 Secondary: Macro-averaged F1 (focuses on per-concept accuracy, then averages)
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro', zero_division=0
    )

    return pd.Series({
        'Concept_F1_Primary (Micro)': f1_micro,
        'Concept_F1_Secondary (Macro)': f1_macro
    })

In [None]:
def run_evaluation(file_path: str) -> None:
    """
    Loads results from a JSONL file, computes all requested metrics, and prints the final report.
    """
    print(f"--- Starting Evaluation for: {file_path} ---")

    # Load data from JSONL
    try:
        results_df = pd.read_json(file_path, lines=True)
    except Exception as e:
        print(f"\nERROR: Could not load JSONL file. Please check path and format.")
        print(f"Details: {e}")
        return

    print(f"Loaded {len(results_df)} samples.")

    # Data Preprocessing: Parse the generated text
    print("\nParsing generated text to extract captions and concepts...")
    results_df[['caption_extracted', 'concepts_extracted']] = results_df['generation'].apply(extract_caption_and_concepts)

    # 1. Calculate Caption Generation Metrics

    # ROUGE-1 F1
    print("Calculating ROUGE-1 F1...")
    results_df['rouge1_f1'] = results_df.apply(
        lambda row: calculate_rouge_1_f1(row['caption'], row['caption_extracted']),
        axis=1
    )
    average_rouge1_f1 = results_df['rouge1_f1'].mean()

    # BERTScore F1
    print("Calculating BERTScore F1 (may take a minute to load model)...")
    references = results_df['caption'].tolist()
    candidates = results_df['caption_extracted'].tolist()

    average_bertscore_f1 = calculate_bertscore_f1(references, candidates)

    # 2. Calculate Concept Extraction Metrics
    print("Calculating Concept F1 Metrics (Micro and Macro)...")
    concept_metrics = calculate_concept_metrics(results_df)

    # --- Final Report ---
    print("\n" + "="*70)
    print("                MODEL PERFORMANCE EVALUATION REPORT")
    print("="*70)

    print("\n[CAPTIONING METRICS]")
    print(f"  > ROUGE-1 F1:     {average_rouge1_f1:.4f}")
    print(f"  > BERTScore F1:   {average_bertscore_f1:.4f}")

    print("\n[CONCEPT DETECTION METRICS]")
    print(f"  > F1 (Primary / Micro):  {concept_metrics['Concept_F1_Primary (Micro)']:.4f}")
    print(f"  > F1 (Secondary / Macro): {concept_metrics['Concept_F1_Secondary (Macro)']:.4f}")

    print("\n" + "="*70)
    print("Evaluation Complete.")
    print("="*70)

    # Optional: Display a few sample rows with scores
    print("\nSample Predictions vs. Actuals:")
    print(results_df[['caption', 'caption_extracted', 'rouge1_f1']].head().to_markdown(index=False, numalign="left"))


# --- USER INPUT SECTION ---

# TODO: Replace the placeholder path below with the actual path to your JSONL results file
# generated by the scoring notebook (e.g., 'Score_Results/4Bit_Qunat_Gemma_...jsonl').

# RESULTS_FILE_PATH = "path/to/your/results.jsonl"

# To run the evaluation, uncomment the line below and replace with your file path:
# run_evaluation(RESULTS_FILE_PATH)