In [None]:
%pip install pandas scikit-learn jsonlines rouge-score bert-score transformers torch

import pandas as pd
import json
import re
import torch
from rouge_score import rouge_scorer
from bert_score import score
from sklearn.metrics import precision_recall_fscore_support

Collecting jsonlines
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=89ce45c899f92893467250f6471c57ac1931dde6b6dbf15dd9a9fe1a2c500ff1
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge-score
Installing collected packages: jsonlines, rouge-score, 

In [None]:
CAPTION_RE = re.compile(
    r"Caption:\s*(.*?)\s*(?:\\nConcept descriptions:|\\nConcepts:|$)",
    flags=re.S
)

CONCEPTS_RE = re.compile(
    r"Concepts:\s*(.*)\s*$",
    flags=re.S
)

def extract_caption_and_concepts(text: str) -> pd.Series:
    """
    Parses the model's single-string output to extract the structured caption and concepts.
    """
    text = (text or "").strip()

    m_cap = CAPTION_RE.search(text)
    caption = m_cap.group(1).strip().replace('<end_of_turn>', '').strip() if m_cap else ""

    m_con = CONCEPTS_RE.search(text)
    concepts_str = m_con.group(1).strip().replace('<end_of_turn>', '').strip() if m_con else ""

    concepts = [c.strip() for c in concepts_str.split(",") if c.strip()] if concepts_str else []

    return pd.Series({"caption_extracted": caption, "concepts_extracted": concepts})

In [None]:
ROUGE_SCORER = rouge_scorer.RougeScorer(['rouge1'], use_stemmer=True)

BERT_SCORER = score

def calculate_rouge_1_f1(reference: str, candidate: str) -> float:
    """Calculates the ROUGE-1 F1 score."""
    if not reference or not candidate:
        return 0.0

    scores = ROUGE_SCORER.score(reference, candidate)
    return scores['rouge1'].fmeasure

def calculate_bertscore_f1(references: list[str], candidates: list[str]) -> float:
    """Calculates the average BERTScore F1 across the corpus."""
    if not references or not candidates:
        return 0.0

    P, R, F1 = BERT_SCORER(candidates, references, lang="en", verbose=False)
    return F1.mean().item()


def calculate_concept_metrics(df: pd.DataFrame) -> pd.Series:

    primary_scores = df.apply(
        lambda row: calculate_sample_f1(row['cui'], row['concepts_extracted']),
        axis=1
    )
    primary_f1 = primary_scores.mean()
    all_annotated_concepts = set()
    for concepts in df['cui']:
        if isinstance(concepts, list):
            all_annotated_concepts.update(concepts)

    def filter_concepts(concept_list, allowed_set):
        if not isinstance(concept_list, list):
            return []
        return [c for c in concept_list if c in allowed_set]

    secondary_scores = df.apply(
        lambda row: calculate_sample_f1(
            filter_concepts(row['cui'], all_annotated_concepts),
            filter_concepts(row['concepts_extracted'], all_annotated_concepts)
        ),
        axis=1
    )
    secondary_f1 = secondary_scores.mean()

    return pd.Series({
        'Concept_F1_Primary': primary_f1,
        'Concept_F1_Secondary': secondary_f1
    })

In [None]:
def run_evaluation(file_path: str) -> None:

    print(f"--- Starting Evaluation for: {file_path} ---")
    if not os.path.exists(file_path):
        print(f"\nERROR: The file '{file_path}' does not exist. Please check the path.")
        print(f"Current working directory: {os.getcwd()}")
        return

    try:
        results_df = pd.read_json(file_path, lines=True)
    except pd.errors.EmptyDataError:
        print(f"\nERROR: The file '{file_path}' is empty or contains no valid JSON lines.")
        return
    except ValueError as e:
        print(f"\nERROR: Failed to parse JSONL file '{file_path}'.")
        print(f"Details: {e}")
        print("Please ensure the file is a valid JSON Lines (JSONL) format, where each line is a complete JSON object.")
        return
    except Exception as e:
        print(f"\nERROR: An unexpected error occurred while loading the file '{file_path}'.")
        print(f"Details: {e}")
        return

    print(f"Loaded {len(results_df)} samples.")

    print("\nParsing generated text to extract captions and concepts...")
    results_df[['caption_extracted', 'concepts_extracted']] = results_df['generation'].apply(extract_caption_and_concepts)

    print("Calculating ROUGE-1 F1...")
    results_df['rouge1_f1'] = results_df.apply(
        lambda row: calculate_rouge_1_f1(row['caption'], row['caption_extracted']),
        axis=1
    )
    average_rouge1_f1 = results_df['rouge1_f1'].mean()

    print("Calculating BERTScore F1 (may take a minute to load model)...")
    references = results_df['caption'].tolist()
    candidates = results_df['caption_extracted'].tolist()

    if len(references) > 0:
        average_bertscore_f1 = calculate_bertscore_f1(references, candidates)
    else:
        average_bertscore_f1 = 0.0

    print("Calculating Concept F1 Metrics (Primary and Secondary)...")
    concept_metrics = calculate_concept_metrics(results_df)

    print("\n" + "="*70)
    print("MODEL PERFORMANCE EVALUATION REPORT")
    print("="*70)

    print("\n[CAPTIONING METRICS]")
    print(f"  > ROUGE-1 F1:     {average_rouge1_f1:.4f}")
    print(f"  > BERTScore F1:   {average_bertscore_f1:.4f}")

    print("\n[CONCEPT DETECTION METRICS]")
    print(f"  > Primary F1 (Unfiltered Avg):    {concept_metrics['Concept_F1_Primary']:.4f}")
    print(f"  > Secondary F1 (GT-Filtered Avg): {concept_metrics['Concept_F1_Secondary']:.4f}")

    print("\n" + "="*70)
    print("Evaluation Complete.")
    print("="*70)
    print("\nSample Predictions vs. Actuals:")
    print(results_df[['caption', 'caption_extracted', 'rouge1_f1']].head().to_markdown(index=False, numalign="left"))


# --- USER INPUT SECTION ---

# TODO: Replace the placeholder path below with the actual path to your JSONL results file
# generated by the scoring notebook (e.g., 'Score_Results/4Bit_Qunat_Gemma_...jsonl').
#Mount the notebook on to the google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Set the working directory to dl_project_fall_2025
import os
os.chdir("/content/drive/MyDrive/DL_Project_2025/Score_Results")

# Auto relaod doesnt work in google colab, so you can use reload to reload your function calls
from importlib import reload
RESULTS_FILE_PATH = "4Bit_Qunat_Gemma_YL_validation_data_trained_model.jsonl"

# To run the evaluation, uncomment the line below and replace with your file path:
run_evaluation(RESULTS_FILE_PATH)

Mounted at /content/drive
--- Starting Evaluation for: 4Bit_Qunat_Gemma_YL_validation_data_trained_model.jsonl ---
Loaded 9920 samples.

Parsing generated text to extract captions and concepts...
Calculating ROUGE-1 F1...
Calculating BERTScore F1 (may take a minute to load model)...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating Concept F1 Metrics (Primary and Secondary)...

                MODEL PERFORMANCE EVALUATION REPORT

[CAPTIONING METRICS]
  > ROUGE-1 F1:     0.1917
  > BERTScore F1:   0.8300

[CONCEPT DETECTION METRICS]
  > Primary F1 (Unfiltered Avg):    0.7613
  > Secondary F1 (GT-Filtered Avg): 0.7627

Evaluation Complete.

Sample Predictions vs. Actuals:
| caption                                                                                                                                                                                                                                                                        | caption_extracted                                                                                                                                                                                                                                                                                                                                                               

In [None]:
RESULTS_FILE_PATH

'./Score_Results/4Bit_Qunat_Gemma_YL_validation_data_trained_model.jsonl'

In [None]:
def calculate_sample_f1(reference_list: list, candidate_list: list) -> float:

    ref_set = set(reference_list)
    cand_set = set(candidate_list)

    tp = len(ref_set.intersection(cand_set))

    if len(cand_set) > 0:
        precision = tp / len(cand_set)
    else:
        precision = 0.0

    if len(ref_set) > 0:
        recall = tp / len(ref_set)
    else:
        recall = 0.0

    if (precision + recall) > 0:
        f1 = 2 * (precision * recall) / (precision + recall)
    else:
        f1 = 0.0

    return f1