In [1]:
import pandas as pd
import evaluate
from bert_score import score as bertscore
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [2]:
llama_df = pd.read_csv("llama3_sample_summaries.csv")
ref_df = pd.read_csv("llama3_bhc_matched_reference_summaries.csv")

df = pd.merge(llama_df, ref_df, on="note_id")
generated = df['summary'].fillna("").tolist()
reference = df['target'].fillna("").tolist()

In [3]:
### 1. ROUGE
rouge = evaluate.load("rouge")
rouge_result = rouge.compute(predictions=generated, references=reference)

print("\n📊 ROUGE Scores:")
for k, v in rouge_result.items():
    print(f"{k.upper()}: {v:.4f}")


📊 ROUGE Scores:
ROUGE1: 0.2947
ROUGE2: 0.1072
ROUGEL: 0.1772
ROUGELSUM: 0.1828


In [4]:
### 2. BLEU (average over all rows)
smoothie = SmoothingFunction().method4
bleu_scores = [
    sentence_bleu([ref.split()], pred.split(), smoothing_function=smoothie)
    for ref, pred in zip(reference, generated)
]
bleu_avg = sum(bleu_scores) / len(bleu_scores)

print("\n📊 BLEU Score (average over 1000 rows):")
print(f"BLEU: {bleu_avg:.4f}")


📊 BLEU Score (average over 1000 rows):
BLEU: 0.0312


In [5]:
### 3. BERTScore
P, R, F1 = bertscore(generated, reference, lang="en", verbose=True)
bert_avg = {
    "precision": P.mean().item(),
    "recall": R.mean().item(),
    "f1": F1.mean().item()
}

print("\n📊 BERTScore:")
print(f"Precision: {bert_avg['precision']:.4f}")
print(f"Recall:    {bert_avg['recall']:.4f}")
print(f"F1 Score:  {bert_avg['f1']:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 25.51 seconds, 39.21 sentences/sec

📊 BERTScore:
Precision: 0.8577
Recall:    0.8168
F1 Score:  0.8365
