In [2]:
%pip install bert-score rouge-score sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
%pip install hf_xet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
from typing import List
from bert_score import score as bert_score
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer, util


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = SentenceTransformer('all-MiniLM-L6-v2')


In [None]:
def evaluate_predictions(preds: List[str], refs: List[str]) -> dict:
    _, _, F1 = bert_score(preds, refs, lang="en", verbose=False)
    bert_f1_avg = F1.mean().item()

    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_l_scores = [rouge.score(r, p)['rougeL'].fmeasure for r, p in zip(refs, preds)]
    rouge_l_avg = sum(rouge_l_scores) / len(rouge_l_scores)

    emb_pred = model.encode(preds, convert_to_tensor=True)
    emb_ref = model.encode(refs, convert_to_tensor=True)
    cos_scores = util.cos_sim(emb_pred, emb_ref)
    cosine_avg = cos_scores.diagonal().mean().item()

    return {
        "BERTScore F1": round(bert_f1_avg, 4),
        "ROUGE-L": round(rouge_l_avg, 4),
        "Cosine Similarity": round(cosine_avg, 4)
    }


In [None]:
file_paths = {
    "RAG Keywords": "",
    "RAG Embeddings": "",
    "LLM": ""
}

In [10]:
results = {}
for name, path in file_paths.items():
    df = pd.read_csv(path)
    preds = df["predicted"].astype(str).tolist()
    refs = df["ground_truth"].astype(str).tolist()
    results[name] = evaluate_predictions(preds, refs)

metrics_table = pd.DataFrame.from_dict(results, orient="index")
metrics_table.index.name = "Method"
metrics_table


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0_level_0,BERTScore F1,ROUGE-L,Cosine Similarity
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
RAG Keywords,0.8738,0.2269,0.7136
RAG Embeddings,0.8745,0.2336,0.7513
LLM,0.8695,0.2103,0.7078


In [None]:
results_path =''
metadata_path =''

results_df = pd.read_csv(results_path)

meta_df = pd.read_json(metadata_path, lines=True)

merged_df = results_df.merge(
    meta_df[['id', 'subject_name', 'topic_name']], 
    on="id", 
    how="left"
)

subject_acc = merged_df.groupby("subject_name").apply(
    lambda g: (g["predicted"] == g["gold"]).mean()
).sort_values(ascending=False)

subject_acc.to_csv("accuracy_subject_.csv")