In [None]:
import pandas as pd

In [None]:
def calculate_accuracy(true_labels, predicted_labels):
    if len(true_labels) != len(predicted_labels):
        raise ValueError("Both lists must have the same length.")

    correct_count = 0
    for true, predicted in zip(true_labels, predicted_labels):
        if predicted and true == predicted:
            correct_count += 1

    accuracy = (correct_count / len(true_labels)) * 100
    return accuracy

df = pd.read_csv('litqa.csv')


true_labels = list(df['true_litqa'].values)
predicted_labels = list(df['prediction_litqa'].values)

accuracy = calculate_accuracy(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.2f}%")


Accuracy: 6.12%


In [None]:
!pip install nltk scikit-learn rouge-score




In [None]:
df = pd.read_csv('pubmedqa.csv')
true_labels = list(df['true_pubmedqa'].values)
predicted_labels = list(df['prediction_pubmedqa'].values)

In [None]:
from rouge_score import rouge_scorer

def calculate_total_rouge(true_labels, predicted_labels):

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    combined_true = " ".join(true_labels)
    combined_predicted = " ".join(predicted_labels)

    total_score = scorer.score(combined_true, combined_predicted)

    return total_score




total_rouge_scores = calculate_total_rouge(true_labels, predicted_labels)
print("Total ROUGE Scores:", total_rouge_scores)


Total ROUGE Scores: {'rouge1': Score(precision=0.6036939313984169, recall=0.6224156692056583, fmeasure=0.6129118671309939), 'rouge2': Score(precision=0.19429778247096094, recall=0.20032661948829614, fmeasure=0.19726614848566068), 'rougeL': Score(precision=0.25488126649076515, recall=0.26278563656147985, fmeasure=0.2587731047414948)}


In [None]:
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def calculate_bleu(true_labels, predicted_labels):

    references = []
    hypotheses = []
    for true, pred in zip(true_labels, predicted_labels):
        references.append([true.split()])
        if pred.strip():
            hypotheses.append(pred.split())
        else:
            hypotheses.append([])

    smoothing_function = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smoothing_function)
    return bleu_score



bleu_score = calculate_bleu(true_labels, predicted_labels)
print("BLEU Score:", bleu_score)

BLEU Score: 0.07261468973171632


In [None]:
from sentence_transformers import SentenceTransformer, util
def calculate_llm_similarity(true_labels, predicted_labels, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)

    similarities = []
    for true, pred in zip(true_labels, predicted_labels):
        if pred:
            embeddings = model.encode([true, pred], convert_to_tensor=True)
            similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
            similarities.append(similarity)
        else:
            similarities.append(0.0)

    return similarities


similarities = calculate_llm_similarity(true_labels, predicted_labels)
print("LLM Similarities:", similarities)


LLM Similarities: [0.5643056035041809, 0.7388991713523865, 0.7989909052848816, 0.5674906373023987, 0.7210288047790527, 0.41824185848236084, 0.6289610266685486, 0.45475807785987854, 0.6026619076728821, 0.6113052368164062, 0.8387893438339233, 0.8231927156448364, 0.8016122579574585, 0.7046035528182983, 0.7683147192001343, 0.6723469495773315, 0.8930250406265259, 0.6395443677902222, 0.6476553678512573, 0.7873544096946716, 0.7277991771697998, 0.5868380069732666, 0.5488913059234619, 0.883202850818634, 0.844327449798584, 0.5609667301177979, 0.5519403219223022, 0.6548464298248291, 0.7802248001098633, 0.8870300054550171, 0.8742786645889282, 0.848362922668457, 0.7818562984466553, 0.7282975912094116, 0.5283779501914978, 0.8335415720939636, 0.8447953462600708, 0.813531756401062, 0.8565278053283691, 0.6714909076690674, 0.8899580240249634, 0.7419842481613159, 0.3599957823753357, 0.7064620852470398, 0.8789749145507812, 0.8487503528594971, 0.4657061994075775, 0.7298774719238281, 0.7202070355415344, 0.7

In [None]:
(sum(similarities)/len(similarities))

0.7110191571712494