In [29]:
import pandas as pd
import numpy as np
import evaluate
from rouge import Rouge
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import single_meteor_score
from bert_score import score as bert_score
from tabulate import tabulate

In [30]:
MODEL_NAME = "gemma"

In [31]:
class EvaluationMetrics:
    def __init__(self, predictions, references):
        self.predictions = predictions
        self.references = references

    def compute_rouge_score(self):
        rouge = Rouge()
        rouge_l_f1, rouge_l_recall, rouge_l_precision = [], [], []
        rouge_1_f1, rouge_1_recall, rouge_1_precision = [], [], []
        rouge_2_f1, rouge_2_recall, rouge_2_precision = [], [], []
        
        for prediction, reference in zip(self.predictions, self.references):
            scores = rouge.get_scores(prediction, reference)[0]
            
            rouge_l_f1.append(scores["rouge-l"]["f"])
            rouge_l_recall.append(scores["rouge-l"]["r"])
            rouge_l_precision.append(scores["rouge-l"]["p"])
            
            rouge_1_f1.append(scores["rouge-1"]["f"])
            rouge_1_recall.append(scores["rouge-1"]["r"])
            rouge_1_precision.append(scores["rouge-1"]["p"])
            
            rouge_2_f1.append(scores["rouge-2"]["f"])
            rouge_2_recall.append(scores["rouge-2"]["r"])
            rouge_2_precision.append(scores["rouge-2"]["p"])

        results = {
            "ROUGE-1 F1": np.mean(rouge_1_f1) * 100,
            "ROUGE-2 F1": np.mean(rouge_2_f1) * 100,
            "ROUGE-L F1": np.mean(rouge_l_f1) * 100
        }
        return results

    def compute_meteor_score(self):
        meteor = evaluate.load('meteor')
        scores = [
            meteor.compute(predictions=[pred], references=[ref])["meteor"]
            for pred, ref in zip(self.predictions, self.references)
        ]
        return {"METEOR": np.mean(scores)}

    def compute_bleu_scores(self):
        smoothie = SmoothingFunction().method4
        bleu_4 = []
        for pred, ref in zip(self.predictions, self.references):
            bleu_4.append(
                sentence_bleu([ref.split()], pred.split(), weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
            )
        return {"BLEU-4": np.mean(bleu_4)}

    def compute_bertscore(self, lang="en"):
        P, R, F1 = bert_score(self.predictions, self.references, lang=lang)
        return {"BERTScore F1": F1.mean().item()}

In [32]:
# Load CSV file
df = pd.read_csv(f"../Output/predictions_{MODEL_NAME}.csv")
# Drop rows with NaNs in output or reference
df = df.dropna(subset=["output", "reference"])

# Get predictions and references
predictions = df["output"].astype(str).tolist()
references = df["reference"].astype(str).tolist()

# Run evaluations
evaluator = EvaluationMetrics(predictions, references)
rouge_scores = evaluator.compute_rouge_score()
meteor_score = evaluator.compute_meteor_score()
bleu_score = evaluator.compute_bleu_scores()
bertscore = evaluator.compute_bertscore(lang="en")

summary_table = [
    ["ROUGE-1 F1", rouge_scores["ROUGE-1 F1"]],
    ["ROUGE-2 F1", rouge_scores["ROUGE-2 F1"]],
    ["ROUGE-L F1", rouge_scores["ROUGE-L F1"]],
    ["METEOR", meteor_score["METEOR"]],
    ["BLEU-4", bleu_score["BLEU-4"]],
    ["BERTScore F1", bertscore["BERTScore F1"]],
]

table_str = tabulate(summary_table, headers=["Metric", "Score"], floatfmt=".4f", tablefmt="fancy_grid")
print(table_str)
with open(f'../Results/metrics_{MODEL_NAME}', 'w', encoding='utf-8') as f:
    f.write(table_str)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


╒══════════════╤═════════╕
│ Metric       │   Score │
╞══════════════╪═════════╡
│ ROUGE-1 F1   │ 27.1679 │
├──────────────┼─────────┤
│ ROUGE-2 F1   │  8.5591 │
├──────────────┼─────────┤
│ ROUGE-L F1   │ 24.4605 │
├──────────────┼─────────┤
│ METEOR       │  0.3449 │
├──────────────┼─────────┤
│ BLEU-4       │  0.0441 │
├──────────────┼─────────┤
│ BERTScore F1 │  0.8741 │
╘══════════════╧═════════╛
