In [5]:
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge
from bert_score import score as bert_score
import numpy as np
import pandas as pd
import evaluate

In [8]:
class EvaluationMetrics:
    def __init__(self, predictions, references):
        self.predictions = predictions
        self.references = references

    def compute_rouge_score(self):
        rouge = Rouge()
        rouge_l_f1, rouge_l_recall, rouge_l_precision = [], [], []
        rouge_1_f1, rouge_1_recall, rouge_1_precision = [], [], []
        rouge_2_f1, rouge_2_recall, rouge_2_precision = [], [], []
        
        for prediction, reference in zip(self.predictions, self.references):
            scores = rouge.get_scores(prediction, reference)[0]
            
            rouge_l_f1.append(scores["rouge-l"]["f"])
            rouge_l_recall.append(scores["rouge-l"]["r"])
            rouge_l_precision.append(scores["rouge-l"]["p"])
            
            rouge_1_f1.append(scores["rouge-1"]["f"])
            rouge_1_recall.append(scores["rouge-1"]["r"])
            rouge_1_precision.append(scores["rouge-1"]["p"])
            
            rouge_2_f1.append(scores["rouge-2"]["f"])
            rouge_2_recall.append(scores["rouge-2"]["r"])
            rouge_2_precision.append(scores["rouge-2"]["p"])

        results = {
            "rouge_l": {
                "f1": np.mean(rouge_l_f1) * 100 ,
                "recall": np.mean(rouge_l_recall) * 100,
                "precision": np.mean(rouge_l_precision) * 100
            },
            "rouge_1": {
                "f1": np.mean(rouge_1_f1) * 100,
                "recall": np.mean(rouge_1_recall) * 100,
                "precision": np.mean(rouge_1_precision) * 100
            },
            "rouge_2": {
                "f1": np.mean(rouge_2_f1) * 100,
                "recall": np.mean(rouge_2_recall) * 100,
                "precision": np.mean(rouge_2_precision) * 100
            }
        }
        return results

    def compute_meteor_score(self):
        # Using evaluate instead of load_metric
        meteor = evaluate.load('meteor')
        scores = []
        for prediction, reference in zip(self.predictions, self.references):
            score = meteor.compute(predictions=[prediction], references=[reference])
            scores.append(score["meteor"])

        average_meteor_score = np.mean(scores)
        return {"meteor": average_meteor_score}
    
    def compute_bleu_scores(self):
        bleu_1, bleu_2, bleu_3, bleu_4 = [], [], [], []
        smoothie = SmoothingFunction().method4
        
        for prediction, reference in zip(self.predictions, self.references):
            reference = [reference.split()]
            prediction = prediction.split()
            
            bleu_1.append(sentence_bleu(reference, prediction, weights=(1, 0, 0, 0), smoothing_function=smoothie))
            bleu_2.append(sentence_bleu(reference, prediction, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothie))
            bleu_3.append(sentence_bleu(reference, prediction, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothie))
            bleu_4.append(sentence_bleu(reference, prediction, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie))

        results = {
            "bleu_1": np.mean(bleu_1),
            "bleu_2": np.mean(bleu_2),
            "bleu_3": np.mean(bleu_3),
            "bleu_4": np.mean(bleu_4)
        }
        return results
    
    def compute_bertscore(self, lang="en"):
        # Compute BERTScore
        P, R, F1 = bert_score(self.predictions, self.references, lang=lang)
        return {
            "precision": P.mean().item(),
            "recall": R.mean().item(),
            "f1": F1.mean().item()
        }


In [None]:
# Load test data and predictions
with open('../../Dataset/test.json', 'r') as f:
    test_data = json.load(f)
with open('./predictions_gemini.json', 'r') as f:
    output_data = json.load(f)

# Check for length mismatch
if len(test_data) != len(output_data):
    print("LENGTH ERROR")
    exit(1)

predictions, references = [], []
for i in range(len(test_data)):
    predicted_summaries = output_data[i]['summaries']
    reference_summaries = test_data[i]['labelled_summaries']
    for perspective in predicted_summaries.keys():
        if perspective in reference_summaries:
            predictions.append(predicted_summaries[perspective][0])
            references.append(reference_summaries[perspective])

# Evaluate using the EvaluationMetrics class
eval_metrics = EvaluationMetrics(predictions, references)

print("ROUGE scores:", eval_metrics.compute_rouge_score())
print("METEOR score:", eval_metrics.compute_meteor_score())
print("BLEU scores:", eval_metrics.compute_bleu_scores())
print("BERTScore:", eval_metrics.compute_bertscore(lang="en"))

ROUGE scores: {'rouge_l': {'f1': np.float64(28.6694499740343), 'recall': np.float64(41.56357281452255), 'precision': np.float64(25.13207231048138)}, 'rouge_1': {'f1': np.float64(31.574420749000815), 'recall': np.float64(45.5992779821014), 'precision': np.float64(27.66690313739584)}, 'rouge_2': {'f1': np.float64(11.551963701634985), 'recall': np.float64(17.149280106806266), 'precision': np.float64(10.347562875300435)}}


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Aditya\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


METEOR score: {'meteor': np.float64(0.37116963865550623)}
BLEU scores: {'bleu_1': np.float64(0.24042566131539708), 'bleu_2': np.float64(0.1424754647714093), 'bleu_3': np.float64(0.09679226641801822), 'bleu_4': np.float64(0.06592722552612827)}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
