In [2]:
import json
import os
import sys
import re
import numpy as np
from rouge_score import rouge_scorer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/athul/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/athul/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /home/athul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def normalize_text(text, remove_stopwords=False):
    """Normalize text by lowercasing, removing punctuation, and optionally removing stopwords."""
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # Remove punctuation
    tokens = word_tokenize(text)
    if remove_stopwords:
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [4]:
def compute_rouge(reference, hypothesis):
    """Compute ROUGE-1, ROUGE-2, and ROUGE-L scores."""
    scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)
    scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    for key in hypothesis.keys():
        result = scorer.score(reference[key], hypothesis[key])
        scores["rouge1"].append(result["rouge1"].fmeasure)
        scores["rouge2"].append(result["rouge2"].fmeasure)
        scores["rougeL"].append(result["rougeL"].fmeasure)

    Rouge1 = sum(scores["rouge1"]) / len(scores["rouge1"])
    Rouge2 = sum(scores["rouge2"]) / len(scores["rouge2"])
    RougeL = sum(scores["rougeL"]) / len(scores["rougeL"])

    Rouge_geo = (Rouge1 * Rouge2 * RougeL) ** (1/3)
    
    return {"Rouge1": Rouge1, "Rouge2": Rouge2, "RougeL": RougeL, "Rouge_geo": Rouge_geo}




In [5]:
def compute_f1(reference, hypothesis):
    """Compute F1 score (unigram overlap without stopwords)."""
    scores = []
    for key in hypothesis.keys():
        ref_tokens = set(reference[key].split())
        hyp_tokens = set(hypothesis[key].split())
        common = ref_tokens & hyp_tokens
        num_same = len(common)
        if num_same == 0:
            scores.append(0.0)
        else:
            precision = num_same / len(hyp_tokens)
            recall = num_same / len(ref_tokens)
            scores.append(2 * (precision * recall) / (precision + recall))
    return {"f1_score":sum(scores) / len(scores)}

In [6]:
def compute_exact_match(reference, hypothesis):
    """Compute Exact Match score across all samples."""
    scores = []
    for key in hypothesis.keys():
        ref_text = reference[key]
        hyp_text = hypothesis[key]
        scores.append(1.0 if normalize_text(ref_text, remove_stopwords=True) == normalize_text(hyp_text, remove_stopwords=True) else 0.0)
    return {"EM_score" : sum(scores) / len(scores)} if scores else {"EM_score" : 0.0}  

In [7]:
def evaluate_scores(reference_file, hypothesis_file, metric):
    with open(reference_file, 'r') as ref_f, open(hypothesis_file, 'r') as hyp_f:
        references = json.load(ref_f)
        hypotheses = json.load(hyp_f)
    
    results = 0
    if metric == "rouge":
        results = compute_rouge(references, hypotheses)
    elif metric == "f1":
        results = compute_f1(references, hypotheses)
    elif metric == "exact_match":
        results = compute_exact_match(references, hypotheses)
            
    
    return results

In [8]:
datasets = ["gov_report", "summ_screen_fd", "qmsum", "qasper","narrative_qa", "quality"]
model_names = ["/assets/models/meta-llama-3.2-instruct-3b"]#["Qwen/Qwen2.5-1.5B-Instruct","MBZUAI/LaMini-GPT-1.5B","instruction-pretrain/InstructLM-1.3B","nvidia/AceInstruct-1.5B","/assets/models/meta-llama-3.2-instruct-3b"]
ipynb = "generations/ipynb"
metrics = {"gov_report":"rouge", "summ_screen_fd":"rouge", "qmsum":"rouge", "qasper":"f1","narrative_qa":"f1", "quality":"exact_match"}



for model in model_names:
    output_file = os.path.join(f"Evaluation_results", model.replace("/", "_").replace("-", "_"), "results.json")
    results = {}
    for dataset in datasets:
        reference_file = f"generations/ipynb/Input_output_json/output_{dataset}.json"
        hypothesis_file = os.path.join(ipynb, model.replace("/", "_").replace("-", "_"), f"{dataset}.json")
        result = evaluate_scores(reference_file, hypothesis_file,metrics[dataset])
        results[dataset] = result
        
        os.makedirs(os.path.dirname(output_file), exist_ok=True)    
    
        with open(output_file, 'w') as out_f:
            json.dump(results, out_f, indent=4)
        print(model, dataset)
        

/assets/models/meta-llama-3.2-instruct-3b gov_report
/assets/models/meta-llama-3.2-instruct-3b summ_screen_fd
/assets/models/meta-llama-3.2-instruct-3b qmsum
/assets/models/meta-llama-3.2-instruct-3b qasper
/assets/models/meta-llama-3.2-instruct-3b narrative_qa
/assets/models/meta-llama-3.2-instruct-3b quality
