In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration
from tqdm import tqdm
import pandas as pd
import numpy as np
import evaluate
import textstat

from src.Case_Builder import (device,
                              bert_version,
                              bert_model_name,
                              genai_version,
                              genai_model_name,
                              prompt_strategy_used,
                              dataset_name,
                              massage_strategy
                              )

In [2]:
results = pd.read_csv(f'results/{genai_version}_{massage_strategy}_summaries_{bert_version}_{dataset_name}_{prompt_strategy_used}.csv')

In [3]:
results

Unnamed: 0,reference,prediction
0,Imagine a gymnastics competition in which part...,This study looked at how the brain controls c...
1,"To grow and multiply , a living cell must take...",Scientists have discovered that there are two...
2,Our sense of number is thought to have emerged...,"In our study, we found evidence suggesting th..."
3,"When an embryo is developing , stem cells must...",This study found that a type of molecule call...
4,Neurons communicate with one another at juncti...,This study focused on understanding how synap...
...,...,...
236,A stem cell is a special cell that divides to ...,Follicle Stem Cells (FSC) are important for m...
237,If all the DNA contained within a single human...,Scientists have found that there is a protein...
238,Hybrids arise when two populations of organism...,Transposable Elements (TE) are small pieces o...
239,"As an animal embryo develops , specific genes ...",This study examines how the Sonic Hedgehog ge...


In [4]:
len(results)

241

In [5]:
# ROUGE and BERTScore
rouge = evaluate.load('rouge')
bertscore = evaluate.load("bertscore")

# Load a pre-trained BART model for BARTScore
bart_model_name = "facebook/bart-large-cnn"
bart_tokenizer = BartTokenizer.from_pretrained(bart_model_name)
bart_model = BartForConditionalGeneration.from_pretrained(bart_model_name)

In [None]:
batch_size = 10

# Compute ROUGE metrics
print("ROUGE Metrics Calculater:")
rouge_results = rouge.compute(
    predictions=results['prediction'],
    references=results['reference'],
    use_aggregator=True,
    use_stemmer=True,
)
# Compute BERTScore
print("BERTScore Calculater:")
bertscore_results = {
    "precision": [],
    "recall": [],
    "f1": [],
}
for idx in tqdm(range(0, len(results), batch_size)):
    str_idx = idx
    end_idx = idx + batch_size
    tmp_bertscore_results = bertscore.compute(
        predictions=results['prediction'][str_idx:end_idx].to_list(),
        references=results['reference'][str_idx:end_idx].to_list(),
        model_type="microsoft/deberta-xlarge-mnli",
    )
    bertscore_results["precision"].extend(tmp_bertscore_results["precision"])
    bertscore_results["recall"].extend(tmp_bertscore_results["recall"])
    bertscore_results["f1"].extend(tmp_bertscore_results["f1"])

# Compute FKGL and DCRS for Readability
print("FKGL Metrics Calculater:")
fkgl_scores = [textstat.flesch_kincaid_grade(p) for p in results['prediction'].to_list()]
print("DCRS Metrics Calculater:")
dcrs_scores = [textstat.dale_chall_readability_score(p) for p in results['prediction'].to_list()]

# Compute BARTScore for Factuality
def compute_bart_score(predictions, references):
    bart_scores = []
    for pred, ref in zip(predictions, references):
        inputs = bart_tokenizer(ref, return_tensors="pt", truncation=True, max_length=1024)
        outputs = bart_tokenizer(pred, return_tensors="pt", truncation=True, max_length=1024)
        ref_to_pred_score = bart_model(**inputs, labels=outputs["input_ids"]).loss.item()
        pred_to_ref_score = bart_model(**outputs, labels=inputs["input_ids"]).loss.item()
        bart_scores.append((ref_to_pred_score + pred_to_ref_score) / 2)
    return bart_scores

print("BARTScore Calculater:")
bart_scores = {
    "bart_scores": [],
}
for idx in tqdm(range(0, len(results), batch_size)):
    str_idx = idx
    end_idx = idx + batch_size
    tmp_bart_scores = compute_bart_score(results['prediction'][str_idx:end_idx].to_list(), results['reference'][str_idx:end_idx].to_list())
    bart_scores["bart_scores"].extend(tmp_bart_scores)

ROUGE Metrics Calculater:


In [None]:
# Combine all results
final_results = {
    "ROUGE1": [rouge_results['rouge1']],
    "ROUGE2": [rouge_results['rouge2']],
    "ROUGEL": [rouge_results['rougeL']],
    "BERTScore_Precision": [np.average(bertscore_results["precision"])],
    "BERTScore_Recall": [np.average(bertscore_results["recall"])],
    "BERTScore_F1": [np.average(bertscore_results["f1"])],
    "FKGL": [np.average(fkgl_scores)],
    "DCRS": [np.average(dcrs_scores)],
    "BARTScore": [np.average(bart_scores["bart_scores"])],
}

In [None]:
for key, value in final_results.items():
    print(f"{key}: {value}")

In [None]:
result_df = pd.DataFrame(final_results)
result_df.to_csv(f'results/{genai_version}_{massage_strategy}_results_{bert_version}_{dataset_name}_{prompt_strategy_used}.csv', index=False)