# Data extraction evaluation : ROUGE and BERTscore

## ROUGE 
https://thepythoncode.com/article/calculate-rouge-score-in-python

In [1]:
import pandas as pd
import statistics
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1'])

In [2]:
reference_studies = ['Liu_Shao_2024', 'Safdar_Siddique_Khan_2024', 'Shahzad_Khan_2023']
models = ['mistral-small-2503', 'gpt-4o-mini']

In [3]:
results = []
for model in models:
    for reference_study in reference_studies:
        ground_truth = pd.read_excel(f'../ground_truth/test_set/{reference_study.lower()}.xlsx')
        ground_truth = ground_truth.rename(columns={'Study': 'study', 'Authors': 'study'})

        llm_extraction = pd.read_csv(f'../llm-based_extraction/{model}/specific_extraction_{reference_study}.csv')
        llm_extraction = llm_extraction.rename(columns={'Method': 'Methodology'})
        llm_extraction = llm_extraction.rename(columns={"Relation between Motivation and Librarians' PD": "Relation between motivation and librarian's PD"})
        llm_extraction = llm_extraction[[x for x in llm_extraction.columns if x in ground_truth.columns]]

        for data_element in ground_truth.columns[1:]:
            precision, recall, fmeasure = 0, 0, 0 # initialisation

            for study in ground_truth['study'].unique():
                reference = str(ground_truth[ground_truth['study'] == study][data_element].tolist()[0]).lower()
                candidate = str(llm_extraction[llm_extraction['study'] == study][data_element].tolist()[0]).lower()
                
                scores = scorer.score(reference, candidate)
                precision += scores['rouge1'].precision
                recall += scores['rouge1'].recall
                fmeasure += scores['rouge1'].fmeasure

            # On moyenne les score en fonction du nombre d'études
            precision = precision/len(ground_truth)
            recall = recall/len(ground_truth)
            fmeasure = fmeasure/len(ground_truth)

            # On ajoute au tableau de résultats
            results.append({
                'model' : model,
                'reference_study' : reference_study,
                'data_element' : data_element,
                'ROUGE-precision' : precision,
                'ROUGE-recall' : recall,
                'ROUGE-Fmeasure' : fmeasure
            })

results = pd.DataFrame(results)

In [4]:
mistral = results[results['model'] == 'mistral-small-2503']
mistral

Unnamed: 0,model,reference_study,data_element,ROUGE-precision,ROUGE-recall,ROUGE-Fmeasure
0,mistral-small-2503,Liu_Shao_2024,Analysis method,0.416667,0.416667,0.416667
1,mistral-small-2503,Liu_Shao_2024,Data sources,0.201691,0.311111,0.217923
2,mistral-small-2503,Liu_Shao_2024,Participants,0.165562,0.169222,0.148856
3,mistral-small-2503,Liu_Shao_2024,Participant size,0.333333,0.333333,0.333333
4,mistral-small-2503,Safdar_Siddique_Khan_2024,Country,0.684524,0.785714,0.707143
5,mistral-small-2503,Safdar_Siddique_Khan_2024,Population,0.163896,0.303571,0.18272
6,mistral-small-2503,Safdar_Siddique_Khan_2024,Sample size and tech,0.481037,0.573129,0.513872
7,mistral-small-2503,Safdar_Siddique_Khan_2024,Tool,0.267857,0.357143,0.290476
8,mistral-small-2503,Safdar_Siddique_Khan_2024,Methodology,0.571429,0.571429,0.571429
9,mistral-small-2503,Shahzad_Khan_2023,Country,0.8,0.8,0.8


In [5]:
mistral['ROUGE-precision'].mean()

np.float64(0.3674697463689777)

In [6]:
mistral['ROUGE-recall'].mean()

np.float64(0.425989248623261)

In [7]:
mistral['ROUGE-Fmeasure'].mean()

np.float64(0.3763849145417449)

In [8]:
gpt = results[results['model'] == 'gpt-4o-mini']
gpt

Unnamed: 0,model,reference_study,data_element,ROUGE-precision,ROUGE-recall,ROUGE-Fmeasure
13,gpt-4o-mini,Liu_Shao_2024,Analysis method,0.5,0.5,0.5
14,gpt-4o-mini,Liu_Shao_2024,Data sources,0.099839,0.227778,0.136905
15,gpt-4o-mini,Liu_Shao_2024,Participants,0.152844,0.203944,0.167659
16,gpt-4o-mini,Liu_Shao_2024,Participant size,0.5,0.5,0.5
17,gpt-4o-mini,Safdar_Siddique_Khan_2024,Country,0.738095,0.785714,0.75
18,gpt-4o-mini,Safdar_Siddique_Khan_2024,Population,0.068651,0.321429,0.111322
19,gpt-4o-mini,Safdar_Siddique_Khan_2024,Sample size and tech,0.377551,0.445578,0.401361
20,gpt-4o-mini,Safdar_Siddique_Khan_2024,Tool,0.077381,0.238095,0.112472
21,gpt-4o-mini,Safdar_Siddique_Khan_2024,Methodology,0.392857,0.428571,0.404762
22,gpt-4o-mini,Shahzad_Khan_2023,Country,0.8,0.8,0.8


In [9]:
gpt['ROUGE-precision'].mean()

np.float64(0.31039034826899137)

In [10]:
gpt['ROUGE-recall'].mean()

np.float64(0.3729457315419524)

In [11]:
gpt['ROUGE-Fmeasure'].mean()

np.float64(0.3237623791320533)

## Cosine similarity

In [12]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm



## BERT-Score

In [13]:
# from evaluate import load
# bertscore = load("bertscore")

In [14]:
# results = []
# for model in models:
#     for reference_study in reference_studies:
#         ground_truth = pd.read_excel(f'../ground_truth/test_set/{reference_study.lower()}.xlsx')
#         ground_truth = ground_truth.rename(columns={'Study': 'study', 'Authors': 'study'})

#         llm_extraction = pd.read_csv(f'../llm-based_extraction/{model}/specific_extraction_{reference_study}.csv')
#         llm_extraction = llm_extraction.rename(columns={'Method': 'Methodology'})
#         llm_extraction = llm_extraction.rename(columns={"Relation between Motivation and Librarians' PD": "Relation between motivation and librarian's PD"})
#         llm_extraction = llm_extraction[[x for x in llm_extraction.columns if x in ground_truth.columns]]

#         predictions = []
#         references = []
#         for data_element in ground_truth.columns[1:]:
#             for study in ground_truth['study'].unique():
#                 references.append(str(ground_truth[ground_truth['study'] == study][data_element].tolist()[0]).lower())
#                 predictions.append(str(llm_extraction[llm_extraction['study'] == study][data_element].tolist()[0]).lower())
                
#             scores = bertscore.compute(predictions=predictions, references=references, model_type="roberta-large", lang="en-sci")

#             # On moyenne les score en fonction du nombre d'études
#             precision = statistics.mean(scores['precision'])
#             recall = statistics.mean(scores['recall'])
#             fmeasure = statistics.mean(scores['f1'])

#             # On ajoute au tableau de résultats
#             results.append({
#                 'model' : model,
#                 'reference_study' : reference_study,
#                 'data_element' : data_element,
#                 'BERTScore-precision' : precision,
#                 'BERTScore-recall' : recall,
#                 'BERTScore-F1' : fmeasure
#             })

# results = pd.DataFrame(results)

In [15]:
# mistral = results[results['model'] == 'mistral-small-2503']
# mistral['BERTScore-precision'].mean()