In [8]:
import pandas as pd
from rouge_score import rouge_scorer

In [9]:
summaries_df = pd.read_csv('../data/summaries_train.csv')
prompts_df = pd.read_csv('../data/prompts_train.csv')

In [10]:
df_merged = summaries_df.merge(prompts_df, on="prompt_id")

In [11]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [12]:
def calculate_rouge_scores(row):
    resumen_estudiante = row['text']
    texto_original = row['prompt_text']
    
    scores = scorer.score(texto_original, resumen_estudiante)
    return {
        'rouge1_recall': scores['rouge1'].recall,
        'rouge1_precision': scores['rouge1'].precision,
        'rouge1_f1': scores['rouge1'].fmeasure,
        'rouge2_recall': scores['rouge2'].recall,
        'rouge2_precision': scores['rouge2'].precision,
        'rouge2_f1': scores['rouge2'].fmeasure,
        'rougeL_recall': scores['rougeL'].recall,
        'rougeL_precision': scores['rougeL'].precision,
        'rougeL_f1': scores['rougeL'].fmeasure
    }

In [13]:
rouge_scores = df_merged.apply(calculate_rouge_scores, axis=1, result_type="expand")

In [14]:
df_merged = pd.concat([df_merged, rouge_scores], axis=1)

In [17]:
# Save the results
df_merged.to_csv('../results/rouge_scores.csv', index=False)

# Evaluate the results

In [25]:
df_merged = pd.read_csv('../results/rouge_scores.csv')

In [26]:
comparable_cols = ['content', 'wording', 'rouge1_recall', 'rouge1_precision', 'rouge1_f1', 
                    'rouge2_recall', 'rouge2_precision', 'rouge2_f1', 
                    'rougeL_recall', 'rougeL_precision', 'rougeL_f1']

In [27]:
corr_matrix = df_merged[comparable_cols].corr()

In [28]:
score_corr = corr_matrix.loc[['content', 'wording'], ['rouge1_recall', 'rouge1_precision', 'rouge1_f1', 
                                                                         'rouge2_recall', 'rouge2_precision', 'rouge2_f1', 
                                                                         'rougeL_recall', 'rougeL_precision', 'rougeL_f1']]

# add a column with the name of the metric
score_corr['metric'] = score_corr.index

In [29]:
# Save the results
score_corr.to_csv('../results/rouge_scores_corr.csv', index=False)