# Install Packages

In [None]:
!pip install rouge_score

# Import Packages

In [1]:
import pickle
import pandas as pd
import numpy as np
from glimpse.evaluate.evaluate_common_metrics_samples import evaluate_rouge
import os


# Extract RSA Scores from PK Files

In [2]:
def extract_mean_scores(path):
  obj = pd.read_pickle(path)
  results = obj['results']

  cons_scores = []
  init_cons_scores = []
  listener_df_scores = []
  speaker_df_scores = []
  language_model_proba_df_scores = []
  initial_listener_df_scores = []

  for i in range(len(results)):
    cons_scores.append(results[i]['consensuality_scores'].describe())
    init_cons_scores.append(results[i]['initial_consensuality_scores'].describe())
    listener_df_scores.append(results[i]['listener_df'].describe())
    speaker_df_scores.append(results[i]['speaker_df'].describe())
    language_model_proba_df_scores.append(results[i]['language_model_proba_df'].describe())
    initial_listener_df_scores.append(results[i]['initial_listener'].describe())


  cons_scores_means = []
  init_cons_scores_means = []
  listener_df_scores_means = []
  speaker_df_scores_means = []
  language_model_proba_df_scores_means = []
  initial_listener_df_scores_means = []

  for i in range(len(cons_scores)):
    cons_scores_means.append(cons_scores[i].loc['mean'])

  for i in range(len(init_cons_scores)):
    init_cons_scores_means.append(init_cons_scores[i].loc['mean'])

  for i in range(len(listener_df_scores)):
    listener_df_scores_means.append(listener_df_scores[i].loc['mean'].aggregate('mean'))

  for i in range(len(speaker_df_scores)):
    speaker_df_scores_means.append(speaker_df_scores[i].loc['mean'].aggregate('mean'))

  for i in range(len(language_model_proba_df_scores)):
    language_model_proba_df_scores_means.append(language_model_proba_df_scores[i].loc['mean'].aggregate('mean'))

  for i in range(len(initial_listener_df_scores)):
    initial_listener_df_scores_means.append(initial_listener_df_scores[i].loc['mean'].aggregate('mean'))


  all_scores = np.array([cons_scores_means, init_cons_scores_means, listener_df_scores_means, speaker_df_scores_means, language_model_proba_df_scores_means, initial_listener_df_scores_means]).transpose()
  all_scores_df = pd.DataFrame(all_scores, columns=['consensuality_scores', 'initial_consensuality_scores', 'listener_df', 'speaker_df', 'language_model_proba_df', 'initial_listener'])

  return all_scores_df.copy()


## Abstractive

In [3]:
pickle_original_paths = { 
    "2017" : "output/abstractive/facebook_bart-large-cnn-_-all_reviews_2017-_-top_p_sampling-_-trimmed-_-2024-12-11-15-31-37.csv-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2018" : "output/abstractive/facebook_bart-large-cnn-_-all_reviews_2018-_-top_p_sampling-_-padded-_-2024-12-09-13-08-21-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2019" : "output/abstractive/facebook_bart-large-cnn-_-all_reviews_2019-_-top_p_sampling-_-trimmed-_-2024-12-11-15-31-37-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2020" : "output/abstractive/facebook_bart-large-cnn-_-all_reviews_2020-_-top_p_sampling-_-trimmed-_-2024-12-11-15-31-37-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk"
                          }

pickle_gpt2_paths = {
    "2017" : "output/abstractive/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2017-_-top_p_sampling-_-trimmed-_-2025-02-08-22-07-53-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2018" : "output/abstractive/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2018-_-top_p_sampling-_-trimmed-_-2025-02-08-23-18-26-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2019" : "output/abstractive/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2019-_-top_p_sampling-_-trimmed-_-2025-02-09-01-34-19-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2020" : "output/abstractive/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2020-_-top_p_sampling-_-trimmed-_-2025-02-09-10-18-19-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk"
}


### Original

In [None]:
mean_scores_original = {}

for name, path in pickle_original_paths.items():
  print(name)
  mean_scores_original[name] =  extract_mean_scores(path).mean()

2017
2018
2019
2020


In [26]:
concatenated_original = pd.concat(mean_scores_original, axis=1).transpose()
concatenated_original

Unnamed: 0,consensuality_scores,initial_consensuality_scores,listener_df,speaker_df,language_model_proba_df,initial_listener
2017,0.652511,0.182344,-9.367365,-10.816949,-16.605637,-1.837223
2018,1.051409,0.436363,-17.345043,-19.487237,-18.474056,-2.303026
2019,1.053721,0.425736,-16.270704,-18.416813,-18.250831,-2.196775
2020,0.642981,0.176821,-8.783283,-10.236029,-16.678669,-1.761115


In [27]:
res_original = pd.DataFrame(concatenated_original.mean()).transpose()
res_original.index = ['orignal']
res_original

Unnamed: 0,consensuality_scores,initial_consensuality_scores,listener_df,speaker_df,language_model_proba_df,initial_listener
orignal,0.850155,0.305316,-12.941599,-14.739257,-17.502298,-2.024535


### GPT2

In [4]:
mean_scores_gpt2 = {}

for name, path in pickle_gpt2_paths.items():
  print(name)
  mean_scores_gpt2[name] =  extract_mean_scores(path).mean()

2017
2018
2019
2020


In [5]:
concatenated_gpt2 = pd.concat(mean_scores_gpt2, axis=1).transpose()
concatenated_gpt2

Unnamed: 0,consensuality_scores,initial_consensuality_scores,listener_df,speaker_df,language_model_proba_df,initial_listener
2017,1.092917,0.358979,-18.554302,-20.642376,-15.072538,-2.497087
2018,1.060741,0.313634,-16.531683,-18.618351,-13.958208,-2.318736
2019,1.06081,0.288102,-14.500072,-16.586401,-13.571649,-2.121807
2020,1.054152,0.272017,-13.837928,-15.923457,-13.039964,-2.060369


In [6]:
res_gpt2 = pd.DataFrame(concatenated_gpt2.mean()).transpose()
res_gpt2.index = ['GPT2']
res_gpt2

Unnamed: 0,consensuality_scores,initial_consensuality_scores,listener_df,speaker_df,language_model_proba_df,initial_listener
GPT2,1.067155,0.308183,-15.855996,-17.942646,-13.91059,-2.2495


## Extractive

In [7]:
pickle_original_paths = { 
    "2017" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-none-_-2024-12-12-18-26-05-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2018" : "output/extractive/extractive_sentences-_-all_reviews_2018-_-none-_-2025-02-10-08-15-21-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2019" : "output/extractive/extractive_sentences-_-all_reviews_2019-_-none-_-2025-02-08-18-51-24-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2020" : "output/extractive/extractive_sentences-_-all_reviews_2020-_-none-_-2025-02-09-00-02-38-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk"
                          }

pickle_cluster_2017_paths = {
    "2017-1" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-clustering-1-_-2025-02-03-14-03-32-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-3" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-clustering-3-_-2025-01-27-16-57-48-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-5" : "output/extractive/extractive_sentences-_-all_reviews_2019-_-cluster-_-2025-02-08-18-51-24-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-7" : "output/extractive/extractive_sentences-_-all_reviews_2020-_-cluster-_-2025-02-09-00-02-38-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-10" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-clustering-10-_-2024-12-20-12-02-07-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk"
}

pickle_tfidf_2017_paths = {
    "2017-1" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-top_sentences-1-_-2025-02-03-15-18-16-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-3" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-top_sentences-3-_-2025-01-27-09-35-07-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-5" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-top_sentences-5-_-2025-01-27-08-38-44-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-7" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-top_sentences-7-_-2025-02-03-15-31-56-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-10" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-top_sentences-10-_-2025-01-26-17-10-07-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
}

pickle_tfidf_2018_paths = {
    "2018-3" : "output/extractive/extractive_sentences-_-all_reviews_2018-_-top_sentences-3-_-2025-02-08-11-31-20-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2018-5" : "output/extractive/extractive_sentences-_-all_reviews_2018-_-top_sentences-5-_-2025-02-10-16-15-53-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2018-7" : "output/extractive/extractive_sentences-_-all_reviews_2018-_-top_sentences-7-_-2025-02-10-10-48-00-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    }

pickle_tfidf_2019_paths = {
    "2019-3" : "output/extractive/extractive_sentences-_-all_reviews_2019-_-top_sentences-3-_-2025-02-08-22-43-24-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2019-5" : "output/extractive/extractive_sentences-_-all_reviews_2019-_-top_sentences-5-_-2025-02-10-15-37-51-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2019-7" : "output/extractive/extractive_sentences-_-all_reviews_2019-_-top_sentences-7-_-2025-02-08-23-07-10-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    }

pickle_tfidf_2020_paths = {
    "2020-3" : "output/extractive/extractive_sentences-_-all_reviews_2020-_-top_sentences-3-_-2025-02-10-11-57-48-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2020-5" : "output/extractive/extractive_sentences-_-all_reviews_2020-_-top_sentences-5-_-2025-02-10-14-34-58-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2020-7" : "output/extractive/extractive_sentences-_-all_reviews_2020-_-top_sentences-7-_-2025-02-10-12-57-30-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    }

pickle_dynamic_tfidf_2017_paths = {
    "2017-0.1" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-dynamic_tfidf_0.1-_-2025-02-06-09-19-46-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-0.3" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-dynamic_tfidf_0.3-_-2025-02-05-10-55-40-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-0.5" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-dynamic_tfidf_0.5-_-2025-02-05-10-02-50-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
    "2017-0.7" : "output/extractive/extractive_sentences-_-all_reviews_2017-_-dynamic_tfidf_0.7-_-2025-02-06-14-37-02-_-r3-_-rsa_reranked-google-pegasus-arxiv.pk",
}

dict_dict_of_paths = {
    "original" : pickle_original_paths,
    "cluster_2017" : pickle_cluster_2017_paths,
    "tfidf_2017" : pickle_tfidf_2017_paths,
    "tfidf_2018" : pickle_tfidf_2018_paths,
    "tfidf_2019" : pickle_tfidf_2019_paths,
    "tfidf_2020" : pickle_tfidf_2020_paths,
    "dynamic_tfidf_2017" : pickle_dynamic_tfidf_2017_paths
}

In [None]:
for name, dict_of_paths in dict_dict_of_paths.items():
    print(name)
    mean_scores_all = {}
    for name, path in dict_of_paths.items():
        print(name)
        mean_scores_all[name] =  extract_mean_scores(path).mean()
        concatenated_all = pd.concat(mean_scores_all, axis=1).transpose()
    res_all = pd.DataFrame(concatenated_all.mean()).transpose()
    res_all.index = ['name']
    display(res_all)

# Compute Rouge Scores

## Abstractive

### Initialize Dictionaries

In [2]:
scores_dict = {}

In [None]:
# abstractive
datasets = {
    "2017_original_abstractive" : "data/candidates/facebook_bart-large-cnn-_-all_reviews_2017-_-top_p_sampling-_-trimmed-_-2024-12-11-15-31-37.csv.csv",
    "2018_original_abstractive" : "data/candidates/facebook_bart-large-cnn-_-all_reviews_2018-_-top_p_sampling-_-padded-_-2024-12-09-13-08-21.csv",
    "2019_original_abstractive" : "data/candidates/facebook_bart-large-cnn-_-all_reviews_2019-_-top_p_sampling-_-trimmed-_-2024-12-11-15-31-37.csv",
    "2020_original_abstractive" : "data/candidates/facebook_bart-large-cnn-_-all_reviews_2020-_-top_p_sampling-_-trimmed-_-2024-12-11-15-31-37.csv",
    "2017_modified_abstractive" : "data/candidates/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2017-_-top_p_sampling-_-trimmed-_-2025-02-08-22-07-53.csv",
    "2018_modified_abstractive" : "data/candidates/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2018-_-top_p_sampling-_-trimmed-_-2025-02-08-23-18-26.csv",
    "2019_modified_abstractive" : "data/candidates/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2019-_-top_p_sampling-_-trimmed-_-2025-02-09-01-34-19.csv",
    "2020_modified_abstractive" : "data/candidates/gavin124_gpt2-finetuned-cnn-summarization-v2-_-all_reviews_2020-_-top_p_sampling-_-trimmed-_-2025-02-09-10-18-19.csv",
    }

### Evaluate Rouge and Mean Rouge

In [None]:
for name, path in datasets.items():
  print(name)
  # Load dataset (CSV file should have 'gold' and 'summary' columns)
  df = pd.read_csv(path)

  # Compute ROUGE scores
  rouge_scores = evaluate_rouge(df)

  # Print results
  final_scores = pd.DataFrame({
      "Rouge-1": rouge_scores["rouge1"],
      "Rouge-2": rouge_scores["rouge2"],
      "Rouge-L": rouge_scores["rougeL"]})
  scores_dict[name] = final_scores.copy()
  scores_dict[name].to_csv(f"output/abstractive/rouge/{name}.csv")

In [None]:
for name, score_df in scores_dict.items():
  print(name)
  score_df_mean = score_df.mean()
  #display(score_df_mean)
  score_df_mean.to_csv(f"output/abstractive/rouge/mean/{name}.csv")

## Extractive

### Initialize Dictionaries

In [None]:
scores_dict = {}

In [None]:
# extractive
extractive_dir = 'data/candidates/extractive'
extractive_files = [os.path.join(extractive_dir, file) for file in os.listdir(extractive_dir) if os.path.isfile(os.path.join(extractive_dir, file))]
for i in range(len(extractive_files)):
    name = (extractive_files[i].split('-_-')[1] + "-" + extractive_files[i].split('-_-')[2]).split('_')[2]
    path = extractive_files[i]

### Evaluate Rouge and Mean Rouge

In [None]:
for name, path in datasets.items():
  print(name)
  # Load dataset (CSV file should have 'gold' and 'summary' columns)
  df = pd.read_csv(path)

  # Compute ROUGE scores
  rouge_scores = evaluate_rouge(df)

  # Print results
  final_scores = pd.DataFrame({
      "Rouge-1": rouge_scores["rouge1"],
      "Rouge-2": rouge_scores["rouge2"],
      "Rouge-L": rouge_scores["rougeL"]})
  scores_dict[name] = final_scores.copy()
  scores_dict[name].to_csv(f"output/extractive/rouge/{name}.csv")

In [None]:
for name, score_df in scores_dict.items():
  print(name)
  score_df_mean = score_df.mean()
  #display(score_df_mean)
  score_df_mean.to_csv(f"output/extractive/rouge/mean/{name}.csv")