In [37]:
import pandas as pd
import pickle
from tabulate import tabulate

In [34]:
with open("../data/all_scores.pickle", "rb") as file:
    scores = pickle.load(file)
scores

data = []

# Iterate through each model's results and flatten the data
for model, metrics in scores.items():
    for metric_type, values in metrics.items():
        if metric_type == 'rouge':
            # Flatten ROUGE metrics
            data.append((model, metric_type, 'rouge1', values['rouge1']))
            data.append((model, metric_type, 'rouge2', values['rouge2']))
            data.append((model, metric_type, 'rougeL', values['rougeL']))
            data.append((model, metric_type, 'rougeLsum', values['rougeLsum']))
        elif metric_type == 'bleu':
            # Flatten BLEU metrics and details
            data.append((model, metric_type, 'bleu', values['bleu']))
            data.append((model, metric_type, 'brevity_penalty', values['brevity_penalty']))
            data.append((model, metric_type, 'length_ratio', values['length_ratio']))
            data.append((model, metric_type, 'translation_length', values['translation_length']))
            data.append((model, metric_type, 'reference_length', values['reference_length']))
            
df = pd.DataFrame(data, columns=['Model', 'Metric Type', 'Metric', 'Value'])
df['Value'] = df['Value'].round(3)
df_pivot = df.pivot_table(index=['Model', 'Metric'], columns='Metric Type', values='Value', aggfunc='first')

In [36]:
df_pivot.fillna("-", inplace=True)
print(df_pivot.to_markdown())

|                                              | bleu   | rouge   |
|:---------------------------------------------|:-------|:--------|
| ('t5-base', 'bleu')                          | 0.02   | -       |
| ('t5-base', 'brevity_penalty')               | 1.0    | -       |
| ('t5-base', 'length_ratio')                  | 3.721  | -       |
| ('t5-base', 'reference_length')              | 985.0  | -       |
| ('t5-base', 'rouge1')                        | -      | 0.177   |
| ('t5-base', 'rouge2')                        | -      | 0.058   |
| ('t5-base', 'rougeL')                        | -      | 0.144   |
| ('t5-base', 'rougeLsum')                     | -      | 0.144   |
| ('t5-base', 'translation_length')            | 3665.0 | -       |
| ('t5-small', 'bleu')                         | 0.024  | -       |
| ('t5-small', 'brevity_penalty')              | 1.0    | -       |
| ('t5-small', 'length_ratio')                 | 2.97   | -       |
| ('t5-small', 'reference_length')             |

In [39]:
rouge_table = tabulate(
    df[df['Metric Type'] == 'rouge'].pivot_table(index=['Model'], columns='Metric Type', values='Value', aggfunc='first'),
    headers='keys',
    tablefmt='pipe',
    showindex=True
)
print(rouge_table)

| Model              |   rouge |
|:-------------------|--------:|
| t5-base            |   0.177 |
| t5-small           |   0.174 |
| t5-small-finetuned |   0.149 |


In [40]:
bleu_table = tabulate(
    df[df['Metric Type'] == 'bleu'].pivot_table(index=['Model'], columns='Metric Type', values='Value', aggfunc='first'), 
    headers='keys', 
    tablefmt='pipe', 
    showindex=True
)
print(bleu_table)

| Model              |   bleu |
|:-------------------|-------:|
| t5-base            |  0.02  |
| t5-small           |  0.024 |
| t5-small-finetuned |  0.022 |
