# Review Baseline Results


In [1]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')
pd.set_option('display.max_columns', None)


Mounted at /content/drive


Import data for Test and Validation

In [2]:
## import results for test and validation

# baseline_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results/'
baseline_path = path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results'
# specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs/'
specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs/'
df_val = pd.read_csv(f'{baseline_path}/T5baseline_val.csv')
df_test = pd.read_csv(f'{baseline_path}/T5baseline_test.csv')


Average metric columns for both references

In [3]:
def avg_a1_a2(df):
  '''
  how to represent two reference answers into one per evaluation metric
  add columns for test, train, or validation results dataframes
  two strategies: average or max
  '''
  df['bleurt_avg'] = df[['bleurt_score_a1', 'bleurt_score_a2']].mean(axis=1)
  df['bleu_avg'] = df[['bleu_a1', 'bleu_a2']].mean(axis=1)
  df['rouge1_avg'] = df[['rouge1_a1', 'rouge1_a2']].mean(axis=1)
  df['rouge2_avg'] = df[['rouge2_a1', 'rouge2_a2']].mean(axis=1)
  df['rougeL_avg'] = df[['rougeL_a1', 'rougeL_a2']].mean(axis=1)

  df['bleurt_max'] = df[['bleurt_score_a1', 'bleurt_score_a2']].max(axis=1)
  df['bleu_max'] = df[['bleu_a1', 'bleu_a2']].max(axis=1)
  df['rouge1_max'] = df[['rouge1_a1', 'rouge1_a2']].max(axis=1)
  df['rouge2_max'] = df[['rouge2_a1', 'rouge2_a2']].max(axis=1)
  df['rougeL_max'] = df[['rougeL_a1', 'rougeL_a2']].max(axis=1)

  return df

In [4]:
df_val = avg_a1_a2(df_val)
df_test = avg_a1_a2(df_test)

In [5]:
def results_overview(df):
  print(f"{len(df)} total results")
  # Calculate and print average
  avg_rouge = df[['rouge1_avg','rouge2_avg','rougeL_avg','bleu_avg', 'bleurt_avg']].mean()
  print("\nAverage of Average Scores:")
  for metric, score in avg_rouge.items():
    print(f"{metric}: {score:.4f}")

  # Calculate average by question type
  avg_rouge_type = df.groupby(['ex_or_im'])[['rouge1_avg','rouge2_avg','rougeL_avg','bleu_avg','bleurt_avg']].mean()
  print("\nAverage Scores by Question type:")
  print(avg_rouge_type)

  # Calculate and print average of max
  avg_rouge = df[['rouge1_max','rouge2_max','rougeL_max','bleu_max','bleurt_max']].mean()
  print("\nAverage of Max Scores:")
  for metric, score in avg_rouge.items():
    print(f"{metric}: {score:.4f}")

  # Calculate average by question type for max
  max_breakdown = df.groupby(['ex_or_im'])[['rouge1_max','rouge2_max','rougeL_max','bleu_max','bleurt_max']].mean()
  print("\nAverage of Max Scores by Question type:")
  print(max_breakdown)


  # # Print some example summaries
  # print("\nExample Summaries:")
  # i = randint(0,len(df))
  # for i in range(i, i+1):
  #   print(f"\nExample {i}:")
  #   print(f"Reference1: {df.iloc[i]['reference_answer1']}")
  #   print(f"Reference2: {df.iloc[i]['reference_answer2']}")
  #   print(f"Generated: {df.iloc[i]['generated_answer']}")


BLEU's output is always a number between 0 and 1. This value indicates how similar the candidate text is to the reference texts, with values closer to 1 representing more similar texts

BLEURT's output is always a number between 0 and (approximately 1). This value indicates how similar the generated text is to the reference texts, with values closer to 1 representing more similar texts

# Baseline Outputs

In [6]:
results_overview(df_test)

1007 total results

Average of Average Scores:
rouge1_avg: 0.3317
rouge2_avg: 0.2032
rougeL_avg: 0.3267
bleu_avg: 0.1008
bleurt_avg: -0.7133

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.397718    0.258989    0.393336  0.131266   -0.596215
implicit    0.134935    0.037120    0.128018  0.010113   -1.062269

Average of Max Scores:
rouge1_max: 0.3806
rouge2_max: 0.2375
rougeL_max: 0.3744
bleu_max: 0.1236
bleurt_max: -0.5825

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.444076    0.296143    0.438596  0.158457   -0.471695
implicit    0.191502    0.062713    0.183135  0.019831   -0.912588


# Fine Tuned Together

In [9]:
## import results for test and validation

train_together_file = 't5_trained_overall_model_both.csv'
specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
# specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_train_together_test = pd.read_csv(f'{specialized_path}/{train_together_file}')

# Pull performance
df_train_together_test = avg_a1_a2(df_train_together_test)

# present results overview
results_overview(df_train_together_test)

1007 total results

Average of Average Scores:
rouge1_avg: 0.2754
rouge2_avg: 0.1569
rougeL_avg: 0.2711
bleu_avg: 0.2245
bleurt_avg: -0.8902

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.332096    0.201379    0.327607  0.240696   -0.786404
implicit    0.106251    0.024271    0.102561  0.176222   -1.199579

Average of Max Scores:
rouge1_max: 0.3134
rouge2_max: 0.1791
rougeL_max: 0.3083
bleu_max: 0.2551
bleurt_max: -0.7876

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.368750    0.225212     0.36384  0.271754   -0.686074
implicit    0.148471    0.041540     0.14260  0.205352   -1.090116


In [10]:
df_train_together_test.head()

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
0,0,why did the happy hunter not say anything ?,explicit,nothing would appease the anger of the skillfu...,being the younger he owed his elder brother ob...,he owed his elder brother obedience .,None of the above choices,0.142857,0.0,0.142857,0.142857,0.0,0.0,0.0,0.0,0.100436,"[0.3333333333333333, 0.2, 0.25, 0.333333333333...",0.367879,0.5,5,10,0.153889,"[0.16666666666666666, 0.2, 0.25, 0.33333333333...",0.67032,0.714286,5,7,-1.365232,-1.148966,-1.257099,0.127162,0.071429,0.0,0.071429,-1.148966,0.153889,0.142857,0.0,0.142857
1,1,who did rin jin call to find a wife ?,explicit,the palace of rin jin was at the bottom of the...,all his fish retainers .,all his fish retainers .,all his fish retainers,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.778801,"[1.0, 1.0, 1.0, 1.0]",0.778801,0.8,4,5,0.778801,"[1.0, 1.0, 1.0, 1.0]",0.778801,0.8,4,5,0.743693,0.743693,0.743693,0.778801,1.0,1.0,1.0,0.743693,0.778801,1.0,1.0,1.0
2,2,what happened because all the fishes-in-waitin...,explicit,there was much gladness between the sea king a...,wonderful array of sea creatures waited upon t...,a wonderful array of sea creatures it was that...,None of the above choices,0.266667,0.0,0.266667,0.266667,0.210526,0.0,0.210526,0.210526,0.091002,"[0.5, 0.2, 0.25, 0.3333333333333333]",0.301194,0.454545,5,11,0.04089,"[0.5, 0.2, 0.25, 0.3333333333333333]",0.135335,0.333333,5,15,-1.544642,-1.537794,-1.541218,0.065946,0.238596,0.0,0.238596,-1.537794,0.091002,0.266667,0.0,0.266667
3,3,what did the three young men ask for ?,explicit,maie sighed . she knew well that her husband w...,a junket .,a can of fresh milk .,something to eat,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.451801,"[0.25, 0.3333333333333333, 0.5, 1.0]",1.0,1.0,3,3,0.166208,"[0.25, 0.3333333333333333, 0.5, 1.0]",0.367879,0.5,3,6,-1.228875,-1.010803,-1.119839,0.309005,0.0,0.0,0.0,-1.010803,0.451801,0.0,0.0,0.0
4,4,how did the happy hunter feel when he saw the ...,explicit,the happy hunter would have liked to enter the...,surprised .,exceedingly surprised .,The hunter was surprised,0.4,0.0,0.4,0.4,0.333333,0.0,0.333333,0.333333,0.359304,"[0.4, 0.25, 0.3333333333333333, 0.5]",1.0,2.0,4,2,0.359304,"[0.4, 0.25, 0.3333333333333333, 0.5]",1.0,1.333333,4,3,-0.272074,-0.31036,-0.291217,0.359304,0.366667,0.0,0.366667,-0.272074,0.359304,0.4,0.0,0.4


In [11]:
df_train_together_test['generated_answer'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
generated_answer,Unnamed: 1_level_1
None of the above choices,0.304866
None of the above choices .,0.021847
the king,0.007944
harold,0.006951
matte,0.006951
...,...
andrew did not understand the whole affair,0.000993
She wanted to have a little peep at the company,0.000993
She was irritated,0.000993
He had a son,0.000993


In [13]:
## import results for test and validation, quesiton then context

train_together_file = 't5_trained_overall_model_both_questionThenContext.csv'
specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
# specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_train_together_test = pd.read_csv(f'{specialized_path}/{train_together_file}')

# Pull performance
df_train_together_test = avg_a1_a2(df_train_together_test)

# present results overview
results_overview(df_train_together_test)

1007 total results

Average of Average Scores:
rouge1_avg: 0.2873
rouge2_avg: 0.1502
rougeL_avg: 0.2842
bleu_avg: 0.2063
bleurt_avg: -0.8030

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.343979    0.188830    0.340985  0.215209   -0.728193
implicit    0.118367    0.035236    0.115084  0.179696   -1.026138

Average of Max Scores:
rouge1_max: 0.3284
rouge2_max: 0.1755
rougeL_max: 0.3247
bleu_max: 0.2403
bleurt_max: -0.6781

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.383876    0.214881    0.380619  0.248408   -0.606076
implicit    0.162921    0.058319    0.157962  0.216205   -0.892609


# Individual Specialized T5 Models

In [15]:
## import results for test and validation

train_separate_file = 't5_trained_specialized.csv'
specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
# specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_train_separate_test = pd.read_csv(f'{specialized_path}/{train_separate_file}')

# Pull performance
df_train_separate_test = avg_a1_a2(df_train_separate_test)

# present results overview
results_overview(df_train_separate_test)

1007 total results

Average of Average Scores:
rouge1_avg: 0.3021
rouge2_avg: 0.1884
rougeL_avg: 0.2974
bleu_avg: 0.2472
bleurt_avg: -0.8316

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.366860    0.243352    0.361693  0.271573   -0.711435
implicit    0.108993    0.024433    0.105666  0.174756   -1.189657

Average of Max Scores:
rouge1_max: 0.3447
rouge2_max: 0.2164
rougeL_max: 0.3396
bleu_max: 0.2807
bleurt_max: -0.7214

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.408083    0.274442    0.402815  0.306382   -0.603638
implicit    0.155786    0.043268    0.151042  0.204223   -1.072292


In [16]:
## import results for test and validation

train_separate_file = 't5_trained_specialized_smaller_lr.csv'
specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
# specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_train_separate_test = pd.read_csv(f'{specialized_path}/{train_separate_file}')

# Pull performance
df_train_separate_test = avg_a1_a2(df_train_separate_test)

# present results overview
results_overview(df_train_separate_test)

1007 total results

Average of Average Scores:
rouge1_avg: 0.3401
rouge2_avg: 0.2611
rougeL_avg: 0.3320
bleu_avg: 0.2796
bleurt_avg: -0.7113

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.412570    0.333265    0.405282  0.336769   -0.539196
implicit    0.124113    0.046047    0.113518  0.109151   -1.224051

Average of Max Scores:
rouge1_max: 0.3906
rouge2_max: 0.3097
rougeL_max: 0.3807
bleu_max: 0.3238
bleurt_max: -0.5650

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.463056    0.387680    0.454867  0.388727   -0.396875
implicit    0.174606    0.077209    0.159529  0.130392   -1.066225


In [17]:
df_train_separate_test['generated_answer'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
generated_answer,Unnamed: 1_level_1
it is easier to catch a sunbeam than a woman 's roving fancy .,0.003972
she had suffered greatly from the advances of the dwarf who had been chosen for her husband .,0.002979
"the king commanded the most skilled maidens in his kingdom to weave the three dresses , one as golden as the sun , one as silver as the moon , and one as shining as the stars .",0.002979
the farmer fell into a rage and hastened after the bonze as fast as ever he could .,0.002979
"he had any excuse to plead for the high treason he had committed by striking the heir to the throne , and , if so , to be quick in setting it forth .",0.002979
...,...
"he had inherited from his mother , whom all men agreed had been mortal , the dangerous qualities of vanity and ambition .",0.000993
"the water of its own accord , and the distance was much shorter than he had expected , for in a few hours he caught sight of the gate and the roof of the sea king 's palace .",0.000993
"one like myself , who has given up the world , must not be miserly .",0.000993
"the happy hunter grew more and more homesick as the days passed , and he could not repress a great anxiety to know what had happened to his home and his country and his brother while he had been away .",0.000993


# CoT Prompting Implicit Only

In [None]:
## import results for test and validation

cot_implicitOnly_file = 't5_trained_modelIm_cot_prompting_implicitOnly.csv'
# specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_cot_implicitOnly_test = pd.read_csv(f'{specialized_path}/{cot_implicitOnly_file}')

df_train_separate_test = pd.read_csv(f'{specialized_path}/{train_separate_file}')

# take explicit from specialized T5, then join with implicit CoT
df_train_separate_explicit_test = df_train_separate_test.loc[df_train_separate_test['ex_or_im'] == 'explicit']

print('chain of thought shape',df_cot_implicitOnly_test.shape)
print('Specialized explicit t5',df_train_separate_explicit_test.shape)

# concatenate for evaluation
df__specialized_explicit__cot_implicit = pd.concat([df_cot_implicitOnly_test,df_train_separate_explicit_test])
print('Concatenated shape', df__specialized_explicit__cot_implicit.shape)


# Pull performance
df__specialized_explicit__cot_implicit = avg_a1_a2(df__specialized_explicit__cot_implicit)

# present results overview
results_overview(df__specialized_explicit__cot_implicit)


chain of thought shape (253, 29)
Specialized explicit t5 (754, 29)
Concatenated shape (1007, 29)
1007 total results

Average of Average Scores:
rouge1_avg: 0.29
rouge2_avg: 0.19
rougeL_avg: 0.28
bleu_avg: 0.21
bleurt_avg: -0.91

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.366860    0.243352    0.361693  0.271573   -0.711435
implicit    0.053247    0.016107    0.048084  0.039136   -1.496489

Average of Max Scores:
rouge1_max: 0.32
rouge2_max: 0.21
rougeL_max: 0.32
bleu_max: 0.24
bleurt_max: -0.80

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.408083    0.274442    0.402815  0.306382   -0.603638
implicit    0.073513    0.026816    0.067264  0.046225   -1.396537


# Qwen Fine-Tuning

In [None]:
train_separate_file

't5_trained_specialized.csv'

In [None]:
## import results for test and validation

qwen_implicitOnly_file = 'qwen_finetuned_imOnly_chat.csv'
# specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_qwen_implicitOnly_test = pd.read_csv(f'{specialized_path}/{qwen_implicitOnly_file}')

df_train_separate_test = pd.read_csv(f'{specialized_path}/{train_separate_file}')

# take explicit from specialized T5, then join with implicit Qwen
df_train_separate_explicit_test = df_train_separate_test.loc[df_train_separate_test['ex_or_im'] == 'explicit']

print('chain of thought shape',df_qwen_implicitOnly_test.shape)
print('Specialized explicit t5',df_train_separate_explicit_test.shape)

# concatenate for evaluation
df__specialized_explicit__qwen_implicit = pd.concat([df_qwen_implicitOnly_test,df_train_separate_explicit_test])
print('Concatenated shape', df__specialized_explicit__qwen_implicit.shape)


# Pull performance
df__specialized_explicit__qwen_implicit = avg_a1_a2(df__specialized_explicit__qwen_implicit)

# present results overview
results_overview(df__specialized_explicit__qwen_implicit)


chain of thought shape (253, 29)
Specialized explicit t5 (754, 29)
Concatenated shape (1007, 29)
1007 total results

Average of Average Scores:
rouge1_avg: 0.3138
rouge2_avg: 0.1930
rougeL_avg: 0.3066
bleu_avg: 0.2317
bleurt_avg: -0.8090

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.366860    0.243352    0.361693  0.271573   -0.711435
implicit    0.155609    0.042912    0.142497  0.113012   -1.099709

Average of Max Scores:
rouge1_max: 0.3574
rouge2_max: 0.2232
rougeL_max: 0.3488
bleu_max: 0.2615
bleurt_max: -0.6891

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.408083    0.274442    0.402815  0.306382   -0.603638
implicit    0.206299    0.070450    0.187909  0.127909   -0.943733


In [None]:
## import results for test and validation

qwen_implicitOnly_file = 'qwen_finetuned_imOnly_chat_CoT.csv'
# specialized_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/answer_outputs'
specialized_path = '/content/drive/MyDrive/266/FinalProject/answer_outputs'
df_qwen_implicitOnly_test = pd.read_csv(f'{specialized_path}/{qwen_implicitOnly_file}')

df_train_separate_test = pd.read_csv(f'{specialized_path}/{train_separate_file}')

# take explicit from specialized T5, then join with implicit Qwen
df_train_separate_explicit_test = df_train_separate_test.loc[df_train_separate_test['ex_or_im'] == 'explicit']

print('chain of thought shape',df_qwen_implicitOnly_test.shape)
print('Specialized explicit t5',df_train_separate_explicit_test.shape)

# concatenate for evaluation
df__specialized_explicit__qwen_implicit = pd.concat([df_qwen_implicitOnly_test,df_train_separate_explicit_test])
print('Concatenated shape', df__specialized_explicit__qwen_implicit.shape)


# Pull performance
df__specialized_explicit__qwen_implicit = avg_a1_a2(df__specialized_explicit__qwen_implicit)

# present results overview
results_overview(df__specialized_explicit__qwen_implicit)


chain of thought shape (253, 29)
Specialized explicit t5 (754, 29)
Concatenated shape (1007, 29)
1007 total results

Average of Average Scores:
rouge1_avg: 0.2967
rouge2_avg: 0.1878
rougeL_avg: 0.2899
bleu_avg: 0.2130
bleurt_avg: -0.8728

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.366860    0.243352    0.361693  0.271573   -0.711435
implicit    0.087558    0.022148    0.075971  0.038401   -1.353782

Average of Max Scores:
rouge1_max: 0.3341
rouge2_max: 0.2141
rougeL_max: 0.3261
bleu_max: 0.2402
bleurt_max: -0.7624

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.408083    0.274442    0.402815  0.306382   -0.603638
implicit    0.113618    0.034068    0.097316  0.042896   -1.235474


In [None]:
df_test.columns

Index(['id', 'question', 'ex_or_im', 'story_section', 'reference_answer1',
       'reference_answer2', 'generated_answer', 'rouge1_a1', 'rouge2_a1',
       'rougeL_a1', 'rougeLsum_a1', 'rouge1_a2', 'rouge2_a2', 'rougeL_a2',
       'rougeLsum_a2', 'bleu_a1', 'precisions_a1', 'brevity_penalty_a1',
       'length_ratio_a1', 'translation_length_a1', 'reference_length_a1',
       'bleu_a2', 'precisions_a2', 'brevity_penalty_a2', 'length_ratio_a2',
       'translation_length_a2', 'reference_length_a2', 'bleurt_score_a1',
       'bleurt_score_a2', 'bleurt_avg', 'bleu_avg', 'rouge1_avg', 'rouge2_avg',
       'rougeL_avg', 'bleurt_max', 'bleu_max', 'rouge1_max', 'rouge2_max',
       'rougeL_max'],
      dtype='object')

In [None]:
df_test[df_test['ex_or_im'] == 'implicit'].value_counts(['reference_length_a1'])

Unnamed: 0_level_0,count
reference_length_a1,Unnamed: 1_level_1
2,70
7,33
5,28
6,24
8,24
4,20
10,15
11,10
9,10
12,6


In [None]:
# what explicit questions did okay?
df_test[df_test['ex_or_im'] == 'explicit'].sort_values('bleu_avg', ascending=False).head(10)

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
601,601,why did the goodman go home ?,explicit,""" what 's that ? "" said he , "" for the bowl of...",he was feeling cold without his coat .,he was feeling cold without his coat .,he was feeling cold without his coat .,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,8,8,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,8,8,1.087961,1.087961,1.087961,1.0,1.0,1.0,1.0,1.087961,1.0,1.0,1.0,1.0
606,606,why did boat row away at top speed ?,explicit,""" god be praised for good company ! that was j...",they thought the meer - trolls were making sig...,they thought the meer - trolls were making sig...,they thought the meer - trolls were making sig...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,12,12,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,12,12,1.046282,1.046282,1.046282,1.0,1.0,1.0,1.0,1.046282,1.0,1.0,1.0,1.0
67,67,why was rin jin not happy ?,explicit,the palace of rin jin was at the bottom of the...,he reigned alone .,he reigned alone .,he reigned alone .,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,4,4,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,4,4,1.067542,1.067542,1.067542,1.0,1.0,1.0,1.0,1.067542,1.0,1.0,1.0,1.0
390,390,why did the young couple live happily together...,explicit,when the flower queen heard that her daughter ...,the flower queen 's daughter departed and went...,the flower queen 's daughter departed and went...,the flower queen 's daughter departed and went...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,13,13,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,13,13,0.993606,0.993606,0.993606,1.0,1.0,1.0,1.0,0.993606,1.0,1.0,1.0,1.0
135,135,why was death an everyday matter to the swords...,explicit,at the time when the tang dynasty reigned over...,they hired themselves out to those who wished ...,they hired themselves out to those who wished ...,they hired themselves out to those who wished ...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.931063,"[1.0, 1.0, 1.0, 1.0]",0.931063,0.933333,14,15,0.931063,"[1.0, 1.0, 1.0, 1.0]",0.931063,0.933333,14,15,0.955911,0.955911,0.955911,0.931063,1.0,1.0,1.0,0.955911,0.931063,1.0,1.0,1.0
324,324,why were the messengers sent far and wide ?,explicit,there was once upon a time a king who had a wi...,to seek for a bride equal to the late queen in...,to seek for a bride equal to the late queen in...,to seek for a bride equal to the late queen in...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.949171,0.949171,0.949171,0.920044,1.0,1.0,1.0,0.949171,0.920044,1.0,1.0,1.0
115,115,what did the old man spend his time doing ?,explicit,"there was once an old man and his wife , who l...","looking after the cows , and the hens , and th...","looking after the cows , and the hens , and th...","looking after the cows , and the hens , and th...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.970399,0.970399,0.970399,0.920044,1.0,1.0,1.0,0.970399,0.920044,1.0,1.0,1.0
573,573,why did the happy hunter save his brother ?,explicit,the happy hunter had a kind heart and could no...,he had a kind heart and could not bear the sig...,the happy hunter had a kind heart and could no...,he had a kind heart and could not bear the sig...,1.0,1.0,1.0,1.0,0.882353,0.875,0.882353,0.882353,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,17,17,0.831343,"[0.9411764705882353, 0.9375, 0.933333333333333...",0.88901,0.894737,17,19,1.012825,0.177293,0.595059,0.915671,0.941176,0.9375,0.941176,1.012825,1.0,1.0,1.0,1.0
96,96,what was the jelly fish able to do ?,explicit,"the chief steward thought for some time , and ...",walk on land with his four legs like a tortoise .,walk on land with his four legs like a tortoise .,walk on land with his four legs like a tortoise,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,1.064988,1.064988,1.064988,0.904837,1.0,1.0,1.0,1.064988,0.904837,1.0,1.0,1.0
619,619,why did the old woman beg the priest and the g...,explicit,"left to himself , the full horror of his comin...",she told the young man what he was to do .,she told the young man what he was to do .,she told the young man what he was to do,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,1.01271,1.01271,1.01271,0.904837,1.0,1.0,1.0,1.01271,0.904837,1.0,1.0,1.0


In [None]:
# what implicit questions did okay?
df_test[df_test['ex_or_im'] == 'implicit'].sort_values('bleu_avg', ascending=False).head(10)

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
88,88,why would the skillful fisher not accept the f...,implicit,the happy hunter felt that he was to blame for...,he wanted his original hook back .,they are of no use to him .,they are of no use to him,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,"[0.0, 0.0, 0.0, 0.0]",1.0,1.0,7,7,0.866878,"[1.0, 1.0, 1.0, 1.0]",0.866878,0.875,7,8,-1.018798,1.062443,0.021822,0.433439,0.5,0.5,0.5,1.062443,0.866878,1.0,1.0,1.0
294,294,why did the farmer forget about the business h...,implicit,"when the bonze had begun to make his magic , t...",he was interested in the bonze 's magic .,the farmer had mingled with the crowd .,the farmer had mingled with the crowd,0.133333,0.0,0.133333,0.133333,1.0,1.0,1.0,1.0,0.0,"[0.14285714285714285, 0.0, 0.0, 0.0]",0.751477,0.777778,7,9,0.866878,"[1.0, 1.0, 1.0, 1.0]",0.866878,0.875,7,8,-1.36697,1.014066,-0.176452,0.433439,0.566667,0.5,0.566667,1.014066,0.866878,1.0,1.0,1.0
237,237,why did tsui grow sad again ?,implicit,"said molo : "" when she stretched out three fin...",he thought that it would not be possible to go...,the prince 's palace is shut off as thought by...,the prince 's palace is shut off as though by ...,0.258065,0.206897,0.258065,0.258065,0.916667,0.818182,0.916667,0.916667,0.100145,"[0.3333333333333333, 0.2727272727272727, 0.2, ...",0.472367,0.571429,12,21,0.67613,"[0.9166666666666666, 0.8181818181818182, 0.7, ...",0.920044,0.923077,12,13,-1.183965,0.375965,-0.404,0.388138,0.587366,0.512539,0.587366,0.375965,0.67613,0.916667,0.818182,0.916667
639,639,what happened after the deer shook her head ov...,implicit,"' it is the man who is lying under the cask , ...",he was alive again .,ian jumped up as well as ever .,he jumped up as well as ever,0.181818,0.0,0.181818,0.181818,0.857143,0.833333,0.857143,0.857143,0.0,"[0.14285714285714285, 0.0, 0.0, 0.0]",1.0,1.4,7,5,0.701397,"[0.8571428571428571, 0.8333333333333334, 0.8, ...",0.866878,0.875,7,8,-0.874265,0.423447,-0.225409,0.350698,0.519481,0.416667,0.519481,0.423447,0.701397,0.857143,0.833333,0.857143
340,340,why did harold rush out of the room ?,implicit,""" for whom hast thou purchased that ? "" he ask...",he wanted to show lady morna .,to show lady morna how fine he was .,to show the lady morna how fine he was,0.666667,0.307692,0.533333,0.533333,0.941176,0.8,0.941176,0.941176,0.0,"[0.5555555555555556, 0.25, 0.0, 0.0]",1.0,1.285714,9,7,0.660633,"[0.8888888888888888, 0.75, 0.5714285714285714,...",1.0,1.0,9,9,0.028125,0.927688,0.477906,0.330316,0.803922,0.553846,0.737255,0.927688,0.660633,0.941176,0.8,0.941176
295,295,what happened after the widowed mistress revea...,implicit,now the young mistress of kittlerumpit knew th...,she jumped into the air .,"the old woman ran down the brae , shrieking wi...","she ran down the brae , shrieking with rage an...",0.133333,0.0,0.133333,0.133333,0.594595,0.457143,0.486486,0.486486,0.0,"[0.06896551724137931, 0.0, 0.0, 0.0]",1.0,4.833333,29,6,0.320944,"[0.41379310344827586, 0.32142857142857145, 0.2...",1.0,2.071429,29,14,-1.145075,-0.019223,-0.582149,0.160472,0.363964,0.228571,0.30991,-0.019223,0.320944,0.594595,0.457143,0.486486
146,146,why did the jelly fish feel discouraged ?,implicit,the jelly fish wondered at this speech and the...,the monkey tricked him .,the monkey replied laughlingly that he could n...,the jellyfish did n't want to lose his liver .,0.142857,0.0,0.142857,0.142857,0.583333,0.363636,0.583333,0.583333,0.0,"[0.2, 0.0, 0.0, 0.0]",1.0,2.0,10,5,0.286419,"[0.7, 0.4444444444444444, 0.375, 0.28571428571...",0.67032,0.714286,10,14,-1.267366,-0.840598,-1.053982,0.14321,0.363095,0.181818,0.363095,-0.840598,0.286419,0.583333,0.363636,0.583333
434,434,what did the gentleman and the old woman do af...,implicit,it took some time to discover the whereabouts ...,they got married .,go to the chapel .,The gentleman and the old woman went to the ch...,0.0,0.0,0.0,0.0,0.428571,0.333333,0.428571,0.428571,0.0,"[0.09090909090909091, 0.0, 0.0, 0.0]",1.0,2.75,11,4,0.234624,"[0.36363636363636365, 0.3, 0.2222222222222222,...",1.0,2.2,11,5,-1.412994,-0.182248,-0.797621,0.117312,0.214286,0.166667,0.214286,-0.182248,0.234624,0.428571,0.333333,0.428571
437,437,why did the poor widow give a piteous cry ?,implicit,a joyful woman was the mistress of kittlerumpi...,the old woman was a wicked fairy .,the green - clad lady was a fairy .,she knew now what she had not guessed before -...,0.263158,0.166667,0.263158,0.263158,0.368421,0.333333,0.368421,0.368421,0.0,"[0.16666666666666666, 0.08571428571428572, 0.0...",1.0,4.5,36,8,0.191216,"[0.25, 0.2, 0.17647058823529413, 0.15151515151...",1.0,4.0,36,9,-0.733674,-0.33285,-0.533262,0.095608,0.315789,0.25,0.315789,-0.33285,0.191216,0.368421,0.333333,0.368421
916,916,why did the old duke hide his daughter from th...,implicit,"' one day , shortly after the death of the que...",he waited for the day that the king would anno...,the duke vowed that he would hide her safely f...,he vowed that he would hide her from you,0.272727,0.0,0.272727,0.272727,0.5,0.333333,0.4375,0.4375,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.573753,0.642857,9,14,0.106681,"[0.8888888888888888, 0.625, 0.5714285714285714...",0.169013,0.36,9,25,-0.796656,-0.856786,-0.826721,0.05334,0.386364,0.166667,0.355114,-0.796656,0.106681,0.5,0.333333,0.4375


In [None]:
## proportion with exact match generated answer
round(len(df_test[(df_test['rougeL_a1']==1)|(df_test['rougeL_a2']==1)])/len(df_test)*100,2)

15.69