# Review & Compare Results


In [1]:
import pandas as pd
from random import randint
from google.colab import drive
import pprint

drive.mount('/content/drive')
pd.set_option('display.max_columns', None)


Mounted at /content/drive


# FUNCTIONS

In [2]:
def avg_a1_a2(df):
  '''
  how to represent two reference answers into one per evaluation metric
  add columns for test, train, or validation results dataframes
  two strategies: average or max
  '''
  df['bleurt_avg'] = df[['bleurt_score_a1', 'bleurt_score_a2']].mean(axis=1)
  df['bleu_avg'] = df[['bleu_a1', 'bleu_a2']].mean(axis=1)
  df['rouge1_avg'] = df[['rouge1_a1', 'rouge1_a2']].mean(axis=1)
  df['rouge2_avg'] = df[['rouge2_a1', 'rouge2_a2']].mean(axis=1)
  df['rougeL_avg'] = df[['rougeL_a1', 'rougeL_a2']].mean(axis=1)

  df['bleurt_max'] = df[['bleurt_score_a1', 'bleurt_score_a2']].max(axis=1)
  df['bleu_max'] = df[['bleu_a1', 'bleu_a2']].max(axis=1)
  df['rouge1_max'] = df[['rouge1_a1', 'rouge1_a2']].max(axis=1)
  df['rouge2_max'] = df[['rouge2_a1', 'rouge2_a2']].max(axis=1)
  df['rougeL_max'] = df[['rougeL_a1', 'rougeL_a2']].max(axis=1)

  return df

BLEU's output is always a number between 0 and 1. This value indicates how similar the candidate text is to the reference texts, with values closer to 1 representing more similar texts

BLEURT's output is always a number between 0 and (approximately 1). This value indicates how similar the generated text is to the reference texts, with values closer to 1 representing more similar texts

In [3]:
def results_overview(df):
  print(f"{len(df)} total results")
  # Calculate and print average
  avg_rouge = df[['rouge1_avg','rouge2_avg','rougeL_avg','bleu_avg', 'bleurt_avg']].mean()
  print("\nAverage of Average Scores:")
  for metric, score in avg_rouge.items():
    print(f"{metric}: {score:.4f}")

  # Calculate average by question type
  avg_rouge_type = df.groupby(['ex_or_im'])[['rouge1_avg','rouge2_avg','rougeL_avg','bleu_avg','bleurt_avg']].mean()
  print("\nAverage Scores by Question type:")
  print(avg_rouge_type)

  # Calculate and print average of max
  avg_rouge = df[['rouge1_max','rouge2_max','rougeL_max','bleu_max','bleurt_max']].mean()
  print("\nAverage of Max Scores:")
  for metric, score in avg_rouge.items():
    print(f"{metric}: {score:.4f}")

  # Calculate average by question type for max
  max_breakdown = df.groupby(['ex_or_im'])[['rouge1_max','rouge2_max','rougeL_max','bleu_max','bleurt_max']].mean()
  print("\nAverage of Max Scores by Question type:")
  print(max_breakdown)


  # # Print some example summaries
  # print("\nExample Summaries:")
  # i = randint(0,len(df))
  # for i in range(i, i+1):
  #   print(f"\nExample {i}:")
  #   print(f"Reference1: {df.iloc[i]['reference_answer1']}")
  #   print(f"Reference2: {df.iloc[i]['reference_answer2']}")
  #   print(f"Generated: {df.iloc[i]['generated_answer']}")


In [4]:
## proportion with exact match generated answer
def prop_exact_match(df):
  x = round(len(df[(df['rougeL_a1']==1)|(df['rougeL_a2']==1)])/len(df)*100,2)
  return x

In [5]:
def random_outputs(df, num_ex=1):
  i = randint(0,len(df))
  for j in range(i, i+num_ex):
    print(f"\nExample {i}:")
    pprint.pprint(f"Story: {df.iloc[j]['story_section']}")
    print(f"Question: {df.iloc[j]['question']}")
    print(f"Reference1: {df.iloc[j]['reference_answer1']}")
    print(f"Reference2: {df.iloc[j]['reference_answer2']}")
    print(f"Generated: {df.iloc[j]['generated_answer']}")

# DATA

In [6]:
# Out of the Box T5 Baseline results
## results from  notebook

baseline_val = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/T5baseline_val.csv')
baseline_test = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/T5baseline_test.csv')

baseline_val = avg_a1_a2(baseline_val)
baseline_test = avg_a1_a2(baseline_test)

In [7]:
## t5 fine tuned once on implicit and explicit question together
test_overall_df = pd.read_csv('/content/drive/MyDrive/266/FinalProject/answer_outputs/t5_trained_overall_model_both.csv')
test_overall_df = avg_a1_a2(test_overall_df)

In [8]:
# T5 trained on implicit and explicit seperately as two models then results combined
## results from T5_SmallSpecializedIndividualModels notebook

test_both_df = pd.read_csv('/content/drive/MyDrive/266/FinalProject/answer_outputs/t5_trained_specialized.csv')
test_both_df = avg_a1_a2(test_both_df)

In [9]:
# T5 generate params adjusted on implicit and explicit seperately trained models as experiments
## results from 266_Danielle_Dylan_T5_Generate_Experiments.ipynb notebook

# Experiment1 - Explicit Validataion
# num_beams=4, do_sample=True, top_k=100, top_p=1, temperature=0.2, no_repeat_ngram_size=2

d1 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/explicit_val_1.csv')
d1 = avg_a1_a2(d1)

In [10]:
# T5 generate params adjusted on implicit and explicit seperately trained models as experiments
## results from 266_Danielle_Dylan_T5_Generate_Experiments.ipynb notebook

# Experiment2 - Implicit Validataion
# num_beams=4, do_sample=True, top_k=100, top_p=1, temperature=0.2, no_repeat_ngram_size=2

d2 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/implicit_val_2.csv')
d2 = avg_a1_a2(d2)

In [11]:
# T5 generate params adjusted on implicit and explicit seperately trained models as experiments
## results from 266_Danielle_Dylan_T5_Generate_Experiments.ipynb notebook

# Experiment3 - Implicit Validataion
# num_beams=1, do_sample=False, top_k=50, top_p=1, temperature=1, no_repeat_ngram_size=0

d3 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/implicit_val_3.csv')
d3 = avg_a1_a2(d3)

In [12]:
# T5 generate params adjusted on implicit and explicit seperately trained models as experiments
## results from 266_Danielle_Dylan_T5_Generate_Experiments.ipynb notebook

# Experiment4 - Explicit Validataion
# num_beams=1, do_sample=False, top_k=50, top_p=1, temperature=1, no_repeat_ngram_size=0

d4 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/explicit_val_4.csv')
d4 = avg_a1_a2(d4)

In [13]:
# model fintuning experiment 5 generation experiment 1

dm51 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/t5_model_im_experiments5_test_1.csv')
dm51 = avg_a1_a2(dm51)

In [14]:
# model fintuning experiment 3 generation experiment 1

dm31 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/t5_model_im_experiments3_test_1.csv')
dm31 = avg_a1_a2(dm31)

In [15]:
# model fintuning experiment 2 generation experiment 1

dm21 = pd.read_csv('/content/drive/MyDrive/266/FinalProject/results/t5_model_im_experiments2_test_1.csv')
dm21 = avg_a1_a2(dm21)


In [16]:
# qwen implicit

dqw = pd.read_csv('/content/drive/MyDrive/266/FinalProject/answer_outputs/qwen_finetuned_imOnly_chat.csv')
dqw = avg_a1_a2(dqw)


In [17]:
# cot_implicitOnly_file = 't5_trained_modelIm_cot_prompting_implicitOnly.csv'
cot = pd.read_csv('/content/drive/MyDrive/266/FinalProject/answer_outputs/t5_trained_modelIm_cot_prompting_implicitOnly.csv')
cot = avg_a1_a2(cot)

In [18]:
dm5ex =  pd.read_csv('/content/drive/MyDrive/266/FinalProject/answer_outputs/t5_trained_specialized_smaller_lr.csv')
dm5ex = dm5ex[dm5ex['ex_or_im']=='explicit']
dm5ex = avg_a1_a2(dm5ex)



In [19]:
frankensteinqwen = pd.concat([dm5ex, dqw])
frankensteinqwen.shape

(1007, 39)

In [26]:
frankenstein5 = pd.concat([dm5ex,dm51])
frankenstein5.shape

(1007, 39)

In [19]:
### Create one dataset for all Implicit TEST model answers to easily compare generated responses

df_IMPLICIT = baseline_test[baseline_test['ex_or_im'] == 'implicit'][['story_section','question','reference_answer1','reference_answer2','generated_answer']]
df_IMPLICIT.rename(columns={'generated_answer':'baseline_gen_ans'}, inplace=True)
# add dm51
df_IMPLICIT = pd.merge(df_IMPLICIT,dm51[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_IMPLICIT.rename(columns={'generated_answer':'dm51_gen_ans'}, inplace=True)
# qwen
df_IMPLICIT = pd.merge(df_IMPLICIT,dqw[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_IMPLICIT.rename(columns={'generated_answer':'dqw_gen_ans'}, inplace=True)
# cot
df_IMPLICIT = pd.merge(df_IMPLICIT,cot[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_IMPLICIT.rename(columns={'generated_answer':'cot_gen_ans'}, inplace=True)
# add test_both_df
df_IMPLICIT = pd.merge(df_IMPLICIT,test_both_df[test_both_df['ex_or_im']=='implicit'][['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_IMPLICIT.rename(columns={'generated_answer':'both_gen_ans'}, inplace=True)
# add dm31
df_IMPLICIT = pd.merge(df_IMPLICIT,dm31[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_IMPLICIT.rename(columns={'generated_answer':'dm31_gen_ans'}, inplace=True)
# add dm21
df_IMPLICIT = pd.merge(df_IMPLICIT,dm21[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_IMPLICIT.rename(columns={'generated_answer':'dm21_gen_ans'}, inplace=True)


df_IMPLICIT.head()

Unnamed: 0,story_section,question,reference_answer1,reference_answer2,baseline_gen_ans,dm51_gen_ans,dqw_gen_ans,cot_gen_ans,both_gen_ans,dm31_gen_ans,dm21_gen_ans
0,' bear me to the kingdom of the king under the...,how did ian feel after he reached the place wh...,nervous .,excited .,glad,glad .,"he felt relieved, happy, and contented.",1. the lady 2. food and drink 3. the young man...,glad,he wanted to be .,glad
1,"but the herring were now ready , and the stude...",how did prince feel after the young men left ?,sad .,woeful .,ahti,happy that he had good ears and had laid to he...,prince felt very sorry because he had been mis...,1. the students 2. some cold meat 3. the stude...,None of the above choices,he wanted to be .,None of the above choices
2,the king danced again with the beautiful maide...,why did the king put a gold ring on the maiden...,to figure out who she was .,so that he can find her later .,the king commanded that the dance should last ...,he wanted to keep her hands in his hand .,the king wanted to keep her hands in his .,1. the king commanded that the dance should la...,None of the above choices,he wanted to be .,None of the above choices
3,"on they went , on and on and one , till they r...",how did the robbers feel after they realized s...,angry .,angry .,ian,happy .,they felt guilty and ashamed.,1. the house belonged to some robbers 2. the h...,None of the above choices,he wanted to .,None of the above choices
4,""" yes , and i 'm sure it is because you have t...",what did the woman do because the boy threw th...,kicked him out .,"tolds him to "" get out and earn your own livin...",she was a dunderhead and will stay a dunderhea...,he found a chest full of silver coins under th...,she was angry because she believed that nothin...,1. a chest full of bright silver coins 2. a ch...,None of the above choices .,he wanted to be .,None of the above choices .


In [77]:
frankenstein5[frankenstein5['question'].str.contains("how did the king feel after the gentlema")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
157,157,how did the king feel after the gentleman gave...,implicit,"' by and bye i was born , and was brought up b...",surprised .,choked - up .,happy .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.759836,"[0.6666666666666666, 0.5, 1.0, 1.0]",1.0,1.0,2,2,0.279528,"[0.6666666666666666, 0.5, 1.0, 1.0]",0.367879,0.5,2,4,-0.547631,-1.951132,-1.249381,0.519682,0.0,0.0,0.0,-0.547631,0.759836,0.0,0.0,0.0
205,205,how did the king feel after the gentleman did ...,implicit,"at last the ceremony was over , and the king ,...",unhappy .,upset by his son not wanting to keep his promi...,he would rather receive a bride chosen by the ...,0.0,0.0,0.0,0.0,0.095238,0.0,0.095238,0.095238,0.103903,"[0.15384615384615385, 0.08333333333333333, 0.0...",1.0,6.0,12,2,0.114988,"[0.23076923076923078, 0.08333333333333333, 0.0...",1.0,1.090909,12,11,-1.866862,-1.211088,-1.538975,0.109445,0.047619,0.0,0.047619,-1.211088,0.114988,0.095238,0.0,0.095238


In [73]:
# frankenstein5[frankenstein5['id']==181]
frankenstein5[frankenstein5['reference_answer1'].str.contains("delighted")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
181,181,how did the king feel seeing the maiden again ?,explicit,"when the many - furred creature came , the kin...",delighted .,delighted .,"delighted to see her again , and as the dance ...",0.125,0.0,0.125,0.125,0.125,0.0,0.125,0.125,0.075358,"[0.15789473684210525, 0.05555555555555555, 0.0...",1.0,9.0,18,2,0.075358,"[0.15789473684210525, 0.05555555555555555, 0.0...",1.0,9.0,18,2,-1.294472,-1.294472,-1.294472,0.075358,0.125,0.0,0.125,-1.294472,0.075358,0.125,0.0,0.125
181,181,how did ian feel after he realized he missed m...,implicit,as soon as the lady had departed the fisher 's...,sad .,heart broken .,happy .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.759836,"[0.6666666666666666, 0.5, 1.0, 1.0]",1.0,1.0,2,2,0.460864,"[0.6666666666666666, 0.5, 1.0, 1.0]",0.606531,0.666667,2,3,-0.579265,-1.664623,-1.121944,0.61035,0.0,0.0,0.0,-0.579265,0.759836,0.0,0.0,0.0


In [78]:
baseline_test[baseline_test['question'].str.contains("how did the king feel after the gentlema")]


Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
151,151,how did the king feel after the gentleman gave...,implicit,"' by and bye i was born , and was brought up b...",surprised .,choked - up .,the king did not move from his seat .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.1111111111111111, 0.0, 0.0, 0.0]",1.0,4.5,9,2,0.0,"[0.1111111111111111, 0.0, 0.0, 0.0]",1.0,2.25,9,4,-1.580027,-1.548232,-1.564129,0.0,0.0,0.0,0.0,-1.548232,0.0,0.0,0.0,0.0
253,253,how did the king feel after the gentleman did ...,implicit,"at last the ceremony was over , and the king ,...",unhappy .,upset by his son not wanting to keep his promi...,frowned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0]",0.367879,0.5,1,2,0.0,"[0.0, 0.0, 0.0, 0.0]",4.5e-05,0.090909,1,11,-0.056498,-1.671472,-0.863985,0.0,0.0,0.0,0.0,-0.056498,0.0,0.0,0.0,0.0


In [76]:
# elighted .
# 253
baseline_test[baseline_test['reference_answer1'].str.contains("happy .")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
132,132,what happened because all the fishes-in-waitin...,explicit,there was much gladness between the sea king a...,wonderful array of sea creatures waited upon t...,a wonderful array of sea creatures it was that...,ryn jin commanded his daughters to play some m...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0]",0.800737,0.818182,9,11,0.0,"[0.0, 0.0, 0.0, 0.0]",0.513417,0.6,9,15,-1.237066,-1.275485,-1.256275,0.0,0.0,0.0,0.0,-1.237066,0.0,0.0,0.0,0.0
204,204,how did the princess feel in her new home ?,explicit,in a few days she heard that a great hunt was ...,happy .,too happy .,The princess was too happy in her new home to ...,0.095238,0.0,0.095238,0.095238,0.181818,0.1,0.181818,0.181818,0.0,"[0.09523809523809523, 0.0, 0.0, 0.0]",1.0,10.5,21,2,0.0,"[0.14285714285714285, 0.05, 0.0, 0.0]",1.0,7.0,21,3,-1.321007,-1.074576,-1.197791,0.0,0.138528,0.05,0.138528,-1.074576,0.0,0.181818,0.1,0.181818
253,253,how did the king feel after the gentleman did ...,implicit,"at last the ceremony was over , and the king ,...",unhappy .,upset by his son not wanting to keep his promi...,frowned,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0]",0.367879,0.5,1,2,0.0,"[0.0, 0.0, 0.0, 0.0]",4.5e-05,0.090909,1,11,-0.056498,-1.671472,-0.863985,0.0,0.0,0.0,0.0,-0.056498,0.0,0.0,0.0,0.0
389,389,how did the man feel about being outside ?,implicit,"one day the firewood gave out in his kitchen ,...",happy .,happy .,the old man enjoyed the fresh air and was in n...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0625, 0.0, 0.0, 0.0]",1.0,8.0,16,2,0.0,"[0.0625, 0.0, 0.0, 0.0]",1.0,8.0,16,2,-1.126065,-1.126065,-1.126065,0.0,0.0,0.0,0.0,-1.126065,0.0,0.0,0.0,0.0
393,393,how did lady morna feel after paul proposed to...,implicit,at these words harold 's vanity was flattered ...,happy .,excited .,she was saddened by paul 's decision .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.125, 0.0, 0.0, 0.0]",1.0,4.0,8,2,0.0,"[0.125, 0.0, 0.0, 0.0]",1.0,4.0,8,2,-1.440022,-1.623094,-1.531558,0.0,0.0,0.0,0.0,-1.440022,0.0,0.0,0.0,0.0
491,491,why did the widowed mistress make a rash promi...,implicit,""" madam , "" she said , with a sob in her voice...",she was too happy that her pig was going to be...,the old woman could ask for anything in return .,she was a pig - sty .,0.352941,0.133333,0.352941,0.352941,0.0,0.0,0.0,0.0,0.0,"[0.5714285714285714, 0.16666666666666666, 0.0,...",0.424373,0.538462,7,13,0.0,"[0.14285714285714285, 0.0, 0.0, 0.0]",0.651439,0.7,7,10,-1.340246,-1.241013,-1.29063,0.0,0.176471,0.066667,0.176471,-1.241013,0.0,0.352941,0.133333,0.352941
506,506,what did the skillful fisher plan to use the l...,explicit,nothing would appease the anger of the skillfu...,kill the happy hunter and to usurp his place a...,to kill and usupr his place as ruler of japan .,to kill him and to usurp his place as ruler of...,0.8,0.695652,0.8,0.8,0.818182,0.6,0.818182,0.818182,0.617303,"[0.8333333333333334, 0.7272727272727273, 0.7, ...",0.846482,0.857143,12,14,0.48327,"[0.75, 0.5454545454545454, 0.4, 0.333333333333...",1.0,1.090909,12,11,-0.079388,0.266727,0.09367,0.550286,0.809091,0.647826,0.809091,0.266727,0.617303,0.818182,0.695652,0.818182
638,638,how did the skillful fisher know something mus...,explicit,"he now began to feel extremely anxious , for h...",he saw the happy hunter searching about on the...,when he saw the happy hunter seraching about o...,"the happy hunter went forward timidly , for he...",0.242424,0.129032,0.181818,0.181818,0.235294,0.125,0.176471,0.176471,0.0,"[0.16666666666666666, 0.06896551724137931, 0.0...",1.0,2.727273,30,11,0.0,"[0.16666666666666666, 0.06896551724137931, 0.0...",1.0,2.5,30,12,-1.0785,-1.241834,-1.160167,0.0,0.238859,0.127016,0.179144,-1.0785,0.0,0.242424,0.129032,0.181818
683,683,how did the widowed mistress feel while she th...,implicit,"at any rate , his poor young wife was sorely t...",happy .,very happy .,she was a savage .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.2, 0.0, 0.0, 0.0]",1.0,2.5,5,2,0.0,"[0.2, 0.0, 0.0, 0.0]",1.0,1.666667,5,3,-1.763487,-1.612019,-1.687753,0.0,0.0,0.0,0.0,-1.612019,0.0,0.0,0.0,0.0
770,770,how did snorro the dwarf feel after harold cam...,implicit,"so he made haste to hire a boat , and soon he ...",happy .,happy because he will get more gold .,snorro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0]",0.367879,0.5,1,2,0.0,"[0.0, 0.0, 0.0, 0.0]",0.000912,0.125,1,8,-0.724556,-1.331763,-1.028159,0.0,0.0,0.0,0.0,-0.724556,0.0,0.0,0.0,0.0


In [79]:
df_IMPLICIT[df_IMPLICIT['question'].str.contains("how did the king feel after the gentlema")]


Unnamed: 0,story_section,question,reference_answer1,reference_answer2,baseline_gen_ans,dm51_gen_ans,dqw_gen_ans,cot_gen_ans,both_gen_ans,dm31_gen_ans,dm21_gen_ans
37,"' by and bye i was born , and was brought up b...",how did the king feel after the gentleman gave...,surprised .,choked - up .,the king did not move from his seat .,happy .,the king felt relieved that the gentleman had ...,1. the rings 2. if i am your son or not 3. if ...,The king felt he was a king .,he wanted to be .,The king felt that he was a prince .
60,"at last the ceremony was over , and the king ,...",how did the king feel after the gentleman did ...,unhappy .,upset by his son not wanting to keep his promi...,frowned,he would rather receive a bride chosen by the ...,"the king frowned, and answered sharply.",1. how did the king feel after the gentleman d...,frowned,he wanted to be .,the king frowned


In [38]:
df_IMPLICIT[df_IMPLICIT['baseline_gen_ans'].str.contains("None")]

Unnamed: 0,story_section,question,reference_answer1,reference_answer2,baseline_gen_ans,dm51_gen_ans,dqw_gen_ans,cot_gen_ans,both_gen_ans,dm31_gen_ans,dm21_gen_ans
8,between the sea realm and the earth there was ...,what will the happy hunter do because he is ho...,try to go home .,runaway back home .,None of the above choices .,repress a great anxiety to know what had happe...,he will go back to his homeland and family to ...,1. the wonders of that enchanted land seemed t...,None of the above choices .,he wanted to be .,None of the above choices .
181,"so the matter was settled by this talk , and t...",what will happen when the happy hunter tries t...,he wo n't catch any fish .,he wo nt catch any fish .,None of the above choices .,he will learn nothing about hunting .,"the happy hunter will be caught, while the ski...",1. what will happen to the happy hunter 2. wha...,None of the above choices .,he wanted to be .,None of the above choices .
182,""" oh , sweet madam mistress , "" she cried , "" ...",why didn't the old woman want to take the pig ?,she did not eat bacon .,she had no need of bacon .,None of the above choices .,he had no time to waste in this wise .,the old woman did not want to take the pig bec...,"1. oh , sweet madam mistress 2. if thou wilt n...",None of the above choices .,he wanted to be .,None of the above choices .
193,"but his brother complained of being weary , an...",how did andrew feel when he saw his brother an...,confused .,he thought him to be a fool .,None of the above choices .,happy .,andrew felt tired after waking up and seeing n...,1. his brother complained of being weary 2. he...,andrew did not understand the whole affair,he wanted to be .,andrew did not understand the whole affair
208,""" what is your name ? "" asked the girl from un...",why did the girl's father think the girl burne...,the woman told the girl her name was self .,"because she shouted "" self burned me "" .",None of the above choices .,he had a curious name to the girl .,the girl's father thought the girl had burned ...,1. What is your name ? 2. What is your name ? ...,None of the above choices .,he wanted to .,None of the above choices .
211,""" nothing easier , "" said molo . "" on the fift...",what did molo need to do to help tsui meet the...,kill the wild dog on guard at the girl 's court .,carry tsui there .,None of the above choices .,take two pieces of silk and wrap ourselves up ...,"molo needed to wrap himself up in dark silk, t...",1. two pieces of silk and wrap ourselves up in...,None of the above choices .,he wanted to be .,None of the above choices .
231,"he said not a word , however , but silently ga...",what happened because the happy hunter was qui...,the women did not know he was there .,they were unaware they were being watched .,None of the above choices .,he had posted himself .,the happy hunter was quite hidden among the br...,1. the happy hunter was quite hidden among the...,None of the above choices .,he wanted to be .,None of the above choices .


### NOTES


*   question: what will happen after harold finds out about paul's and lady

    Metrics are better on dm51>qwen but dm51 respone is nonsense while qwen response is a strong meaningful answer that could be seen as correct

*   #8 what will the happy hunter do because he is..
  good example



In [43]:
baseline_test[baseline_test['question'].str.contains("what will happen after harold finds out about paul's and lady")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
337,337,what will happen after harold finds out about ...,implicit,she agreed willingly . hiding her rosy face on...,harold will try to win lady morna 's love .,he will kill paul .,snorro,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0]",0.000123,0.1,1,10,0.0,"[0.0, 0.0, 0.0, 0.0]",0.018316,0.2,1,5,-1.258609,-1.148662,-1.203635,0.0,0.0,0.0,0.0,-1.148662,0.0,0.0,0.0,0.0


In [41]:

df_IMPLICIT[df_IMPLICIT['question'].str.contains("what will happen after harold finds out about paul's and lady")]

Unnamed: 0,story_section,question,reference_answer1,reference_answer2,baseline_gen_ans,dm51_gen_ans,dqw_gen_ans,cot_gen_ans,both_gen_ans,dm31_gen_ans,dm21_gen_ans
78,she agreed willingly . hiding her rosy face on...,what will happen after harold finds out about ...,harold will try to win lady morna 's love .,he will kill paul .,snorro,the lady morna .,he will be furious and will try to break off t...,harold,harold,he wanted to .,harold


In [32]:
dm51[dm51['question'].str.contains("what will happen after harold finds out about paul's and lady")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
75,75,what will happen after harold finds out about ...,implicit,she agreed willingly . hiding her rosy face on...,harold will try to win lady morna 's love .,he will kill paul .,the lady morna .,0.333333,0.2,0.333333,0.333333,0.0,0.0,0.0,0.0,0.11338,"[0.8, 0.5, 0.3333333333333333, 0.5]",0.22313,0.4,4,10,0.279826,"[0.4, 0.25, 0.3333333333333333, 0.5]",0.778801,0.8,4,5,-0.980654,-1.614753,-1.297703,0.196603,0.166667,0.1,0.166667,-0.980654,0.279826,0.333333,0.2,0.333333


In [30]:
dqw[dqw['generated_answer'].str.contains("he will be furious and will try to break off the engagement ")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
50,50,what will happen after harold finds out about ...,implicit,she agreed willingly . hiding her rosy face on...,harold will try to win lady morna 's love .,he will kill paul .,he will be furious and will try to break off t...,0.230769,0.166667,0.230769,0.230769,0.190476,0.105263,0.190476,0.190476,0.134008,"[0.2631578947368421, 0.16666666666666666, 0.11...",1.0,1.8,18,10,0.096299,"[0.21052631578947367, 0.1111111111111111, 0.05...",1.0,3.6,18,5,-1.115495,-1.244859,-1.180177,0.115154,0.210623,0.135965,0.210623,-1.115495,0.134008,0.230769,0.166667,0.230769


In [42]:
# get random Examples from aggregated TEST Implicit
i = randint(0,len(df_IMPLICIT))
i=78
for j in range(i, i+2):
  print(f"\nExample {j}:")
  for col in df_IMPLICIT.columns:
      pprint.pprint(f"{col.upper()}: {df_IMPLICIT.iloc[j][col]}")


Example 78:
('STORY_SECTION: she agreed willingly . hiding her rosy face on his shoulder , '
 'she confessed that she had loved him from the very first day that she had '
 'seen him ; and ever since that moment she had determined that , if she could '
 'not we d him , she would we d no other man . for a little time they sat '
 'together , rejoicing in their new - found happiness . then earl paul sprang '
 'to his feet . " let us go and tell the good news to my mother and my brother '
 ', " he said . " harold may be disappointed at first , for i kn')
("QUESTION: what will happen after harold finds out about paul's and lady "
 "morna's engagement ?")
"REFERENCE_ANSWER1: harold will try to win lady morna 's love ."
'REFERENCE_ANSWER2: he will kill paul .'
'BASELINE_GEN_ANS: snorro'
'DM51_GEN_ANS: the lady morna .'
('DQW_GEN_ANS: he will be furious and will try to break off the engagement '
 'before it can take place.')
'COT_GEN_ANS: harold'
'BOTH_GEN_ANS: harold'
'DM31_GEN_ANS: he wanted

In [69]:
### Create one dataset for all VALIDATION model answers to easily compare generated responses

df_VALID = baseline_val[['ex_or_im','story_section','question','reference_answer1','reference_answer2','generated_answer']]
df_VALID.rename(columns={'generated_answer':'baseline_gen_ans'}, inplace=True)

df_VALID = pd.merge(df_VALID,d2[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_VALID.rename(columns={'generated_answer':'d2_gen_ans'}, inplace=True)

df_VALID = pd.merge(df_VALID,d3[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_VALID.rename(columns={'generated_answer':'d3_gen_ans'}, inplace=True)

df_VALID = pd.merge(df_VALID,d1[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_VALID.rename(columns={'generated_answer':'d1_gen_ans'}, inplace=True)

df_VALID = pd.merge(df_VALID,d4[['story_section','question','generated_answer']],on=['story_section','question'], how='left')
df_VALID.rename(columns={'generated_answer':'d4_gen_ans'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_VALID.rename(columns={'generated_answer':'baseline_gen_ans'}, inplace=True)


In [70]:
# get random Examples from aggregated TEST Implicit
i = randint(0,len(df_VALID))
for j in range(i, i+1):
  print(f"\nExample {i}:")
  for col in df_VALID.columns:
      pprint.pprint(f"{col.upper()}: {df_VALID.iloc[j][col]}")


Example 960:
'EX_OR_IM: explicit'
('STORY_SECTION: and so the years passed happily away till the boy was eight '
 'years old , but then the widow fell sick , and their little store of money '
 'melted gradually away . " i do n\'t know what we shall do for bread , " she '
 'said , kissing her boy with tears in her eyes , " for i am not yet strong '
 'enough to work , and we have no money left . " " but i can work , " answered '
 'the boy ; " and i \'m sure if i go to the squire up at the hall he will give '
 'me something to do . " at first the widow was reluctant to cons')
'QUESTION: why did the mother not know what they would do for bread ?'
('REFERENCE_ANSWER1: she was not yet strong enough to work , and they had no '
 'money left .')
('REFERENCE_ANSWER2: she was not yet strong enough to work , and they had no '
 'money left .')
'BASELINE_GEN_ANS: she loved to keep her child at her side .'
'D2_GEN_ANS: nan'
'D3_GEN_ANS: nan'
'D1_GEN_ANS: None of the above choices .'
'D4_GEN_ANS: The

# COMPARE All Outputs

* Experiment 1&4 compare validation explicit data
* Experiment 2&3 compare validation implicit data
* Experiment 1&2 have the same generate kwargs
* Experiment 3&4 have the same generate kwargs

1. Does the difference in generate kwargs have the same effect on implicit data that it does on explicit data?




In [27]:
print("baseline_test")
results_overview(baseline_test)

baseline_test
1007 total results

Average of Average Scores:
rouge1_avg: 0.3317
rouge2_avg: 0.2032
rougeL_avg: 0.3267
bleu_avg: 0.1008
bleurt_avg: -0.7133

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.397718    0.258989    0.393336  0.131266   -0.596215
implicit    0.134935    0.037120    0.128018  0.010113   -1.062269

Average of Max Scores:
rouge1_max: 0.3806
rouge2_max: 0.2375
rougeL_max: 0.3744
bleu_max: 0.1236
bleurt_max: -0.5825

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.444076    0.296143    0.438596  0.158457   -0.471695
implicit    0.191502    0.062713    0.183135  0.019831   -0.912588


In [20]:
results_overview(frankensteinqwen)

1007 total results

Average of Average Scores:
rouge1_avg: 0.3480
rouge2_avg: 0.2603
rougeL_avg: 0.3393
bleu_avg: 0.2806
bleurt_avg: -0.6800

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.412570    0.333265    0.405282  0.336769   -0.539196
implicit    0.155609    0.042912    0.142497  0.113012   -1.099709

Average of Max Scores:
rouge1_max: 0.3985
rouge2_max: 0.3080
rougeL_max: 0.3878
bleu_max: 0.3232
bleurt_max: -0.5343

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.463056     0.38768    0.454867  0.388727   -0.396875
implicit    0.206299     0.07045    0.187909  0.127909   -0.943733


In [27]:
results_overview(frankenstein5)

1007 total results

Average of Average Scores:
rouge1_avg: 0.3488
rouge2_avg: 0.2611
rougeL_avg: 0.3421
bleu_avg: 0.3215
bleurt_avg: -0.6642

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.412570    0.333265    0.405282  0.336769   -0.539196
implicit    0.158918    0.045860    0.153941  0.275911   -1.036724

Average of Max Scores:
rouge1_max: 0.4032
rouge2_max: 0.3099
rougeL_max: 0.3952
bleu_max: 0.3722
bleurt_max: -0.5131

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.463056    0.387680    0.454867  0.388727   -0.396875
implicit    0.224774    0.078151    0.217556  0.322991   -0.859306


In [None]:
frankenstein5[frankenstein5['ex_or_im']=='explicit']

In [28]:
print("*"*20+"dm51"+"*"*20)
results_overview(dm51)

********************dm51********************
253 total results

Average of Average Scores:
rouge1_avg: 0.1589
rouge2_avg: 0.0459
rougeL_avg: 0.1539
bleu_avg: 0.2759
bleurt_avg: -1.0367

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit    0.158918     0.04586    0.153941  0.275911   -1.036724

Average of Max Scores:
rouge1_max: 0.2248
rouge2_max: 0.0782
rougeL_max: 0.2176
bleu_max: 0.3230
bleurt_max: -0.8593

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.224774    0.078151    0.217556  0.322991   -0.859306


In [29]:
results_overview(dqw)

253 total results

Average of Average Scores:
rouge1_avg: 0.1556
rouge2_avg: 0.0429
rougeL_avg: 0.1425
bleu_avg: 0.1130
bleurt_avg: -1.0997

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit    0.155609    0.042912    0.142497  0.113012   -1.099709

Average of Max Scores:
rouge1_max: 0.2063
rouge2_max: 0.0704
rougeL_max: 0.1879
bleu_max: 0.1279
bleurt_max: -0.9437

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.206299     0.07045    0.187909  0.127909   -0.943733


In [30]:
results_overview(cot)

253 total results

Average of Average Scores:
rouge1_avg: 0.0532
rouge2_avg: 0.0161
rougeL_avg: 0.0481
bleu_avg: 0.0391
bleurt_avg: -1.4965

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit    0.053247    0.016107    0.048084  0.039136   -1.496489

Average of Max Scores:
rouge1_max: 0.0735
rouge2_max: 0.0268
rougeL_max: 0.0673
bleu_max: 0.0462
bleurt_max: -1.3965

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.073513    0.026816    0.067264  0.046225   -1.396537


In [41]:
results_overview(test_overall_df)

1007 total results

Average of Average Scores:
rouge1_avg: 0.2754
rouge2_avg: 0.1569
rougeL_avg: 0.2711
bleu_avg: 0.2245
bleurt_avg: -0.8902

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.332096    0.201379    0.327607  0.240696   -0.786404
implicit    0.106251    0.024271    0.102561  0.176222   -1.199579

Average of Max Scores:
rouge1_max: 0.3134
rouge2_max: 0.1791
rougeL_max: 0.3083
bleu_max: 0.2551
bleurt_max: -0.7876

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.368750    0.225212     0.36384  0.271754   -0.686074
implicit    0.148471    0.041540     0.14260  0.205352   -1.090116


In [31]:
print("*"*20+"dm31"+"*"*20)
results_overview(dm31)

********************dm31********************
253 total results

Average of Average Scores:
rouge1_avg: 0.1023
rouge2_avg: 0.0168
rougeL_avg: 0.1008
bleu_avg: 0.2078
bleurt_avg: -1.3897

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit     0.10234    0.016813    0.100837  0.207794   -1.389747

Average of Max Scores:
rouge1_max: 0.1465
rouge2_max: 0.0301
rougeL_max: 0.1448
bleu_max: 0.2393
bleurt_max: -1.2857

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.146509    0.030112     0.14484  0.239256    -1.28566


In [32]:
print("*"*20+"dm21"+"*"*20)
results_overview(dm21)

********************dm21********************
253 total results

Average of Average Scores:
rouge1_avg: 0.1081
rouge2_avg: 0.0221
rougeL_avg: 0.1049
bleu_avg: 0.1738
bleurt_avg: -1.1798

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit    0.108149    0.022065    0.104949   0.17379   -1.179804

Average of Max Scores:
rouge1_max: 0.1536
rouge2_max: 0.0397
rougeL_max: 0.1488
bleu_max: 0.2016
bleurt_max: -1.0639

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.153577    0.039672    0.148768  0.201644   -1.063881


In [33]:
print("baseline_val")
results_overview(baseline_val)

baseline_val
1025 total results

Average of Average Scores:
rouge1_avg: 0.3268
rouge2_avg: 0.2124
rougeL_avg: 0.3212
bleu_avg: 0.2579
bleurt_avg: -0.7373

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.399089    0.276773    0.393677  0.294019   -0.609512
implicit    0.135469    0.042080    0.129348  0.162333   -1.075791

Average of Max Scores:
rouge1_max: 0.3757
rouge2_max: 0.2497
rougeL_max: 0.3693
bleu_max: 0.2976
bleurt_max: -0.6062

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit     0.44804    0.317740    0.441837  0.338185   -0.479134
implicit     0.18415    0.069513    0.177392  0.190082   -0.942504


In [34]:
print("test_both_df")
results_overview(test_both_df)

test_both_df
1007 total results

Average of Average Scores:
rouge1_avg: 0.3021
rouge2_avg: 0.1884
rougeL_avg: 0.2974
bleu_avg: 0.2472
bleurt_avg: -0.8316

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.366860    0.243352    0.361693  0.271573   -0.711435
implicit    0.108993    0.024433    0.105666  0.174756   -1.189657

Average of Max Scores:
rouge1_max: 0.3447
rouge2_max: 0.2164
rougeL_max: 0.3396
bleu_max: 0.2807
bleurt_max: -0.7214

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.408083    0.274442    0.402815  0.306382   -0.603638
implicit    0.155786    0.043268    0.151042  0.204223   -1.072292


In [35]:
print("d1 - Experiment 1 - Validation Explicit")
print("num_beams=4, do_sample=True, top_k=100, top_p=1, temperature=0.2, no_repeat_ngram_size=2")
results_overview(d1)

d1 - Experiment 1 - Validation Explicit
num_beams=4, do_sample=True, top_k=100, top_p=1, temperature=0.2, no_repeat_ngram_size=2
744 total results

Average of Average Scores:
rouge1_avg: 0.3414
rouge2_avg: 0.2260
rougeL_avg: 0.3347
bleu_avg: 0.2484
bleurt_avg: -0.8058

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit    0.341388    0.226039    0.334671  0.248406   -0.805753

Average of Max Scores:
rouge1_max: 0.3819
rouge2_max: 0.2588
rougeL_max: 0.3751
bleu_max: 0.2843
bleurt_max: -0.6993

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.381881    0.258797     0.37511  0.284284     -0.6993


In [36]:
print("d4 - Experiment 4 - Validation Explicit")
print("num_beams=1, do_sample=False, top_k=50, top_p=1, temperature=1, no_repeat_ngram_size=0")
results_overview(d4)

d4 - Experiment 4 - Validation Explicit
num_beams=1, do_sample=False, top_k=50, top_p=1, temperature=1, no_repeat_ngram_size=0
744 total results

Average of Average Scores:
rouge1_avg: 0.3364
rouge2_avg: 0.1955
rougeL_avg: 0.3300
bleu_avg: 0.2172
bleurt_avg: -0.7908

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
explicit      0.3364    0.195486    0.329972  0.217178   -0.790786

Average of Max Scores:
rouge1_max: 0.3763
rouge2_max: 0.2231
rougeL_max: 0.3701
bleu_max: 0.2485
bleurt_max: -0.6890

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
explicit    0.376342    0.223056    0.370086  0.248497   -0.689026


In [37]:
print("d2 - Experiment 2 - Validation Implicit\n")
print("num_beams=4, do_sample=True, top_k=100, top_p=1, temperature=0.2, no_repeat_ngram_size=2")
results_overview(d2)

d2 - Experiment 2 - Validation Implicit

num_beams=4, do_sample=True, top_k=100, top_p=1, temperature=0.2, no_repeat_ngram_size=2
281 total results

Average of Average Scores:
rouge1_avg: 0.1084
rouge2_avg: 0.0237
rougeL_avg: 0.1031
bleu_avg: 0.1618
bleurt_avg: -1.1810

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit    0.108363    0.023668     0.10309  0.161799    -1.18103

Average of Max Scores:
rouge1_max: 0.1501
rouge2_max: 0.0388
rougeL_max: 0.1436
bleu_max: 0.1847
bleurt_max: -1.0718

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.150107    0.038796    0.143608  0.184662   -1.071774


In [38]:
print("d3 - Experiment 3 - Validation Implicit")
print("num_beams=1, do_sample=False, top_k=50, top_p=1, temperature=1, no_repeat_ngram_size=0")
results_overview(d3)

d3 - Experiment 3 - Validation Implicit
num_beams=1, do_sample=False, top_k=50, top_p=1, temperature=1, no_repeat_ngram_size=0
281 total results

Average of Average Scores:
rouge1_avg: 0.1454
rouge2_avg: 0.0358
rougeL_avg: 0.1402
bleu_avg: 0.1796
bleurt_avg: -1.0862

Average Scores by Question type:
          rouge1_avg  rouge2_avg  rougeL_avg  bleu_avg  bleurt_avg
ex_or_im                                                          
implicit    0.145403    0.035832     0.14018  0.179634   -1.086193

Average of Max Scores:
rouge1_max: 0.1885
rouge2_max: 0.0529
rougeL_max: 0.1821
bleu_max: 0.2024
bleurt_max: -0.9676

Average of Max Scores by Question type:
          rouge1_max  rouge2_max  rougeL_max  bleu_max  bleurt_max
ex_or_im                                                          
implicit    0.188531    0.052891    0.182098  0.202423   -0.967605


### Output

In [28]:
random_outputs(baseline_test,2)


Example 546:
('Story: but after her mother had gone , she found so many berries that she '
 'forgot to say her verse , and so she was enchanted and taken into the hill . '
 'and there no harm had been done her , save that she had lost the top joint '
 'of the little finger of her left hand , and the goblins had been kind to her '
 '; yet it had always seemed to her as though something were not as it should '
 'be , she felt as though something weighed upon her , and she had suffered '
 'greatly from the advances of the dwarf who had been ch')
Question: how were the goblins towards the bride ?
Reference1: kind .
Reference2: kind to her .
Generated: andrew was enchanted and taken into the hill

Example 546:
('Story: his vengeance was baulked , however , for in the panic and confusion '
 "that followed harold 's death , the two countesses slipped out of the palace "
 'and fled to the coast , and took boat in haste to scotland , where they had '
 'great possessions , and where they were m

In [22]:
random_outputs(dm51,2)


Example 177:
("Story: ' by and bye i was born , and was brought up by my grandfather in one "
 'of his great houses . here are the rings you gave to my mother , and here is '
 "the cross , and these will prove if i am your son or not . ' as he spoke the "
 'young man laid the jewels at the feet of the king , and the nobles and the '
 'judges pressed round to examine them . the king alone did not move from his '
 'seat , for he had forgotten the hall of justice and all about him , and saw '
 'only the apple - orchard , as it was twenty years')
Question: why did everyone acknowledge the gentleman as the next king ?
Reference1: the prince was adopted .
Reference2: the king said " and let everyman present swear to acknowledge him as king , after my death " .
Generated: he had forgotten the hall of justice and all about him .

Example 177:
('Story: the sun rose and she still slept on and on , although it was nearly '
 'noon . now , it happened that the king to whom this wood belonged was '

In [26]:
random_outputs(dm31,5)


Example 224:
('Story: but his brother complained of being weary , and at length they '
 'decided to remain there for the night . when andrew awoke he found himself '
 'alone ; and he saw neither brother nor boat , until he came to the highest '
 'point of the island . then he discovered him far out , darting for land like '
 'a sea - gull . andrew did not understand the whole affair . there were still '
 'provisions there , as well as a dish of curd , his gun and various other '
 'things . so andrew wasted but little time in thought . " he ')
Question: how will john nicholas feel about andrew's wealth ?
Reference1: jealous .
Reference2: he will feel jealous .
Generated: he wanted to be .

Example 224:
('Story: when he had been home a few days , his mother wanted him to go up to '
 'the castle and show the king what a man he had grown to be . his father said '
 ': " no , he had better not do that , for we will have to do without him in '
 'the meantime . " but there was no help for it 

In [62]:
80/len(baseline_test)

0.07944389275074479

In [61]:
print(sum(baseline_test[baseline_test['reference_length_a1']<=2]['reference_answer1'].value_counts()[:20]))
baseline_test[baseline_test['reference_length_a1']<=2]['reference_answer1'].value_counts()[:20]

83


Unnamed: 0_level_0,count
reference_answer1,Unnamed: 1_level_1
surprised .,14
sad .,11
angry .,10
happy .,7
excited .,5
confused .,5
delighted .,3
dance .,3
astonished .,3
horrified .,3


+14+11+10+7+5+5+3+3+3+3+2

In [56]:
14+11+10+7+5+5+3+3+3+3+2

66

##Exploration

In [54]:
x = len(baseline_test[baseline_test['reference_length_a1']<=4]['reference_answer1'])/len(baseline_test)*100
print(f"{x:.2f}% of questions have a reference answer1 that is 3 words or less")

baseline_test[baseline_test['reference_length_a1']<=4]['reference_answer1'].value_counts()[:20]

31.88% of questions have a reference answer1 that is 3 words or less


Unnamed: 0_level_0,count
reference_answer1,Unnamed: 1_level_1
surprised .,14
sad .,11
angry .,10
happy .,7
confused .,5
excited .,5
astonished .,3
dance .,3
delighted .,3
horrified .,3


#### Most Common Generated Answers

In [87]:
print(len(test_both_df[test_both_df['ex_or_im']=='implicit']))
test_both_df[test_both_df['ex_or_im']=='implicit']['generated_answer'].value_counts()[:5]

253


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,76
None of the above choices,37
harold,4
matte,4
the princess,3


In [88]:
print(len(test_both_df[test_both_df['ex_or_im']=='explicit']))
test_both_df[test_both_df['ex_or_im']=='explicit']['generated_answer'].value_counts()[:5]

754


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,140
None of the above choices,23
the king,8
matte,3
harold,3


In [89]:
print(len(d1))
d1['generated_answer'].value_counts()[:5]

744


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,159
None of the above choices,32
Fairies,4
the fair maid,3
the king 's son,3


In [90]:
print(len(d2))
d2['generated_answer'].value_counts()[:5]

281


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,83
None of the above choices,11
the king 's daughter,3
tephany,3
the princess,2


In [91]:
print(len(d3))
d3['generated_answer'].value_counts()[:5]

281


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,11
None of the above choices,4
She was a brave and loving little maiden .,3
They were disappointed,2
wounds,2


In [92]:
print(len(d4))
d4['generated_answer'].value_counts()[:5]

744


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,11
None of the above choices,9
the giant,6
the king,5
She was a child,4


In [83]:
print(len(baseline_val))
baseline_val['generated_answer'].value_counts()[:5]

1025


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,27
tephany,16
the king 's daughter,13
the king 's son,11
denis,6


In [34]:
baseline_test[baseline_test['generated_answer'].str.contains("None")]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg,bleurt_max,bleu_max,rouge1_max,rouge2_max,rougeL_max
38,38,why did people pity the widowed mistress of ki...,explicit,i am going to tell you a story about a poor yo...,"she had lost her husband , and no one quite kn...",she had lost her husband .,None of the above choices .,0.1,0.0,0.1,0.1,0.0,0.0,0.0,0.0,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.15988,0.352941,6,17,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,1.0,6,6,-1.526155,-1.346074,-1.436114,0.0,0.05,0.0,0.05,-1.346074,0.0,0.1,0.0,0.1
44,44,why did snorro want to find the hidden carbunc...,explicit,but all the time they never guessed the real r...,it would bestow on its finder marvellous magic...,"when it was found , would bestow on its finder...",None of the above choices .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",0.15988,0.352941,6,17,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",0.082085,0.285714,6,21,-1.543396,-1.523764,-1.53358,0.0,0.0,0.0,0.0,-1.523764,0.0,0.0,0.0,0.0
46,46,what will the happy hunter do because he is ho...,implicit,between the sea realm and the earth there was ...,try to go home .,runaway back home .,None of the above choices .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,1.2,6,5,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,1.5,6,4,-0.953393,-1.54771,-1.250551,0.0,0.0,0.0,0.0,-0.953393,0.0,0.0,0.0,0.0
169,169,how did the poor neighbor feel about the agree...,explicit,now the rich neighbor got together as many mow...,despaired .,despaired .,None of the above choices .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,3.0,6,2,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,3.0,6,2,-1.055519,-1.055519,-1.055519,0.0,0.0,0.0,0.0,-1.055519,0.0,0.0,0.0,0.0
244,244,what was special about the palace of rin jin ?,explicit,the palace of rin jin was at the bottom of the...,was so beautiful that no one has ever seen any...,"the walls were of coral , the rood of jadeston...",None of the above choices .,0.0,0.0,0.0,0.0,0.148148,0.08,0.148148,0.148148,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",0.188876,0.375,6,16,0.0,"[0.5, 0.2, 0.0, 0.0]",0.030197,0.222222,6,27,-1.39829,-1.771914,-1.585102,0.0,0.074074,0.04,0.074074,-1.39829,0.0,0.148148,0.08,0.148148
392,392,what did others say about her husband ?,explicit,i am going to tell you a story about a poor yo...,had been taken away to serve as a sailor by th...,taken away to serve as a sailor .,None of the above choices .,0.083333,0.0,0.083333,0.083333,0.0,0.0,0.0,0.0,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.069483,0.272727,6,22,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",0.716531,0.75,6,8,-1.694271,-1.455223,-1.574747,0.0,0.041667,0.0,0.041667,-1.455223,0.0,0.083333,0.0,0.083333
525,525,why did the poor neighbor not even take the tr...,explicit,"then the poor man 's heart grew less heavy , a...",he saw how the others took hold and that he hi...,knew the large man would help .,None of the above choices .,0.083333,0.0,0.083333,0.083333,0.181818,0.0,0.181818,0.181818,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.096972,0.3,6,20,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.846482,0.857143,6,7,-1.107534,-1.237089,-1.172312,0.0,0.132576,0.0,0.132576,-1.107534,0.0,0.181818,0.0,0.181818
612,612,why didn't the widowed mistress of kittlerumpi...,explicit,"at any rate , his poor young wife was sorely t...",times were bad .,times were bad .,None of the above choices .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,1.5,6,4,0.0,"[0.16666666666666666, 0.0, 0.0, 0.0]",1.0,1.5,6,4,-0.853959,-0.853959,-0.853959,0.0,0.0,0.0,0.0,-0.853959,0.0,0.0,0.0,0.0
729,729,what happened after the woman upset the tar-ba...,explicit,""" what is your name ? "" asked the girl from un...",the girl screamed and ran away crying .,"the latter screamed and ran away , crying .",None of the above choices .,0.166667,0.0,0.166667,0.166667,0.166667,0.0,0.166667,0.166667,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.716531,0.75,6,8,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.606531,0.666667,6,9,-1.865901,-1.591534,-1.728718,0.0,0.166667,0.0,0.166667,-1.591534,0.0,0.166667,0.0,0.166667
750,750,what will happen if the many-furred creature d...,explicit,she had run to her little room and had quickly...,the many - furred creature will get nothing to...,you will get nothing to eat in the future ! .,None of the above choices .,0.125,0.0,0.125,0.125,0.142857,0.0,0.142857,0.142857,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.311403,0.461538,6,13,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.434598,0.545455,6,11,-1.442789,-1.425044,-1.433917,0.0,0.133929,0.0,0.133929,-1.425044,0.0,0.142857,0.0,0.142857


In [29]:
print(len(baseline_test))
baseline_test['generated_answer'].value_counts()

1007


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,20
the king,17
snorro,10
angered,6
ahti,6
...,...
"the two brothers started out to try the other 's occupation , little dreaming of all that would happen .",1
he did not know how he was there,1
his mother,1
he found he had forgotten his handkerchief and went down to the boat to get it,1


In [84]:
print(len(dm21))
dm21['generated_answer'].value_counts()[:5]

253


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
None of the above choices .,73
None of the above choices,31
matte,4
harold,4
exhausted,2


In [85]:
print(len(dm31))
dm31['generated_answer'].value_counts()[:5]

253


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
he wanted to be .,239
he wanted to .,14


In [86]:
print(len(dm51))
dm51['generated_answer'].value_counts()[:5]

253


Unnamed: 0_level_0,count
generated_answer,Unnamed: 1_level_1
happy .,37
angry .,3
sad .,3
astonishment .,2
he had a fire inside it .,2


In [31]:
prop_exact_match(baseline_test[baseline_test['ex_or_im'] == 'implicit'])

3.16

In [32]:
prop_exact_match(dm31)

0.0

In [33]:
prop_exact_match(dm51)

5.14

In [9]:
baseline_test[baseline_test['ex_or_im'] == 'implicit'].value_counts(['reference_length_a1'])

Unnamed: 0_level_0,count
reference_length_a1,Unnamed: 1_level_1
2,70
7,33
5,28
6,24
8,24
4,20
10,15
11,10
9,10
12,6


In [10]:
# what explicit questions did okay?
baseline_test[baseline_test['ex_or_im'] == 'explicit'].sort_values('bleu_avg', ascending=False).head(10)

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg
601,601,why did the goodman go home ?,explicit,""" what 's that ? "" said he , "" for the bowl of...",he was feeling cold without his coat .,he was feeling cold without his coat .,he was feeling cold without his coat .,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,8,8,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,8,8,1.087961,1.087961,1.087961,1.0,1.0,1.0,1.0
606,606,why did boat row away at top speed ?,explicit,""" god be praised for good company ! that was j...",they thought the meer - trolls were making sig...,they thought the meer - trolls were making sig...,they thought the meer - trolls were making sig...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,12,12,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,12,12,1.046282,1.046282,1.046282,1.0,1.0,1.0,1.0
67,67,why was rin jin not happy ?,explicit,the palace of rin jin was at the bottom of the...,he reigned alone .,he reigned alone .,he reigned alone .,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,4,4,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,4,4,1.067542,1.067542,1.067542,1.0,1.0,1.0,1.0
390,390,why did the young couple live happily together...,explicit,when the flower queen heard that her daughter ...,the flower queen 's daughter departed and went...,the flower queen 's daughter departed and went...,the flower queen 's daughter departed and went...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,13,13,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,13,13,0.993606,0.993606,0.993606,1.0,1.0,1.0,1.0
135,135,why was death an everyday matter to the swords...,explicit,at the time when the tang dynasty reigned over...,they hired themselves out to those who wished ...,they hired themselves out to those who wished ...,they hired themselves out to those who wished ...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.931063,"[1.0, 1.0, 1.0, 1.0]",0.931063,0.933333,14,15,0.931063,"[1.0, 1.0, 1.0, 1.0]",0.931063,0.933333,14,15,0.955911,0.955911,0.955911,0.931063,1.0,1.0,1.0
324,324,why were the messengers sent far and wide ?,explicit,there was once upon a time a king who had a wi...,to seek for a bride equal to the late queen in...,to seek for a bride equal to the late queen in...,to seek for a bride equal to the late queen in...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.949171,0.949171,0.949171,0.920044,1.0,1.0,1.0
115,115,what did the old man spend his time doing ?,explicit,"there was once an old man and his wife , who l...","looking after the cows , and the hens , and th...","looking after the cows , and the hens , and th...","looking after the cows , and the hens , and th...",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.920044,"[1.0, 1.0, 1.0, 1.0]",0.920044,0.923077,12,13,0.970399,0.970399,0.970399,0.920044,1.0,1.0,1.0
573,573,why did the happy hunter save his brother ?,explicit,the happy hunter had a kind heart and could no...,he had a kind heart and could not bear the sig...,the happy hunter had a kind heart and could no...,he had a kind heart and could not bear the sig...,1.0,1.0,1.0,1.0,0.882353,0.875,0.882353,0.882353,1.0,"[1.0, 1.0, 1.0, 1.0]",1.0,1.0,17,17,0.831343,"[0.9411764705882353, 0.9375, 0.933333333333333...",0.88901,0.894737,17,19,1.012825,0.177293,0.595059,0.915671,0.941176,0.9375,0.941176
96,96,what was the jelly fish able to do ?,explicit,"the chief steward thought for some time , and ...",walk on land with his four legs like a tortoise .,walk on land with his four legs like a tortoise .,walk on land with his four legs like a tortoise,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,1.064988,1.064988,1.064988,0.904837,1.0,1.0,1.0
619,619,why did the old woman beg the priest and the g...,explicit,"left to himself , the full horror of his comin...",she told the young man what he was to do .,she told the young man what he was to do .,she told the young man what he was to do,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,0.904837,"[1.0, 1.0, 1.0, 1.0]",0.904837,0.909091,10,11,1.01271,1.01271,1.01271,0.904837,1.0,1.0,1.0


In [None]:
# what implicit questions did okay?
baseline_test[baseline_test['ex_or_im'] == 'implicit'].sort_values('bleu_avg', ascending=False).head(10)

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg
88,88,why would the skillful fisher not accept the f...,implicit,the happy hunter felt that he was to blame for...,he wanted his original hook back .,they are of no use to him .,they are of no use to him,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,"[0.0, 0.0, 0.0, 0.0]",1.0,1.0,7,7,0.866878,"[1.0, 1.0, 1.0, 1.0]",0.866878,0.875,7,8,-1.018798,1.062443,0.021822,0.433439,0.5,0.5,0.5
294,294,why did the farmer forget about the business h...,implicit,"when the bonze had begun to make his magic , t...",he was interested in the bonze 's magic .,the farmer had mingled with the crowd .,the farmer had mingled with the crowd,0.133333,0.0,0.133333,0.133333,1.0,1.0,1.0,1.0,0.0,"[0.14285714285714285, 0.0, 0.0, 0.0]",0.751477,0.777778,7,9,0.866878,"[1.0, 1.0, 1.0, 1.0]",0.866878,0.875,7,8,-1.36697,1.014066,-0.176452,0.433439,0.566667,0.5,0.566667
237,237,why did tsui grow sad again ?,implicit,"said molo : "" when she stretched out three fin...",he thought that it would not be possible to go...,the prince 's palace is shut off as thought by...,the prince 's palace is shut off as though by ...,0.258065,0.206897,0.258065,0.258065,0.916667,0.818182,0.916667,0.916667,0.100145,"[0.3333333333333333, 0.2727272727272727, 0.2, ...",0.472367,0.571429,12,21,0.67613,"[0.9166666666666666, 0.8181818181818182, 0.7, ...",0.920044,0.923077,12,13,-1.183965,0.375965,-0.404,0.388138,0.587366,0.512539,0.587366
639,639,what happened after the deer shook her head ov...,implicit,"' it is the man who is lying under the cask , ...",he was alive again .,ian jumped up as well as ever .,he jumped up as well as ever,0.181818,0.0,0.181818,0.181818,0.857143,0.833333,0.857143,0.857143,0.0,"[0.14285714285714285, 0.0, 0.0, 0.0]",1.0,1.4,7,5,0.701397,"[0.8571428571428571, 0.8333333333333334, 0.8, ...",0.866878,0.875,7,8,-0.874265,0.423447,-0.225409,0.350698,0.519481,0.416667,0.519481
340,340,why did harold rush out of the room ?,implicit,""" for whom hast thou purchased that ? "" he ask...",he wanted to show lady morna .,to show lady morna how fine he was .,to show the lady morna how fine he was,0.666667,0.307692,0.533333,0.533333,0.941176,0.8,0.941176,0.941176,0.0,"[0.5555555555555556, 0.25, 0.0, 0.0]",1.0,1.285714,9,7,0.660633,"[0.8888888888888888, 0.75, 0.5714285714285714,...",1.0,1.0,9,9,0.028125,0.927688,0.477906,0.330316,0.803922,0.553846,0.737255
295,295,what happened after the widowed mistress revea...,implicit,now the young mistress of kittlerumpit knew th...,she jumped into the air .,"the old woman ran down the brae , shrieking wi...","she ran down the brae , shrieking with rage an...",0.133333,0.0,0.133333,0.133333,0.594595,0.457143,0.486486,0.486486,0.0,"[0.06896551724137931, 0.0, 0.0, 0.0]",1.0,4.833333,29,6,0.320944,"[0.41379310344827586, 0.32142857142857145, 0.2...",1.0,2.071429,29,14,-1.145075,-0.019223,-0.582149,0.160472,0.363964,0.228571,0.30991
146,146,why did the jelly fish feel discouraged ?,implicit,the jelly fish wondered at this speech and the...,the monkey tricked him .,the monkey replied laughlingly that he could n...,the jellyfish did n't want to lose his liver .,0.142857,0.0,0.142857,0.142857,0.583333,0.363636,0.583333,0.583333,0.0,"[0.2, 0.0, 0.0, 0.0]",1.0,2.0,10,5,0.286419,"[0.7, 0.4444444444444444, 0.375, 0.28571428571...",0.67032,0.714286,10,14,-1.267366,-0.840598,-1.053982,0.14321,0.363095,0.181818,0.363095
434,434,what did the gentleman and the old woman do af...,implicit,it took some time to discover the whereabouts ...,they got married .,go to the chapel .,The gentleman and the old woman went to the ch...,0.0,0.0,0.0,0.0,0.428571,0.333333,0.428571,0.428571,0.0,"[0.09090909090909091, 0.0, 0.0, 0.0]",1.0,2.75,11,4,0.234624,"[0.36363636363636365, 0.3, 0.2222222222222222,...",1.0,2.2,11,5,-1.412994,-0.182248,-0.797621,0.117312,0.214286,0.166667,0.214286
437,437,why did the poor widow give a piteous cry ?,implicit,a joyful woman was the mistress of kittlerumpi...,the old woman was a wicked fairy .,the green - clad lady was a fairy .,she knew now what she had not guessed before -...,0.263158,0.166667,0.263158,0.263158,0.368421,0.333333,0.368421,0.368421,0.0,"[0.16666666666666666, 0.08571428571428572, 0.0...",1.0,4.5,36,8,0.191216,"[0.25, 0.2, 0.17647058823529413, 0.15151515151...",1.0,4.0,36,9,-0.733674,-0.33285,-0.533262,0.095608,0.315789,0.25,0.315789
916,916,why did the old duke hide his daughter from th...,implicit,"' one day , shortly after the death of the que...",he waited for the day that the king would anno...,the duke vowed that he would hide her safely f...,he vowed that he would hide her from you,0.272727,0.0,0.272727,0.272727,0.5,0.333333,0.4375,0.4375,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",0.573753,0.642857,9,14,0.106681,"[0.8888888888888888, 0.625, 0.5714285714285714...",0.169013,0.36,9,25,-0.796656,-0.856786,-0.826721,0.05334,0.386364,0.166667,0.355114


In [15]:
x=len(test_both_df['question'].value_counts())
y=len(test_both_df)
print(f'{x} unique questions out of {y} total questions')

1005 unique questions out of 1007 total questions


In [16]:
test_both_df['question'].value_counts()

Unnamed: 0_level_0,count
question,Unnamed: 1_level_1
how will the little grey man help dullhead ?,2
where did the fisherman live ?,2
why were matte and maie alone for weeks at a time ?,1
what happened after all was ready ?,1
why did the lady cry more bitterly than before ?,1
...,...
what did the woman do after greeting the girl ?,1
why did the man sit inside a large hole in the hollow trunk of a tree ?,1
why did rose-red believe that tsui was intelligent ?,1
what were the tailors and his apprentices doing before the bannock came in ?,1


In [27]:
test_both_df[(test_both_df['question'] == 'how will the little grey man help dullhead ?')| (test_both_df['question'] == 'where did the fisherman live ?')]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2,bleurt_avg,bleu_avg,rouge1_avg,rouge2_avg,rougeL_avg
122,122,how will the little grey man help dullhead ?,explicit,after a time they all came to a town where a k...,drink up a whole cellarful of wine .,helping him drink a whole cellarful of wine .,The little grey man will help dullhead .,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.160366,"[0.2222222222222222, 0.125, 0.1428571428571428...",1.0,1.0,8,8,0.141522,"[0.2222222222222222, 0.125, 0.1428571428571428...",0.882497,0.888889,8,9,-1.569883,-1.512433,-1.541158,0.150944,0.0,0.0,0.0
129,129,where did the fisherman live ?,explicit,there was once a fisherman who was called salm...,by the shore of the big sea .,by the shore of the big sea .,The fisherman lived by the shore of the big sea,0.823529,0.8,0.823529,0.823529,0.823529,0.8,0.823529,0.823529,0.67865,"[0.7272727272727273, 0.7, 0.6666666666666666, ...",1.0,1.25,10,8,0.67865,"[0.7272727272727273, 0.7, 0.6666666666666666, ...",1.0,1.25,10,8,-0.3739,-0.3739,-0.3739,0.67865,0.823529,0.8,0.823529
332,332,how will the little grey man help dullhead ?,explicit,now the king had given orders to have all the ...,give him the ship .,by eating the bread .,The king will help dullhead,0.222222,0.0,0.222222,0.222222,0.222222,0.0,0.222222,0.222222,0.229575,"[0.16666666666666666, 0.2, 0.25, 0.33333333333...",1.0,1.0,5,5,0.229575,"[0.16666666666666666, 0.2, 0.25, 0.33333333333...",1.0,1.0,5,5,-1.168645,-1.472529,-1.320587,0.229575,0.222222,0.0,0.222222
607,607,where did the fisherman live ?,explicit,"once upon a time there was a fisherman , who l...",near the king 's castle .,near the king 's castle .,near the king 's castle,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.818731,"[1.0, 1.0, 1.0, 1.0]",0.818731,0.833333,5,6,0.818731,"[1.0, 1.0, 1.0, 1.0]",0.818731,0.833333,5,6,0.915884,0.915884,0.915884,0.818731,1.0,1.0,1.0
