# Baseline
# Out of the Box small Flan T5 model
Results for Test and Validation

In [1]:
!pip install -q -U transformers
!pip install -q -U datasets
!pip install -q -U evaluate
!pip install -q -U tokenizers
!pip install -q -U bitsandbytes
!pip install -q rouge_score
!pip install -q -U bert_score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.2/11.2 MB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m70.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import re
import random
import numpy as np
from scipy.special import softmax
import pprint

import bitsandbytes as bnb

import torch
import transformers
import evaluate
from datasets import Dataset, load_dataset, DatasetDict

# For from-scratch T5 model
from transformers import T5TokenizerFast, T5Config, T5ForConditionalGeneration

# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration  # this won't import twice, just noting here what's for each model

# For all T5 models
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM


# For style classifier model (also for evaluating the seq2seq model output)
from transformers import BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig,pipeline
from transformers import TrainingArguments, Trainer

import pandas as pd
from tqdm import tqdm

In [3]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Path to data save in Drive
train = 'FairytaleQA_train.csv'
valid = 'FairytaleQA_valid.csv'
test = 'FairytaleQA_test.csv'
# path = 'drive/MyDrive/266_Danielle_Dylan_final_project/data/'
path = 'drive/MyDrive/DataSci_266/266_Danielle_Dylan_final_project/data/'

In [5]:
train = path+train
valid = path+valid
test = path+test

train = pd.read_csv(train)
valid = pd.read_csv(valid)
test = pd.read_csv(test)

In [6]:
train_ds = Dataset.from_pandas(train, split="train")
test_ds = Dataset.from_pandas(test, split="test")
valid_ds = Dataset.from_pandas(valid, split="test")

# Combine into a single DatasetDict
ds = DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "validation": valid_ds,
})

train = ds['train'].shuffle()
val = ds['validation'].shuffle()
test = ds['test'].shuffle()

In [7]:
test

Dataset({
    features: ['story_name', 'story_section', 'question', 'answer1', 'answer2', 'local_or_sum', 'attribute', 'ex_or_im', 'ex_or_im2'],
    num_rows: 1007
})

In [8]:
"""
Initialize the pipeline with bitsandbytes quantization
"""
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Initialize pipeline
model_id = "google/flan-t5-small"

pipe = pipeline(
   "text2text-generation",
   model=model_id,
   model_kwargs={"torch_dtype": torch.bfloat16, "quantization_config": quantization_config},
   device_map="auto",
   trust_remote_code=True,
   truncation=True
)



config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


In [9]:
# load rouge

rouge = evaluate.load('rouge')

# Load the BLEU metric and the trained BLEURT model for semantic similarity scoring

bleu = evaluate.load("bleu")

bleurt_tokenizer = AutoTokenizer.from_pretrained("Elron/bleurt-base-512")
bleurt_model = AutoModelForSequenceClassification.from_pretrained("Elron/bleurt-base-512")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [13]:
from evaluate import load
bertscore = load("bertscore")

Downloading builder script: 0.00B [00:00, ?B/s]

In [14]:
%%time

# Store results for aggregate scoring

def process_dataset(dataset):
  results = []
  bleurt_scores = []
  for idx, sample in enumerate(tqdm(dataset)):

      # if idx >= 10:  # Stop after processing 10 samples
      #   return pd.DataFrame(results)
      #   break

      prefix = "Please answer this question: "
      context = " Context: "

      question = context + sample['story_section'] + prefix + sample['question']

      # Generate summary via the pipeline
      outputs = pipe(
                          question,
                          max_new_tokens=500,
                          truncation=True
      )

      answer = outputs[0]["generated_text"]

      if answer.isspace():
        print("\nANSWER IS SPACE for question:",sample['question'],"\nReference:", sample['answer1'])
        print("\nanswer:",[answer])
        answer = 'blank'

      # print("\nbegin answer:", answer, ":end answer")
      # print("\n",[answer],"****")

      # Evaluation
      predictions = [answer]
      reference1 = [sample['answer1']]
      reference2 = [sample['answer2']]

      # Calculate ROUGE scores
      rouge_scores1 = rouge.compute(predictions=predictions, references=reference1)
      if reference1 != reference2:
        rouge_scores2 = rouge.compute(predictions=predictions, references=reference2)
      else:
        rouge_scores2 = rouge_scores1
      rouge_scores1 = {k+'_a1': v for k, v in rouge_scores1.items()}
      rouge_scores2 = {k+'_a2': v for k, v in rouge_scores2.items()}

      # Calculate BERTScore
      bertscore_result1 = bertscore.compute(predictions=predictions,
                                            references=reference1,
                                            lang="en")
      if reference1 != reference2:
        bertscore_result2 = bertscore.compute(predictions=predictions,
                                              references=reference2,
                                              lang="en")
      else:
        bertscore_result2 = bertscore_result1

      # Each BERTScore result is a list; extract first (only) value
      bertscore_f1_a1 = bertscore_result1["f1"][0]
      bertscore_f1_a2 = bertscore_result2["f1"][0]



      # Calculate BLEU scores
      bleu_scores1 = bleu.compute(predictions=predictions, references=reference1,smooth=True)
      if reference1 != reference2:
        bleu_scores2 = bleu.compute(predictions=predictions, references=reference2,smooth=True)
      else:
        bleu_scores2 = bleu_scores1
      bleu_scores1 = {k+'_a1': v for k, v in bleu_scores1.items()}
      bleu_scores2 = {k+'_a2': v for k, v in bleu_scores2.items()}

      # Calculate BLEURT scores
      with torch.no_grad():
        bleurt_score_a1 = bleurt_model(**bleurt_tokenizer(answer,
                                                sample['answer1'],
                                                truncation=True,
                                                max_length=512,
                                                padding='max_length',
                                                return_tensors='pt'))[0].squeeze().numpy()
        if reference1 != reference2:
          bleurt_score_a2 = bleurt_model(**bleurt_tokenizer(answer,
                                                sample['answer2'],
                                                truncation=True,
                                                max_length=512,
                                                padding='max_length',
                                                return_tensors='pt'))[0].squeeze().numpy()
        else:
          bleurt_score_a2 = bleurt_score_a1


      # Store results
      results.append({
          'id': idx,
          'question': sample['question'],
          'ex_or_im': sample['ex_or_im'],
          'story_section': sample['story_section'][:500],  # Store truncated text for readability
          'reference_answer1': sample['answer1'],
          'reference_answer2': sample['answer2'],
          'generated_answer': answer,
          **rouge_scores1,
          **rouge_scores2,
          **bleu_scores1,
          **bleu_scores2,
          'bertscore_f1_a1': bertscore_f1_a1,
          'bertscore_f1_a2': bertscore_f1_a2,
          'bleurt_score_a1': bleurt_score_a1,
          'bleurt_score_a2': bleurt_score_a2,
      })

      # Print progress update every 10 samples
      if (idx + 1) % 100 == 0:
          print(f"\nProcessed {idx + 1} samples; Results length: {len(results)}")
          print(f"Latest Answer1 ROUGE-1: {rouge_scores1['rouge1_a1']:.4f}")

  return pd.DataFrame(results)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.58 µs


In [15]:
%%time
test_results_df = process_dataset(test)

  0%|          | 0/1007 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  1%|          | 9/1007 [01:05<1:00:13,  3.62s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 10%|▉         | 100/1007 [05:00<33:35,  2.22s/it]


Processed 100 samples; Results length: 100
Latest Answer1 ROUGE-1: 0.6667


 20%|█▉        | 200/1007 [09:40<30:54,  2.30s/it]


Processed 200 samples; Results length: 200
Latest Answer1 ROUGE-1: 0.0000


 30%|██▉       | 300/1007 [14:23<31:12,  2.65s/it]


Processed 300 samples; Results length: 300
Latest Answer1 ROUGE-1: 0.1538


 40%|███▉      | 400/1007 [19:06<33:23,  3.30s/it]


Processed 400 samples; Results length: 400
Latest Answer1 ROUGE-1: 0.6429


 50%|████▉     | 500/1007 [23:46<21:17,  2.52s/it]


Processed 500 samples; Results length: 500
Latest Answer1 ROUGE-1: 0.9231


 60%|█████▉    | 600/1007 [28:32<18:29,  2.73s/it]


Processed 600 samples; Results length: 600
Latest Answer1 ROUGE-1: 0.0000


 70%|██████▉   | 700/1007 [33:17<14:03,  2.75s/it]


Processed 700 samples; Results length: 700
Latest Answer1 ROUGE-1: 0.0000


 79%|███████▉  | 800/1007 [38:01<09:06,  2.64s/it]


Processed 800 samples; Results length: 800
Latest Answer1 ROUGE-1: 0.2000


 89%|████████▉ | 900/1007 [42:24<05:38,  3.16s/it]


Processed 900 samples; Results length: 900
Latest Answer1 ROUGE-1: 0.0000


 99%|█████████▉| 1000/1007 [46:48<00:19,  2.79s/it]


Processed 1000 samples; Results length: 1000
Latest Answer1 ROUGE-1: 0.2857


100%|██████████| 1007/1007 [47:07<00:00,  2.81s/it]

CPU times: user 45min 52s, sys: 20.9 s, total: 46min 13s
Wall time: 47min 7s





In [16]:
print("length of results", len(test_results_df))
test_results_df[:2]

length of results 1007


Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,...,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bertscore_f1_a1,bertscore_f1_a2,bleurt_score_a1,bleurt_score_a2
0,0,who did the cook say cooked the soup ?,explicit,"when the dance was ended , the king had his so...",the many - furred creature .,the many - furred creature did .,the king,0.333333,0.0,0.333333,...,0.062371,"[0.6666666666666666, 0.5, 1.0, 1.0]",0.082085,0.285714,2,7,0.828311,0.828822,-1.3537391,-1.3911833
1,1,where did the young man chase the deer to ?,explicit,"on they went , on and on and one , till they r...",a cottage .,a cottage .,the house,0.0,0.0,0.0,...,0.387539,"[0.3333333333333333, 0.5, 1.0, 1.0]",0.606531,0.666667,2,3,0.905426,0.905426,-0.15933652,-0.15933652


In [20]:
# save results
results_path = 'drive/MyDrive/DataSci_266/266_Danielle_Dylan_final_project/answer_outputs/'

test_results_df.to_csv(results_path+'/t5_baseline_BertScore.csv', index=False)

In [None]:
vx= valid[645:650]
vy = Dataset.from_pandas(vx, split="test")

In [None]:
vz = process_dataset(vy)

  0%|          | 0/10 [00:00<?, ?it/s]


ANSWER IS SPACE for question: who was a mighty hero and a good archer ? 
Reference: hou i .

answer: ['                                                                                                                                                                                                                                                         ']


100%|██████████| 10/10 [00:42<00:00,  4.23s/it]


In [None]:
v[0]

{'story_name': 'the-lady-of-the-moon',
 'story_section': 'in the days of the emperor yau lived a prince by the name of hou i , who was a mighty hero and a good archer . once ten suns rose together in the sky , and shone so brightly and burned so fiercely that the people on earth could not endure them . so the emperor ordered hou i to shoot at them . and hou i shot nine of them down from the sky . besides his bow , hou i also had a horse which ran so swiftly that even the wind could not catch up with it .',
 'question': 'who was a mighty hero and a good archer ?',
 'answer1': 'hou i .',
 'answer2': 'hou i .',
 'local_or_sum': 'local',
 'attribute': 'character',
 'ex_or_im': 'explicit',
 'ex_or_im2': 'explicit'}

In [None]:
val[807]

{'story_name': 'the-lady-of-the-moon',
 'story_section': 'in the days of the emperor yau lived a prince by the name of hou i , who was a mighty hero and a good archer . once ten suns rose together in the sky , and shone so brightly and burned so fiercely that the people on earth could not endure them . so the emperor ordered hou i to shoot at them . and hou i shot nine of them down from the sky . besides his bow , hou i also had a horse which ran so swiftly that even the wind could not catch up with it .',
 'question': 'who was a mighty hero and a good archer ?',
 'answer1': 'hou i .',
 'answer2': 'hou i .',
 'local_or_sum': 'local',
 'attribute': 'character',
 'ex_or_im': 'explicit',
 'ex_or_im2': 'explicit'}

In [None]:
valid[valid['answer1']=='hou i .']

Unnamed: 0,story_name,story_section,question,answer1,answer2,local_or_sum,attribute,ex_or_im,ex_or_im2
645,the-lady-of-the-moon,in the days of the emperor yau lived a prince ...,who was a mighty hero and a good archer ?,hou i .,hou i .,local,character,explicit,explicit


In [None]:
%%time
val_results_df = process_dataset(val)

# save results
results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
val_results_df.to_csv(results_path+f'T5baseline_val.csv', index=False)

 10%|▉         | 100/1025 [05:33<51:54,  3.37s/it]


Processed 100 samples; Results length: 100
Latest Answer1 ROUGE-1: 0.1250


 20%|█▉        | 200/1025 [11:11<48:17,  3.51s/it]


Processed 200 samples; Results length: 200
Latest Answer1 ROUGE-1: 0.0000


 29%|██▉       | 300/1025 [17:07<43:08,  3.57s/it]


Processed 300 samples; Results length: 300
Latest Answer1 ROUGE-1: 0.1818


 39%|███▉      | 400/1025 [22:27<31:00,  2.98s/it]


Processed 400 samples; Results length: 400
Latest Answer1 ROUGE-1: 0.0000


 49%|████▉     | 500/1025 [27:49<25:29,  2.91s/it]


Processed 500 samples; Results length: 500
Latest Answer1 ROUGE-1: 0.3333


 59%|█████▊    | 600/1025 [33:09<19:31,  2.76s/it]


Processed 600 samples; Results length: 600
Latest Answer1 ROUGE-1: 0.6667


 63%|██████▎   | 648/1025 [35:54<20:50,  3.32s/it]

ANSWER IS SPACE for question: who was a mighty hero and a good archer ?


 68%|██████▊   | 700/1025 [38:57<15:00,  2.77s/it]


Processed 700 samples; Results length: 700
Latest Answer1 ROUGE-1: 0.0000


 78%|███████▊  | 800/1025 [44:39<09:08,  2.44s/it]


Processed 800 samples; Results length: 800
Latest Answer1 ROUGE-1: 0.8000


 88%|████████▊ | 900/1025 [50:37<06:50,  3.28s/it]


Processed 900 samples; Results length: 900
Latest Answer1 ROUGE-1: 1.0000


 98%|█████████▊| 1000/1025 [56:38<01:32,  3.71s/it]


Processed 1000 samples; Results length: 1000
Latest Answer1 ROUGE-1: 0.0000


100%|██████████| 1025/1025 [58:13<00:00,  3.41s/it]


CPU times: user 57min 49s, sys: 14.2 s, total: 58min 3s
Wall time: 58min 14s


# LOOK AT RESULTS

In [None]:
results_path = '/content/drive/MyDrive/266/FinalProject/results'
df_v = pd.read_csv(f'{results_path}/T5baseline_val.csv')
print(f"{len(df_v)} records in validation results")
df_v[:2]

1025 records in validation results


Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,...,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2
0,0,why was madge unable to resist the boy's reque...,explicit,his tone was so pleading that mistress madge w...,his tone was so pleading .,his tone was so pleading that mistress madge w...,his tone was so pleading that the mistress mad...,0.526316,0.470588,0.526316,...,15,6,0.815355,"[0.9375, 0.8666666666666667, 0.785714285714285...",1.0,1.071429,15,14,-0.185938,0.992595
1,1,what did the sparrows do after the old man cho...,explicit,the old man could not refuse this kind proposa...,helped him put it on his back .,the sparrows all helped him put it on his back .,the sparrows helped him put it on his back and...,0.56,0.521739,0.56,...,19,8,0.388172,"[0.55, 0.42105263157894735, 0.3333333333333333...",1.0,1.727273,19,11,-0.518739,0.149364


In [None]:
pd.set_option('display.max_columns', None)

results_path = '/content/drive/MyDrive/266/FinalProject/results'
df_r = pd.read_csv(f'{results_path}/T5baseline_test.csv')

In [None]:
print(len(df_r))

1007


In [None]:
df_r[:5]

Unnamed: 0,id,question,ex_or_im,story_section,reference_answer1,reference_answer2,generated_answer,rouge1_a1,rouge2_a1,rougeL_a1,rougeLsum_a1,rouge1_a2,rouge2_a2,rougeL_a2,rougeLsum_a2,bleu_a1,precisions_a1,brevity_penalty_a1,length_ratio_a1,translation_length_a1,reference_length_a1,bleu_a2,precisions_a2,brevity_penalty_a2,length_ratio_a2,translation_length_a2,reference_length_a2,bleurt_score_a1,bleurt_score_a2
0,0,how did the brother feel because they had been...,explicit,"but his brother complained of being weary , an...",weary .,weary .,they were weary,0.5,0.0,0.5,0.5,0.5,0.0,0.5,0.5,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",1.0,1.5,3,2,0.0,"[0.3333333333333333, 0.0, 0.0, 0.0]",1.0,1.5,3,2,-0.087235,-0.087235
1,1,how did ian feel after he reached the place wh...,implicit,' bear me to the kingdom of the king under the...,nervous .,excited .,glad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[0.0, 0.0, 0.0, 0.0]",0.367879,0.5,1,2,0.0,"[0.0, 0.0, 0.0, 0.0]",0.367879,0.5,1,2,-0.983741,0.13179
2,2,what did the mice do when the they saw the cat ?,explicit,""" eating is hard work in this country , "" said...",hurried into their holes .,the mice had to hurry into their holes .,They had to hurry into their holes .,0.545455,0.444444,0.545455,0.545455,0.8,0.769231,0.8,0.8,0.345721,"[0.5, 0.42857142857142855, 0.3333333333333333,...",1.0,1.6,8,5,0.742088,"[0.875, 0.8571428571428571, 0.8333333333333334...",0.882497,0.888889,8,9,-0.031122,-0.112637
3,3,why did the prince not tell his mother the tru...,explicit,"the queen spoke several times to her son , to ...",she was of the race of the ogres .,she was the race of the ogres .,she had all the difficulty in the world to avo...,0.285714,0.0,0.285714,0.285714,0.3,0.0,0.3,0.3,0.0,"[0.23076923076923078, 0.0, 0.0, 0.0]",1.0,1.444444,13,9,0.0,"[0.23076923076923078, 0.0, 0.0, 0.0]",1.0,1.625,13,8,-1.316074,-1.43915
4,4,why were the women frightened ?,explicit,as the two ladies leaned over the side of the ...,never before had they seen the face of mortal ...,never before have they seen the face of a mort...,they saw the face of mortal man .,0.705882,0.533333,0.705882,0.705882,0.666667,0.375,0.666667,0.666667,0.485987,"[0.875, 0.7142857142857143, 0.6666666666666666...",0.687289,0.727273,8,11,0.0,"[0.875, 0.5714285714285714, 0.3333333333333333...",0.606531,0.666667,8,12,0.335033,0.225543


In [None]:
len(df_r[df_r['reference_answer1']!=df_r['reference_answer2']])

722

In [None]:
len(df_r)

5

# Analysis of Results

In [None]:
# Calculate and print average ROUGE scores
avg_scores1 = df_r[['rouge1_a1', 'rouge2_a1', 'rougeL_a1']].mean()
print("\nAverage A1 ROUGE Scores:")
for metric, score in avg_scores1.items():
   print(f"{metric}: {score:.4f}")

avg_scores2 = df_r[['rouge1_a2', 'rouge2_a2', 'rougeL_a2']].mean()
print("\nAverage A2 ROUGE Scores:")
for metric, score in avg_scores2.items():
   print(f"{metric}: {score:.4f}")



# Calculate average by question type
avg_by_type1 = df_r.groupby(['ex_or_im'])[['rouge1_a1', 'rouge2_a1', 'rougeL_a1']].mean()
print("\nAverage A1 ROUGE Scores by Question type:")
print(avg_by_type1)

avg_by_type2 = df_r.groupby(['ex_or_im'])[['rouge1_a2', 'rouge2_a2', 'rougeL_a2']].mean()
print("\nAverage A2 ROUGE Scores by Question type:")
print(avg_by_type2)

# for i in t.itertuples():
#   print(f"\nAverage {i.Index} ROUGE scores:")
#   for metric, score in zip(metrics, i[1:]):
#     print(f"{metric}: {score:.4f}")

# Print some example summaries
print("\nExample Summaries:")
for i in range(min(10, len(df_r))):
   print(f"\nExample {i}:")
   print(f"Reference1: {df_r.iloc[i]['reference_answer1']}")
   print(f"Reference2: {df_r.iloc[i]['reference_answer2']}")
   print(f"Generated: {df_r.iloc[i]['generated_answer']}")


Average A1 ROUGE Scores:
rouge1_a1: 0.2017
rouge2_a1: 0.1500
rougeL_a1: 0.2017

Average A2 ROUGE Scores:
rouge1_a2: 0.2160
rouge2_a2: 0.1014
rougeL_a2: 0.2160

Average A1 ROUGE Scores by Question type:
          rouge1_a1  rouge2_a1  rougeL_a1
ex_or_im                                 
explicit   0.319865       0.25   0.319865
implicit   0.024390       0.00   0.024390

Average A2 ROUGE Scores by Question type:
          rouge1_a2  rouge2_a2  rougeL_a2
ex_or_im                                 
explicit   0.271132   0.117647   0.271132
implicit   0.133333   0.076923   0.133333

Example Summaries:

Example 0:
Reference1: sat together , rejoicing in their new - found happiness .
Reference2: for a little time they sat together .
Generated: shook her head

Example 1:
Reference1: alarmed .
Reference2: alarmed .
Generated: the doctor was alarmed at rin jin 's evident displeasure .

Example 2:
Reference1: the young men did not see ahti with their own eyes .
Reference2: it does n't sound real .


In [None]:

pd.set_option('display.max_columns', None)