In [None]:
!pip install -q -U transformers
!pip install -q -U datasets
!pip install -q -U evaluate
!pip install -q -U tokenizers
!pip install -q -U bitsandbytes
!pip install -q rouge_score

In [None]:
import re
import random
import numpy as np
from scipy.special import softmax
import pprint

import bitsandbytes as bnb

import torch
import transformers
import evaluate
from datasets import Dataset, load_dataset, DatasetDict

# For from-scratch T5 model
from transformers import T5TokenizerFast, T5Config, T5ForConditionalGeneration

# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration  # this won't import twice, just noting here what's for each model

# For all T5 models
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM


# For style classifier model (also for evaluating the seq2seq model output)
from transformers import BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig,pipeline
from transformers import TrainingArguments, Trainer

import pandas as pd
from tqdm import tqdm

# Local Data Loading Load Data

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Path to data save in Drive
train = 'FairytaleQA_train.csv'
valid = 'FairytaleQA_valid.csv'
test = 'FairytaleQA_test.csv'
# path = 'drive/MyDrive/266_Danielle_Dylan_final_project/data/' #DYLAN
path = '/content/drive/MyDrive/266/FinalProject/data/' #DANIELLE personal


In [None]:
train = path+train
valid = path+valid
test = path+test

train = pd.read_csv(train)
valid = pd.read_csv(valid)
test = pd.read_csv(test)

In [None]:
train_ds = Dataset.from_pandas(train, split="train")
test_ds = Dataset.from_pandas(test, split="test")
valid_ds = Dataset.from_pandas(valid, split="test")

# Combine into a single DatasetDict
ds = DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "validation": valid_ds,
})

# Remote Data Loading

In [None]:
pprint.pprint(ds['train'][1])

{'answer1': 'kind and just .',
 'answer2': None,
 'attribute': 'character',
 'ex_or_im': 'explicit',
 'ex_or_im2': None,
 'local_or_sum': 'local',
 'question': 'what type of ruler was the king ?',
 'story_name': 'three-dogs',
 'story_section': 'once upon a time there was a king who went forth into the '
                  'world and fetched back a beautiful queen . and after they '
                  'had been married a while god gave them a little daughter . '
                  'then there was great rejoicing in the city and throughout '
                  'the country , for the people wished their king all that was '
                  'good , since he was kind and just . while the child lay in '
                  'its cradle , a strange - looking old woman entered the room '
                  ', and no one knew who she was nor whence she came . the old '
                  'woman spoke a verse over the child , and said that she must '
                  'not be allowed out under the open 

In [None]:
# """
# Initialize the pipeline with bitsandbytes quantization
# """
# # Configure bitsandbytes for 4-bit quantization
# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type="nf4",
#     bnb_4bit_compute_dtype=torch.bfloat16
# )

# # Initialize pipeline
# model_id = "google/flan-t5-small"

In [None]:
# quantized_model = AutoModelForSeq2SeqLM.from_pretrained(model_id, device_map="cuda:0", quantization_config=quantization_config)

In [None]:
"""
Initialize the pipeline with bitsandbytes quantization
"""
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Initialize pipeline
model_id = "google/flan-t5-small"

pipe = pipeline(
   "text2text-generation",
   model=model_id,
   model_kwargs={"torch_dtype": torch.bfloat16, "quantization_config": quantization_config},
   device_map="auto",
   trust_remote_code=True
)



Device set to use cuda:0


In [None]:
# quantized_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)


In [None]:
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)

# model_8bit = AutoModelForCausalLM.from_pretrained(
#     "bigscience/bloom-1b7",
#     device_map="auto",
#     quantization_config=quantization_config
# )

In [None]:

# VOCAB_SIZE = 15000

# MODEL_NAME= "google/flan-t5-small"
# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# # model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, quantization_config=quantization_config)
# # qa_model = pipeline("question-answering",model=MODEL_NAME)

# # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=qa_model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
train = ds['train'].shuffle()
val = ds['validation'].shuffle()
test = ds['test'].shuffle()

In [None]:
train

Dataset({
    features: ['story_name', 'story_section', 'question', 'answer1', 'answer2', 'local_or_sum', 'attribute', 'ex_or_im', 'ex_or_im2'],
    num_rows: 8548
})

In [None]:
# # We prefix our tasks with "answer the question"
# prefix = "Please answer this question: "
# context = " Context: "

# # Define the preprocessing function

# def preprocess_function(data):
#    """Add prefix to the sentences, tokenize the text, and set the labels"""
#    # The "inputs" are the tokenized answer:
#    inputs = [prefix + question + context for question,context in zip(data["question"],data['story_section'])]
#    model_inputs = tokenizer(text_target=inputs,
#                       max_length=512,
#                       truncation=True,
#                       padding='max_length',
#                             return_tensors='pt')

#    # The "labels" are the tokenized outputs:
#    labels = tokenizer(text_target=data["answer1"],
#                       max_length=512,
#                       truncation=True,
#                       padding='max_length',
#                       return_tensors='pt')

#    model_inputs["labels"] = labels["input_ids"]
#    return model_inputs

In [None]:
# # Map the preprocessing function across our dataset
# train_tokenized = train.map(preprocess_function, batched=True)
# val_tokenized = val.map(preprocess_function, batched=True)
# test_tokenized = test.map(preprocess_function, batched=True)

# train_tokenzied = {'input_ids': train_tokenized['input_ids'], 'labels': train_tokenized['labels']}
# val_tokenzied = {'input_ids': val_tokenized['input_ids'], 'labels': val_tokenized['labels']}
# test_tokenzied = {'input_ids': test_tokenized['input_ids'], 'labels': test_tokenized['labels']}

In [None]:
# len(train_tokenzied['input_ids'][400])

In [None]:
# def create_seq2seq_training_args(batch_size, num_epochs):

#     training_args = Seq2SeqTrainingArguments(
#         "fairytale_QA_model",
#         eval_strategy='epoch',
#         per_device_train_batch_size=batch_size,
#         per_device_eval_batch_size=batch_size,
#         num_train_epochs=num_epochs,
#         report_to='none'

#     )

#     return training_args

In [None]:
# def create_seq2seq_trainer(model, training_args, train_ds, val_ds):

#     trainer = Seq2SeqTrainer(
#         model,
#         training_args,
#         train_dataset=train_ds,
#         eval_dataset=val_ds
#     )

#     return trainer

# Fine Tuning (DNU)

In [None]:
# batch_size = 32
# num_epochs = 4

In [None]:
# embed_dim = 300
# keyvalue_dim = 36
# num_heads = 6
# dense_dim = 850
# num_layers = 6

In [None]:
# training_args = create_seq2seq_training_args(batch_size, num_epochs)
# trainer = create_seq2seq_trainer(model,training_args,train_tokenized,val_tokenized)

# trainer.train()

# Inference

In [None]:
rouge = evaluate.load('rouge')

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
%%time

results = []
batch_size = 16
r = []

for idx in tqdm(range(0,len(test),batch_size)):
  # if idx >= 50:  # Stop after processing 10 samples
  #   break

  prefix = "Please answer this question: "
  context = " Context: "

  batch = train[idx:idx+batch_size]

  # for j in range(len(batch)):
  #   batch[j]['question'] = prefix + batch[j]['question'] + context + batch[j]['story_section']

  # print(batch)

  questions = []
  for j in range(0,len(batch['question'])):
    # print(batch)
    # print(batch['question'][j])
    q = prefix + batch['question'][j] + context + batch['story_section'][j]
    questions.append(q)


  # questions = [prefix + sample['question'] + context + sample['story_section'] for sample in batch]

  # print('AAA',questions)
  # print("\n", len(questions))

  # Generate summary via the pipeline
  outputs = pipe(
                      questions,
                      max_new_tokens=700,
  )
  # print("A",outputs)
  # print("A2",len(outputs))


  answer = outputs[0]["generated_text"]
  # print("B",answer)

  r.append(answer)


  # # Calculate ROUGE scores



  # for sample in batch:
  #   print("C",sample)
  #   predictions = [answer]
  #   print("D",predictions)
  #   references = [[sample['answer1']]]
  #   print("E",references)
  #   rouge_scores = rouge.compute(predictions=predictions, references=references)
  #   print("F",rouge_scores)


  #   # Store results
  #   results.append({
  #       'id': idx,
  #       'ex_or_im': sample['ex_or_im'],
  #       'story_section': sample['story_section'][:500],  # Store truncated text for readability
  #       'reference_answer': sample['answer1'],
  #       'generated_answer': answer,
  #       **rouge_scores
  #   })

  # Print progress update every 10 samples
  if (idx + 1) % 500 == 0:
      print(f"\nProcessed {idx + 1} samples")
      # print(f"Latest ROUGE-1: {rouge_scores['rouge1']:.4f}")

  0%|          | 0/63 [00:03<?, ?it/s]


KeyboardInterrupt: 

In [None]:
# %%time

# results = []

# for idx, sample in enumerate(tqdm(test)):
#   # if idx >= 50:  # Stop after processing 10 samples
#   #   break
#   prefix = "Please answer this question: "
#   context = " Context: "

#   question = prefix + sample['question'] + context + sample['story_section']
#   # Generate summary via the pipeline
#   outputs = pipe(
#                       question,
#                       max_new_tokens=700,
#   )


#   answer = outputs[0]["generated_text"]


#   # Calculate ROUGE scores
#   predictions = [answer]
#   references = [[sample['answer1']]]
#   rouge_scores = rouge.compute(predictions=predictions, references=references)


#   # Store results
#   results.append({
#       'id': idx,
#       'ex_or_im': sample['ex_or_im'],
#       'story_section': sample['story_section'][:500],  # Store truncated text for readability
#       'reference_answer': sample['answer1'],
#       'generated_answer': answer,
#       **rouge_scores
#   })

#   # Print progress update every 10 samples
#   if (idx + 1) % 500 == 0:
#       print(f"\nProcessed {idx + 1} samples")
#       print(f"Latest ROUGE-1: {rouge_scores['rouge1']:.4f}")

  0%|          | 10/8548 [00:09<1:53:45,  1.25it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  0%|          | 25/8548 [00:21<1:36:10,  1.48it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (724 > 512). Running this sequence through the model will result in indexing errors
  1%|          | 50/8548 [00:46<2:12:46,  1.07it/s]

CPU times: user 42.6 s, sys: 542 ms, total: 43.2 s
Wall time: 46.9 s





In [None]:
# def preprocess_function(data):
#    """Add prefix to the sentences, tokenize the text, and set the labels"""
#    # The "inputs" are the tokenized answer:
#    inputs = [prefix + question + context for question,context in zip(data["question"],data['story_section'])]
#    model_inputs = tokenizer(text_target=inputs,
#                       max_length=512,
#                       truncation=True,
#                       padding='max_length',
#                             return_tensors='pt')

#    # The "labels" are the tokenized outputs:
#    labels = tokenizer(text_target=data["answer1"],
#                       max_length=512,
#                       truncation=True,
#                       padding='max_length',
#                       return_tensors='pt')

#    model_inputs["labels"] = labels["input_ids"]
#    return model_inputs

SIMILAR TO ORIGINAL

In [None]:
# %%time

# # Store results for aggregate scoring
# results = []
# def process_dataset(dataset):
#   for idx, sample in enumerate(tqdm(dataset)):

#       prefix = "Please answer this question: "
#       context = " Context: "

#       question = prefix + sample['question'] + context + sample['story_section']

#       # Generate summary via the pipeline
#       outputs = pipe(
#                           question,
#                           max_new_tokens=700,
#       )


#       answer = outputs[0]["generated_text"]


#       # Calculate ROUGE scores
#       predictions = [answer]
#       references = [[sample['answer1']]]
#       rouge_scores = rouge.compute(predictions=predictions, references=references)


#       # Store results
#       results.append({
#           'id': idx,
#           'ex_or_im': sample['ex_or_im'],
#           'story_section': sample['story_section'][:500],  # Store truncated text for readability
#           'reference_answer': sample['answer1'],
#           'generated_answer': answer,
#           **rouge_scores
#       })

#       # Print progress update every 10 samples
#       if (idx + 1) % 500 == 0:
#           print(f"\nProcessed {idx + 1} samples")
#           print(f"Latest ROUGE-1: {rouge_scores['rouge1']:.4f}")


#       if dataset == train:
#         filesuffix = "train"
#       elif dataset == valid:
#         filesuffix = "val"
#       elif dataset == test:
#         filesuffix = "test"
#       else:
#         filesuffix = ""
#       # Convert results to DataFrame
#       results_df = pd.DataFrame(results)

#       # Save results in Drive, will overwrite existing file
#       # results_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results/'  #DYLAN
#       results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
#       results_df.to_csv(results_path+f'T5baseline_resultsdf_{filesuffix}.csv', index=False)

#       return results_df


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


In [None]:
# %%time
# test_results_df = process_dataset(test)

  0%|          | 0/1007 [00:00<?, ?it/s]


ValueError: Unable to coerce to Series, length must be 9: given 1007

In [None]:
%%time
######ORIGINAL

# Store results for aggregate scoring
results = []
def process_dataset(dataset):
  for idx, sample in enumerate(tqdm(dataset)):
      # if idx >= 50:  # Stop after processing 10 samples
      #   break

      prefix = "Please answer this question: "
      context = " Context: "

      question = prefix + sample['question'] + context + sample['story_section']
      # Generate summary via the pipeline
      outputs = pipe(
                          question,
                          max_new_tokens=700,
      )


      answer = outputs[0]["generated_text"]


      # Calculate ROUGE scores
      predictions = [answer]
      references = [[sample['answer1']]]
      rouge_scores = rouge.compute(predictions=predictions, references=references)


      # Store results
      results.append({
          'id': idx,
          'ex_or_im': sample['ex_or_im'],
          'story_section': sample['story_section'][:500],  # Store truncated text for readability
          'reference_answer': sample['answer1'],
          'generated_answer': answer,
          **rouge_scores
      })

      # Print progress update every 10 samples
      if (idx + 1) % 500 == 0:
          print(f"\nProcessed {idx + 1} samples")
          print(f"Latest ROUGE-1: {rouge_scores['rouge1']:.4f}")


          # if dataset == train:
          #   filesuffix = "train"
          # elif dataset == valid:
          #   filesuffix = "val"
          # elif dataset == test:
          #   filesuffix = "test"
          # else:
          #   filesuffix = ""
          # Convert results to DataFrame
          results_df = pd.DataFrame(results)

          # # Save results in Drive, will overwrite existing file
          # # results_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results/'  #DYLAN
          # results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
          # results_df.to_csv(results_path+f'T5baseline_resultsdf_{filesuffix}.csv', index=False)

          return results_df

      # except Exception as e:
      #   print(f"Error processing sample {idx}: {str(e)}")
      #   continue

# process_dataset(train)
# process_dataset(valid)
# process_dataset(test)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.39 µs


In [None]:
%%time
test_results_df = process_dataset(test)


 50%|████▉     | 499/1007 [08:01<08:09,  1.04it/s]


Processed 500 samples
Latest ROUGE-1: 0.7143
CPU times: user 7min 54s, sys: 2.83 s, total: 7min 56s
Wall time: 8min 1s





In [None]:
results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
test_results_df.to_csv(results_path+f'T5baseline_resultsdf_test.csv', index=False)

In [None]:
%%time
process_dataset(valid)

In [None]:
%%time
process_dataset(train)

In [None]:
print(len(test), len(train), len(valid))

1007 8548 1025


In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results in Drive, will overwrite existing file
# results_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results/'  #DYLAN
results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
results_df.to_csv(results_path+'T5baseline_resultsdf.csv', index=False)

In [None]:
results[0]

In [None]:
# Calculate and print average ROUGE scores
avg_scores = results_df[['rouge1', 'rouge2', 'rougeL']].mean()
print("\nAverage ROUGE Scores:")
for metric, score in avg_scores.items():
   print(f"{metric}: {score:.4f}")

# Calculate average by question type
avg_by_type = results_df.groupby(['ex_or_im'])[['rouge1', 'rouge2', 'rougeL']].mean()
# for i in t.itertuples():
#   print(f"\nAverage {i.Index} ROUGE scores:")
#   for metric, score in zip(metrics, i[1:]):
#     print(f"{metric}: {score:.4f}")
print("\nAverage ROUGE Scores by Question type:")
print(avg_by_type)

# Print some example summaries
print("\nExample Summaries:")
for i in range(min(20, len(results_df))):
   print(f"\nExample {i}:")
   print(f"Reference: {results_df.iloc[i]['reference_answer']}")
   print(f"Generated: {results_df.iloc[i]['generated_answer']}")


Average ROUGE Scores:
rouge1: 0.3265
rouge2: 0.1798
rougeL: 0.3191

Average ROUGE Scores by Question type:
            rouge1    rouge2    rougeL
ex_or_im                              
explicit  0.401419  0.239823  0.394258
implicit  0.113302  0.009050  0.105205

Example Summaries:

Example 0:
Reference: angry .
Generated: dreadful

Example 1:
Reference: happy .
Generated: glad

Example 2:
Reference: seized the squirrel and ate him up .
Generated: ate him up

Example 3:
Reference: trick the giant .
Generated: crushed it into fine sand

Example 4:
Reference: he looked handsomer than ever for he was glided all over .
Generated: he was gilded all over .

Example 5:
Reference: the great spirit .
Generated: the raspberry king

Example 6:
Reference: an envious wizened .
Generated: an envious wizened basthard of a fellow

Example 7:
Reference: seized the skin in their beaks and they flew quickly away .
Generated: the gulls flew straight as an arrow

Example 8:
Reference: to enrish and season

In [None]:
train[48]