In [1]:
!pip install -q -U transformers
!pip install -q -U datasets
!pip install -q -U evaluate
!pip install -q -U tokenizers
!pip install -q -U bitsandbytes
!pip install -q rouge_score

In [2]:
import re
import random
import numpy as np
from scipy.special import softmax
import pprint

import bitsandbytes as bnb

import torch
import transformers
import evaluate
from datasets import Dataset, load_dataset, DatasetDict

# For from-scratch T5 model
from transformers import T5TokenizerFast, T5Config, T5ForConditionalGeneration

# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration  # this won't import twice, just noting here what's for each model

# For all T5 models
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# For style classifier model (also for evaluating the seq2seq model output)
from transformers import BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig,pipeline
from transformers import TrainingArguments, Trainer

import pandas as pd
from tqdm import tqdm

# Local Data Loading Load Data

In [3]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Path to data save in Drive
train = 'FairytaleQA_train.csv'
valid = 'FairytaleQA_valid.csv'
test = 'FairytaleQA_test.csv'
# path = 'drive/MyDrive/266_Danielle_Dylan_final_project/data/' #DYLAN
path = 'drive/MyDrive/266/FinalProject/data/' #DANIELLE


In [5]:
train = path+train
valid = path+valid
test = path+test

train = pd.read_csv(train)
valid = pd.read_csv(valid)
test = pd.read_csv(test)

In [6]:
train_ds = Dataset.from_pandas(train, split="train")
test_ds = Dataset.from_pandas(test, split="test")
valid_ds = Dataset.from_pandas(valid, split="test")

# Combine into a single DatasetDict
ds = DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "validation": valid_ds,
})

# Remote Data Loading

In [7]:
pprint.pprint(ds['train'][1])

{'answer1': 'kind and just .',
 'answer2': None,
 'attribute': 'character',
 'ex_or_im': 'explicit',
 'ex_or_im2': None,
 'local_or_sum': 'local',
 'question': 'what type of ruler was the king ?',
 'story_name': 'three-dogs',
 'story_section': 'once upon a time there was a king who went forth into the '
                  'world and fetched back a beautiful queen . and after they '
                  'had been married a while god gave them a little daughter . '
                  'then there was great rejoicing in the city and throughout '
                  'the country , for the people wished their king all that was '
                  'good , since he was kind and just . while the child lay in '
                  'its cradle , a strange - looking old woman entered the room '
                  ', and no one knew who she was nor whence she came . the old '
                  'woman spoke a verse over the child , and said that she must '
                  'not be allowed out under the open 

In [8]:
"""
Initialize the pipeline with bitsandbytes quantization
"""
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Initialize pipeline
model_id = "google/flan-t5-small"

pipe = pipeline(
   "text2text-generation",
   model=model_id,
   model_kwargs={"torch_dtype": torch.bfloat16, "quantization_config": quantization_config},
   device_map="auto",
   trust_remote_code=True,
)



Device set to use cuda:0


In [9]:

# VOCAB_SIZE = 15000

# MODEL_NAME= "google/flan-t5-small"
# tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

# # model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, quantization_config=quantization_config)
# # qa_model = pipeline("question-answering",model=MODEL_NAME)

# # data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=qa_model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
train = ds['train'].shuffle()
val = ds['validation'].shuffle()
test = ds['test'].shuffle()

In [11]:
train

Dataset({
    features: ['story_name', 'story_section', 'question', 'answer1', 'answer2', 'local_or_sum', 'attribute', 'ex_or_im', 'ex_or_im2'],
    num_rows: 8548
})

In [11]:
# # We prefix our tasks with "answer the question"
# prefix = "Please answer this question: "
# context = " Context: "

# # Define the preprocessing function

# def preprocess_function(data):
#    """Add prefix to the sentences, tokenize the text, and set the labels"""
#    # The "inputs" are the tokenized answer:
#    inputs = [prefix + question + context for question,context in zip(data["question"],data['story_section'])]
#    model_inputs = tokenizer(text_target=inputs,
#                       max_length=512,
#                       truncation=True,
#                       padding='max_length',
#                             return_tensors='pt')

#    # The "labels" are the tokenized outputs:
#    labels = tokenizer(text_target=data["answer1"],
#                       max_length=512,
#                       truncation=True,
#                       padding='max_length',
#                       return_tensors='pt')

#    model_inputs["labels"] = labels["input_ids"]
#    return model_inputs

In [12]:
# # Map the preprocessing function across our dataset
# train_tokenized = train.map(preprocess_function, batched=True)
# val_tokenized = val.map(preprocess_function, batched=True)
# test_tokenized = test.map(preprocess_function, batched=True)

# train_tokenzied = {'input_ids': train_tokenized['input_ids'], 'labels': train_tokenized['labels']}
# val_tokenzied = {'input_ids': val_tokenized['input_ids'], 'labels': val_tokenized['labels']}
# test_tokenzied = {'input_ids': test_tokenized['input_ids'], 'labels': test_tokenized['labels']}

Map:   0%|          | 0/8548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

Map:   0%|          | 0/1007 [00:00<?, ? examples/s]

In [29]:
# len(train_tokenzied['input_ids'][400])

512

In [None]:
# def create_seq2seq_training_args(batch_size, num_epochs):

#     training_args = Seq2SeqTrainingArguments(
#         "fairytale_QA_model",
#         eval_strategy='epoch',
#         per_device_train_batch_size=batch_size,
#         per_device_eval_batch_size=batch_size,
#         num_train_epochs=num_epochs,
#         report_to='none'

#     )

#     return training_args

In [None]:
# def create_seq2seq_trainer(model, training_args, train_ds, val_ds):

#     trainer = Seq2SeqTrainer(
#         model,
#         training_args,
#         train_dataset=train_ds,
#         eval_dataset=val_ds
#     )

#     return trainer

# Fine Tuning (DNU)

In [1]:
# batch_size = 32
# num_epochs = 4

In [2]:
# embed_dim = 300
# keyvalue_dim = 36
# num_heads = 6
# dense_dim = 850
# num_layers = 6

In [3]:
# training_args = create_seq2seq_training_args(batch_size, num_epochs)
# trainer = create_seq2seq_trainer(model,training_args,train_tokenized,val_tokenized)

# trainer.train()

# Inference

In [16]:
rouge = evaluate.load('rouge')

In [21]:
# train[0]['story_section']
train[0]

{'story_name': 'the-brown-bear-of-norway',
 'story_section': "at last she pitied him , and removed the charm , and the horns dropped down on the ground , and he would have killed her on the spot , only he was as weak as water , and his fellow - servants came in and carried him up to the big house . well , some way or other the story came to the ears of the prince , and he strolled down that way . she had only the dress of a countrywoman on her as she sat sewing at the window , but that did not hide her beauty , and he was greatly puzzled after he had a good look , just as a body is puzzled to know whether something happened to him when he was young or if he only dreamed it . well , the witch 's daughter heard about it too , and she came to see the strange girl ; and what did she find her doing but cutting out the pattern of a gown from brown paper ; and as she cut away , the paper became the richest silk she ever saw . the witch 's daughter looked on with greedy eyes , and , says she ,

In [None]:
# Store results for aggregate scoring
results = []

for idx, sample in enumerate(tqdm(train)):
    # if idx >= 50:  # Stop after processing 10 samples
    #   break
    try:
      prefix = "Please answer this question: "
      context = " Context: "

      question = prefix + sample['question'] + context + sample['story_section']
      # Generate summary via the pipeline
      outputs = pipe(
                          question,
                          max_new_tokens=700,
      )


      answer = outputs[0]["generated_text"]


      # Calculate ROUGE scores
      predictions = [answer]
      references = [[sample['answer1']]]
      rouge_scores = rouge.compute(predictions=predictions, references=references)


      # Store results
      results.append({
          'id': idx,
          'ex_or_im': sample['ex_or_im'],
          'story_section': sample['story_section'][:500],  # Store truncated text for readability
          'reference_answer': sample['answer1'],
          'generated_answer': answer,
           **rouge_scores
      })

      # Print progress update every 10 samples
      if (idx + 1) % 500 == 0:
          print(f"\nProcessed {idx + 1} samples")
          print(f"Latest ROUGE-1: {rouge_scores['rouge1']:.4f}")


          if dataset == train:
            filesuffix = "train"
          elif dataset == valid:
            filesuffix = "val"
          elif dataset == test:
            filesuffix = "test"
          else:
            filesuffix = ""
          # Convert results to DataFrame
          results_df = pd.DataFrame(results)

          # Save results in Drive, will overwrite existing file
          # results_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results/'  #DYLAN
          results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
          results_df.to_csv(results_path+f'T5baseline_resultsdf_{filesuffix}.csv', index=False)

    except Exception as e:
      print(f"Error processing sample {idx}: {str(e)}")
      continue

  6%|▌         | 500/8548 [07:00<2:07:33,  1.05it/s]


Processed 500 samples
Latest ROUGE-1: 0.0000


 12%|█▏        | 1000/8548 [13:52<1:19:49,  1.58it/s]


Processed 1000 samples
Latest ROUGE-1: 0.0000


 18%|█▊        | 1500/8548 [20:44<2:20:59,  1.20s/it]


Processed 1500 samples
Latest ROUGE-1: 0.0000


 23%|██▎       | 2000/8548 [27:17<10:36:22,  5.83s/it]


Processed 2000 samples
Latest ROUGE-1: 0.0000


 29%|██▉       | 2500/8548 [34:45<49:44,  2.03it/s]


Processed 2500 samples
Latest ROUGE-1: 0.1538


 35%|███▌      | 3000/8548 [41:15<1:03:57,  1.45it/s]


Processed 3000 samples
Latest ROUGE-1: 0.1538


 41%|████      | 3500/8548 [48:06<50:11,  1.68it/s]


Processed 3500 samples
Latest ROUGE-1: 0.0000


 47%|████▋     | 4000/8548 [55:11<36:14,  2.09it/s]


Processed 4000 samples
Latest ROUGE-1: 0.5714


 53%|█████▎    | 4500/8548 [1:01:20<35:15,  1.91it/s]


Processed 4500 samples
Latest ROUGE-1: 0.0000


 58%|█████▊    | 5000/8548 [1:07:45<43:10,  1.37it/s]


Processed 5000 samples
Latest ROUGE-1: 0.1250


 64%|██████▍   | 5500/8548 [1:15:11<53:59,  1.06s/it]  


Processed 5500 samples
Latest ROUGE-1: 0.0000


 70%|███████   | 6000/8548 [1:21:54<38:09,  1.11it/s]


Processed 6000 samples
Latest ROUGE-1: 0.1111


 76%|███████▌  | 6500/8548 [1:29:06<42:33,  1.25s/it]


Processed 6500 samples
Latest ROUGE-1: 0.3333


 82%|████████▏ | 7000/8548 [1:35:01<17:29,  1.47it/s]


Processed 7000 samples
Latest ROUGE-1: 0.8889


 88%|████████▊ | 7500/8548 [1:40:49<07:56,  2.20it/s]


Processed 7500 samples
Latest ROUGE-1: 0.4000


 89%|████████▊ | 7582/8548 [1:41:47<09:42,  1.66it/s]

In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results in Drive, will overwrite existing file
# results_path = 'drive/MyDrive/266_Danielle_Dylan_final_project/results/'  #DYLAN
results_path = 'drive/MyDrive/266/FinalProject/results/'  #DANIELLE
results_df.to_csv(results_path+'T5baseline_resultsdf.csv', index=False)

In [None]:
results[0]

In [115]:
# Calculate and print average ROUGE scores
avg_scores = results_df[['rouge1', 'rouge2', 'rougeL']].mean()
print("\nAverage ROUGE Scores:")
for metric, score in avg_scores.items():
   print(f"{metric}: {score:.4f}")

# Calculate average by question type
avg_by_type = results_df.groupby(['ex_or_im'])[['rouge1', 'rouge2', 'rougeL']].mean()
# for i in t.itertuples():
#   print(f"\nAverage {i.Index} ROUGE scores:")
#   for metric, score in zip(metrics, i[1:]):
#     print(f"{metric}: {score:.4f}")
print("\nAverage ROUGE Scores by Question type:")
print(avg_by_type)

# Print some example summaries
print("\nExample Summaries:")
for i in range(min(20, len(results_df))):
   print(f"\nExample {i}:")
   print(f"Reference: {results_df.iloc[i]['reference_answer']}")
   print(f"Generated: {results_df.iloc[i]['generated_answer']}")


Average ROUGE Scores:
rouge1: 0.3265
rouge2: 0.1798
rougeL: 0.3191

Average ROUGE Scores by Question type:
            rouge1    rouge2    rougeL
ex_or_im                              
explicit  0.401419  0.239823  0.394258
implicit  0.113302  0.009050  0.105205

Example Summaries:

Example 0:
Reference: angry .
Generated: dreadful

Example 1:
Reference: happy .
Generated: glad

Example 2:
Reference: seized the squirrel and ate him up .
Generated: ate him up

Example 3:
Reference: trick the giant .
Generated: crushed it into fine sand

Example 4:
Reference: he looked handsomer than ever for he was glided all over .
Generated: he was gilded all over .

Example 5:
Reference: the great spirit .
Generated: the raspberry king

Example 6:
Reference: an envious wizened .
Generated: an envious wizened basthard of a fellow

Example 7:
Reference: seized the skin in their beaks and they flew quickly away .
Generated: the gulls flew straight as an arrow

Example 8:
Reference: to enrish and season

In [None]:
train[48]