In [None]:
 !pip install -q -U transformers[torch]
!pip install -q -U datasets
!pip install -q -U evaluate
!pip install -q -U tokenizers
!pip install -q -U bitsandbytes
!pip install -q rouge_score
!pip install -q -U peft
!pip install -q -U accelerate

In [None]:
import re
import random
import numpy as np
from scipy.special import softmax
import pprint

import bitsandbytes as bnb

import torch
import transformers
import evaluate
from datasets import Dataset, load_dataset

# For from-scratch T5 model
from transformers import T5TokenizerFast, T5Config, T5ForConditionalGeneration

# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration  # this won't import twice, just noting here what's for each model

# For all T5 models
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM


# For style classifier model (also for evaluating the seq2seq model output)
from transformers import BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig,pipeline
from transformers import TrainingArguments, Trainer

import pandas as pd
from tqdm import tqdm

# Local Data Loading Load Data

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Path to data save in Drive
train = 'FairytaleQA_train.csv'
valid = 'FairytaleQA_valid.csv'
test = 'FairytaleQA_test.csv'
path = 'drive/MyDrive/266_Danielle_Dylan_final_project/data/'
# path = 'drive/MyDrive/DataSci_266/266_Danielle_Dylan_final_project/data/'

In [None]:
train = path+train
valid = path+valid
test = path+test

train = pd.read_csv(train)
valid = pd.read_csv(valid)
test = pd.read_csv(test)

In [None]:
from datasets import load_dataset, DatasetDict
train_ds = Dataset.from_pandas(train, split="train")
test_ds = Dataset.from_pandas(test, split="test")
valid_ds = Dataset.from_pandas(valid, split="test")

# Combine into a single DatasetDict
ds = DatasetDict({
    "train": train_ds,
    "test": test_ds,
    "validation": valid_ds,
})

In [None]:
train = ds['train'].shuffle()
val = ds['validation'].shuffle()
test = ds['test'].shuffle()

# Remote Data Loading

In [None]:
# from datasets import load_dataset

# ds = load_dataset("WorkInTheDark/FairytaleQA")

In [None]:
pprint.pprint(ds['train'][1])

{'answer1': 'kind and just .',
 'answer2': None,
 'attribute': 'character',
 'ex_or_im': 'explicit',
 'ex_or_im2': None,
 'local_or_sum': 'local',
 'question': 'what type of ruler was the king ?',
 'story_name': 'three-dogs',
 'story_section': 'once upon a time there was a king who went forth into the '
                  'world and fetched back a beautiful queen . and after they '
                  'had been married a while god gave them a little daughter . '
                  'then there was great rejoicing in the city and throughout '
                  'the country , for the people wished their king all that was '
                  'good , since he was kind and just . while the child lay in '
                  'its cradle , a strange - looking old woman entered the room '
                  ', and no one knew who she was nor whence she came . the old '
                  'woman spoke a verse over the child , and said that she must '
                  'not be allowed out under the open 

In [None]:
# Initialize pipeline
model_id = "google/flan-t5-small"

In [None]:
"""
Initialize the pipeline with bitsandbytes quantization
"""
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)



model = pipeline(
   "text2text-generation",
   model=model_id,
   model_kwargs={"torch_dtype": torch.bfloat16, "quantization_config": quantization_config},
   device_map="auto",
   trust_remote_code=True,
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


In [None]:
VOCAB_SIZE = 15000
MAX_SEQUENCE_LENGTH = 300

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_id)

# model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, quantization_config=quantization_config)
# qa_model = pipeline("question-answering",model=MODEL_NAME)

# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=qa_model)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "
context = " Context: "

# Define the preprocessing function

def preprocess_function(data):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:
   inputs = [prefix + question + context for question,context in zip(data["question"],data['story_section'])]
   model_inputs = tokenizer.batch_encode_plus(inputs,
                      max_length=512,
                      truncation=True,
                      padding='max_length',
                            return_tensors='pt')

   # The "labels" are the tokenized outputs:
   labels = tokenizer.batch_encode_plus(data["answer1"],
                      max_length=512,
                      truncation=True,
                      padding='max_length',
                      return_tensors='pt')

   model_inputs["labels"] = labels["input_ids"]
   return model_inputs

In [None]:
# Map the preprocessing function across our dataset
train_tokenized = train.map(preprocess_function, batched=True)
val_tokenized = val.map(preprocess_function, batched=True)
test_tokenized = test.map(preprocess_function, batched=True)

train_tokenized = {'input_ids': train_tokenized['input_ids'], 'labels': train_tokenized['labels']}
val_tokenzied = {'input_ids': val_tokenized['input_ids'], 'labels': val_tokenized['labels']}
test_tokenzied = {'input_ids': test_tokenized['input_ids'], 'labels': test_tokenized['labels']}

Map:   0%|          | 0/8548 [00:00<?, ? examples/s]

Map:   0%|          | 0/1025 [00:00<?, ? examples/s]

Map:   0%|          | 0/1007 [00:00<?, ? examples/s]

In [None]:
len(train_tokenzied['input_ids'][400])

512

In [None]:
def create_seq2seq_training_args(batch_size, num_epochs):

    training_args = Seq2SeqTrainingArguments(
        "fairytale_QA_model",
        eval_strategy='epoch',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        report_to='none'

    )

    return training_args

In [None]:
def create_seq2seq_trainer(model, training_args, train_ds, val_ds):

    trainer = Seq2SeqTrainer(
        model,
        training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds
    )

    return trainer

# Fine Tuning (DNU)

In [None]:
# batch_size = 32
# num_epochs = 4

In [None]:
# embed_dim = 300
# keyvalue_dim = 36
# num_heads = 6
# dense_dim = 850
# num_layers = 6

In [None]:
# training_args = create_seq2seq_training_args(batch_size, num_epochs)
# trainer = create_seq2seq_trainer(model,training_args,train_tokenized,val_tokenized)

# trainer.train()

# Inference

In [None]:
rouge = evaluate.load('rouge')

In [None]:
# Store results for aggregate scoring
results = []

In [None]:
# for idx, sample in enumerate((ds['train'])):
#   pprint.pprint(sample)
#   break

In [None]:
for idx, sample in enumerate(tqdm(train)):
    if idx >= 10:  # Stop after processing 10 samples
      break
    try:
      prefix = "Please answer this question: "
      context = " Context: "

      question = prefix + sample['question'] + context + sample['story_section']
      # Generate summary via the pipeline
      outputs = model(
                          question,
                          max_new_tokens=700,
                          num_beams=4,
                          do_sample=True,
                          top_k=100,
                          top_p=1.0,
                          temperature=0.2,
                          no_repeat_ngram_size=2
      )


      answer = outputs[0]["generated_text"]


      # Calculate ROUGE scores
      predictions = [answer]
      references = [[sample['answer1']]]
      rouge_scores = rouge.compute(predictions=predictions, references=references)


      # Store results
      results.append({
          'id': idx,
          'story_section': sample['story_section'][:500],  # Store truncated text for readability
          'reference_answer': sample['answer1'],
          'generated_answer': answer,
           **rouge_scores
      })

      # Print progress update every 10 samples
      if (idx + 1) % 10 == 0:
          print(f"\nProcessed {idx + 1} samples")
          print(f"Latest ROUGE-1: {rouge_scores['rouge1']:.4f}")

    except Exception as e:
      print(f"Error processing sample {idx}: {str(e)}")
      continue

  0%|          | 10/8548 [00:07<1:46:37,  1.33it/s]


Processed 10 samples
Latest ROUGE-1: 0.8000





In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Calculate and print average ROUGE scores
avg_scores = results_df[['rouge1', 'rouge2', 'rougeL']].mean()
print("\nAverage ROUGE Scores:")
for metric, score in avg_scores.items():
   print(f"{metric}: {score:.4f}")

# Print some example summaries
print("\nExample Summaries:")
for i in range(min(50, len(results_df))):
   print(f"\nExample {i}:")
   print(f"Reference: {results_df.iloc[i]['reference_answer']}")
   print(f"Generated: {results_df.iloc[i]['generated_answer']}")


Average ROUGE Scores:
rouge1: 0.1777
rouge2: 0.0921
rougeL: 0.1652

Example Summaries:

Example 0:
Reference: insato , king of all reptiles .
Generated: assegai

Example 1:
Reference: tell the prince .
Generated: clapped her little hands

Example 2:
Reference: the youth returned with two men , and asked him in their presence whether he refused the dead man christian burial .
Generated: to pay for the wake out of his own slender purse .

Example 3:
Reference: the waters ceased their roaring , and the river was quiet .
Generated: the emperor recovered his health

Example 4:
Reference: his parents take good care of him .
Generated: momotaro is grateful for the kindness of his parents .

Example 5:
Reference: a witch .
Generated: the sorcerer

Example 6:
Reference: he wanted the rice - dumpling .
Generated: the monkey was trying to persuade the crab .

Example 7:
Reference: outside of a smith 's gate .
Generated: the smith

Example 8:
Reference: his father 's mind had become impaired .
Ge

In [None]:
train[48]

{'story_name': 'which-was-the-foolishest',
 'story_section': 'in a little village that stood on a wide plain , where you could see the sun from the moment he rose to the moment he set , there lived two couples side by side . the men , who worked under the same master , were quite good friends , but the wives were always quarrelling , and the subject they quarrelled most about was -- which of the two had the stupidest husband . unlike most women -- who think that anything that belongs to them must be better than what belongs to anyone else -- each thought her husband the more foolish of the two .',
 'question': 'what did the two wives always quarrel about ?',
 'answer1': 'which of the two had the stupidest husband .',
 'answer2': None,
 'local_or_sum': 'local',
 'attribute': 'action',
 'ex_or_im': 'explicit',
 'ex_or_im2': None}

# Build Classification Model for Question Type

In [None]:
checkpoint = 'google-bert/bert-base-uncased'
bert_classification_model = BertForSequenceClassification.from_pretrained(checkpoint)
bert_tokenizer = BertTokenizer.from_pretrained(checkpoint)

In [None]:
# Define the preprocessing function

def preprocess_classification_function(data,tokenizer):
   # The "inputs" are the tokenized answer:
   inputs = data["question"]
   model_inputs = tokenizer(text_target=inputs,
                      max_length=512,
                      truncation=True,
                      padding='max_length',
                            return_tensors='pt')

   # The "labels" are the tokenized outputs:
   label_map = {"explicit": 0, "implicit": 1}
   model_inputs["labels"] = [label_map[x] for x in data["ex_or_im"]]
   return model_inputs

In [None]:
# Map the preprocessing function across our dataset
train_clf_tokenized = train.map(preprocess_classification_function, batched=True,fn_kwargs={'tokenizer': bert_tokenizer})
val_clf_tokenized = val.map(preprocess_classification_function, batched=True,fn_kwargs={'tokenizer': bert_tokenizer})
test_clf_tokenized = test.map(preprocess_classification_function, batched=True,fn_kwargs={'tokenizer': bert_tokenizer})

train_clf_tokenized = {'input_ids': train_tokenized['input_ids'], 'labels': train_tokenized['labels']}
val_clf_tokenized = {'input_ids': val_tokenized['input_ids'], 'labels': val_tokenized['labels']}
test_clf_tokenized = {'input_ids': test_tokenized['input_ids'], 'labels': test_tokenized['labels']}

In [None]:
metric = evaluate.load('accuracy')

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
def fine_tune_classification_model(classification_model,
                                   tokenizer,
                                   train_data,
                                   dev_data,
                                   batch_size = 32,
                                   num_epochs = 2):
    """
    Preprocess the data using the given tokenizer (we've give you the code for that part).
    Create the training arguments and trainer for the given model and data (write your code for that).
    Then train it.
    """

    preprocessed_train_data = train_data.map(preprocess_classification_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_data.map(preprocess_classification_function, batched=True, fn_kwargs={'tokenizer': tokenizer})

    training_args = TrainingArguments(
        output_dir='bert_fine_tuned_clf',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy='epoch',
        save_strategy='epoch',
        report_to='none'
    )
    trainer = Trainer(
        model=classification_model,
        args=training_args,
        train_dataset=preprocessed_train_data,
        eval_dataset=preprocessed_dev_data,
        compute_metrics=compute_metrics
    )

    trainer.train()

In [None]:
def fine_tune_clf_model_freeze_layers(classification_model,
                                          tokenizer,
                                          train_data,
                                          dev_data,
                                          layers_to_train = ["classifier."],
                                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                                          batch_size = 32,
                                          num_epochs = 2):
    """
    Freeze any parameters inside the given model that have a name containing one of the
    strings in the "layers_to_freeze" list.
    Then specify the training arguments and trainer for the given model and data.
    Then train it.
    """

    preprocessed_train_data = train_data.map(preprocess_classification_function, batched=True, fn_kwargs={'tokenizer': tokenizer})
    preprocessed_dev_data = dev_data.map(preprocess_classification_function, batched=True, fn_kwargs={'tokenizer': tokenizer})

    ### YOUR CODE HERE

    # freeze all layers except for the layers_to_train
    for name, param in classification_model.named_parameters():
      if not any(x in name for x in layers_to_train):
        param.requires_grad = False

    training_args = TrainingArguments(
        output_dir='bert_fine_tuned_clf',
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        eval_strategy='epoch',
        save_strategy='epoch',
        report_to='none'
    )

    trainer = Trainer(
        model=classification_model,
        args=training_args,
        train_dataset=preprocessed_train_data,
        eval_dataset=preprocessed_dev_data,
        compute_metrics=compute_metrics
    )


    ### END YOUR CODE

    trainer.train()

In [None]:
fine_tune_classification_model(bert_classification_model, bert_tokenizer, train, val)

In [None]:
from datasets import concatenate_datasets

def downsample(train_df,verbose=False):
  '''Downsamples train_df to balance classes'''
  #balance classes in train
  explicit = train_df.filter(lambda x: x['ex_or_im'] == 'explicit')
  implicit = train_df.filter(lambda x: x['ex_or_im'] == 'implicit')

  if verbose:
      print(f"Original implicit count: {len(implicit)}")
      print(f"Original explicit count: {len(explicit)}")

  explicit = explicit.shuffle().select(range(len(implicit)))

  train_downsampled = concatenate_datasets([implicit,explicit]).shuffle()

  if verbose:
      print(f"New explicit count: {len(train_downsampled.filter(lambda x: x['ex_or_im'] == 'explicit'))}")
      print(f"New implicit count: {len(train_downsampled.filter(lambda x: x['ex_or_im'] == 'implicit'))}")

  return train_downsampled

In [None]:
train_downsampled = downsample(train,verbose=True)

In [None]:
layers_to_train = ['classifier.','pooler.','LayerNorm.','embeddings.']
fine_tune_clf_model_freeze_layers(bert_classification_model,bert_tokenizer,train_downsampled,val,layers_to_train)

# Try qLORA for Model Specialization

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification

In [None]:
max([len(q) for q in train['question']])

In [None]:
model_checkpoint = "roberta-large"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

lr = 1e-5
batch_size = 32
num_epochs = 2

In [None]:
# set LORA specific hyperparameters
r = 8
lora_alpha = 32
lora_dropout = 0.1

In [None]:
id2label = {
    0: "implicit",
    1: "explicit"
}

label2id = {
    "implicit": 0,
    "explicit": 1
}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=2, id2label=id2label, label2id=label2id
)
for name, param in model.named_parameters():
    param.requires_grad = False

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout
)

In [None]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
_= model.to("cuda")

In [None]:
training_args = TrainingArguments(
    output_dir="roberta-large-lora-sequence-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to='none'
)

In [None]:
train_downsampled = downsample(train,verbose=True)

In [None]:
# Define the preprocessing function

def preprocess_classification_function_lora(data,tokenizer):
   # The "inputs" are the tokenized answer:
   inputs = data["question"]
   model_inputs = tokenizer(text_target=inputs,
                      max_length=150,
                      truncation=True,
                      padding='max_length',
                            return_tensors='pt')

   # The "labels" are the tokenized outputs:
   label_map = {"explicit": 0, "implicit": 1}
   model_inputs["labels"] = [label_map[label] for label in data["ex_or_im"]]
   return model_inputs

In [None]:
# Map the preprocessing function across our dataset
train_clf_tokenized = train_downsampled.map(preprocess_classification_function_lora, batched=True,fn_kwargs={'tokenizer': tokenizer})
val_clf_tokenized = val.map(preprocess_classification_function_lora, batched=True,fn_kwargs={'tokenizer': tokenizer})
test_clf_tokenized = test.map(preprocess_classification_function_lora, batched=True,fn_kwargs={'tokenizer': tokenizer})

# train_clf_tokenized = {'input_ids': train_clf_tokenized['input_ids'], 'labels': train_clf_tokenized['labels']}
# val_clf_tokenized = {'input_ids': val_clf_tokenized['input_ids'], 'labels': val_clf_tokenized['labels']}
# test_clf_tokenized = {'input_ids': test_clf_tokenized['input_ids'], 'labels': test_clf_tokenized['labels']}

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_clf_tokenized,
    eval_dataset=val_clf_tokenized,
    compute_metrics=compute_metrics,
)

trainer.train()

# Train Individual T-5 Lora Models on Implicit vs Explicit

In [None]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import torch
from transformers import AutoTokenizer

## Explicit

In [None]:
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
metric = evaluate.load('accuracy')

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
rouge = evaluate.load('rouge')

def compute_metrics_rouge(p):
  predictions, labels = p
  return rouge.compute(predictions=predictions, references=labels)


In [None]:
model_checkpoint = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

lr = 1e-5
batch_size = 16
num_epochs = 4

In [None]:
# set LORA specific hyperparameters
r = 8
lora_alpha = 32
lora_dropout = 0.1

In [None]:
model_ex = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,quantization_config=quantization_config)

# for name, param in model.named_parameters():
#     param.requires_grad = False

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout
)

In [None]:
model_ex = get_peft_model(model_ex, peft_config)
model_ex.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [None]:
_= model_ex.to("cuda")

In [None]:
# Map the preprocessing function across our dataset
train_ex = train.filter(lambda x: x['ex_or_im'] == 'explicit')
val_ex = val.filter(lambda x: x['ex_or_im'] == 'explicit')
test_ex = test.filter(lambda x: x['ex_or_im'] == 'explicit')

train_tokenized_ex = train_ex.map(preprocess_function, batched=True)
val_tokenized_ex = val_ex.map(preprocess_function, batched=True)
test_tokenized_ex = test_ex.map(preprocess_function, batched=True)

train_tokenized_ex = {'input_ids': train_tokenized_ex['input_ids'], 'labels': train_tokenized_ex['labels']}
val_tokenzied_ex = {'input_ids': val_tokenized_ex['input_ids'], 'labels': val_tokenized_ex['labels']}
test_tokenzied_ex = {'input_ids': test_tokenized_ex['input_ids'], 'labels': test_tokenized_ex['labels']}

Filter:   0%|          | 0/8548 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1025 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1007 [00:00<?, ? examples/s]

Map:   0%|          | 0/6382 [00:00<?, ? examples/s]

Map:   0%|          | 0/744 [00:00<?, ? examples/s]

Map:   0%|          | 0/754 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-lora-text-generation",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to='none'
)

In [None]:
train_small = train_tokenized_ex.select(range(5000))
val_small = val_tokenized_ex.select(range(700))

In [None]:
train_small

Dataset({
    features: ['story_name', 'story_section', 'question', 'answer1', 'answer2', 'local_or_sum', 'attribute', 'ex_or_im', 'ex_or_im2', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 5000
})

In [None]:
trainer = Seq2SeqTrainer(
    model=model_ex,
    args=training_args,
    train_dataset=train_small,
    eval_dataset=val_small,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,30.2328,30.703125
2,20.1797,16.53125
3,14.2508,10.757812
4,12.4648,9.875


TrainOutput(global_step=1252, training_loss=22.594891922923324, metrics={'train_runtime': 1423.1079, 'train_samples_per_second': 14.054, 'train_steps_per_second': 0.88, 'total_flos': 3738949386240000.0, 'train_loss': 22.594891922923324, 'epoch': 4.0})

In [None]:
def generate_output(model, tokenizer, data, batch_size, **kwargs):

    all_outputs = []
    prefix = "Please answer this question: "
    context = " Context: "
    input_sentences = [prefix + sample['question'] + context + sample['story_section'] for sample in data]

    for i in range(int(len(input_sentences) / batch_size) + 1):
        start_i, end_i = i * batch_size, (i + 1) * batch_size
        if start_i >= len(input_sentences):
            break

        inputs_encoded = tokenizer(input_sentences[start_i:end_i], padding=True, return_tensors='pt')
        output_ids = model.cuda().generate(input_ids=inputs_encoded['input_ids'].cuda(), **kwargs)
        generated_sentences = tokenizer.batch_decode(output_ids,
                                                     skip_special_tokens=True,
                                                     clean_up_tokenization_spaces=False)
        all_outputs.extend(generated_sentences)

    return all_outputs

In [None]:
def calculate_eval_metrics(data, model, tokenizer, batch_size, prefix="", **kwargs):

    # Translate original texts
    answers = generate_output(model, tokenizer, data, batch_size, **kwargs)

    # Calculate BLEU scores
    rouge_results = rouge.compute(predictions=answers, references=data['answer1'])
    print('ROUGE: ', rouge_results)

    # # Calculate BLEURT scores
    # bleurt_scores = []
    # for i in range(int(len(translations) / batch_size) + 1):
    #     start_i, end_i = i * batch_size, (i + 1) * batch_size
    #     if start_i >= len(translations):
    #         break

    #     with torch.no_grad():
    #         scores = bleurt_model(**bleurt_tokenizer(label_texts[start_i:end_i],
    #                                                  translations[start_i:end_i],
    #                                                  truncation=True,
    #                                                  max_length=MAX_SEQUENCE_LENGTH,
    #                                                  padding='max_length',
    #                                                  return_tensors='pt'))[0].squeeze().numpy()
    #         if scores.shape:
    #             bleurt_scores.extend(scores)
    #         else:  # Happens when there was only one example in the last batch
    #             bleurt_scores.append(float(scores))

    # print('BLEURT: ', np.mean(bleurt_scores))

    return answers

In [None]:
"""
Playing with the decoder .generate() arguments like num_beams or top_p, etc.
"""

generate_kwargs = {

    'num_beams': 4,
    'do_sample':True,
    'top_k':100,
    'top_p':1,
    'temperature':0.2,
    'no_repeat_ngram_size':2
}

part1_val_translations = calculate_eval_metrics(
    val_small,
    model_ex,
    tokenizer,
    batch_size,
    **generate_kwargs
)

Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors


ROUGE:  {'rouge1': np.float64(0.1936835005403164), 'rouge2': np.float64(0.04584093490031679), 'rougeL': np.float64(0.19156213437210629), 'rougeLsum': np.float64(0.19135937900749395)}


In [None]:
# Print out a sample of outputs to manually review
for i in range(10):
    sample_i = random.choice(range(len(part1_val_translations)))
    print('Referenced answer:  ', val_small['answer1'][i])
    print('Generated answer:   ', part1_val_translations[i])
    print()

Referenced answer:   the fairies .
Generated answer:    Fairies

Referenced answer:   he grew weary of his lonely life .
Generated answer:    None of the above choices

Referenced answer:   it was at once covered with the finest dishes one might desire .
Generated answer:    The maid

Referenced answer:   he was so handsome .
Generated answer:    None of the above choices

Referenced answer:   assipattle .
Generated answer:    None of the above choices

Referenced answer:   a young man .
Generated answer:    Denis

Referenced answer:   the tree broke at once into blossom and the birds came and sang on it .
Generated answer:    The giant

Referenced answer:   her body gave forth so much soft bright light that she might have been a daughter of the moon god .
Generated answer:    she was a narcissistic

Referenced answer:   the king 's son understood that they had come to remind him of what he had forgotten , and his lost memory came back , and he knew his wife , and kissed her .
Generate

## Implicit

In [None]:
def clear_gpu_memory():
    import gc
    import torch

    gc.collect()
    torch.cuda.empty_cache()
    print("Cleared GPU memory.")

# Usage
del model_ex, tokenizer, trainer
clear_gpu_memory()


Cleared GPU memory.


In [None]:
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
metric = evaluate.load('accuracy')

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
rouge = evaluate.load('rouge')

def compute_metrics_rouge(p):
  predictions, labels = p
  return rouge.compute(predictions=predictions, references=labels)


In [None]:
model_checkpoint = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

lr = 1e-5
batch_size = 16
num_epochs = 4

In [None]:
# set LORA specific hyperparameters
r = 8
lora_alpha = 32
lora_dropout = 0.1

In [None]:
model_im = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,quantization_config=quantization_config)

# for name, param in model.named_parameters():
#     param.requires_grad = False

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout
)

In [None]:
model_im = get_peft_model(model_im, peft_config)
model_im.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451




In [None]:
_= model_im.to("cuda")

In [None]:
# Map the preprocessing function across our dataset
train_im = train.filter(lambda x: x['ex_or_im'] == 'implicit')
val_im = val.filter(lambda x: x['ex_or_im'] == 'implicit')
test_im = test.filter(lambda x: x['ex_or_im'] == 'implicit')

train_tokenized_im = train_im.map(preprocess_function, batched=True)
val_tokenized_im = val_im.map(preprocess_function, batched=True)
test_tokenized_im = test_im.map(preprocess_function, batched=True)

train_tokenized_im = {'input_ids': train_tokenized_im['input_ids'], 'labels': train_tokenized_im['labels']}
val_tokenzied_im = {'input_ids': val_tokenized_im['input_ids'], 'labels': val_tokenized_im['labels']}
test_tokenzied_im = {'input_ids': test_tokenized_im['input_ids'], 'labels': test_tokenized_im['labels']}

Filter:   0%|          | 0/8548 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1025 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1007 [00:00<?, ? examples/s]

Map:   0%|          | 0/2166 [00:00<?, ? examples/s]

Map:   0%|          | 0/281 [00:00<?, ? examples/s]

Map:   0%|          | 0/253 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-lora-text-generation",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to='none'
)

In [None]:
train_small = train_tokenized_im.select(range(1000))
val_small = val_tokenized_im.select(range(200))

AttributeError: 'dict' object has no attribute 'select'

In [None]:
val_small

In [None]:
trainer = Seq2SeqTrainer(
    model=model_im,
    args=training_args,
    train_dataset=train_small,
    eval_dataset=val_small,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


TypeError: T5ForConditionalGeneration.forward() got an unexpected keyword argument 'num_items_in_batch'

In [None]:
def generate_output(model, tokenizer, data, batch_size, **kwargs):

    all_outputs = []
    prefix = "Please answer this question: "
    context = " Context: "
    input_sentences = [prefix + sample['question'] + context + sample['story_section'] for sample in data]

    for i in range(int(len(input_sentences) / batch_size) + 1):
        start_i, end_i = i * batch_size, (i + 1) * batch_size
        if start_i >= len(input_sentences):
            break

        inputs_encoded = tokenizer(input_sentences[start_i:end_i], padding=True, return_tensors='pt')
        output_ids = model.cuda().generate(input_ids=inputs_encoded['input_ids'].cuda(), **kwargs)
        generated_sentences = tokenizer.batch_decode(output_ids,
                                                     skip_special_tokens=True,
                                                     clean_up_tokenization_spaces=False)
        all_outputs.extend(generated_sentences)

    return all_outputs

In [None]:
def calculate_eval_metrics(data, model, tokenizer, batch_size, prefix="", **kwargs):

    # Translate original texts
    answers = generate_output(model, tokenizer, data, batch_size, **kwargs)

    # Calculate BLEU scores
    rouge_results = rouge.compute(predictions=answers, references=data['answer1'])
    print('ROUGE: ', rouge_results)

    # # Calculate BLEURT scores
    # bleurt_scores = []
    # for i in range(int(len(translations) / batch_size) + 1):
    #     start_i, end_i = i * batch_size, (i + 1) * batch_size
    #     if start_i >= len(translations):
    #         break

    #     with torch.no_grad():
    #         scores = bleurt_model(**bleurt_tokenizer(label_texts[start_i:end_i],
    #                                                  translations[start_i:end_i],
    #                                                  truncation=True,
    #                                                  max_length=MAX_SEQUENCE_LENGTH,
    #                                                  padding='max_length',
    #                                                  return_tensors='pt'))[0].squeeze().numpy()
    #         if scores.shape:
    #             bleurt_scores.extend(scores)
    #         else:  # Happens when there was only one example in the last batch
    #             bleurt_scores.append(float(scores))

    # print('BLEURT: ', np.mean(bleurt_scores))

    return answers

In [None]:
train_small

In [None]:
"""
Playing with the decoder .generate() arguments like num_beams or top_p, etc.
"""

generate_kwargs = {

    'num_beams': 4,
    'do_sample':True,
    'top_k':100,
    'top_p':1,
    'temperature':0.2,
    'no_repeat_ngram_size':2
}

part1_val_translations = calculate_eval_metrics(
    val_small,
    model_im,
    tokenizer,
    batch_size,
    **generate_kwargs
)

In [None]:
val_small

In [None]:
part1_val_translations[:5]

In [None]:
# Print out a sample of outputs to manually review
for i in range(10):
    sample_i = random.choice(range(len(part1_val_translations)))
    print('Referenced answer:  ', val_small['answer1'][i])
    print('Generated answer:   ', part1_val_translations[i])
    print()