In [1]:
!pip install -q -U transformers[torch]
!pip install -q -U datasets
!pip install -q -U evaluate
!pip install -q -U tokenizers
!pip install -q -U bitsandbytes
!pip install -q rouge_score
!pip install -q -U peft
!pip install -q -U accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m45.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import re
import random
import numpy as np
from scipy.special import softmax
import pprint

import bitsandbytes as bnb

import torch
import transformers
import evaluate
from datasets import Dataset, load_dataset

# For from-scratch T5 model
from transformers import T5TokenizerFast, T5Config, T5ForConditionalGeneration

# For pre-trained T5 model
from transformers import T5Tokenizer, T5ForConditionalGeneration  # this won't import twice, just noting here what's for each model

# For all T5 models
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, Trainer, TrainingArguments

# For BLEURT (to load a trained model for evaluation)
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM


# For style classifier model (also for evaluating the seq2seq model output)
from transformers import BertTokenizer, BertForSequenceClassification, BitsAndBytesConfig,pipeline
from transformers import TrainingArguments, Trainer

import pandas as pd
from tqdm import tqdm

### Local Data Loading Load Data

In [3]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Path to data save in Drive
train = 'FairytaleQA_train.csv'
valid = 'FairytaleQA_valid.csv'

test = 'FairytaleQA_test.csv'
path = '/content/drive/MyDrive/266/FinalProject/data/'
# path = 'drive/MyDrive/DataSci_266/266_Danielle_Dylan_final_project/data/'

In [5]:
train_filepath = path+train
valid_filepath = path+valid
test_filepath = path+test


In [6]:
train = load_dataset("csv",data_files=train_filepath,streaming=True)['train']
val = load_dataset("csv",data_files=valid_filepath,streaming=True)['train']
test = load_dataset("csv",data_files=test_filepath,streaming=True)['train']

In [7]:
train

IterableDataset({
    features: Unknown,
    num_shards: 1
})

In [8]:
from datasets import load_dataset, DatasetDict
# # train_ds = Dataset.from_pandas(train, split="train")
# # test_ds = Dataset.from_pandas(test, split="test")
# # valid_ds = Dataset.from_pandas(valid, split="valid")

# Combine into a single DatasetDict
ds = DatasetDict({
    "train": train,
    "test": test,
    "validation": val,
})

train = ds['train'].shuffle()
val = ds['validation'].shuffle()
test = ds['test'].shuffle()

In [9]:
VOCAB_SIZE = 15000
MAX_SEQUENCE_LENGTH = 512

In [10]:
# We prefix our tasks with "answer the question"
prefix = "Please answer this question: "
context = " Context: "

# Define the preprocessing function

def preprocess_function(data):
   """Add prefix to the sentences, tokenize the text, and set the labels"""
   # The "inputs" are the tokenized answer:

   data['full_input'] = [context + story + prefix + question for question,story in zip(data["question"],data['story_section'])]

   input_encoded = tokenizer.batch_encode_plus(data['full_input'],
                      max_length=MAX_SEQUENCE_LENGTH,
                      truncation=True,
                      padding='max_length',
                            return_tensors='pt')

   # The "labels" are the tokenized outputs:
   answers_encoded = tokenizer.batch_encode_plus(data["answer1"],
                      max_length=MAX_SEQUENCE_LENGTH,
                      truncation=True,
                      padding='max_length',
                      return_tensors='pt')

   print("Preprocessing Finished")

   return {'input_ids': input_encoded['input_ids'],
           'labels': answers_encoded['input_ids']}

# Train Individual T-5 Lora Models on Implicit

In [11]:
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import torch
from transformers import AutoTokenizer

In [12]:
num_train_examples_im = 2_166

In [13]:
# Configure bitsandbytes for 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [14]:
# rouge = evaluate.load('rouge')

# def compute_metrics_rouge(p):
#   predictions, labels = p
#   return rouge.compute(predictions=predictions, references=labels)


In [15]:
model_checkpoint = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

In [17]:
# ## EXPERIMENT 1
# # set training args
# lr = 1e-5
# batch_size = 16
# num_epochs = 3
# # set LORA specific hyperparameters
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

In [18]:
# ## EXPERIMENT 2
# # set training args
# lr = 1e-5
# batch_size = 16
# num_epochs = 9
# # set LORA specific hyperparameters
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

In [46]:
# ## EXPERIMENT 3
# # set training args
# lr = 1e-2
# batch_size = 16
# num_epochs = 5
# # set LORA specific hyperparameters
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

In [47]:
# ## EXPERIMENT 4
# # set training args
# lr = 1e-2
# batch_size = 16
# num_epochs = 30
# # set LORA specific hyperparameters
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

In [16]:
## EXPERIMENT 5
# set training args
lr = 1e-3
batch_size = 16
num_epochs = 10
# set LORA specific hyperparameters
r = 8
lora_alpha = 32
lora_dropout = 0.1

In [17]:
model_im = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint,quantization_config=quantization_config)

# for name, param in model_im.named_parameters():
#     param.requires_grad = False

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [18]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout
)

In [19]:
model_im = get_peft_model(model_im, peft_config)
model_im.print_trainable_parameters()

trainable params: 344,064 || all params: 77,305,216 || trainable%: 0.4451


In [20]:
_= model_im.to("cuda")

In [21]:
# Map the preprocessing function across our dataset
train_im = train.filter(lambda x: x['ex_or_im'] == 'implicit')
val_im = val.filter(lambda x: x['ex_or_im'] == 'implicit')
test_im = test.filter(lambda x: x['ex_or_im'] == 'implicit')

train_tokenized_im = train_im.map(preprocess_function, batched=True)
val_tokenized_im = val_im.map(preprocess_function, batched=True)
test_tokenized_im = test_im.map(preprocess_function, batched=True)


In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir="t5-small-lora-text-generation",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    max_steps=int(num_epochs * num_train_examples_im / batch_size),
    report_to='none'
)

### Experiment 1
Losses start very high(50), and decreasing slowly (0.1)

In [None]:
%%time

# lr = 1e-5
# batch_size = 16
# num_epochs = 3
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

trainer = Seq2SeqTrainer(
    model=model_im,
    args=training_args,
    train_dataset=train_tokenized_im,
    eval_dataset=val_tokenized_im,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Preprocessing Finished


Epoch,Training Loss,Validation Loss
0,50.4844,50.71875
1,45.7031,48.875
2,44.0438,48.0625


Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
CPU times: user 5min 23s, sys: 1min 36s, total: 7min
Wall time: 7min 55s


TrainOutput(global_step=406, training_loss=48.14354987684729, metrics={'train_runtime': 474.2534, 'train_samples_per_second': 13.697, 'train_steps_per_second': 0.856, 'total_flos': 1210671811264512.0, 'train_loss': 48.14354987684729, 'epoch': 2.3300492610837438})

In [None]:
model_im_checkpoint_filepath = '/content/drive/MyDrive/266/FinalProject/checkpoints/t5_model_im_Experiments1'
model_im.save_pretrained(model_im_checkpoint_filepath, from_pt=True)

### Experiment 2
Based on experiment1: try increasing the epochs to see how much it can be decreased; all else equal

Started high(50) ended(25)

In [24]:
%%time

# lr = 1e-5
# batch_size = 16
# num_epochs = 9
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

trainer = Seq2SeqTrainer(
    model=model_im,
    args=training_args,
    train_dataset=train_tokenized_im,
    eval_dataset=val_tokenized_im,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Preprocessing Finished


Epoch,Training Loss,Validation Loss
0,49.4719,50.375
1,41.7281,45.875
2,37.5938,40.1875
3,33.7375,35.65625
4,30.4391,32.15625
5,28.5297,29.3125
6,26.375,27.109375
7,25.5125,25.671875
8,25.2094,25.203125


Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
CPU times: user 17min 9s, sys: 5min 59s, total: 23min 9s
Wall time: 23min 36s


TrainOutput(global_step=1218, training_loss=35.183228140394085, metrics={'train_runtime': 1415.5981, 'train_samples_per_second': 13.767, 'train_steps_per_second': 0.86, 'total_flos': 3628276484407296.0, 'train_loss': 35.183228140394085, 'epoch': 8.106732348111658})

In [25]:
model_im_checkpoint_filepath = '/content/drive/MyDrive/266/FinalProject/checkpoints/t5_model_im_Experiments2'
model_im.save_pretrained(model_im_checkpoint_filepath, from_pt=True)

### Experiment 3


In [54]:
%%time

# lr = 1e-2
# batch_size = 16
# num_epochs = 5
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

trainer = Seq2SeqTrainer(
    model=model_im,
    args=training_args,
    train_dataset=train_tokenized_im,
    eval_dataset=val_tokenized_im,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Preprocessing Finished


Epoch,Training Loss,Validation Loss
0,4.7762,4.679688
1,4.6992,4.628906
2,4.6715,4.597656
3,4.6383,4.566406
4,4.6312,4.554688


Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
CPU times: user 9min 17s, sys: 3min 35s, total: 12min 52s
Wall time: 12min 54s


TrainOutput(global_step=676, training_loss=4.8682391826923075, metrics={'train_runtime': 774.1157, 'train_samples_per_second': 13.972, 'train_steps_per_second': 0.873, 'total_flos': 2014545929306112.0, 'train_loss': 4.8682391826923075, 'epoch': 4.195266272189349})

In [55]:
model_im_checkpoint_filepath = '/content/drive/MyDrive/266/FinalProject/checkpoints/t5_model_im_Experiments3'
model_im.save_pretrained(model_im_checkpoint_filepath, from_pt=True)

## Experiment 4
FAILED; experiment stopped for lack of promise.
Shows us that after 5 epochs the learning rate of 1e-2 is too big because the losses start to increase

In [45]:
# lr = 1e-2
# batch_size = 16
# num_epochs = 30
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1

trainer = Seq2SeqTrainer(
    model=model_im,
    args=training_args,
    train_dataset=train_tokenized_im,
    eval_dataset=val_tokenized_im,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Preprocessing Finished


Epoch,Training Loss,Validation Loss
0,4.857,4.789062
1,4.8547,4.820312
2,4.8492,4.792969
3,4.9715,4.871094
4,4.8527,4.800781
5,4.893,4.816406
6,4.8645,4.878906
7,4.9539,4.875
8,4.8895,5.027344


Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished


KeyboardInterrupt: 

## Experiment 5

In [23]:
# ## EXPERIMENT 5
# # set training args
# lr = 1e-3
# batch_size = 16
# num_epochs = 10
# # set LORA specific hyperparameters
# r = 8
# lora_alpha = 32
# lora_dropout = 0.1


trainer = Seq2SeqTrainer(
    model=model_im,
    args=training_args,
    train_dataset=train_tokenized_im,
    eval_dataset=val_tokenized_im,
    #compute_metrics=compute_metrics_rouge,
)

trainer.train()

No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Preprocessing Finished


Epoch,Training Loss,Validation Loss
0,4.5113,4.433594
1,4.4816,4.410156
2,4.4918,4.453125
3,4.4738,4.429688
4,4.4684,4.40625
5,4.448,4.40625
6,4.4566,4.410156
7,4.4484,4.410156
8,4.4609,4.414062
9,4.4566,4.414062


Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished
Preprocessing Finished


TrainOutput(global_step=1353, training_loss=4.872990576496674, metrics={'train_runtime': 1556.0338, 'train_samples_per_second': 13.912, 'train_steps_per_second': 0.87, 'total_flos': 4030213543428096.0, 'train_loss': 4.872990576496674, 'epoch': 9.095343680709535})

In [24]:
model_im_checkpoint_filepath = '/content/drive/MyDrive/266/FinalProject/checkpoints/t5_model_im_Experiments5'
model_im.save_pretrained(model_im_checkpoint_filepath, from_pt=True)

In [25]:
def clear_gpu_memory():
    import gc
    import torch

    gc.collect()
    torch.cuda.empty_cache()
    print("Cleared GPU memory.")

# Usage
del model_im, tokenizer, trainer
clear_gpu_memory()


Cleared GPU memory.


In [None]:
# Run this line only if you need to reload the model you trained earlier
model_im = AutoModelForSeq2SeqLM.from_pretrained(model_im_checkpoint_filepath)

In [None]:
def generate_output(model, tokenizer, data, **kwargs):

    all_outputs = []
    prefix = "Please answer this question: "
    context = " Context: "
    input_sentences = [prefix + sample['question'] + context + sample['story_section'] for sample in data]

    for i in range(int(len(input_sentences)) + 1):
        if i >= len(input_sentences):
            break

        inputs_encoded = tokenizer(input_sentences[i],truncation=True, padding=True, return_tensors='pt')
        output_ids = model.cuda().generate(input_ids=inputs_encoded['input_ids'].cuda(), **kwargs)
        generated_sentences = tokenizer.batch_decode(output_ids,
                                                     skip_special_tokens=True,
                                                     clean_up_tokenization_spaces=False)
        all_outputs.extend(generated_sentences)

    return all_outputs

In [None]:
def calculate_eval_metrics(data, model, tokenizer, prefix="", **kwargs):

    # Translate original texts
    answers = generate_output(model, tokenizer, data, **kwargs)

    # Calculate ROUGE scores
    rouge_results = rouge.compute(predictions=answers, references=data['answer1'])
    print('ROUGE: ', rouge_results)

    # Calculate BLEURT scores
    bleurt_scores = []
    for i in range(int(len(answers)) + 1):
        if i >= len(answers):
            break

        with torch.no_grad():
            scores = bleurt_model(**bleurt_tokenizer(data['answer1'][i],
                                                     answers[i],
                                                     truncation=True,
                                                     max_length=MAX_SEQUENCE_LENGTH,
                                                     padding='max_length',
                                                     return_tensors='pt'))[0].squeeze().numpy()
            if scores.shape:
                bleurt_scores.extend(scores)
            else:  # Happens when there was only one example in the last batch
                bleurt_scores.append(float(scores))

    print('BLEURT: ', np.mean(bleurt_scores))

    return answers

In [None]:
"""
Playing with the decoder .generate() arguments like num_beams or top_p, etc.
"""

generate_kwargs = {

    'num_beams': 4,
    'do_sample':True,
    'top_k':100,
    'top_p':1,
    'temperature':0.2,
    'no_repeat_ngram_size':2
}

val_answers_im = calculate_eval_metrics(
    val_small.select(range(100)),
    model_im,
    tokenizer,
    batch_size,
    **generate_kwargs
)

In [None]:
# Print out a sample of outputs to manually review
for i in range(10):
    sample_i = random.choice(range(len(val_answers_im)))
    print('Referenced answer:  ', val_small['answer1'][i])
    print('Generated answer:   ', val_answers_im[i])
    print()

In [None]:
def clear_gpu_memory():
    import gc
    import torch

    gc.collect()
    torch.cuda.empty_cache()
    print("Cleared GPU memory.")

# Usage
del model_im, tokenizer, trainer
clear_gpu_memory()
