<a href="https://colab.research.google.com/github/Buchiexplores/abuchi_notebooks/blob/main/T5_LCD_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is used to fine tune T5 Model on LCDs

# New Section

Set up enivronment first  -- run these commands in the terminal

conda create -n py36_test -y python=3.6.9 jupyter
conda activate py36_test
python -m ipykernel install --user --name envtest --display-name "Python3_6"


In [None]:
#run these from the terminal
#!pip install azureml-core
#python -m pip install azureml-dataset-runtime --upgrade
!pip install torch
!pip install transformers
!pip install horovod==0.22.1
!pip install pytorch_lightning 
!pip install nltk  
!pip install sentencepiece
!pip install nlp


In [3]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [4]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [6]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

## Model

We'll be using the awesome [pytorch-lightning](https://github.com/PytorchLightning/pytorch-lightning) library for training. Most of the below code is adapted from here https://github.com/huggingface/transformers/blob/master/examples/lightning_base.py

The trainer is generic and can be used for any text-2-text task. You'll just need to change the dataset. Rest of the code will stay unchanged for all the tasks.

This is the most intresting and powrfull thing about the text-2-text format. You can fine-tune the model on variety of NLP tasks by just formulating the problem in text-2-text setting. No need to change hyperparameters, learning rate, optimizer or loss function. Just plug in your dataset and you are ready to go!

In [228]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hyperparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained(self.hyperparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(self.hyperparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.global_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    #return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hyperparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hyperparams.learning_rate, eps=self.hyperparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  # def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
  #   if self.trainer.use_tpu:
  #     xm.optimizer_step(optimizer)
  #   else:
  #     optimizer.step()
  #   optimizer.zero_grad()
  #   self.lr_scheduler.step()

  def optimizer_step(self, epoch=None, batch_idx=None, optimizer=None, optimizer_idx=None, optimizer_closure=None, on_tpu=None, using_native_amp=None, using_lbfgs=None):
    optimizer.step(closure=optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict


  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hyperparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hyperparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hyperparams.train_batch_size * max(1, self.hyperparams.n_gpu)))
        // self.hyperparams.gradient_accumulation_steps
        * float(self.hyperparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hyperparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hyperparams)
    return DataLoader(val_dataset, batch_size=self.hyperparams.eval_batch_size, num_workers=4)

In [161]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

Let's define the hyperparameters and other arguments. You can overide this `dict` for specific task as needed. While in most of cases you'll only need to change the `data_dir`and `output_dir`.

Here the batch size is 8 and gradient_accumulation_steps are 16 so the effective batch size is 128

In [207]:
args_dict = dict(
    data_dir="/content/drive/MyDrive/wellington/aclllmdbsmall/", # path for data files
    output_dir="/content/drive/MyDrive/wellington/aclllmdbsmall/", # path to save the checkpoints
    #data_dir="C:\_JSE_Work\Machine Learning\T5\T5 LCD Training\aclImdb", # path for data files
    #output_dir="C:\_JSE_Work\Machine Learning\T5\T5 LCD Training\CheckPoints", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=1,
    eval_batch_size=1,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

## IMDB review classification

In [None]:
# -- manually downloaded info - had data and folder set up in the zip file
#!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
#!tar -xvf aclImdb_v1.tar.gz

In [None]:
from azureml.core import Workspace, Datastore
ws = Workspace.from_config()

# get the name of defult Datastore associated with the workspace.
default_dsname = ws.get_default_datastore().name
default_ds = ws.get_default_datastore()
print('default Datastore = ', default_dsname)

In [None]:
# azureml-core of version 1.0.72 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'f1afabdd-76ea-42a1-96bb-c117e3fa902c'
resource_group = 'TWG_ML2'
workspace_name = 'ML_SalaryTest'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='imdb_data_T5')
dataset.download(target_path='.', overwrite=True)
#print(dataset)

In [124]:
train_pos_files = glob.glob("/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/*.txt")
train_neg_files = glob.glob("/content/drive/MyDrive/wellington/aclllmdbsmall/train/neg/*.txt")

#train_pos_files = glob.glob('C:/_JSE_Work/Machine Learning/T5/T5 LCD Training/aclImdb/train/pos/*.txt')
#!dir "C:\_JSE_Work\Machine Learning\T5\T5 LCD Training\aclImdb\train\pos\*.txt"
#train_neg_files = glob.glob('C:/_JSE_Work/Machine Learning/T5/T5 LCD Training/aclImdb/train/neg/*.txt')
#print(train_neg_files)

In [125]:
#len(dataset), len(dataset)
len(train_pos_files), len(train_neg_files)


(201, 201)

We will use 2000 samples from the train set for validation. Let's choose 1000 postive reviews and 1000 negative reviews for validation and save them in the val directory

In [None]:
# created these manually -- C:\_JSE_Work\Machine Learning\T5\T5 LCD Training\aclImdb\val
#!mkdir aclImdb/val aclImdb/val/pos aclImdb/val/neg

In [126]:
random.shuffle(train_pos_files)
random.shuffle(train_neg_files)

val_pos_files = train_pos_files[:1000]
val_neg_files = train_neg_files[:1000]

print(len(val_pos_files))
print(len(val_neg_files))

201
201


In [120]:
import shutil

In [127]:
for f in val_pos_files:
  shutil.copy(f,  '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos')
for f in val_neg_files:
  shutil.copy(f,  '/content/drive/MyDrive/wellington/aclllmdbsmall/val/neg')

### Prepare Dataset

In [164]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

In [163]:
# removed the </s> as the newer version already creates this
ids_neg = tokenizer.encode('negative')
ids_pos = tokenizer.encode('positive')
len(ids_neg), len(ids_pos)

(2, 2)

All the examples are converted in the text-2-text format as shown in the paper. However I didn't use any task prefix here. The examples are encoded as follows,
if the review is positive then the target is 'positive' else 'negative'

**input**:  I went to see this
movie with my husband, and we both
thought the acting was terrible!"

**target**: negative

**input**:  Despite what others say,
I thought this movie was funny.

**target**: positive

In [229]:
class ImdbDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
    self.pos_file_path = os.path.join(data_dir, type_path, 'pos')
    self.neg_file_path = os.path.join(data_dir, type_path, 'neg')
    # print(self.pos_file_path)
    # print(self.neg_file_path)
    
    self.pos_files = glob.glob("%s/*.txt" % self.pos_file_path)
    self.neg_files = glob.glob("%s/*.txt" % self.neg_file_path)
    print(self.pos_files)
    print(self.neg_files)
    # print(len(self.pos_files))
    # print(len(self.neg_files))
    
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
    
    
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    self._buil_examples_from_files(self.pos_files, 'positive')
    self._buil_examples_from_files(self.neg_files, 'negative')
  
  def _buil_examples_from_files(self, files, sentiment):
    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    for path in files:
      with open(path, 'r',encoding='utf-8' ) as f:
        text = f.read()
      
      line = text.strip()
      line = REPLACE_NO_SPACE.sub("", line) 
      line = REPLACE_WITH_SPACE.sub("", line)
      #line = line + ' </s>'

      target = sentiment  # + " </s>" removed as not needed?

       # tokenize inputs
      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [line], max_length=self.max_len, padding='max_length', return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=2, padding='max_length', return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [209]:
dataset = ImdbDataset(tokenizer, '/content/drive/MyDrive/wellington/aclllmdbsmall/', 'val',  max_len=512)
len(dataset)




['/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/122_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/110_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/16_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/27_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/146_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/121_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/130_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/177_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/187_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/42_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/102_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/77_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/125_7.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/149_9.txt', '/content/drive/MyDrive/wellingto

402

In [231]:
data = dataset[28]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

This was a excellent movie I deal with a child who I am raising that has FAE and watching this movie was more than word can explain I also purchased the book and it was great I would like to have a copy for my own use and so I can have my sons teachers watch it alsoI would like to know if anyone could sell me a copy of this movie let me use it for a time or refer me to someone where I could purchase it Thank You Myra I would recommend this movie to anyone who deal with childrenadults with special abilitys This movie should be shown again on TV The team of doctors that have been tracking my son would also like to have a copy His special Ed teacher has also asked me to try to locate a copy that he can have or that he can borrowThank You</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>

In [244]:
args_dict.update({'data_dir': '/content/drive/MyDrive/wellington/aclllmdbsmall/', 'output_dir': '/content/drive/MyDrive/wellington/t5_imdb_sentiment', 'num_train_epochs':2})
args = argparse.Namespace(**args_dict)
print(args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=True,
    callbacks=[LoggingCallback()],
)

{'data_dir': '/content/drive/MyDrive/wellington/aclllmdbsmall/', 'output_dir': '/content/drive/MyDrive/wellington/t5_imdb_sentiment', 'model_name_or_path': 't5-base', 'tokenizer_name_or_path': 't5-base', 'max_seq_length': 512, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 1, 'eval_batch_size': 1, 'num_train_epochs': 2, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}


In [245]:
def get_dataset(tokenizer, type_path, args):
  return ImdbDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

In [246]:
model = T5FineTuner(args)

In [247]:
trainer = pl.Trainer(**train_params)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [252]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

48999

In [253]:
torch.cuda.memory_summary(device=None, abbreviated=False)



In [254]:
trainer.fit(model)

  "When using `Trainer(accumulate_grad_batches != 1)` and overriding"
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
891.614   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

['/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/122_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/110_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/16_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/27_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/146_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/121_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/130_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/177_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/187_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/42_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/102_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/77_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/125_7.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/val/pos/149_9.txt', '/content/drive/MyDrive/wellingto

  cpuset_checked))


['/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/64_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/148_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/126_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/158_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/36_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/40_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/117_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/59_10.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/38_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/190_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/139_9.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/14_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/143_8.txt', '/content/drive/MyDrive/wellington/aclllmdbsmall/train/pos/98_9.txt', '/conten

Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]