In [None]:
!nvidia-smi

Wed Feb 16 19:11:39 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install transformers==2.9.0
!pip install pytorch_lightning==0.7.5

Collecting transformers==2.9.0
  Downloading transformers-2.9.0-py3-none-any.whl (635 kB)
[?25l[K     |▌                               | 10 kB 23.4 MB/s eta 0:00:01[K     |█                               | 20 kB 12.9 MB/s eta 0:00:01[K     |█▌                              | 30 kB 16.5 MB/s eta 0:00:01[K     |██                              | 40 kB 15.3 MB/s eta 0:00:01[K     |██▋                             | 51 kB 8.7 MB/s eta 0:00:01[K     |███                             | 61 kB 8.7 MB/s eta 0:00:01[K     |███▋                            | 71 kB 9.1 MB/s eta 0:00:01[K     |████▏                           | 81 kB 10.1 MB/s eta 0:00:01[K     |████▋                           | 92 kB 10.6 MB/s eta 0:00:01[K     |█████▏                          | 102 kB 8.6 MB/s eta 0:00:01[K     |█████▊                          | 112 kB 8.6 MB/s eta 0:00:01[K     |██████▏                         | 122 kB 8.6 MB/s eta 0:00:01[K     |██████▊                         | 133 kB 8.6 

In [None]:
!pip install sentencepiece



# T5 fine-tuning
This notebook is to showcase how to fine-tune T5 model with Huggigface's Transformers to solve different NLP tasks using text-2-text approach proposed in the T5 paper.

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

nltk.download('punkt')

INFO:transformers.file_utils:PyTorch version 1.10.0+cu111 available.
INFO:transformers.file_utils:TensorFlow version 2.8.0 available.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Model
We'll be using the awesome pytorch-lightning library for training.

The trainer is generic and can be used for any text-2-text task. You'll just need to change the dataset. Rest of the code will stay unchanged for all the tasks.

This is the most intresting and powrfull thing about the text-2-text format. You can fine-tune the model on variety of NLP tasks by just formulating the problem in text-2-text setting.

In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.proc_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

Let's define the hyperparameters and other arguments. You can overide this dict for specific task as needed. While in most of cases you'll only need to change the data_dirand output_dir.

In [None]:
args_dict = dict(
    output_dir="", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=2,
    eval_batch_size=2,
    num_train_epochs=2,
    gradient_accumulation_steps=4,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

# iSarcasm Classfication

In [None]:
train = pd.read_csv('/content/Train_Dataset.csv')[["tweet", "sarcastic"]]
train = train.dropna(subset=['tweet'])

train_mutant = pd.read_csv('/content/data4.csv')
train_mutant = train_mutant.dropna(subset=['tweet'])

test = pd.read_csv('/content/task_A_En_test.csv')
test.rename(columns={'text': 'tweet'}, inplace=True)

In [None]:
train = pd.concat([train, train_mutant])
train = train.sample(frac = 1)
train = train.reset_index()

In [None]:
train.head()

Unnamed: 0,index,tweet,sarcastic
0,499,It was the,1
1,2878,Their website doesn't seem to mention Axa nor ...,0
2,393,Imagine not liking football 🤣,1
3,903,play golf is just class watch,0
4,1135,I want one of those insects. You know,0


In [None]:
test.head()

Unnamed: 0,tweet,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0
1,Pinball!,0
2,So the Scottish Government want people to get ...,1
3,villainous pro tip : change the device name on...,0
4,I would date any of these men 🥺,0


In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

INFO:transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model not found in cache or force_download set to True, downloading to /root/.cache/torch/transformers/tmp64qsvbdq


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

INFO:transformers.file_utils:storing https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model in cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f
INFO:transformers.file_utils:creating metadata file for /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f
INFO:transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model from cache at /root/.cache/torch/transformers/68f1b8dbca4350743bb54b8c4169fd38cbabaad564f85a9239337a8d0342af9f.9995af32582a1a7062cb3173c118cb7b4636fa03feb967340f20fc37406f021f


In [None]:
ids_nonsarcastic = tokenizer.encode('negative </s>')
ids_sarcastic = tokenizer.encode('positive </s>')
len(ids_sarcastic), len(ids_nonsarcastic)

(2, 2)

All the examples are converted in the text-2-text format as shown in the paper. However I didn't use any task prefix here. The examples are encoded as follows, if the tweet is sarcastic then the target is 'saractic' else 'nonsarcastic'

For T5 max input length is 512 and we can choose the max length for target sequence depending upon our dataset. The T5Tokenizer encodes both 'postive' and 'negative' as a single ids so I chose the max target length 2, extra 1 for the </s> token

In [None]:
from copy import copy

class SarcasmDataset(Dataset):
  def __init__(self, tokenizer, df, max_len=512):
    self.max_len = max_len
    self.df = df
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self._build()
  
  def __len__(self):
    return len(self.inputs)
  
  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    for index, row in self.df.iterrows():
      
      line = copy(row['tweet'])
      
      target = ""
      if row['sarcastic'] == 1:
        target = "positive"
      else:
        target = "negative"

      line = line.strip()
      line = line + ' </s>'

      target = target + " </s>"

      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [line], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
      )

      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=2, pad_to_max_length=True, return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [None]:
train_dataset = SarcasmDataset(tokenizer, train, max_len=200)
test_dataset = SarcasmDataset(tokenizer, test, max_len=200)
len(train_dataset), len(test_dataset)

(6934, 1400)

In [None]:
sample = train_dataset[17]
print(tokenizer.decode(sample['source_ids']))
print(tokenizer.decode(sample['target_ids']))

acu candlelight send off got me all emo :’((
negative


In [None]:
def get_dataset(tokenizer, type_path, args):
  if type_path == "test" or type_path == "val":
    return test_dataset
  return train_dataset

## Train

In [None]:
!mkdir -p t5_isarcasm

In [None]:
args_dict.update({'output_dir': '/content/t5_isarcasm', 'num_train_epochs':5})
args = argparse.Namespace(**args_dict)

checkpoint_callback = pl.callbacks.ModelCheckpoint(
    filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)



# Model

In [None]:
model = T5FineTuner(args)

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json from cache at /root/.cache/torch/transformers/40578967d1f029acb6162b36db9d8b4307063e885990ccd297c2c5be1cf1b3d7.2995d650f5eba18c8baa4146e210d32d56165e90d374281741fc78b872cd6c9b
INFO:transformers.configuration_utils:Model config T5Config {
  "architectures": [
    "T5WithLMHeadModel"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
   

In [None]:
trainer = pl.Trainer(**train_params)

INFO:lightning:GPU available: True, used: True
INFO:lightning:CUDA_VISIBLE_DEVICES: [0]


In [None]:
trainer.fit(model)

INFO:lightning:
    | Name                                                                  | Type                       | Params
-----------------------------------------------------------------------------------------------------------------
0   | model                                                                 | T5ForConditionalGeneration | 222 M 
1   | model.shared                                                          | Embedding                  | 24 M  
2   | model.encoder                                                         | T5Stack                    | 109 M 
3   | model.encoder.block                                                   | ModuleList                 | 84 M  
4   | model.encoder.block.0                                                 | T5Block                    | 7 M   
5   | model.encoder.block.0.layer                                           | ModuleList                 | 7 M   
6   | model.encoder.block.0.layer.0                                     

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_val_loss = tensor(0.1959, device='cuda:0')

INFO:__main__:loss = tensor(0.4729, device='cuda:0')

INFO:__main__:train_loss = tensor(0.4729, device='cuda:0')

INFO:__main__:val_loss = tensor(0.1959, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.3326, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.2861, device='cuda:0')

INFO:__main__:epoch = 0

INFO:__main__:loss = tensor(0.0020, device='cuda:0')

INFO:__main__:train_loss = tensor(0.0020, device='cuda:0')

INFO:__main__:val_loss = tensor(0.2861, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.1656, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.6764, device='cuda:0')

INFO:__main__:epoch = 1

INFO:__main__:loss = tensor(4.7054e-05, device='cuda:0')

INFO:__main__:train_loss = tensor(4.7054e-05, device='cuda:0')

INFO:__main__:val_loss = tensor(0.6764, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0476, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(1.0236, device='cuda:0')

INFO:__main__:epoch = 2

INFO:__main__:loss = tensor(1.0431e-06, device='cuda:0')

INFO:__main__:train_loss = tensor(1.0431e-06, device='cuda:0')

INFO:__main__:val_loss = tensor(1.0236, device='cuda:0')



Validating: 0it [00:00, ?it/s]

INFO:__main__:***** Validation results *****
INFO:__main__:avg_train_loss = tensor(0.0141, device='cuda:0')

INFO:__main__:avg_val_loss = tensor(0.9237, device='cuda:0')

INFO:__main__:epoch = 3

INFO:__main__:loss = tensor(2.2500e-05, device='cuda:0')

INFO:__main__:train_loss = tensor(2.2500e-05, device='cuda:0')

INFO:__main__:val_loss = tensor(0.9237, device='cuda:0')



1

In [None]:
!mkdir t5_base_isarcasm

mkdir: cannot create directory ‘t5_base_isarcasm’: File exists


In [None]:
model.model.save_pretrained('t5_base_isarcasm')

INFO:transformers.configuration_utils:Configuration saved in t5_base_isarcasm/config.json
INFO:transformers.modeling_utils:Model weights saved in t5_base_isarcasm/pytorch_model.bin


# Eval

In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [None]:
loader = DataLoader(test_dataset, batch_size=2, shuffle=False)

model.model.eval()
outputs = []
targets = []
for batch in tqdm(loader):
  outs = model.model.generate(input_ids=batch['source_ids'].cuda(), 
                              attention_mask=batch['source_mask'].cuda(), 
                              max_length=2)

  dec = [tokenizer.decode(ids) for ids in outs]
  target = [tokenizer.decode(ids) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)

  0%|          | 0/700 [00:00<?, ?it/s]

In [None]:
for i, out in enumerate(outputs):
  if out not in ['positive', 'negative']:
    print(i, 'detected invalid prediction')

In [None]:
metrics.f1_score(targets, outputs, pos_label='positive')

0.36644591611479027