In [None]:
!pip install -q sentencepiece transformers datasets evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!pip install -q transformers
!pip install -q pytorch_lightning
# !pip install -q pytorch-lightning==1.2.10

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.9/776.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.1/806.1 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    # self.hparams = hparams
    # for key in hparams.__dict__.keys():
        #  self.hparams[key]=hparams[key]
    self.save_hyperparameters(hparams)


    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
    self.training_step_outputs = []
    self.validation_step_outputs = []

  def is_logger(self):
    return self.trainer.global_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]
    # print("LOSS is : ", loss)
    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    self.training_step_outputs.append(loss)
    # print("TRAIN STEP OUTPUTS : ", self.training_step_outputs)
    self.log("train_loss", loss, prog_bar=True)
    return {"loss": loss, "log": tensorboard_logs}

  def on_train_epoch_end(self):
    outputs = self.training_step_outputs
    avg_train_loss = torch.stack([x for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    self.log("avg_train_loss", avg_train_loss, prog_bar=True)
    print("YES REACHED HERE - 2", avg_train_loss)
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss= self._step(batch)
    self.validation_step_outputs.append(loss)
    # print("VALIDATION STEP: ", self.validation_step_outputs)
    self.log("val_loss", loss, prog_bar=True)
    return {"val_loss": loss}

  def on_validation_epoch_end(self):
    outputs = self.validation_step_outputs
    avg_loss = torch.stack([x for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    print("YES REACHED HERE - 1", avg_loss)
    self.log("avg_val_loss", avg_loss, prog_bar=True)
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    super(T5FineTuner, self).optimizer_step(epoch, batch_idx, optimizer, optimizer_closure=optimizer_idx)
    # if self._optimizer_closure is not None:
    #   self._optimizer_closure()
    # if second_order_closure is not None:
    #   second_order_closure()
    # optimizer.step()
    # optimizer.zero_grad()
    self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [None]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [None]:
args_dict = dict(
    data_dir=".",
    output_dir=".",
    model_name_or_path='t5-small',
    tokenizer_name_or_path='t5-small',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False,
    opt_level='O1',
    max_grad_norm=1.0,
    seed=42,
)

In [None]:
import csv
from dataclasses import dataclass

from enum import Enum
from typing import List, Optional
from transformers import PreTrainedTokenizer

In [None]:
# !pip install -q jsonlines
import jsonlines
@dataclass(frozen=True)
class InputExample:
    example_id: str
    context: str
    question: str
    endings: List[str]
    label: Optional[str]

class Split(Enum):
    train = "train"
    dev = "dev"
    test = "test"

class DataProcessor:
    def get_train_examples(self, data_dir):
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        raise NotImplementedError()

    def get_labels(self):
        raise NotImplementedError()

class SwagProcessor(DataProcessor):

    def get_train_examples(self, data_dir):

        f = open("Task_1_train.jsonl", mode='r', encoding="utf8")
        reader = jsonlines.Reader(f)
        examples = []
        id = 0
        for line in reader:
          id += 1
          choice_0 = line['question'].replace('@placeholder', line['option_0'])
          choice_1 = line['question'].replace('@placeholder', line['option_1'])
          choice_2 = line['question'].replace('@placeholder', line['option_2'])
          choice_3 = line['question'].replace('@placeholder', line['option_3'])
          choice_4 = line['question'].replace('@placeholder', line['option_4'])
          examples.append(InputExample(
                  example_id=str(id),
                  context=line['article'],
                  question=line['question'],
                  endings=[choice_0, choice_1, choice_2, choice_3, choice_4],
                  label=int(line['label']),
              ))
        return examples

    def get_dev_examples(self, data_dir):
        logger.info("LOOKING AT {} dev".format(data_dir))
        f = open("Task_1_dev.jsonl", mode='r', encoding="utf8")
        reader = jsonlines.Reader(f)
        examples = []
        id = 0
        for line in reader:
          id += 1
          choice_0 = line['question'].replace('@placeholder', line['option_0'])
          choice_1 = line['question'].replace('@placeholder', line['option_1'])
          choice_2 = line['question'].replace('@placeholder', line['option_2'])
          choice_3 = line['question'].replace('@placeholder', line['option_3'])
          choice_4 = line['question'].replace('@placeholder', line['option_4'])
          examples.append(InputExample(
                  example_id=str(id),
                  context=line['article'],
                  question=line['question'],
                  endings=[choice_0, choice_1, choice_2, choice_3, choice_4],
                  label=int(line['label']),
              ))
        return examples

    def get_test_examples(self, data_dir):
        logger.info("LOOKING AT {} dev".format(data_dir))
        raise ValueError(
            "For swag testing, the input file does not contain a label column. It can not be tested in current code"
            "setting!"
        )

    def get_labels(self):
        return ["0", "1", "2", "3","4"]

In [None]:
class SwagDataset(Dataset):
  def __init__(self, tokenizer, data_dir, type_path,  max_len=512):
    self.data_dir = data_dir
    self.type_path = type_path
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []

    self.proc = SwagProcessor()

    self._build()

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()
    target_mask = self.targets[index]["attention_mask"].squeeze()

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

  def __len__(self):
    return len(self.inputs)

  def _build(self):
    if self.type_path == 'train':
      examples = self.proc.get_train_examples(self.data_dir)
    else:
      examples = self.proc.get_dev_examples(self.data_dir)

    for example in examples:
      self._create_features(example)

  def _create_features(self, example):
    input_ = example.context
    question = example.question
    options = ['%s: %s' % (i, option) for i, option in zip('12345', example.endings)]
    options = " ".join(options)
    input_ = "context: %s question: %s replace @placeholder with one of the options: %s </s>" % (input_, question, options)
    target = "%s </s>" % str(int(example.label) + 1)

    tokenized_inputs = self.tokenizer.batch_encode_plus(
        [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
    )

    tokenized_targets = self.tokenizer.batch_encode_plus(
        [target], max_length=2, pad_to_max_length=True, return_tensors="pt"
    )

    self.inputs.append(tokenized_inputs)
    self.targets.append(tokenized_targets)

In [None]:
# !pip install sentencepiece
!pip install -q sentencepiece

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
dataset = SwagDataset(tokenizer, data_dir='swag_data', type_path='val')
len(dataset)

837

In [None]:
data = dataset[0]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

context: 12 June 2017 Last updated at 12:52 BST Previously code-named Project Scorpio - take a look at the new Xbox One X console. Phil Spencer, head of Xbox said it was the: "most powerful console ever made". The console was revealed at this year's E3 conference - one of the world's biggest gaming and technology shows. It runs from 13th to 15th June in Los Angeles, America. question: Microsoft have revealed their brand - new top - @placeholder console at a big game show in America. replace @placeholder with one of the options: 1: Microsoft have revealed their brand - new top - end console at a big game show in America. 2: Microsoft have revealed their brand - new top - free console at a big game show in America. 3: Microsoft have revealed their brand - new top - secret console at a big game show in America. 4: Microsoft have revealed their brand - new top - normal console at a big game show in America. 5: Microsoft have revealed their brand - new top - concentration console at a big g

In [None]:
!mkdir -p t5_swag

In [None]:
args_dict.update({'data_dir': 'swag_data', 'output_dir': 't5_swag', 'num_train_epochs': 3})
args = argparse.Namespace(**args_dict)
print(args_dict)

{'data_dir': 'swag_data', 'output_dir': 't5_swag', 'model_name_or_path': 't5-small', 'tokenizer_name_or_path': 't5-small', 'max_seq_length': 512, 'learning_rate': 0.0003, 'weight_decay': 0.0, 'adam_epsilon': 1e-08, 'warmup_steps': 0, 'train_batch_size': 8, 'eval_batch_size': 8, 'num_train_epochs': 3, 'gradient_accumulation_steps': 16, 'n_gpu': 1, 'early_stop_callback': False, 'fp_16': False, 'opt_level': 'O1', 'max_grad_norm': 1.0, 'seed': 42}


In [None]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    accumulate_grad_batches=args.gradient_accumulation_steps,
    accelerator="auto",
    max_epochs=3,
    # early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    # opt_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    # checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback(),checkpoint_callback],
)

In [None]:
def get_dataset(tokenizer, type_path, args):
  return SwagDataset(tokenizer=tokenizer, data_dir=args.data_dir, type_path=type_path,  max_len=args.max_seq_length)

In [None]:
model = T5FineTuner(args)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
trainer = pl.Trainer(**train_params)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(model)

/usr/local/lib/python3.10/dist-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /content/t5_swag exists and is not empty.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


YES REACHED HERE - 1 tensor(6.0309, device='cuda:0')


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

YES REACHED HERE - 1 tensor(0.9134, device='cuda:0')
YES REACHED HERE - 2 tensor(1.2622, device='cuda:0', grad_fn=<MeanBackward0>)


Validation: |          | 0/? [00:00<?, ?it/s]

YES REACHED HERE - 1 tensor(0.8636, device='cuda:0')
YES REACHED HERE - 2 tensor(1.0448, device='cuda:0', grad_fn=<MeanBackward0>)


Validation: |          | 0/? [00:00<?, ?it/s]

YES REACHED HERE - 1 tensor(0.8467, device='cuda:0')
YES REACHED HERE - 2 tensor(0.9694, device='cuda:0', grad_fn=<MeanBackward0>)


INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [None]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [None]:
dataset =  SwagDataset(tokenizer, data_dir='swag_data', type_path='val')
loader = DataLoader(dataset, batch_size=32, num_workers=4)

In [None]:
model.model.eval()
outputs = []
targets = []
for batch in tqdm(loader):
  outs = model.model.to("cuda").generate(input_ids=batch['source_ids'].to("cuda"),
                              attention_mask=batch['source_mask'].to("cuda"),
                              max_length=2)

  dec = [tokenizer.decode(ids) for ids in outs]
  target = [tokenizer.decode(ids) for ids in batch["target_ids"]]

  outputs.extend(dec)
  targets.extend(target)

  0%|          | 0/27 [00:00<?, ?it/s]

In [None]:
for i, out in enumerate(outputs):
  if out not in "12345":
    print(i, 'detected invalid prediction')

In [None]:
outputs_1 = [o[-1] for o in outputs]
targets_1 = [t[0] for t in targets]

In [None]:
metrics.accuracy_score(targets_1, outputs_1)

0.2066905615292712

In [None]:
list(zip(outputs_1, targets_1))