In [None]:
'''
    The following script was written and run on Kaggle's P100 GPU (faster than Goggle colab's T4 GPU).
'''

Loading the model and the tokenizer ....

In [1]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import warnings
warnings.simplefilter(action='ignore')

In [2]:
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
model

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

Verifying Summarization ..

In [3]:
'''
  The following is an introductory paragraph from a Wikipedia page on Laplace Distribution
'''
para = "In probability theory and statistics, the Laplace distribution is a continuous probability distribution named after Pierre-Simon Laplace. It is also sometimes called the double exponential distribution, because it can be thought of as two exponential distributions (with an additional location parameter) spliced together along the abscissa, although the term is also sometimes used to refer to the Gumbel distribution. The difference between two independent identically distributed exponential random variables is governed by a Laplace distribution, as is a Brownian motion evaluated at an exponentially distributed random time[citation needed]. Increments of Laplace motion or a variance gamma process evaluated over the time scale also have a Laplace distribution."

In [4]:
inputs=tokenizer.encode("summarize: " + para,return_tensors='pt', max_length=512, truncation=True)

In [5]:
output = model.generate(inputs, max_length=50)

In [6]:
summary=tokenizer.decode(output[0], skip_special_tokens=True)
print(summary)

the Laplace distribution is a continuous probability distribution named after Pierre-Simon Laplace. it can be thought of as two exponential distributions spliced together along the abscissa.


Verifying QnA task ...

In [7]:
ques = "What is the capital of Spain?"
inputs=tokenizer.encode("question: " + ques,return_tensors='pt', max_length=512, truncation=True)

In [8]:
out = model.generate(inputs, max_length=5, num_beams=2, early_stopping=True) # can use early_stopping for one-word answer
print(tokenizer.decode(out[0], skip_special_tokens=True))

Madrid


Verifying Translation ...

In [9]:
sent = "Hi, My name is Aditya. Nice to meet me ;)"
inputs=tokenizer.encode("translate English to French: " + sent,return_tensors='pt', max_length=512, truncation=True)

In [10]:
out = model.generate(inputs, max_length=50)
print(tokenizer.decode(out[0], skip_special_tokens=True))

Bonjour, Je m'appelle Aditya, j'ai eu la chance de me rencontrer ;)


Deep diving into Model Params ...

In [11]:
# Printing the names and dimensions of all the layers as well as total number of params in the model ...
tot_params=0
for n,p in model.state_dict().items():
  print("Layer Name: "+n, "; Layer Shape: "+str(p.shape))
  tot_params+=p.numel()
print("--"*70)
print(f"Total number of parameters in {model_name}: "+str(tot_params)) # Since we've included the params for LM_Head as well, the # of params must be close to 300M for t5-base, as expected!

Layer Name: shared.weight ; Layer Shape: torch.Size([32128, 768])

Layer Name: encoder.embed_tokens.weight ; Layer Shape: torch.Size([32128, 768])

Layer Name: encoder.block.0.layer.0.SelfAttention.q.weight ; Layer Shape: torch.Size([768, 768])

Layer Name: encoder.block.0.layer.0.SelfAttention.k.weight ; Layer Shape: torch.Size([768, 768])

Layer Name: encoder.block.0.layer.0.SelfAttention.v.weight ; Layer Shape: torch.Size([768, 768])

Layer Name: encoder.block.0.layer.0.SelfAttention.o.weight ; Layer Shape: torch.Size([768, 768])

Layer Name: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight ; Layer Shape: torch.Size([32, 12])

Layer Name: encoder.block.0.layer.0.layer_norm.weight ; Layer Shape: torch.Size([768])

Layer Name: encoder.block.0.layer.1.DenseReluDense.wi.weight ; Layer Shape: torch.Size([3072, 768])

Layer Name: encoder.block.0.layer.1.DenseReluDense.wo.weight ; Layer Shape: torch.Size([768, 3072])

Layer Name: encoder.block.0.layer.1.layer_norm.weigh

In [12]:
# Setting final layer weights to all zeros ...
model.decoder.final_layer_norm.weight = torch.nn.Parameter(torch.zeros(768, dtype=torch.float32))
model.decoder.final_layer_norm.weight

Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0.

In [13]:
# Verifying whether QnA works after layer weights reset ..
ques = "What is the capital of Spain?"
inputs=tokenizer.encode("question: " + ques,return_tensors='pt', max_length=512, truncation=True)

out = model.generate(inputs, max_length=5, num_beams=2, early_stopping=True) # can use early_stopping for one-word answer
print(out)
print(tokenizer.decode(out[0], skip_special_tokens=False)) # Empty response expected as final layer's output would now be all zeros! (Bcs the scaling factor is set to all zeros now)

tensor([[0, 3, 1]])

<pad> </s>


In [14]:
# Replacing last layer with a layer of smaller dimensions (768 --> 384)
'''
  ** Changing d_model basically from 768 to 384 since because of the residual connections all over the architecture, need to go through all the layers and change them :'(
  ** Another, a rather much simpler solution, would've been to just add a projection layer after final-layer's outputs that takes care of the residual connections, but that would've
     required tweaking with the model architecture, something which I believe is beyond the scope of this assignment. (Changes in modelling_t5.py in the transformers library were req for this soltuion)
'''
old_dim=768
new_dim=old_dim//2 # =384
print(new_dim)
model.shared.weight = torch.nn.Parameter(torch.randn((32128,new_dim), dtype=torch.float32))
model.encoder.embed_tokens.weight = torch.nn.Parameter(torch.randn((32128,new_dim), dtype=torch.float32))
model.decoder.embed_tokens.weight = torch.nn.Parameter(torch.randn((32128,new_dim), dtype=torch.float32))
for i in range(12):
  model.encoder.block[i].layer[0].SelfAttention.q.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32)) # keeping the projected dimension same as before
  model.encoder.block[i].layer[0].SelfAttention.k.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.encoder.block[i].layer[0].SelfAttention.v.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.encoder.block[i].layer[0].SelfAttention.o.weight = torch.nn.Parameter(torch.randn((new_dim,old_dim), dtype=torch.float32))
  model.encoder.block[i].layer[0].layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))
  model.encoder.block[i].layer[1].DenseReluDense.wi.weight = torch.nn.Parameter(torch.randn((3072,new_dim), dtype=torch.float32))
  model.encoder.block[i].layer[1].DenseReluDense.wo.weight = torch.nn.Parameter(torch.randn((new_dim,3072), dtype=torch.float32))
  model.encoder.block[i].layer[1].layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))

  model.decoder.block[i].layer[0].SelfAttention.q.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[0].SelfAttention.k.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[0].SelfAttention.v.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[0].SelfAttention.o.weight = torch.nn.Parameter(torch.randn((new_dim,old_dim), dtype=torch.float32))
  model.decoder.block[i].layer[0].layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))
  model.decoder.block[i].layer[1].EncDecAttention.q.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[1].EncDecAttention.k.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[1].EncDecAttention.v.weight = torch.nn.Parameter(torch.randn((old_dim,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[1].EncDecAttention.o.weight = torch.nn.Parameter(torch.randn((new_dim,old_dim), dtype=torch.float32))
  model.decoder.block[i].layer[1].layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))
  model.decoder.block[i].layer[2].DenseReluDense.wi.weight = torch.nn.Parameter(torch.randn((3072,new_dim), dtype=torch.float32))
  model.decoder.block[i].layer[2].DenseReluDense.wo.weight = torch.nn.Parameter(torch.randn((new_dim,3072), dtype=torch.float32))
  model.decoder.block[i].layer[2].layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))

model.encoder.final_layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))
model.decoder.final_layer_norm.weight = torch.nn.Parameter(torch.randn(new_dim, dtype=torch.float32))
model.lm_head.weight = torch.nn.Parameter(torch.randn((32128,new_dim), dtype=torch.float32))

384


In [15]:
# Checking all the layers and total params now after reducing final_layer_norm and other layers dims ...
tot_params=0
for n,p in model.state_dict().items():
  print("Layer Name: "+n, "; Layer Shape: "+str(p.shape)) # Notice the shape of all the layers
  tot_params += p.numel()
print("--"*70)
print(f"Total # of params in {model_name}: "+str(tot_params)) # Notice how the number of params decrease(drops to nearly half!)

Layer Name: shared.weight ; Layer Shape: torch.Size([32128, 384])

Layer Name: encoder.embed_tokens.weight ; Layer Shape: torch.Size([32128, 384])

Layer Name: encoder.block.0.layer.0.SelfAttention.q.weight ; Layer Shape: torch.Size([768, 384])

Layer Name: encoder.block.0.layer.0.SelfAttention.k.weight ; Layer Shape: torch.Size([768, 384])

Layer Name: encoder.block.0.layer.0.SelfAttention.v.weight ; Layer Shape: torch.Size([768, 384])

Layer Name: encoder.block.0.layer.0.SelfAttention.o.weight ; Layer Shape: torch.Size([384, 768])

Layer Name: encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight ; Layer Shape: torch.Size([32, 12])

Layer Name: encoder.block.0.layer.0.layer_norm.weight ; Layer Shape: torch.Size([384])

Layer Name: encoder.block.0.layer.1.DenseReluDense.wi.weight ; Layer Shape: torch.Size([3072, 384])

Layer Name: encoder.block.0.layer.1.DenseReluDense.wo.weight ; Layer Shape: torch.Size([384, 3072])

Layer Name: encoder.block.0.layer.1.layer_norm.weigh

In [16]:
# Verifying whether QnA works after modifying all the layers with smaller dimensions ..
ques = "What is the capital of Spain?"
inputs=tokenizer.encode("question: " + ques,return_tensors='pt', max_length=512, truncation=True)

out = model.generate(inputs, max_length=5, num_beams=2, early_stopping=True)
print(out)
print(tokenizer.decode(out[0], skip_special_tokens=False)) # Since we've randomly assigned weights while dimension modification, don't expect anything sensible!

tensor([[    0, 27262, 27061, 18797, 13763]])

<pad>RIGHT frumoasăprompted executed


Training t5 for QnA + Context task (Advice to activate GPU runtime, as well as reconnect to runtime for faster & efficient training)

In [2]:
# !pip install datasets
from datasets import load_dataset
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn as nn
import os
from tqdm import tqdm
from collections import Counter
# re-importing a few libraries in case you've restarted the runtime
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
import warnings
warnings.simplefilter(action='ignore')

In [3]:
# Reloading the original google's t5-small model ...
del model # delete prev model to free-up some space, just comment this line in case you've just restarted the runtime/session
model_name = "t5-small"
t5model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
t5model

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [4]:
class cqaDataset(Dataset): # Writing my own Dataset class for fast loading
  def __init__(self, split="train"):
    # self.tokenizer = tokenizer
    self.dataset = load_dataset("McGill-NLP/TopiOCQA")[split]
    self.__buildData()

  def __buildData(self):
    self.context = []
    self.questions = []
    self.answers = []

    for row in self.dataset:
      self.questions.append(row["Question"])
      self.answers.append(row["Answer"])
      if len(row["Context"])>0:
        ctx=""
        for c in row["Context"]:
          ctx+=c
          ctx+=";"
        self.context.append(ctx[:-1])
      else:
        self.context.append("")

  def __len__(self):
    return len(self.questions)

  def __getitem__(self, idx):
    return self.context[idx], self.questions[idx], self.answers[idx]

In [5]:
# CONFIGS ....
max_inp_len=512
lr = 10**-4
epochs = 5
wd = 0.0
trn_bs = 16
tst_bs = 64
optim_type="adam"
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
train_set = cqaDataset()
train_loader = DataLoader(train_set, batch_size=trn_bs, shuffle=True)
dev_set = cqaDataset(split="validation")
dev_loader = DataLoader(dev_set, batch_size=tst_bs, shuffle=False)

Downloading data: 100%|██████████| 28.9M/28.9M [00:01<00:00, 19.8MB/s]
Downloading data: 100%|██████████| 2.64M/2.64M [00:00<00:00, 14.3MB/s]


Generating train split:   0%|          | 0/45450 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2514 [00:00<?, ? examples/s]

In [7]:
def evaluate(preds, gold):

  def f1_score(p,g):
    common = Counter(p) & Counter(g)
    num_same = sum(common.values())
    if num_same == 0:
      return 0
    precision = 1.0 * num_same / len(p)
    recall = 1.0 * num_same / len(g)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

  def exact_match_calc(p,g):
    if len(g) == len(p):
      if all(token1 == token2 for token1, token2 in zip(g,p)):
        return 1
    return 0

  f1 = exact_match = 0
  for gld, pred in tqdm(zip(gold, preds)):
    # Remove pad token
    tokens_to_remove = {
        tokenizer.pad_token_id,
        tokenizer.eos_token_id,
        tokenizer.bos_token_id,
        tokenizer.cls_token_id,
        tokenizer.sep_token_id,
        tokenizer.mask_token_id
    }
    pred = list(filter(lambda token: token not in tokens_to_remove, pred))
    gld = list(filter(lambda token: token not in tokens_to_remove, gld))
    f1 += f1_score(p=pred, g=gld)
    exact_match += exact_match_calc(p=pred, g=gld)
  return 100*f1/len(preds), 100*exact_match/len(preds)

In [8]:
class Model(nn.Module): # Wrapper class for putting everything needed to train the t5 model
  def __init__(self):
    super().__init__()
    self.t5=t5model
    self.__makeOptim()
    self.to(device)
    self.tot_params=-1
    self.checkpoint_dir = "/kaggle/working/"
    self.model_name = model_name
    self.tokenizer=tokenizer

  def forward(self, data):
    input_ids, attention_mask, encoded_targets = self.__makeData(data=data)
    return self.t5(input_ids=input_ids, attention_mask=attention_mask, labels=encoded_targets)

  def gen(self, data):
    input_ids, attention_mask, encoded_targets = self.__makeData(data=data,to_train=False)
    return self.t5.generate(input_ids=input_ids, attention_mask=attention_mask), encoded_targets

  def __makeData(self, data, to_train=True):
    inputs = list(map(lambda tuple: f"question:{tuple[0]}  context:{tuple[1]}", zip(data[1],data[0])))
    encoded_inputs = self.tokenizer(
                            inputs,
                            padding="longest",
                            max_length=max_inp_len,
                            truncation=True,
                            return_tensors="pt",
                        )
    encoded_targets = self.tokenizer(
                            list(data[2]),
                            padding="longest",
                            max_length=max_inp_len,
                            truncation=True,
                            return_tensors="pt",
                        )

    input_ids, attention_mask = encoded_inputs.input_ids, encoded_inputs.attention_mask
    encoded_targets = encoded_targets.input_ids
    if to_train:
      encoded_targets[encoded_targets == self.tokenizer.pad_token_id] = -100

    input_ids = input_ids.to(device)
    encoded_targets = encoded_targets.to(device)
    attention_mask = attention_mask.to(device)

    return input_ids, attention_mask, encoded_targets

  def __makeOptim(self):
    optim_grp_params = [
        {
            "params": [p for n,p in self.named_parameters() if "bias" not in n],
            "weight_decay": wd,
        },
        {
            "params": [p for n,p in self.named_parameters() if "bias" in n],
            "weight_decay": 0.0,
        }
    ]
    if optim_type=="adam":
      self.optimizer = optim.Adam(optim_grp_params, lr=lr)
    elif optim_type=="adam-amsgrad":
      self.optimizer = optim.Adam(optim_grp_params, lr=lr, amsgrad=True)
    elif optim_type=="adamw":
      self.optimizer = optim.AdamW(optim_grp_params, lr=lr)
    elif optim_type=="rmsprop":
      self.optimizer = optim.RMSprop(optim_grp_params, lr=lr)
    elif optim_type=="sgd":
      self.optimizer = optim.SGD(optim_grp_params, lr=lr)
    elif optim_type=="sgd-nesterov":
      self.optimizer = optim.SGD(optim_grp_params, lr=lr, nesterov=True, momentum=0.9)
    else:
      raise NotImplementedError("optim_type not implemented yet :/")

  def save_checkpoint(self,epoch):
    torch.save(self.state_dict(), os.path.join(self.checkpoint_dir,"contextualT5_"+str(epoch)))

  def save_best(self):
    torch.save(self.state_dict(), os.path.join(self.checkpoint_dir,"contextualT5_best"))

  def load_best(self):
    self.load_state_dict(torch.load(os.path.join(self.checkpoint_dir,"contextualT5_best"), map_location=device))

  # def load_checkpoint(self, epoch):
  #   self.load_state_dict(torch.load(os.path.join(self.checkpoint_dir,"contextualT5_"+str(epoch)), map_location=device))

  def calc_params(self, ret=False, verbose=True):
    if self.tot_params==-1:
      self.tot_params=0
      for p in self.optimizer.param_groups:
        for k,v in p.items():
          if k=="params":
            for x in v:
              temp_sr=1
              for i in range(len(x.shape)):
                temp_sr*=x.shape[i]
              self.tot_params += temp_sr
    if verbose:
      print("Total number of trainable parameters in "+self.model_name+" = "+str(self.tot_params))
    if ret:
      return self.tot_params

In [9]:
model = Model()
model.calc_params()

Total number of trainable parameters in t5-small = 60506624


In [10]:
model.eval()
preds = []
labels_encoded = []
with torch.no_grad():
    for data in tqdm(dev_loader):
        outs,tars = model.gen(data)
        preds += outs.tolist()
        labels_encoded += tars.tolist()
f1, exact_match = evaluate(preds=preds, gold=labels_encoded)
print(f"Benchmark Scores --> Validation F1 = {f1:.2f}, EM = {exact_match:.2f}")

100%|██████████| 40/40 [00:26<00:00,  1.48it/s]
2514it [00:00, 33791.32it/s]

Benchmark Scores --> Validation F1 = 6.02, EM = 0.36





In [11]:
# Takes around 35 mins per epoch on colab's T4 GPU & ~16 mins per epoch on Kaggle's P100 GPU ....
f1_old = f1 # initializing w/o finetuned metric for starters 
for epoch in range(epochs):

  model.train()
  avg_loss = 0.0
  for data in tqdm(train_loader):
      model.optimizer.zero_grad()
      outs = model(data)
      loss = outs.loss
      loss.backward()
      model.optimizer.step()
      avg_loss += (trn_bs*loss.item())
  print("Epoch: "+str(epoch)+"| Avg Loss: "+str(avg_loss/len(train_set)))
  model.save_checkpoint(epoch)

  model.eval()
  preds = []
  labels_encoded = []
  with torch.no_grad():
      for data in tqdm(dev_loader):
          outs,tars = model.gen(data)
          preds += outs.tolist()
          labels_encoded += tars.tolist()
  f1, exact_match = evaluate(preds=preds, gold=labels_encoded)
  print(f"Epoch = {epoch}, Validation F1 = {f1:.2f}, EM = {exact_match:.2f}")
  if f1 > f1_old :
      model.save_best()
      f1_old = f1

100%|██████████| 2841/2841 [14:21<00:00,  3.30it/s]


Epoch: 0| Avg Loss: 3.757974936272314


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]
2514it [00:00, 31332.77it/s]


Epoch = 0, Validation F1 = 16.15, EM = 0.95


100%|██████████| 2841/2841 [14:20<00:00,  3.30it/s]


Epoch: 1| Avg Loss: 3.5905152245952743


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]
2514it [00:00, 32848.75it/s]


Epoch = 1, Validation F1 = 16.86, EM = 1.47


100%|██████████| 2841/2841 [14:19<00:00,  3.31it/s]


Epoch: 2| Avg Loss: 3.5021842626691257


100%|██████████| 40/40 [00:23<00:00,  1.73it/s]
2514it [00:00, 33165.84it/s]


Epoch = 2, Validation F1 = 17.11, EM = 1.79


100%|██████████| 2841/2841 [14:19<00:00,  3.31it/s]


Epoch: 3| Avg Loss: 3.4355419569361723


100%|██████████| 40/40 [00:23<00:00,  1.72it/s]
2514it [00:00, 33643.18it/s]


Epoch = 3, Validation F1 = 16.77, EM = 1.55


100%|██████████| 2841/2841 [14:18<00:00,  3.31it/s]


Epoch: 4| Avg Loss: 3.3768131288836893


100%|██████████| 40/40 [00:22<00:00,  1.74it/s]
2514it [00:00, 32594.49it/s]


Epoch = 4, Validation F1 = 17.39, EM = 2.47


In [12]:
# Loading the best model ...
model.load_best()
model.eval()
preds = []
labels_encoded = []
with torch.no_grad():
    for data in tqdm(dev_loader):
        outs,tars = model.gen(data)
        preds += outs.tolist()
        labels_encoded += tars.tolist()
f1, exact_match = evaluate(preds=preds, gold=labels_encoded)
print(f"Best Model Scores --> Validation F1 = {f1:.2f}, EM = {exact_match:.2f}")

100%|██████████| 40/40 [00:22<00:00,  1.74it/s]
2514it [00:00, 32435.57it/s]

Best Model Scores --> Validation F1 = 17.39, EM = 2.47





In [None]:
'''
  Remarks:
  ** For the purpose of this assignment, I've just used a few basic metrics like F1-score and Exact Match. However, more sophisticated eval metrics like ROGUE, BLEU, etc. can be used for better model quality evaluation.
  ** Future scope of work includes experimenting with better trigger words than just using "context" & "question", maybe using the rationale as well
  ** As compared to benchmark metrics (w/o finetuning model), we can see a significant improvement (~189% improvement in the F1 score and ~586% improvement in the Exact Match Score) in both the metrics with just 5 epochs of SFT!
'''