<a href="https://colab.research.google.com/github/ChaitaliV/IITDelhi-RA/blob/main/basic_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install evaluate
!pip install sentencepiece
!pip install nltk
!pip install rouge-score

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m55.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.8 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
import torch
import requests
import json
import pandas as pd
from transformers import T5Tokenizer, T5Model, T5ForConditionalGeneration, T5TokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm import tqdm, trange
import evaluate
from datasets import load_metric
import numpy as np
import nltk

Get Data

In [3]:
!git clone https://github.com/ChaitaliV/IITDelhi-RA

Cloning into 'IITDelhi-RA'...
remote: Enumerating objects: 14, done.[K
remote: Counting objects: 100% (14/14), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 14 (delta 3), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (14/14), 29.80 KiB | 9.93 MiB/s, done.
Resolving deltas: 100% (3/3), done.


In [4]:
data = pd.read_csv('IITDelhi-RA/dataset.csv')
data = data.sample(frac=1.0, random_state=42)

Initialization

In [5]:
tokenizer = T5TokenizerFast.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
device = 'cuda:0'
pattern_len = 218
response_len = 512
batch_size = 4
epochs = 10
optimizer = AdamW(model.parameters(), lr=0.00001)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



In [6]:
class Prepare_Data():
  """
  this class will take the dataframe,
  tokenize the text, get input_ids, attention_masks, create labels, and
  prepare dataloader objects to train the model
  """
  def __init__(self, tokenizer, dframe, q_len, t_len):
    self.tokenizer = tokenizer
    self.p_len = pattern_len
    self.r_len = response_len
    self.data = dframe
    self.sentences = self.data.pattern.values
    self.responses = self.data.response.values
    self.tags = self.data.tag.values

  def prepare_dataloader(self):

    pattern_tokenized = self.tokenizer([str(sequence) for sequence in self.sentences],[str(tag) for tag in self.tags], max_length=self.p_len, padding="max_length",
                                                    truncation=True, pad_to_max_length=True, add_special_tokens=True,return_tensors="pt")
    response_tokenized = self.tokenizer([str(response) for response in self.responses], max_length=self.r_len, padding="max_length",
                                          truncation=True, pad_to_max_length=True, add_special_tokens=True, return_tensors = "pt")

    labels = response_tokenized.input_ids
    labels[labels == tokenizer.pad_token_id] = -100
    input_ids = pattern_tokenized.input_ids
    attention_mask = pattern_tokenized.attention_mask

    #create train, validation split
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,random_state = 2018, test_size = 0.1 )
    train_masks, validation_masks, _, _ = train_test_split(attention_mask, input_ids,
                                             random_state=2018, test_size=0.1)

    #create dataloader
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    return train_dataloader, validation_dataloader

In [7]:
LoadData = Prepare_Data(tokenizer, data, pattern_len, response_len)
train_dataloader, validation_dataloader = LoadData.prepare_dataloader()

In [8]:
metric = load_metric("rouge")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    #replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    #ROUGE score calculation
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    #rogue F1 score
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    #add mean length to metric
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

  metric = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [9]:
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [10]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.weight']
# Separate the `weight` parameters from the `bias` parameters.
# - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01.
# - For the `bias` parameters, the 'weight_decay_rate' is 0.0.
optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},

    # Filter for parameters which *do* include those.
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,
                  lr = 3e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )
# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives
# us the number of batches.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0.1, # Default value in run_glue.py
                                            num_training_steps = total_steps)



In [12]:
# The Training Loop
t = []

# Store our loss and accuracy for plotting
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs['loss']
    train_loss_set.append(loss.item())
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
      loss = outputs['loss']

      eval_loss += loss.item()
      nb_eval_steps += 1


  print("Validation loss: {}".format(eval_loss/nb_eval_steps))





Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Train loss: 4.614697470760985


Epoch:  10%|█         | 1/10 [01:47<16:09, 107.69s/it]

Validation loss: 5.065709590911865
Train loss: 4.54164566449671


Epoch:  20%|██        | 2/10 [03:37<14:29, 108.65s/it]

Validation loss: 5.065709590911865
Train loss: 4.531889893064563


Epoch:  30%|███       | 3/10 [05:27<12:44, 109.28s/it]

Validation loss: 5.065709590911865
Train loss: 4.54351913048917


Epoch:  40%|████      | 4/10 [07:17<10:58, 109.70s/it]

Validation loss: 5.065709590911865
Train loss: 4.526057110536819


Epoch:  50%|█████     | 5/10 [09:08<09:10, 110.06s/it]

Validation loss: 5.065709590911865
Train loss: 4.56154460554955


Epoch:  60%|██████    | 6/10 [10:58<07:20, 110.20s/it]

Validation loss: 5.065709590911865
Train loss: 4.5387558729056545


Epoch:  70%|███████   | 7/10 [12:49<05:30, 110.30s/it]

Validation loss: 5.065709590911865
Train loss: 4.6075746389043415


Epoch:  80%|████████  | 8/10 [14:39<03:40, 110.35s/it]

Validation loss: 5.065709590911865
Train loss: 4.540245781008829


Epoch:  90%|█████████ | 9/10 [16:29<01:50, 110.38s/it]

Validation loss: 5.065709590911865
Train loss: 4.556779010183859


Epoch: 100%|██████████| 10/10 [18:20<00:00, 110.05s/it]

Validation loss: 5.065709590911865





In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
torch.save(model,r'/content/drive/MyDrive/T5.pt')
