# Creating Dataset

In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install datasets==1.0.2
!pip install tqdm==4.57.0
!pip install Cython

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==1.0.2
  Downloading datasets-1.0.2-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, datasets
Successfully installed datasets-1.0.2 dill-0.3.6 xxhash-3.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tqdm==4.57.0
  Downloading tqdm-4.57.0-py2

In [None]:
!mkdir '/content/gdrive/My Drive/t5_model'
!mkdir '/content/gdrive/My Drive/t5_model/dataset'
!mkdir '/content/gdrive/My Drive/t5_model/model'
!mkdir '/content/gdrive/My Drive/t5_model/tokenizer'
!ls

mkdir: cannot create directory ‘/content/gdrive/My Drive/t5_model’: File exists
mkdir: cannot create directory ‘/content/gdrive/My Drive/t5_model/dataset’: File exists
mkdir: cannot create directory ‘/content/gdrive/My Drive/t5_model/model’: File exists
mkdir: cannot create directory ‘/content/gdrive/My Drive/t5_model/tokenizer’: File exists
gdrive	sample_data


In [None]:
import pandas as pd
from datasets import load_dataset
from sklearn.utils import shuffle

pd.set_option("display.max_colwidth", None)

path_to_training_csv = f"/content/gdrive/My Drive/t5_model/dataset/training_dataset.csv"
path_to_validation_csv = f"/content/gdrive/My Drive/t5_model/dataset/validation_dataset.csv"

class QuestionDataset:
    @staticmethod
    def __prepare_dataset(df, dataset, answer_length_limit=7):
        position = 0
        for data in dataset:
            context, question = data["context"], data["question"]
            answer = data["answers"]["text"][0]
            answer_length = len(answer.split())

            if answer_length >= answer_length_limit:
                continue

            df.loc[position] = [context] + [answer] + [question]
            position += 1

    @staticmethod
    def prepare():
        # Load Squad Dataset
        training_dataset = load_dataset("squad", split="train")
        validation_dataset = load_dataset("squad", split="validation")

        # Create Training & Validation DataFrames
        df_training = pd.DataFrame(columns=['context', 'answer', 'question'])
        df_validation = pd.DataFrame(columns=['context', 'answer', 'question'])

        # Prepare Training & Validation Datasets
        QuestionDataset.__prepare_dataset(df_training, training_dataset)
        QuestionDataset.__prepare_dataset(df_validation, validation_dataset)

        # Shuffle Dataset
        df_training = shuffle(df_training)
        df_validation = shuffle(df_validation)

        # Save To CSV Files
        df_training.to_csv(path_to_training_csv, index=False)
        df_validation.to_csv(path_to_validation_csv, index=False)

In [None]:
QuestionDataset.prepare()

Downloading:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/955 [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.75 MiB, post-processed: Unknown size, total: 119.27 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41...


Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41. Subsequent calls will reuse this data.


Reusing dataset squad (/root/.cache/huggingface/datasets/squad/plain_text/1.0.0/1244d044b266a5e4dbd4174d23cb995eead372fbca31a03edc3f8a132787af41)


# Training Model

In [2]:
!pip install --quiet transformers==4.1.1
!pip install --quiet pytorch-lightning==1.1.3
!pip install --quiet tokenizers==0.9.4 
!pip install --quiet sentencepiece==0.1.94
!pip install --quiet tqdm==4.56.0
!pip install --quiet torchtext==0.8.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m680.7/680.7 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.5/72.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.8/776.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.15.1+cu118 requires torch==2.0.0, but you have torch 1.7.1 which is incompatible.
torchdata 0.6.0 requires torch==2.0.0, but you have torch 1.7.1 which is incompatible.
torchaudio 2.0.1+cu118 requires torch==2.0.0, but you have torch 1.7.1 which is incompatible.[0m[31m
[0m

In [6]:
import pandas as pd
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
import copy

class QuestionDataset(Dataset):
    def __init__(self, tokenizer, dataset_path: str, max_input_length: int = 512, max_output_length: int = 96) -> None:
        self.dataset_path = dataset_path
        self.dataset = pd.read_csv(dataset_path, nrows=1000)
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self.__create()
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        source_input_ids = self.inputs[index]["input_ids"].squeeze()
        target_input_ids = self.targets[index]["input_ids"].squeeze()

        source_attention_mask = self.inputs[index]["attention_mask"].squeeze()
        target_attention_mask = self.targets[index]["attention_mask"].squeeze()

        labels = copy.deepcopy(target_input_ids)
        labels[labels == 0] = -100

        return { "source_input_ids" : source_input_ids, "target_input_ids": target_input_ids, "source_attention_mask": source_attention_mask, "target_attention_mask": target_attention_mask, "labels": labels }

    def __create(self):
        def is_greater_than_max_length(input):
            input_encoding = self.tokenizer.encode_plus(input, truncation=False, return_tensors="pt")
            input_encoding_length = len(input_encoding["input_ids"][0])
            return input_encoding_length > self.max_input_length

        for i in tqdm(range(len(self.dataset))):
            context = self.dataset.loc[i, "context"]
            answer = self.dataset.loc[i, "answer"]
            question = self.dataset.loc[i, "question"]

            input = f"context: {context}  answer: {answer} </s>"
            target = f"question: {str(question)} </s>"

            if is_greater_than_max_length(input):
                continue

            input_tokenized = self.tokenizer.batch_encode_plus([input], max_length=self.max_input_length, padding="max_length", return_tensors="pt")
            targets_tokenized = self.tokenizer.batch_encode_plus([target], max_length=self.max_output_length, padding="max_length", return_tensors="pt")

            self.inputs.append(input_tokenized)
            self.targets.append(targets_tokenized)


In [7]:
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from transformers import (
    AdamW
)

class ModelTuner(pl.LightningModule):
    def __init__(self, model, tokenizer, hyper_parameters, training_dataset, validation_dataset):
        super(ModelTuner, self).__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.hyper_parameters = hyper_parameters
        self.training_dataset = training_dataset
        self.validation_dataset = validation_dataset
    
    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, labels=labels)
    
    def training_step(self, batch, batch_index):
        output = self.forward(input_ids=batch["source_input_ids"], attention_mask=batch["source_attention_mask"], decoder_input_ids=batch["target_input_ids"], decoder_attention_mask=batch["target_attention_mask"], labels=batch["labels"])
        training_loss = output[0]
        return training_loss

    def validation_step(self, batch, batch_index):
        output = self.forward(input_ids=batch["source_input_ids"], attention_mask=batch["source_attention_mask"], decoder_input_ids=batch["target_input_ids"], decoder_attention_mask=batch["target_attention_mask"], labels=batch["labels"])
        validation_loss = output[0]
        return validation_loss
    
    def train_dataloader(self):
        return DataLoader(self.training_dataset, batch_size=self.hyper_parameters.batch_size, num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(self.validation_dataset, batch_size=self.hyper_parameters.batch_size, num_workers=4)
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=3e-4, eps=1e-8)

In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
import os
import argparse
import torch
import pytorch_lightning as pl
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
)
pl.seed_everything(42)
path_to_training_csv = f"/content/gdrive/My Drive/t5_model/dataset/training_dataset.csv"
path_to_validation_csv = f"/content/gdrive/My Drive/t5_model/dataset/validation_dataset.csv"
path_to_tokenizer = f"/content/gdrive/My Drive/t5_model/tokenizer/"
path_to_model = f"/content/gdrive/My Drive/t5_model/model/"
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base")

training_dataset = QuestionDataset(tokenizer, path_to_training_csv)
validation_dataset = QuestionDataset(tokenizer, path_to_validation_csv)

class QuestionGenerationModel:
    def train():
        args_dict = dict(
            batch_size=4,
        )
        hyper_parameters = argparse.Namespace(**args_dict)
        tuned_model = ModelTuner(model, tokenizer, hyper_parameters, training_dataset, validation_dataset)
        model_trainer = pl.Trainer(max_epochs = 1, gpus=1, progress_bar_refresh_rate=30)
        model_trainer.fit(tuned_model)

        tuned_model.model.save_pretrained(path_to_model)
        tokenizer.save_pretrained(path_to_tokenizer)

Some weights of the model checkpoint at t5-base were not used when initializing T5ForConditionalGeneration: ['decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight']
- This IS expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing T5ForConditionalGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1000 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (596 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/1000 [00:00<?, ?it/s]



In [9]:
QuestionGenerationModel.train()

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: None, using: 0 TPU cores
INFO:lightning:TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params
INFO:lightning:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 222 M 
-----------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# Testing Model

In [13]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from google.colab import drive
drive.mount('/content/gdrive')
path_to_tokenizer = f"/content/gdrive/My Drive/t5_model/tokenizer/"
path_to_model = f"/content/gdrive/My Drive/t5_model/model/"
tokenizer = T5Tokenizer.from_pretrained(path_to_tokenizer)
model = T5ForConditionalGeneration.from_pretrained(path_to_model)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device: ", device)
model = model.to(device)

class TestModel:
    @staticmethod
    def test_question_generation(context, answer):
      text = f"context: {context} answer: {answer} </s>"

      encoding = tokenizer.encode_plus(text, max_length=512, padding=True, return_tensors="pt")
      input_ids, attention_mask = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

      model.eval()
      beam_search_output = model.generate(
          input_ids=input_ids,
          attention_mask=attention_mask,
          max_length=72,
          early_stopping=True,
          num_beams=5,
          num_return_sequences=3
      )

      for beam_output in beam_search_output:
          question = tokenizer.decode(beam_output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
          print(question)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Device:  cuda


In [14]:
context = "President Donald Trump said and predicted that some states would reopen this month."
answer = "Donald Trump"
TestModel.test_question_generation(context, answer)



question: Who predicted that some states would reopen this month?
question: Who said and predicted that some states would reopen this month?
question: Who predicted that states would reopen this month?
