In [None]:
%%capture
!pip install datasets
!pip install simpletransformers

In [None]:
from simpletransformers.t5 import T5Model

In [None]:
# Loading dataset
from datasets import load_dataset
raw_datasets = load_dataset('squad')

Downloading:   0%|          | 0.00/1.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad/plain_text (download: 33.51 MiB, generated: 85.63 MiB, post-processed: Unknown size, total: 119.14 MiB) to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/6b6c4172d0119c74515f44ea0b8262efe4897f2ddb6613e5e915840fdc309c16...


Downloading:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad downloaded and prepared to /root/.cache/huggingface/datasets/squad/plain_text/1.0.0/6b6c4172d0119c74515f44ea0b8262efe4897f2ddb6613e5e915840fdc309c16. Subsequent calls will reuse this data.


In [None]:
import pandas as pd

In [None]:
# Converting the training dataset to dataframe
train_list = list()
for i in range(len(raw_datasets['train'])):
  new_record = dict()
  new_record['input_text'] = '%s answer: %s' % (raw_datasets['train'][i]['context'], raw_datasets['train'][i]['answers']['text'][0])
  new_record['target_text'] = raw_datasets['train'][i]['question']
  new_record['prefix'] = 'generate_question'
  train_list.append(new_record)

In [None]:
# Converting the validation dataset to dataframe
validation_list = list()
for i in range(len(raw_datasets['validation'])):
  new_record = dict()
  new_record['input_text'] = '%s answer: %s' % (raw_datasets['validation'][i]['context'], raw_datasets['validation'][i]['answers']['text'][0])
  new_record['target_text'] = raw_datasets['validation'][i]['question']
  new_record['prefix'] = 'generate_question'
  validation_list.append(new_record)

In [None]:
train_df = pd.DataFrame.from_dict(train_list)
validation_df = pd.DataFrame.from_dict(validation_list)

In [None]:
train_df.head()

Unnamed: 0,input_text,target_text,prefix
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,generate_question
1,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,generate_question
2,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,generate_question
3,"Architecturally, the school has a Catholic cha...",What is the Grotto at Notre Dame?,generate_question
4,"Architecturally, the school has a Catholic cha...",What sits on top of the Main Building at Notre...,generate_question


In [None]:
validation_df.head()

Unnamed: 0,input_text,target_text,prefix
0,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,generate_question
1,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,generate_question
2,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,generate_question
3,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,generate_question
4,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,generate_question


In [None]:
# Model training arguments
model_args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 128,
    "train_batch_size": 8,
    "num_train_epochs": 1,
    "save_eval_checkpoints": True,
    "save_steps": -1,
    "use_multiprocessing": False,
    "evaluate_during_training": True,
    "evaluate_during_training_steps": 15000,
    "evaluate_during_training_verbose": True,
    "fp16": False,
    'special_tokens_list': ['answer'],
    "wandb_project": "MCQ Generation Using T5 and Squad",
}

model = T5Model("t5", "t5-base", args=model_args)

model.train_model(train_df, eval_data=validation_df)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

  0%|          | 0/87599 [00:00<?, ?it/s]



Using Adafactor for T5


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 1:   0%|          | 0/10950 [00:00<?, ?it/s]

  0%|          | 0/10570 [00:00<?, ?it/s]



(10950,
 {'eval_loss': [2.23038373686903],
  'global_step': [10950],
  'train_loss': [1.927558422088623]})

In [None]:
# Loading best model
model = T5Model("t5", "outputs/best_model", args=model_args)

In [None]:
# Sample testing question generation
to_predict = [
    """generate_question: In 1971, George Lucas wanted to film an adaptation of the Flash Gordon serial, but could not obtain the rights, so he began developing his own space opera. 
    answer: Geroge Lucas""",
]

predictions = model.predict(to_predict)

In [None]:
predictions

['Who wanted to film a film of a film in 1971?']

In [None]:
# Compressing the best model
!tar -zcvf model_with_key.tar.gz outputs/best_model/

outputs/best_model/
outputs/best_model/training_args.bin
outputs/best_model/spiece.model
outputs/best_model/optimizer.pt
outputs/best_model/eval_results.txt
outputs/best_model/config.json
outputs/best_model/tokenizer_config.json
outputs/best_model/pytorch_model.bin
outputs/best_model/scheduler.pt
outputs/best_model/special_tokens_map.json
outputs/best_model/model_args.json
outputs/best_model/added_tokens.json


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Saving the compressed model to drive
!cp model_with_key.tar.gz /content/drive/MyDrive/Models