# NLP Final Project - Question Generation Module

In [24]:
!nvidia-smi

Tue Mar 29 17:41:52 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [25]:
!pip install --quiet transformers==4.3.0
!pip install --quiet pytorch-lightning==1.2.10
!pip install --quiet tokenizers==0.10.3
!pip install --quiet sentencepiece==0.1.94

In [26]:
from typing import List, Dict
import tqdm.notebook as tq
from tqdm.notebook import tqdm
import json
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer
    )

In [27]:
pl.seed_everything(20)

Global seed set to 20


20

In [28]:
# Load data
train = pd.read_json('train-v1.1.json') 
dev = pd.read_json('dev-v1.1.json') 

In [29]:
# Handle the jason files:
def JsonToDF(dataset):

  data=[]
  for article in (dataset['data']):
      title=article['title']
      for paragraph in article['paragraphs']:
        para=paragraph['context']
        for qas in paragraph['qas']:
          question=qas['question']
          text=qas['answers'][0]['text']
          answerStart=qas['answers'][0]['answer_start']
          data.append([title,para,question,text,answerStart])

  dataframe=pd.DataFrame(data=data,columns=['title','context','question','answer','answerStart'])
  return dataframe

In [30]:
# Creating the data frame from jason files
train_df = JsonToDF(train)
dev = JsonToDF(dev)
train_df = train_df.dropna() 
dev = dev.dropna()

In [31]:
# Leaving relevant data columns
train_df.drop(train_df[['title', 'answerStart']], axis=1)
dev.drop(dev[['title', 'answerStart']], axis=1)

Unnamed: 0,context,question,answer
0,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos
1,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,Carolina Panthers
2,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"Santa Clara, California"
3,Super Bowl 50 was an American football game to...,Which NFL team won Super Bowl 50?,Denver Broncos
4,Super Bowl 50 was an American football game to...,What color was used to emphasize the 50th anni...,gold
...,...,...,...
10565,"The pound-force has a metric counterpart, less...",What is the metric term less used than the New...,kilogram-force
10566,"The pound-force has a metric counterpart, less...",What is the kilogram-force sometimes reffered ...,kilopond
10567,"The pound-force has a metric counterpart, less...",What is a very seldom used unit of mass in the...,slug
10568,"The pound-force has a metric counterpart, less...",What seldom used term of a unit of force equal...,kip


In [32]:
# Show how many unique values in each column in dev and train dataset
print('total_length', len(train_df))
print('train_question', len(train_df.question.unique()))
print('train_context', len(train_df.context.unique()))
print('train_answer', len(train_df.answer.unique()))
print()
print('dev_question', len(dev.question.unique()))
print('dev_context', len(dev.context.unique()))
print('dev_answer', len(dev.answer.unique()))

total_length 87599
train_question 87355
train_context 18891
train_answer 65134

dev_question 10539
dev_context 2067
dev_answer 8827


In [34]:
# Creating test set and print size of datasets
train = train_df[:77000]
test = train_df[77000:]

print(len(dev))
print(len(train))


10570
77000


In [35]:
# Defining tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')

In [36]:
# Class for creating the tokenized data sets
class QuesGenDataset(Dataset):

    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: T5Tokenizer,
        source_max_token_len: int = 500,
        target_max_token_len: int = 30
        ):

        self.tokenizer = tokenizer
        self.data = data
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]

        source_encoding = tokenizer(
            data_row['answer'],
            data_row['context'],
            max_length= self.source_max_token_len,
            padding='max_length',
            truncation= 'only_second',
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )
    
        target_encoding = tokenizer(
            data_row['question'],
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation = True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
            )

        labels = target_encoding['input_ids']  
        labels[labels == 0] = -100

        return dict(
            answer = data_row['answer'],
            context = data_row['context'],
            question = data_row['question'],
            input_ids = source_encoding['input_ids'].flatten(),
            attention_mask = source_encoding['attention_mask'].flatten(),
            labels=labels.flatten()
            )

In [37]:
# Module class for later training 
class QuesGenDataModule(pl.LightningDataModule):

    def __init__(
        self,
        train_df: pd.DataFrame,
        val_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: T5Tokenizer,
        batch_size: int = 16,
        source_max_token_len: int = 500,
        target_max_token_len: int = 30
        ): 
        super().__init__()
        self.batch_size = batch_size
        self.train_df = train_df
        self.val_df = val_df
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def setup(self):
        self.train_dataset = QuesGenDataset(self.train_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.val_dataset = QuesGenDataset(self.val_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)
        self.test_dataset = QuesGenDataset(self.test_df, self.tokenizer, self.source_max_token_len, self.target_max_token_len)

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=2)

    def val_dataloader(self): 
        return DataLoader(self.val_dataset, batch_size=1, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=1, num_workers=2)

In [38]:
# Hyperparameters
MODEL_NAME = 't5-small'
TOKENIZER_LEN = len(tokenizer)
SOURCE_MAX_TOKEN_LEN = 500
TARGET_MAX_TOKEN_LEN = 30

N_EPOCHS = 10
BATCH_SIZE = 16
LEARNING_RATE = 0.0001

data_module = QuesGenDataModule(train, dev, test, tokenizer, BATCH_SIZE, SOURCE_MAX_TOKEN_LEN, TARGET_MAX_TOKEN_LEN)
data_module.setup()

In [39]:
# The model calss
class QuesGenModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME, return_dict=True)
        self.model.resize_token_embeddings(TOKENIZER_LEN)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']
        loss, output = self(input_ids, attention_mask, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)
        return loss
  
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=LEARNING_RATE)

In [17]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [32]:
# Saving the best model
checkpoint_callback = ModelCheckpoint(
    dirpath='/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th',
    filename='best-checkpoint',
    save_top_k=1,
    verbose=True,
    monitor='val_loss',
    mode='min'
)



In [33]:
# Define trainer
trainer = pl.Trainer(
        checkpoint_callback= checkpoint_callback,
        max_epochs=N_EPOCHS,
        gpus=1,
        progress_bar_refresh_rate=30
    )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [None]:
# Training the model
model = QuesGenModel()
trainer.fit(model, data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
241.969   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Epoch 0, global step 4812: val_loss reached 1.74085 (best 1.74085), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 1, global step 9625: val_loss reached 1.69160 (best 1.69160), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 2, global step 14438: val_loss reached 1.66543 (best 1.66543), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 3, global step 19251: val_loss reached 1.66013 (best 1.66013), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 4, global step 24064: val_loss reached 1.65751 (best 1.65751), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 5, global step 28877: val_loss reached 1.65342 (best 1.65342), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 6, global step 33690: val_loss reached 1.65117 (best 1.65117), saving model to "/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v4.ckpt" as top 1


Validating: 0it [00:00, ?it/s]

Epoch 7, step 38503: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

Epoch 8, step 43316: val_loss was not in top 1


Validating: 0it [00:00, ?it/s]

In [None]:
# Test
trainer.test()

In [18]:
# Loading the best model from google drive
checkpoint_path = '/content/drive/MyDrive/NLP Final Project/Model Checkpoints/4th/best-checkpoint-v2.ckpt'

best_model = QuesGenModel.load_from_checkpoint(checkpoint_path)
best_model.freeze()
best_model.eval()

print()

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/242M [00:00<?, ?B/s]




In [40]:
# Generating question by answer and context
def generate_question(answer, context, best_model):
    source_encoding = tokenizer(
        answer,
        context,
        max_length=SOURCE_MAX_TOKEN_LEN,
        padding='max_length',
        truncation='only_second',
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )

    generated_ids = best_model.model.generate(
        input_ids=source_encoding['input_ids'],
        attention_mask=source_encoding['attention_mask'],
        num_beams=1,
        max_length=TARGET_MAX_TOKEN_LEN,
        repetition_penalty=3.5,
        length_penalty=0.95,
        early_stopping=True,
        use_cache=True
    )

    preds = {
        tokenizer.decode(generated_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        for generated_id in generated_ids
    }

    return ''.join(preds)

In [41]:
# Function for checking the model manually
context = "Prepared by experienced English teachers, the texts, articles and conversations are brief and appropriate to your level of proficiency. Take the multiple-choice quiz following each text, and you'll get the results immediately. You will feel both challenged and accomplished! You can even download (as PDF) and print the texts and exercises. It's enjoyable, fun and free. Good luck!."
answers = ['pdf']

for ans in answers:

    generated = generate_question(ans, context, best_model)
    print('generated question: ', generated)
    print('from answer: ', ans)
    print()

generated question:  What format can you download and print?
from answer:  pdf



In [42]:
# Print result
def result(generated, context, answer, question: str = ''):
    print('Generated question: ', generated)
    print('SQuAD question: ', question)
    print()
    print('Answer: ', answer)
    print('Conext: ', context)
    print()
    print()

In [22]:
# Script for showing result of test
for i in range(len(test[:10])): #  Showing 10 examples

    context = test.iloc[i]['context']
    answer = test.iloc[i]['answer']
    
    generated = generate_question(answer, context, best_model)
    
    result(generated, context, answer, test.iloc[i]['question'])

Generated question:  What percentage of Armenia's population would be in favor for joining the EU?
SQuAD question:  How many of Armenias inhabitants approve of becoming part of the EU?

Answer:  64%
Conext:  Armenia is also a member of the Council of Europe, maintaining friendly relations with the European Union, especially with its member states such as France and Greece. A 2005 survey reported that 64% of Armenia's population would be in favor of joining the EU. Several Armenian officials have also expressed the desire for their country to eventually become an EU member state, some[who?] predicting that it will make an official bid for membership in a few years.[citation needed] In 2004 its forces joined KFOR, a NATO-led international force in Kosovo. It is also an observer member of the Eurasian Economic Community and the Non-Aligned Movement.


Generated question:  What is KFOR?
SQuAD question:  What is KFOR?

Answer:  a NATO-led international force in Kosovo
Conext:  Armenia is al

In [22]:
# Input from answer and text module, creating out put for Distractor module
def ans_text(input_df): 
 
    output = {'context': [], 'answer': [], 'question': []}

    for i in range(len(input_df)):
      
        context = input_df.iloc[i]['context']
        answer = input_df.iloc[i]['answer']
        
        question = generate_question(answer, context, best_model)

        output['context'].append(context)
        output['answer'].append(answer)
        output['question'].append(question)

    return pd.DataFrame(output)

In [None]:
# Loading an input for the model from first module
context_ans = pd.read_csv('contextAnswer_df2.csv')
context_ans

In [None]:
# Creating a csv for the distractors module
df = ans_text(context_ans)
df.to_csv('Avitay_df2.csv')

#Metric

In [None]:
# Create data frame [question][generated 0/1] 0 if its original question, 1 if generated for distilled-bert-cased model
def metric_df(dataset): 

    metric_df = {'question': [], 'generated': []}

    for i in range(len(dataset)): 

        answer = dataset.iloc[i]['answer']
        context = dataset.iloc[i]['context']
        question = dataset.iloc[i]['question']
        generated = generate_question(answer, context, best_model)

        metric_df['question'].append(question)
        metric_df['generated'].append(0)
        metric_df['question'].append(generated)
        metric_df['generated'].append(1)

    return pd.DataFrame(metric_df)

In [None]:
# Creating dataset to check generated questions with distilled-bert-cased model
sample_df = metric_df(dev)
print(len(sample_df))

In [None]:
# csv for distilled-bert-cased model
sample_df.to_csv('metric_df.csv')