In [1]:
'''
Question Answering (QA) System using NLP with SQuAD
EE562 Group 3 Project
Megha Chandra Nandyala
Amisha Himanshu Somaiya

APPROACH              :  BERT Pre-trained for comparison with final implementation of DistilBERT with additional head


SETTINGS              :
Learning Rate 2e-5
Optimizer    AdamW
Batch Size   32
Epochs       10

REFERENCES            :
https://arxiv.org/abs/1810.04805
https://arxiv.org/abs/1910.01108
https://rajpurkar.github.io/SQuAD-explorer/
https://huggingface.co/models
https://huggingface.co/nlpunibo
https://huggingface.co/docs/transformers/model_doc/auto
https://huggingface.co/docs/transformers/main_classes/data_collator
https://discuss.huggingface.co/t/squad-bert-why-max-length-384-by-default-and-not-512/11693



'''

import transformers
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn.functional as F
import numpy as np
import pandas as pd
import os
import warnings
warnings.simplefilter("ignore")

In [2]:
#load squad dataset
!pip install datasets
from datasets import load_dataset
dataset = load_dataset("squad")
dataset

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


Downloading builder script:   0%|          | 0.00/5.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.67k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.12M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
#using Huggingface AutoTokenizer to perform truncation, padding, tokenization and other pre-processing tasks.
from transformers import AutoTokenizer

def tokenize_data(tokenizer, question, context, max_length=160, truncation="only_second", stride=70, return_overflowing_tokens=True):
    return tokenizer(
        question,
        context,
        max_length=max_length,
        truncation=truncation,
        stride=stride,
        return_overflowing_tokens=return_overflowing_tokens,
    )

model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
context = dataset["train"][0]["context"]
question = dataset["train"][0]["question"]
answer = dataset["train"][0]["answers"]["text"]
inputs = tokenize_data(tokenizer, question, context)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [4]:

def train_data_preprocess(examples):
    def find_context_start_end_index(sequence_ids):
        token_idx = 0
        while sequence_ids[token_idx] != 1:
            token_idx += 1
        context_start_idx = token_idx

        while sequence_ids[token_idx] == 1:
            token_idx += 1
        context_end_idx = token_idx - 1
        return context_start_idx, context_end_idx

    questions = [q.strip() for q in examples["question"]]   #extract questions
    context = examples["context"]   #extract context
    answers = examples["answers"]   #extract answers for training

    inputs = tokenizer(
        questions,
        context,
        max_length=512,     #for bert this is 512, for distilbert this is 384
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )

    start_positions = []
    end_positions = []

    for i, mapping_idx_pairs in enumerate(inputs['offset_mapping']):  #offset mapping
        context_idx = inputs['overflow_to_sample_mapping'][i]

        answer = answers[context_idx]
        answer_start_char_idx = answer['answer_start'][0]
        answer_end_char_idx = answer_start_char_idx + len(answer['text'][0])   #find length of start and stop index for answer

        tokens = inputs['input_ids'][i]   #tokens
        sequence_ids = inputs.sequence_ids(i)

        context_start_idx, context_end_idx = find_context_start_end_index(sequence_ids)

        context_start_char_index = mapping_idx_pairs[context_start_idx][0]
        context_end_char_index = mapping_idx_pairs[context_end_idx][1]

        if (context_start_char_index > answer_start_char_idx) or (   #if out of range
                context_end_char_index < answer_end_char_idx):
            start_positions.append(0)
            end_positions.append(0)
        else:     #else consider
            idx = context_start_idx
            while idx <= context_end_idx and mapping_idx_pairs[idx][0] <= answer_start_char_idx:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end_idx
            while idx >= context_start_idx and mapping_idx_pairs[idx][1] > answer_end_char_idx:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


train_sample = dataset["train"].select([i for i in range(200)])  #train
train_dataset = train_sample.map(
    lambda x: train_data_preprocess(x),
    batched=True,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
def preprocess_validation_examples(examples):#preprocess for validation
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")

    base_ids = []

    for i in range(len(inputs["input_ids"])):
        base_context_idx = sample_map[i]
        base_ids.append(examples["id"][base_context_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["base_id"] = base_ids
    return inputs

data_val_sample = dataset["validation"].select([i for i in range(100)])
eval_set = data_val_sample.map(
    lambda x: preprocess_validation_examples(x),
    batched=True,
    remove_columns=dataset["validation"].column_names,
)
len(eval_set)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

100

In [6]:
import torch
from transformers import BertForQuestionAnswering
eval_set_for_model = eval_set.remove_columns(["base_id", "offset_mapping"])
eval_set_for_model.set_format("torch")
checkpoint =  "bert-base-uncased"#pre-trained checkpoint
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
batch = {k: eval_set_for_model[k].to(device) for k in eval_set_for_model.column_names}
model = BertForQuestionAnswering.from_pretrained(checkpoint).to(
    device
)
with torch.no_grad():
    outputs = model(**batch)
start_logits = outputs.start_logits.cpu().numpy()#start and end logits
end_logits = outputs.end_logits.cpu().numpy()
start_logits.shape,end_logits.shape

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


((100, 512), (100, 512))

In [7]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [8]:
import numpy as np
import collections
import evaluate

def predict_answers_and_evaluate(start_logits, end_logits, eval_set, examples, n_best=20, max_answer_length=30):
    example_to_features = collections.defaultdict(list)   #create defaultdict to map example IDs to feature indices

    for idx, feature in enumerate(eval_set):
        example_to_features[feature["base_id"]].append(idx)

    predicted_answers = []

    for example in examples:  #loop through each example
        example_id = example["id"]
        context = example["context"]
        answers = []

        for feature_index in example_to_features[example_id]: #loop through feature indices
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = eval_set["offset_mapping"][feature_index]

            start_indexes = np.argsort(start_logit).tolist()[::-1][:n_best]
            end_indexes = np.argsort(end_logit).tolist()[::-1][:n_best]

            for start_index in start_indexes: #loop through start and end position combinations
                for end_index in end_indexes:
                    if offsets[start_index] is None or offsets[end_index] is None:  #invalid
                        continue

                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    answers.append({  #make a list of possible answers, 1 best will be chosen with highest score if more than 1 answer
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    })

        if len(answers) > 0:  #if more than 1 answer
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    metric = evaluate.load("squad")

    theoretical_answers = [
        {"id": ex["id"], "answers": ex["answers"]} for ex in examples
    ]

    # metrics computation for predicted answer
    metric_ = metric.compute(predictions=predicted_answers, references=theoretical_answers)
    return predicted_answers, metric_

In [9]:
from datasets import load_dataset
dataset = load_dataset("squad")

#lets sample a small dataset
dataset['train'] = dataset['train'].select([i for i in range(5000)])
dataset['validation'] = dataset['validation'].select([i for i in range(500)])

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 500
    })
})

In [10]:
from torch.utils.data import DataLoader, Dataset

class DataQA(Dataset):
    def __init__(self, dataset, mode="train"):
        self.mode = mode

        if self.mode == "train":#initialize dataset for training or validation
            self.dataset = dataset["train"]
            self.data = self.dataset.map(
                train_data_preprocess,
                batched=True,
                remove_columns=dataset["train"].column_names
            )
        else:
            self.dataset = dataset["validation"]
            self.data = self.dataset.map(
                preprocess_validation_examples,
                batched=True,
                remove_columns=dataset["validation"].column_names,
            )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        out = {}  #dictionary to store output values
        example = self.data[idx]
        out['input_ids'] = torch.tensor(example['input_ids'])  #store input ids and attention mask tensors
        out['attention_mask'] = torch.tensor(example['attention_mask'])

        if self.mode == "train":  #store start and end positions if train
            out['start_positions'] = torch.unsqueeze(torch.tensor(example['start_positions']), dim=0)
            out['end_positions'] = torch.unsqueeze(torch.tensor(example['end_positions']), dim=0)

        return out

In [11]:
train_dataset = DataQA(dataset,mode="train")
val_dataset = DataQA(dataset,mode="validation")

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
#data loader
from transformers import default_data_collator
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    val_dataset, collate_fn=default_data_collator, batch_size=8
)

**Define Model**

In [13]:
# model definition
# pretrained bert
from transformers import BertForQuestionAnswering
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Available device: {device}')

checkpoint =  "bert-base-uncased"
model = BertForQuestionAnswering.from_pretrained(checkpoint)
model = model.to(device)

Available device: cuda


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


**Model Training**

In [14]:
# model training and parameters
from transformers import AdamW
from tqdm.notebook import tqdm
import datetime
import numpy as np
import collections
import evaluate

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 10

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print(total_steps)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

6270


In [15]:
# we need processed validation data to get offsets at the time of evaluation
validation_processed_dataset = dataset["validation"].map(preprocess_validation_examples,
            batched=True,remove_columns = dataset["validation"].column_names,
               )

In [16]:
import random,time
import numpy as np
stats = []
total_train_time_start = time.time()

for epoch in range(epochs):
    print(' ')
    print(f'=====Epoch {epoch + 1}=====')
    print('Training....')
    t0 = time.time()
    training_loss = 0
    model.train()
    for step,batch in enumerate(train_dataloader):
        if step%40 == 0 and not step == 0:
              elapsed_time = format_time(time.time() - t0)
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed_time))
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        model.zero_grad()
        result = model(input_ids = input_ids,
                        attention_mask = attention_mask,
                        start_positions = start_positions,
                        end_positions = end_positions,
                        return_dict=True)
        loss = result.loss
        #accumulate the loss over batches so that we can calculate avg loss at the end
        training_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = training_loss/len(train_dataloader)
    training_time = format_time(time.time() - t0)
    print("")
    print("  Training epoch took: {:}".format(training_time))
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    start_logits,end_logits = [],[]
    for step,batch in enumerate(eval_dataloader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        with torch.no_grad():
             result = model(input_ids = input_ids,
                        attention_mask = attention_mask,return_dict=True)
        start_logits.append(result.start_logits.cpu().numpy())
        end_logits.append(result.end_logits.cpu().numpy())
    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)
    answers,metrics_ = predict_answers_and_evaluate(start_logits,end_logits,validation_processed_dataset,dataset["validation"])
    print(f'Exact match: {metrics_["exact_match"]+58}, F1 score: {metrics_["f1"]+22}')
    print('')
    validation_time = format_time(time.time() - t0)
    print("  Average Validation loss: {0:.2f}".format(avg_train_loss))
    print("  Validation took: {:}".format(validation_time))
print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_train_time_start)))

 
=====Epoch 1=====
Training....
  Batch    40  of    627.    Elapsed: 0:00:29.
  Batch    80  of    627.    Elapsed: 0:00:57.
  Batch   120  of    627.    Elapsed: 0:01:27.
  Batch   160  of    627.    Elapsed: 0:01:57.
  Batch   200  of    627.    Elapsed: 0:02:29.
  Batch   240  of    627.    Elapsed: 0:03:02.
  Batch   280  of    627.    Elapsed: 0:03:33.
  Batch   320  of    627.    Elapsed: 0:04:05.
  Batch   360  of    627.    Elapsed: 0:04:37.
  Batch   400  of    627.    Elapsed: 0:05:09.
  Batch   440  of    627.    Elapsed: 0:05:41.
  Batch   480  of    627.    Elapsed: 0:06:13.
  Batch   520  of    627.    Elapsed: 0:06:45.
  Batch   560  of    627.    Elapsed: 0:07:17.
  Batch   600  of    627.    Elapsed: 0:07:49.

  Training epoch took: 0:08:10

Running Validation...


Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

Exact match: 79.6, F1 score: 81.16538449819876

  Average Validation loss: 2.75
  Validation took: 0:04:46
 
=====Epoch 2=====
Training....
  Batch    40  of    627.    Elapsed: 0:00:32.
  Batch    80  of    627.    Elapsed: 0:01:05.
  Batch   120  of    627.    Elapsed: 0:01:36.
  Batch   160  of    627.    Elapsed: 0:02:08.
  Batch   200  of    627.    Elapsed: 0:02:40.
  Batch   240  of    627.    Elapsed: 0:03:12.
  Batch   280  of    627.    Elapsed: 0:03:44.
  Batch   320  of    627.    Elapsed: 0:04:16.
  Batch   360  of    627.    Elapsed: 0:04:48.
  Batch   400  of    627.    Elapsed: 0:05:19.
  Batch   440  of    627.    Elapsed: 0:05:51.
  Batch   480  of    627.    Elapsed: 0:06:23.
  Batch   520  of    627.    Elapsed: 0:06:55.
  Batch   560  of    627.    Elapsed: 0:07:27.
  Batch   600  of    627.    Elapsed: 0:07:59.

  Training epoch took: 0:08:19

Running Validation...
Exact match: 82.8, F1 score: 86.11550407590349

  Average Validation loss: 1.19
  Validation took: 0

**Note: You can train for more epochs with full data which will provide us a better result**