# Question-Answering with BERT
## Goal

Train a distilBERT model for Question and Answering on a subset of the [SQuAD v2.0](https://rajpurkar.github.io/SQuAD-explorer/) dataset.

In [2]:
!pip install pulp
!pip install transformers
from google.colab import drive
drive.mount('/content/gdrive')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Mounted at /content/gdrive


## Environment

In [None]:
#provided code
import numpy as np
import torch
import pulp
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score

In [None]:
#provided code
squad_path = '/content/gdrive/MyDrive/QA/data/Squad'

## Initial Data Processing

In [None]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


def convert_to_BERT_tensors(questions, contexts):
    '''takes a parallel list of question strings and answer strings
    return: tensors of input_ids and attention masks
    '''
    stuff = tokenizer(questions, contexts, return_tensors='pt', truncation=True, padding=True)#max_length by default = 512
    # print(stuff)
    return stuff['input_ids'], stuff['attention_mask']


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
test_questions = ["Why?", "How?"]
test_contexts = ["I think it is because we can bluminate", "It was done"" ".join(["very"]*1000) + " well"]

ids, mask = convert_to_BERT_tensors(test_questions,test_contexts)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Success!


In [None]:
print(tokenizer.vocab['[CLS]'], tokenizer.vocab['why'], 
      tokenizer.vocab['?'], tokenizer.vocab['[SEP]'], tokenizer.vocab['i'], tokenizer.vocab['think'])

101 2339 1029 102 1045 2228


tensors of indices which correspond to the beginning and end of the answer span.

question + context, including the required special BERT tokens \[CLS\] and \[SEP\]).

Note, if the answer does not appear in the input, should set start and end indices both to zero. if one of the indices ends outside of 512, you should treat it as a failed match

EX: 

question is "Who is Bill"

context is "Bill is short for William Shakespeare, the writer of many plays, including 'Hamlet' and 'MacBeth' ."

answer is "William Shakespeare".  

Then you need to loop through the [CLS] + question + [SEP] + answer string, looking for the first bigram that matches "William Shakespeare", and return the span (in this case, [8,9]) 

In [None]:
def get_answer_span_tensor(question,context,answer):
    # your code here
    input_tokens = tokenizer.tokenize('[CLS] ' + question + ' [SEP] ' + context)
    answer_tokens = tokenizer.tokenize(answer)
    # print("input: ", input_tokens)
    # print("answer: ", answer_tokens)
    span_len = len(answer_tokens)
    # print("span length: ", span_len)
    for i in range(min(len(input_tokens) - span_len+1, 512 - span_len - 1)):
        if input_tokens[i:i+span_len] == answer_tokens:
            span = torch.tensor([i,i+span_len - 1])
            break
    else:
        span = torch.tensor([0,0])
        
    return span

In [None]:
test_question = "Why?"
test_context = "I think it is because we can bluminate"
test_answer = "because we can bluminate"
bad_answer  = "because we can fumiage"
span = get_answer_span_tensor(test_question,test_context,test_answer)
assert span.shape == (2,)
assert list(span) == [8,12]
span = get_answer_span_tensor(test_question,test_context,bad_answer)
assert list(span) == [0,0]

Success!


In [None]:
#provided code
batch_size = 16

class QAdataset(Dataset):
    '''A dataset for housing QA data, including input_data, output_data, and padding mask'''
    def __init__(self, input_data, output_data,mask):
        self.input_data = input_data
        self.output_data = output_data
        self.mask = mask
        
    def __len__(self):
        return len(self.input_data)
    
    def __getitem__(self, index):
        target = self.output_data[index]
        data_val = self.input_data[index]
        mask = self.mask[index]
        return data_val,target,mask 

In [None]:
def prepare_QA_dataset(split):
    
    # '''for split in "train", "dev", "test", perpares Pytorch dataset by reading the files and 
    # converting the data to tensors. For test, provides dummy answers'''    
    with open(squad_path + split + ".question", encoding="utf-8") as f:
        questions = f.readlines()
    with open(squad_path + split + ".context", encoding="utf-8") as f:
        contexts = f.readlines()    
    QA_input, masks = convert_to_BERT_tensors(questions, contexts)

    # only for train and dev; 
    if "train" == split or "dev" == split: 
        with open(squad_path + split + ".answer", encoding="utf-8") as f:
            answers = f.readlines()
            spans = []
            # based on the lenght of questions, `get_answer_span_tensor`
            for i in range(len(questions)):
                spans.append(get_answer_span_tensor(questions[i], contexts[i], answers[i]))
    else:
        spans = [torch.tensor([0,0])]*len(questions)
    return QAdataset(QA_input, spans, masks)


train_dataset = prepare_QA_dataset("train")
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
dev_dataset = prepare_QA_dataset('dev')
dev_dataloader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False)
test_dataset = prepare_QA_dataset('test')
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

## BERT Training 


Choose BERT QA model from Huggingface, uncased version (since there is a pre-tuned version on SQaD)


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased').to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
print(device)

cuda:0


In [None]:
MODEL_PATH = "QA_BERT.bin" 
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [None]:
for dev_text_batch, dev_span_batch, masks in train_dataloader:
  print(mask)
  break

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])


In [None]:
predicted_starts = []
gold_starts = []
predicted_ends = []
gold_ends = []
model.eval()
with torch.no_grad():
    for dev_text_batch, dev_span_batch, masks in dev_dataloader:
        dev_text_batch, masks = dev_text_batch.to(device), masks.to(device)
        output = model(dev_text_batch,attention_mask=masks)

        # `start_scores` and `end_scores` from `start_logitic` and `end_logtis` of `output`
        start_scores = output.start_logits # batch x 512
        # print(start_scores)
        # print(start_scores.size())
        end_scores = output.end_logits # batch x 512

        # `target``from `dev_span_batch`
        targets = dev_span_batch
        # print("targets:",targets)

        # `extend` for 
        # `predicted_starts` and `predicted_ends`; 
        # argmax of start_scores and end_scores, and list
        predicted_starts.extend([i.item() for i in torch.argmax(start_scores, dim = 1)]) # each one: tensor(67, device='cuda:0'), use .item()
        # print("predicted:", predicted_starts)
        predicted_ends.extend([i.item() for i in torch.argmax(end_scores, dim = 1)])

        # # and `gold_starts` and `gold_ends`
        # # list of targets; 
        gold_starts.extend([i[0].item() for i in targets])
        gold_ends.extend([i[1].item() for i in targets])
        # print("gold_ends:", gold_ends)


print("Starts accuracy")
print(accuracy_score(gold_starts,predicted_starts))
print("Ends accuracy")
print(accuracy_score(gold_ends,predicted_ends))


Starts accuracy
0.6228220020498805
Ends accuracy
0.6592073795695251


In [None]:
def select_best_answer_span_v2(start_probs, end_probs, distance):
    '''given 2 matrices of probabilities associated with 
    indicies of a text being the start or end of an answer spans, respectively,
    finds the highest probability spans under the restriction that the end index must be no more 
    than distance after the start. Returns a list (start index, end index) 2-plues
    corresponding to the best solution for each row of start/end_probs'''
    best_starts = np.argsort(start_probs*-1, axis=1) # sort from largest to the smallest
    best_ends = np.argsort(end_probs*-1, axis=1)
    output_spans = []
    for i in range(len(start_probs)):
        step = 0
        found = False
        sorted_spans = []
        bound = 0
        while not found:
    
          # print("iter ", step)
          # for j in range(step + 1):
          #     print(j, step)
          #     print(step, j)
          
          sorted_spans.extend([(start_probs[i, best_starts[i,j]] + end_probs[i, best_ends[i,step]],         # start_probs[i, best_starts[i,j]] + end_probs[i,best_ends[i,step]]
                                                                                                      # where i in `range(len(start_probs)` (iterate # of batch)
                                          best_starts[i,j], best_ends[i,step]) for j in range(step + 1)])
          sorted_spans.extend([(start_probs[i, best_starts[i,step]] + end_probs[i, best_ends[i,j]], 
                                      best_starts[i,step], best_ends[i,j]) for j in range(step + 1)])
          # print("second:", sorted_spans)
          sorted_spans.sort()
          # print("sort:", sorted_spans)

          if len(sorted_spans) > 0:
              curr = sorted_spans.pop()
              # print("curr", curr)
              if curr[1] <= curr[2] <= curr[1] + distance:
                  found = (curr[1], curr[2])

          step += 1

        output_spans.append(found)

    return output_spans

In [None]:
test_starts = np.array([[0.1,0.5,0.2,0.1,0.1], [0.3,0.2,0.2,0.1,0.1]])
test_ends = np.array([[0.4,0.1,0.3,0.1,0.1], [0.1,0.1,0.1,0.1,0.6]])
assert select_best_answer_span_v2(test_starts,test_ends,2) == [(1,2),(2,4)]

## Predicted answers

In [None]:
answers = []
with torch.no_grad():
    for test_text_batch, test_span_batch, masks in test_dataloader:
        test_text_batch, masks = test_text_batch.to(device), masks.to(device)
        output = model(test_text_batch,attention_mask=masks)
        start_scores = output.start_logits.to('cpu').detach()
        end_scores = output.end_logits.to('cpu').detach()
        start_probs = F.log_softmax(start_scores,dim=1).numpy()
        end_probs = F.log_softmax(end_scores,dim=1).numpy()
        spans = select_best_answer_span_v2(start_probs, end_probs, distance=2)
        for i in range(len(spans)):
            answers.append(tokenizer.decode(test_text_batch[i,spans[i][0]: spans[i][1] + 1]))

with open("test_answers.txt","w", encoding="utf=8") as fout:
    fout.write('Id,Predicted\n')
    for idx, pred in enumerate(answers):
        fout.write(str(idx) + ',' + str(pred) + '\n')