#Imports

In [None]:
!git clone https://github.com/deepmind/xquad.git
!pip install --quiet transformers

In [None]:
import json
import numpy as np

from transformers import (AutoTokenizer, 
  AutoModelForQuestionAnswering, 
  AdamW,
 get_linear_schedule_with_warmup,
 DistilBertTokenizerFast)
 

import torch 
import torch.nn as nn
from torch.utils.data import DataLoader

from tqdm import tqdm

In [None]:
# running the model on to the gpu if it is possibile for better performance

if torch.cuda.is_available():
  device = torch.device('cuda')
  print('Running on gpu')
else:
    device = torch.device('cpu')
    print('Running on cpu')

Running on gpu


#Data

Splitting the raw data into training, evaluation and test data

In [None]:
path = 'xquad/'
file = 'xquad.ro.json'

with open(path + file) as f:
    squad_dict = json.load(f)


train_data = {"data": squad_dict["data"][:40], "version": squad_dict["version"]}
valid_data = {"data": squad_dict["data"][40:44], "version": squad_dict["version"]}
test_data = {"data": squad_dict["data"][44:], "version": squad_dict["version"]}

with open("train_ro_quad.json", "w") as f:
    json.dump(train_data, f)

with open("valid_ro_quad.json", "w") as f:
    json.dump(valid_data, f)

with open("test_ro_quad.json", "w") as f:
    json.dump(test_data, f)

dict_keys(['paragraphs', 'title'])


Reading the xquad - dataset

In [None]:
def read_squad(path):
    # open JSON file and load intro dictionary
    with open(path, 'rb') as f:
        squad_dict = json.load(f)

    # initialize lists for contexts, questions, and answers
    contexts = []
    questions = []
    answers = []
    # iterate through all data in squad data
    for group in squad_dict['data']:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                # check if we need to be extracting from 'answers' or 'plausible_answers'
                if 'plausible_answers' in qa.keys():
                    access = 'plausible_answers'
                else:
                    access = 'answers'
                for answer in qa[access]:
                    # append data to lists
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    # return formatted data lists
    return contexts, questions, answers


def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters


def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = 511
        if end_positions[-1] is None:
            end_positions[-1] = 511

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

In [None]:
class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)


def load_dataset(file, tokenizer, batch_size):
    contexts, questions, answers = read_squad(file)
    add_end_idx(answers, contexts)
    encodings = tokenizer(contexts, questions, truncation=True, padding=True, max_length=512, return_tensors="pt")
    add_token_positions(encodings, answers)

    dataset = SquadDataset(encodings)

    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    return loader


def load_datasets(train_file, valid_file, test_file, tokenizer, batch_size):
    train_loader = load_dataset(train_file, tokenizer, batch_size)
    valid_loader = load_dataset(valid_file, tokenizer, 1)
    test_loader = load_dataset(test_file, tokenizer, 1)

    return train_loader, valid_loader, test_loader

#Global

In [None]:
tokenizer = AutoTokenizer.from_pretrained('dumitrescustefan/bert-base-romanian-cased-v1')

model = AutoModelForQuestionAnswering.from_pretrained("dumitrescustefan/bert-base-romanian-cased-v1")


Some weights of the model checkpoint at dumitrescustefan/bert-base-romanian-cased-v1 were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not i

#Utils for testing/evaluation

In [None]:
import re
import string
import torch


def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))


def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()

    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    common_tokens = set(pred_tokens) & set(truth_tokens)

    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0

    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)

    return 2 * (prec * rec) / (prec + rec)


def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""

    gold_answers = [answer["text"] for answer in example.answers if answer["text"]]

    # if gold_answers doesn't exist it's because this is a negative example -
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = [""]

    return gold_answers


def compute_scores(inputs, start_logits, end_logits, gold_start, gold_end, tokenizer):
    answer_start = torch.argmax(start_logits)
    answer_end = torch.argmax(end_logits) + 1

    prediction = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs[0][answer_start:answer_end]))
    gold_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs[0][gold_start:gold_end + 1]))

    # print(f"Prediction: {prediction}")
    # print(f"True Answers: {gold_answer}")

    em_score = compute_exact_match(prediction, gold_answer)
    f1_score = compute_f1(prediction, gold_answer)

    # print(f"EM: {em_score} \t F1: {f1_score}")

    return em_score, f1_score

#Training

In [None]:
epochs = 5
batch_size = 3
learning_rate = 5e-5


train_loader, valid_loader, _ = load_datasets("train_ro_quad.json",
                                              "valid_ro_quad.json",
                                              "test_ro_quad.json",
                                              tokenizer,
                                              batch_size)

# move model over to detected device
model.to(device)
# activate training mode of model
model.train()
# initialize adam optimizer with weight decay (reduces chance of overfitting)
optim = AdamW(model.parameters(), lr=learning_rate)

best_f1 = -1
scheduler = get_linear_schedule_with_warmup(optim,
                                            num_warmup_steps=len(train_loader),
                                            num_training_steps=len(train_loader) * epochs)

for epoch in range(epochs):
    # set model to train mode
    model.train()
    total_loss = 0
    # setup loop (we use tqdm for the progress bar)
    loop = tqdm(train_loader, leave=True)
    for i, batch in enumerate(loop):
        # initialize calculated gradients (from prev step)
        optim.zero_grad()
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)

        # extract loss
        loss = outputs[0]
        total_loss += loss.item()
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optim.step()
        scheduler.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=total_loss / (i + 1))

    total_em = 0
    total_f1 = 0
    model.eval()
    loop = tqdm(valid_loader, leave=True)
    for i, batch in enumerate(loop):
        # pull all the tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # train model on batch and return outputs (incl. loss)
        outputs = model(input_ids, attention_mask=attention_mask)
        # extract loss
        em_score, f1_score = compute_scores(input_ids,
                                            outputs.start_logits, outputs.end_logits,
                                            batch["start_positions"], batch["end_positions"],
                                            tokenizer)

        total_em += em_score
        total_f1 += f1_score

        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}, EM {total_em / (i + 1)}, F1 {total_f1 / (i + 1)}')

    if total_f1 / len(loop) > best_f1:
        print(f"Best checkpoint found: {best_f1} -> {total_f1 / len(loop)}")
        torch.save(model, "model.pt")
        best_f1 = total_f1 / len(loop)



#Testing

In [None]:
model = torch.load("model.pt", map_location=device)
model.to(device)

_, _, test_loader = load_datasets("train_ro_quad.json",
                                  "valid_ro_quad.json",
                                  "test_ro_quad.json",
                                  tokenizer,
                                  1)

total_em = 0
total_f1 = 0
model.eval()
loop = tqdm(test_loader, leave=True)
for i, batch in enumerate(loop):
    # pull all the tensor batches required for training
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    # train model on batch and return outputs (incl. loss)
    outputs = model(input_ids, attention_mask=attention_mask)
    # extract loss
    em_score, f1_score = compute_scores(input_ids,
                                        outputs.start_logits, outputs.end_logits,
                                        batch["start_positions"], batch["end_positions"],
                                        tokenizer)

    total_em += em_score
    total_f1 += f1_score

    # print relevant info to progress bar
    loop.set_description(f'EM {total_em / (i + 1)}, F1 {total_f1 / (i + 1)}')

print(f'EM {total_em / (i + 1)}, F1 {total_f1 / (i + 1)}')