In [None]:
! mkdir squad
! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
! wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

--2021-03-22 10:10:35--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘squad/train-v2.0.json’


2021-03-22 10:10:36 (245 MB/s) - ‘squad/train-v2.0.json’ saved [42123633/42123633]

--2021-03-22 10:10:36--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘squad/dev-v2.0.json’


2021-03-22 10:10:36 (74.5 MB/s) - ‘squad/dev-v2.0.json’ saved [4370528/4370528]



In [None]:
import json
from pathlib import Path
 
def read_squad(path):
    path = Path(path)
    with open(path,'rb') as f:
        squad_dict = json.load(f)
 
    contexts = []
    questions = []
    answers = []
 
    for group in squad_dict["data"]:
        for passage in group["paragraphs"]:
            context = passage["context"]
            for qa in passage["qas"]:
                question = qa["question"]
                for answer in qa["answers"]:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    
    return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_squad("squad/train-v2.0.json")
val_contexts, val_questions, val_answers = read_squad("squad/dev-v2.0.json")

Contexts and questions are string. The answers are dicts containing the subsequence of the passage with the correct answer as well as the integer indicating the character at which the answer begins. In order to train the model on this data we need

(1) the tokenized context/question pairs, and 

(2) Integers indicating at which token positions the answer begins and ends.


First, let's get the character position at which the answer ends in the passage

In [None]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer["text"]
        start_idx = answer["answer_start"]
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two - fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer["answer_start"] = start_idx + 1
            answer["answer_end"] = end_idx + 1  # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer["answer_start"] = start_idx + 2
            answer["answer_end"] = end_idx + 2  # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

In [None]:
! pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 18.3MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 55.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 56.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=f9ed

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding = True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




In [None]:
for count, i in enumerate(train_encodings):
    if count<5:
        print(i)

input_ids
attention_mask


In [None]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]["answer_start"]))
        end_positions.append(encodings.char_to_token(i, answers[i]["answer_end"] - 1))

        # if start position is None, the answer passage has been truncated 
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({"start_positions": start_positions, "end_positions": end_positions})


In [None]:
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

In [None]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        self.start_positions = encodings["start_positions"]
        self.end_positions = encodings["end_positions"]

    def __getitem__(self, idx):
        item = {key:torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['start_positions'] = torch.tensor(self.start_positions[idx])
        item["end_positions"] = torch.tensor(self.end_positions[idx]) 
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
for count, i in enumerate(train_encodings):
    if count<5:
        print(i)

input_ids
attention_mask
start_positions
end_positions


In [None]:
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)

In [None]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
model = model.to(device)

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Step,Training Loss
10,6.0417
20,6.0033
30,5.9909
40,5.9121
50,5.8085
60,5.6546
70,5.4373
80,5.1634
90,4.9652
100,4.7277


TrainOutput(global_step=16281, training_loss=1.0046048394351323, metrics={'train_runtime': 13048.1198, 'train_samples_per_second': 1.248, 'total_flos': 5.310098044580045e+16, 'epoch': 3.0, 'init_mem_cpu_alloc_delta': 3174128, 'init_mem_gpu_alloc_delta': 0, 'init_mem_cpu_peaked_delta': 706932, 'init_mem_gpu_peaked_delta': 0, 'train_mem_cpu_alloc_delta': 1217183, 'train_mem_gpu_alloc_delta': 530958848, 'train_mem_cpu_peaked_delta': 98012047, 'train_mem_gpu_peaked_delta': 6540899328})

In [None]:
trainer.evaluate()

{'epoch': 3.0,
 'eval_loss': 1.3620326519012451,
 'eval_mem_cpu_alloc_delta': 71945,
 'eval_mem_cpu_peaked_delta': 187107,
 'eval_mem_gpu_alloc_delta': 0,
 'eval_mem_gpu_peaked_delta': 2315895808,
 'eval_runtime': 380.0819,
 'eval_samples_per_second': 53.415}

In [None]:
test_context = """
Machine learning (ML) is the study of computer algorithms that improve automatically through experience. It is seen as a part of artificial intelligence. Machine learning algorithms build a model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to do so. Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.
"""

In [None]:
question = "What are machine learning models based on?"

In [None]:
test_encoding = tokenizer(test_context, question, truncation=True, padding=True)

In [None]:
input_ids = torch.tensor(test_encoding["input_ids"]).to(device)
attention_mask = torch.tensor(test_encoding["attention_mask"]).to(device)

In [None]:
import numpy as np

def to_check_result(test_encoding):
    input_ids = torch.tensor(test_encoding["input_ids"]).to(device)
    attention_mask = torch.tensor(test_encoding["attention_mask"]).to(device)
    with torch.no_grad():
        outputs = model(input_ids.unsqueeze(0), attention_mask.unsqueeze(0))
    start_label_indices = np.argmax(outputs[0].to('cpu').numpy())
    end_label_indices = np.argmax(outputs[1].to('cpu').numpy())
    return start_label_indices, end_label_indices

In [None]:
print(start_label_indices)
print(end_label_indices)

36
37


In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
result =  " ".join(tokens[start_label_indices:end_label_indices+1]) 

In [None]:
print(question)
print(f"Answer : {result}")

What are machine learning models based on?
Answer : sample data


In [None]:
context = """
 Modern football originated in Britain in the 19th century. Though “folk football” had been played since medieval times with varying rules, the game began to be standardized when it was taken up as a winter game at public schools. Leo Messi is god of football. Football, also called association football or soccer, game in which two teams of 11 players, using any part of their bodies except their hands and arms, try to maneuver the ball into the opposing team’s goal. Only the goalkeeper is permitted to handle the ball and may do so only within the penalty area surrounding the goal. The team that scores more goals wins.
"""

In [None]:
question_1 = "Which country started football"
question_2 = "Who is the god of football"

In [None]:
test_encoding_1 = tokenizer(context, question_1, truncation=True, padding=True)
test_encoding_2 = tokenizer(context, question_2, truncation=True, padding=True)

In [None]:
start_1, end_1 = to_check_result(test_encoding_1)
start_2, end_2 = to_check_result(test_encoding_2)

In [None]:
tokens_1 = tokenizer.convert_ids_to_tokens(test_encoding_1["input_ids"])
tokens_2 = tokenizer.convert_ids_to_tokens(test_encoding_2["input_ids"])
result_1 =  " ".join(tokens_1[start_1:end_1 + 1])
result_2 =  " ".join(tokens_2[start_2:end_2 + 1]) 

In [None]:
print(question_1)
print(f"Answer : {result_1}")

Which country started football
Answer : britain
Who is the god of football
leo mess ##i


In [None]:
print(question_2)
print(f"Answer : {result_2}")

Who is the god of football
Answer : leo mess ##i
