In [8]:
from transformers.data.processors.squad import SquadFeatures, SquadV1Processor, squad_convert_examples_to_features


processor = SquadV1Processor()
examples = processor.get_dev_examples("/workspace/data/SQuAD")

"""
features = squad_convert_examples_to_features(
    examples=examples,
    tokenizer=tokenizer,
    max_seq_length=args.max_seq_length,
    doc_stride=args.doc_stride,
    max_query_length=args.max_query_length,
    is_training=not evaluate,
)
"""
examples[1]

100%|██████████| 48/48 [00:04<00:00, 10.10it/s]


<transformers.data.processors.squad.SquadExample at 0x7f1abc2dceb8>

In [20]:
examples[1]

<transformers.data.processors.squad.SquadExample at 0x7f1abc2dceb8>

## Parse SQuAD from json

In [1]:
import json
from pathlib import Path
import torch

SQuAD_DIR = "/workspace/data/SQuAD"

# Follows format:
#' data', 'version'
#  |-->'paragraphs', 'title',
#       |--> 'qas' 'context'
#             |--> 'answers' 'question', 'id'
#                   |--> 'answer_start', 'text'
    

def parse_squad(path):
    """
    SQuAD 1.1:
    """
    path = Path(path)
    with open(path, 'rb') as f:
        squad_dict = json.load(f)
    
    contexts = []
    questions = []
    answers = []
    for rows in squad_dict['data']:
        for paragraph in rows['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                for answer in qa['answers']:
                    contexts.append(context)
                    questions.append(question)
                    answers.append(answer)
    return contexts, questions, answers


train_data = parse_squad(f"{SQuAD_DIR}/train-v1.1.json")
val_data   = parse_squad(f"{SQuAD_DIR}/dev-v1.1.json")  

# split into context, questions, answers
train_contexts, train_questions, train_answers = train_data
val_contexts, val_questions, val_answers = val_data

## Get correct answer start and end index from text

In [2]:
# Give the 'golden_answer', starting_index and context:
# Find out the correct end_token and add it to the datasets

# Sometimes SQuAD answers are off by 1-2 characters, so we adjust to this
# Update: 'answer_start'
# Append: 'answer_end'
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        # sometimes squad answers are off by a character or two – fix this
        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        elif context[start_idx-1:end_idx-1] == gold_text:
            answer['answer_start'] = start_idx - 1
            answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
        elif context[start_idx-2:end_idx-2] == gold_text:
            answer['answer_start'] = start_idx - 2
            answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters

add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)

## Tokenize context and questions together

In [3]:
# Now train_answers and val_answers include the correct character end positions
# Lets tokenize the context and question pairs

from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=231508.0), HTML(value='')))




### Convert character start/end --> token start/end

Since using __fast__ tokenizers, we can use `char_to_token()`

In [4]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        # Take start/end char token and convert to token start/end
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
        
        # if None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

## Custom Dataset

In [5]:
# PyTorch
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)


# Tensorflow
"""
import tensorflow as tf

train_dataset = tf.data.Dataset.from_tensor_slices((
    {key: train_encodings[key] for key in ['input_ids', 'attention_mask']},
    {key: train_encodings[key] for key in ['start_positions', 'end_positions']}
))
val_dataset = tf.data.Dataset.from_tensor_slices((
    {key: val_encodings[key] for key in ['input_ids', 'attention_mask']},
    {key: val_encodings[key] for key in ['start_positions', 'end_positions']}
))
"""
print("")




### Now Dataset can be used to train a Transformer model

In [6]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=442.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=267967963.0), HTML(value='')))




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

## Train with Huggingface Trainer

In [10]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=8,  # batch size per device during training
    per_device_eval_batch_size=8,   # batch size for evaluation
    #warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    max_steps=5,
    overwrite_output_dir=True,
)


trainer = Trainer(
    compute_metrics=compute_metrics,
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Iteration'), FloatProgress(value=0.0, max=10950.0), HTML(value='')))





TrainOutput(global_step=6, training_loss=5.647638003031413)

## Train with own Training
[https://huggingface.co/transformers/training.html](https://huggingface.co/transformers/training.html)

In [51]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

steps = 0
max_steps = 5
nr_epochs = 3

# TODO: 
# Add mini-batches and more fancy things to mkae real example

# EX:  https://github.com/huggingface/transformers/blob/b592728eff9996e2cff1c5107438c4989aaa8149/examples/question-answering/run_squad.py#L82
# train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
# train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)


optim = AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):
    if steps > max_steps:
        break
    for batch in train_loader:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        step += 1
        
model.eval()

## Use your own eval metric
https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing#scrollTo=N8J-TLhBuaOf

In [None]:
import datasets
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=2, #16,
    per_device_eval_batch_size=2, #64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train
trainer.train()

# Eval
trainer.evaluate()

### Define custom loss function for Trainier

In [None]:
from transformers import Trainer
class MyTrainer(Trainer):
    def compute_loss(self, model, inputs):
        labels = inputs.pop("labels")
        outputs = models(**inputs)
        logits = outputs[0]
        return my_custom_loss(logits, labels)

#### Load Tensorboard

In [53]:
%load_ext tensorboard
%tensorboard --logdir logs