In [97]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer, BertTokenizerFast, AutoTokenizer

import torch
import numpy as np
import pandas as pd 

In [98]:
# model_checkpoint = "bert-base-uncased" # using smaller model
model_checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [99]:
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS backend.")
else:
    device = torch.device("cpu")
    print("Using CPU.")

Using MPS backend.


In [100]:
from datasets import load_dataset

squad_dataset = load_dataset('squad')
squad_dataset.column_names

{'train': ['id', 'title', 'context', 'question', 'answers'],
 'validation': ['id', 'title', 'context', 'question', 'answers']}

In [101]:
# Use smaller dataset for testing
squad_dataset['train'] = squad_dataset['train'].shuffle(seed=42).select(range(1000))
squad_dataset['validation'] = squad_dataset['validation'].shuffle(seed=42).select(range(500))

In [None]:
squad_dataset['train'][0]

In [103]:
dataset_with_length = squad_dataset.map(lambda x: {"length": len(x["context"])})


In [104]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [105]:
tokenized_dataset = dataset_with_length.map(lambda x: tokenizer(x['context']), batched=True)


#### We added a few fields to the dataset. 
- length
- input_ids
- attention_mask
- token_type_ids

In [106]:
print(tokenized_dataset['train'][0].keys())

dict_keys(['id', 'title', 'context', 'question', 'answers', 'length', 'input_ids', 'token_type_ids', 'attention_mask'])


In [107]:
def prepare_features(examples):
    # Prepare features, while handling larger context size
    tokenized_examples = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop('overflow_to_sample_mapping')
    offset_mapping = tokenized_examples.pop('offset_mapping')

    tokenized_examples['start_positions'] = []
    tokenized_examples['end_positions'] = []

    for i, offsets in enumerate(offset_mapping):
        # Get the index of the example that this feature corresponds to
        sample_index = sample_mapping[i]
        answers = examples['answers'][sample_index]
        input_ids = tokenized_examples['input_ids'][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        if len(answers['answer_start']) == 0:
            tokenized_examples['start_positions'].append(cls_index)
            tokenized_examples['end_positions'].append(cls_index)
        else:
            start_char = answers['answer_start'][0]
            end_char = start_char + len(answers['text'][0])

            sequence_ids = tokenized_examples.sequence_ids(i)

            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1

            if offsets[context_start][0] > end_char or offsets[context_end][1] < start_char:
                tokenized_examples['start_positions'].append(cls_index)
                tokenized_examples['end_positions'].append(cls_index)
            else:
                token_start_index = context_start
                token_end_index = context_end

                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples['start_positions'].append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples['end_positions'].append(token_end_index + 1)

    return tokenized_examples

In [108]:
tokenized_datasets = squad_dataset.map(
    prepare_features,
    batched=True,
    remove_columns=squad_dataset["train"].column_names,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

In [110]:
train_dataset = tokenized_datasets['train']
eval_dataset = tokenized_datasets['validation']

In [111]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'token_type_ids', 'start_positions', 'end_positions'])


In [112]:
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-cased')
model.to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [113]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    fp16=False,  
)

In [114]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

In [115]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,3.2394,2.529618
2,1.9175,2.394147


TrainOutput(global_step=250, training_loss=2.8641237182617187, metrics={'train_runtime': 249.9962, 'train_samples_per_second': 8.0, 'train_steps_per_second': 1.0, 'total_flos': 522593513472000.0, 'train_loss': 2.8641237182617187, 'epoch': 2.0})

In [116]:
trainer.save_model('./models/test_qa_model')
tokenizer.save_pretrained('./models/test_qa_model')

('./models/test_qa_model/tokenizer_config.json',
 './models/test_qa_model/special_tokens_map.json',
 './models/test_qa_model/vocab.txt',
 './models/test_qa_model/added_tokens.json',
 './models/test_qa_model/tokenizer.json')