## FINE-TUNE Bert on a QA task (SQUAD 2)

## Install and import necessary libraries 

In [1]:
! pip install datasets transformers

Collecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 5.4 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 44.4 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 45.9 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 35.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.5 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.2.0-py3-none-any.whl (134 kB)
[K     |████████████████████████████████| 134 kB 50.8 MB/s 
Collect

In [2]:
import pandas as pd
import numpy as np
import re
import os
import random
import time

import torch

from datasets import load_dataset
from transformers import AutoTokenizer, BertForQuestionAnswering, get_linear_schedule_with_warmup

In [3]:
def set_seed(seed = 1234):
    """
    Function to set the seed of the entire notebook for reproducibility of results
    """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

## Check for GPU

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cuda


## 1. DATA

## Load and Preprocess Data

In [5]:
datasets = load_dataset("squad_v2")

Downloading:   0%|          | 0.00/1.87k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading and preparing dataset squad_v2/squad_v2 (download: 44.34 MiB, generated: 122.41 MiB, post-processed: Unknown size, total: 166.75 MiB) to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/9.55M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/801k [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset squad_v2 downloaded and prepared to /root/.cache/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [7]:
datasets["train"][0]

{'answers': {'answer_start': [269], 'text': ['in the late 1990s']},
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'id': '56be85543aeaaa14008c9063',
 'question': 'When did Beyonce start becoming popular?',
 'title': 'Beyoncé'}

In [8]:
# Train dataset
train = datasets["train"]

In [9]:
# Validation dataset
validation = datasets["validation"]

## BERT Tokenization

In [10]:
# Instantiate a tokenizer to tokenize the inputs (including converting the tokens to their corresponding 
# IDs in the pretrained vocabulary) and put it in a format the model expects, 
# as well as generate the other inputs that model requires

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [12]:
max_length = 384 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting is needed.

In [13]:
def preprocess(examples):
    # Remove whitespace on the left of the questions
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        # truncate only the context, not the question
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        # our tokenizer can automatically return us a list of features capped by 
        # a certain maximum length, with the overlap
        return_overflowing_tokens=True,
        # we will also need to to map parts of the original context to some tokens
        return_offsets_mapping=True,
        padding="max_length")

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Label examples
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # Label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [14]:
# apply function to all sentences (or pairs of sentences) in our training dataset
# since preprocessing changes the number of samples, we need to remove the old columns when applying it
# batche=True: encode the texts by batches together

tokenized_train = train.map(preprocess, batched=True, remove_columns=train.column_names)

  0%|          | 0/131 [00:00<?, ?ba/s]

In [15]:
len(tokenized_train)

131754

In [16]:
tokenized_val = validation.map(preprocess, batched=True, remove_columns=validation.column_names)

  0%|          | 0/12 [00:00<?, ?ba/s]

In [17]:
len(tokenized_val)

12134

In [18]:
# Set the format of datasets to return PyTorch tensors instead of lists

tokenized_train.set_format("torch")

In [25]:
tokenized_train[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0

In [19]:
train_dataset = tokenized_train.shuffle(seed=42)

In [20]:
tokenized_val.set_format("torch")

In [21]:
eval_dataset = tokenized_val

## Dataloaders

In [22]:
dataloader_val = torch.utils.data.DataLoader(eval_dataset, batch_size=16)

In [23]:
dataloader_train = torch.utils.data.DataLoader(train_dataset, batch_size=16)

## 2. MODEL: bert-base-uncased

In [None]:
# Instantiate the pretrained  Bert Model with a span classification head (BertForQuestionAnswering)on top for an extractive 
# question-answering task (a linear layers on top of the hidden-states output to compute span start logits and span end logits). 

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [None]:
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

## 3. TRAINING (Fine-tuning)

In [None]:
# Set up the optimizer

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, eps=1e-8)

num_epochs = 2

# Total number of training steps is [number of batches] x [number of epochs]

total_steps = num_epochs * len(dataloader_train)

# Create the learning rate scheduler (adjusts the learning rate between iterations/steps as the training progresses)

# "get_linear_schedule_with_warmup": the learning rate decreases linearly from the initial lr, after
# a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer

lr_scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [None]:
def evaluate(dataloader):
    """ 
    A function to use for evaluation purposes, after the completion of each training epoch

    input:
    dataloader: iterable over a given dataset

    return:
    loss_val_avg: average validation loss

    """

    # Set model to evaluation mode
    model.eval()

    # Instantiate a variable to keep loss/epoch
    total_val_loss = 0



    # Loop over batches of validation set
    for batch in dataloader:

        # Batch to GPU (attention_mask, input_ids, token_type_ids, start_positions, end_positions)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass without weights update 
        with torch.no_grad():

            # Get loss and logits (predictions)       
            outputs = model(**batch)
            
        loss = outputs.loss

        # Accumulate validation loss
        total_val_loss += loss.item()

    
    # Calculate the average loss over all batches
    loss_val_avg = total_val_loss/len(dataloader) 
    
            
    return loss_val_avg

In [None]:
def get_time_elapsed(start_time):
    '''

    A function to calculate time elapsed 

    input:
    start_time: (int) Starting point in time
       
    return:
    time in HH:MM:SS format
    '''
    curr_time = time.time()
    delta = curr_time - start_time
    hour = int(delta / 3600)
    delta -= hour * 3600
    minute = int(delta / 60)
    delta -= minute * 60
    seconds = delta
    return '%02d' % hour + ':%02d' % minute + ':%02d' % seconds 

In [None]:
# Training Loop

print("Start training...\n")

# Lowest validation loss
best_valid_loss = float('inf')

for epoch_i in range(1, num_epochs+1):
    # Print the header of the result table
    print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Time Elapsed':^9}")
    print("-"*70)
    
    # Measure the total training time per epoch and per set of batches
    t0_epoch, t0_batch = time.time(), time.time()

    # Reset tracking variables at the beginning of each epoch
    total_loss, batch_loss, batch_counts = 0, 0, 0

    # Set model to training mode
    model.train()
    

    # Loop over each batch of training data
    for step, batch in enumerate(dataloader_train):
        batch_counts +=1
        
        # Batch to GPU (attention_mask, input_ids, token_type_ids, start_positions, end_positions)
        batch = {k: v.to(device) for k, v in batch.items()}

        # Zero out any previously calculated gradients
        model.zero_grad()

        # Perform a forward pass
        outputs = model(**batch)
        loss = outputs.loss

        # Acucumulate loss
        batch_loss += loss.item()
        total_loss += loss.item()

        # Perform the backward pass to calculate gradients
        loss.backward()

        # Update parameters and the learning rate
        optimizer.step()
        lr_scheduler.step()

        # Print the loss values and time elapsed for every 20 batches
        if (step % 20 == 0 and step != 0) or (step == len(dataloader_train) - 1):

            # Calculate time elapsed for 20 batches
            time_elapsed = time.time() - t0_batch

            # Print training results
            print(f"{epoch_i:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {time_elapsed:^9.2f}")

            # Reset batch tracking variables
            batch_loss, batch_counts = 0, 0

            t0_batch = time.time()
               
    # Calculate the average loss over the entire training data
    avg_train_loss= total_loss/len(dataloader_train)
    print("-"*70)
    
    # Measure model performance on validation dataset
    val_loss = evaluate(dataloader_val)

    # Save the model with the lowest validation loss
    if val_loss < best_valid_loss:
      best_valid_loss = val_loss
      torch.save(model.state_dict(),'/content/QA_SQUAD_BERT_fine-tuned')  

    # Print performance over the entire training data
    time_elapsed = get_time_elapsed(t0_epoch)

    print(f"{epoch_i:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {time_elapsed:}")
    print("-"*70)
print("\n")
print("Training completed!")   

Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | Time Elapsed
----------------------------------------------------------------------
   1    |   20    |   4.804653   |     -      |   46.80  
   1    |   40    |   3.181537   |     -      |   43.97  
   1    |   60    |   2.795400   |     -      |   43.88  
   1    |   80    |   2.631019   |     -      |   43.80  
   1    |   100   |   2.340513   |     -      |   43.73  
   1    |   120   |   2.315262   |     -      |   43.72  
   1    |   140   |   2.122746   |     -      |   43.75  
   1    |   160   |   2.310538   |     -      |   43.71  
   1    |   180   |   2.017429   |     -      |   43.77  
   1    |   200   |   2.015838   |     -      |   43.76  
   1    |   220   |   2.038251   |     -      |   43.76  
   1    |   240   |   2.024140   |     -      |   43.75  
   1    |   260   |   1.829211   |     -      |   43.76  
   1    |   280   |   1.844646   |     -      |   43.73  
   1    |   300   |   1.856720   |   