# Finetune BERT for Question-Answering:

## 1. Import Libraries:

In [1]:
%pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import collections
from functools import partial

import evaluate
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import pipeline
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

## 2. Load the Data:

In [3]:
DATASET_ID = "rajpurkar/squad"
MODEL_ID = "google-bert/bert-base-uncased"

In [4]:
# load the SQuAD dataset
data = load_dataset(DATASET_ID)
data

README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

The dataset has two splits: a train split with 87,599 rows, and a validation split with 10,570 rows.

## 3. Data Processing:

In this section, we will download the dataset, cache it locally, and preprocess it into the format described in the introduction. Our goal is to produce examples that contain:
- The tokenized question and context.
- The start position of the answer within the context.
- The end position of the answer within the context.

First, let's download and examine the dataset:

### 3.1 load the BERT tokenizer:

In [5]:
# set `clean_up_tokenization_spaces` to False to keep the tokenization spaces
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID, clean_up_tokenization_spaces=False)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### 3.2 Define the Preprocessing function:
- tokenize the questions and the context.
- identify the start and end positions of the answer within the context.

In [6]:
def preprocess_train_examples(examples, tokenizer, max_length, stride):
    # Tokenize the questions and context sequences
    # Remove spaces at the beginning and at the end of the string
    questions = [q.strip() for q in examples["question"]]
    
    inputs = tokenizer(
      questions,
      examples["context"],
      # if the combined length exceeds max_length,
      # only the context (examples["context"]) is truncated
      truncation="only_second",
      padding="max_length",
      # When a context is too long and gets truncated,
      # the tokenizer splits it into overlapping chunks of size max_length with a stride overlap.
      stride=stride,
      max_length=max_length,
      # Returns a mapping of each token to its original character positions in the raw text.
      # Useful for locating the start and end positions of answers in the context.
      return_offsets_mapping=True,
      return_overflowing_tokens=True,
    )

    answers = examples["answers"]
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    # find the start and end positions of the answer within the context
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [7]:
def preprocess_valid_examples(examples, tokenizer, max_length, stride):
    # Tokenize the questions and context sequences
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
      questions,
      examples["context"],
      truncation="only_second",
      padding="max_length",
      stride=stride,
      max_length=max_length,
      return_offsets_mapping=True,
      return_overflowing_tokens=True,
    )
    
    example_ids = []
    answers = examples["answers"]
    offset_mapping = inputs["offset_mapping"]
    sample_map = inputs.pop("overflow_to_sample_mapping")

    start_positions = []
    end_positions = []

    # find the start and end positions of the answer within the context
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)
        
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["example_id"] = example_ids
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

the preprocess_valid_examples is the same as preprocess_train_examples function, but the only difference is the preprocess_valid_examples return example_id

### 3.3 Preprocess the train and valid datasets:

In [8]:
preprocess_train_data = partial(
    preprocess_train_examples, tokenizer=tokenizer, max_length=384, stride=128)
processed_train_data = data["train"].map(preprocess_train_data, batched=True, remove_columns=data["train"].column_names)
processed_train_data

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 88524
})

In [9]:
preprocess_valid_data = partial(
    preprocess_valid_examples, tokenizer=tokenizer, max_length=384, stride=128)
processed_valid_data = data["validation"].map(preprocess_valid_data, batched=True, remove_columns=data["validation"].column_names)
processed_valid_data

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions'],
    num_rows: 10784
})

## 4. Model Fine-Tuning:

### 4.1 Load the pre-trained model:

In [10]:
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_ID)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 4.2 Set the training arguments:

In [11]:
training_args = TrainingArguments(
    output_dir='./checkpoints',
    logging_dir='./logs',
    eval_strategy="steps",
    logging_steps=500,
    logging_strategy="steps",
    save_steps=500,
    save_strategy="steps",
    save_total_limit=2,
    learning_rate=3e-5,
    num_train_epochs=2,
    weight_decay=0.01,
    bf16=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    report_to='none'
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_train_data,
    eval_dataset=processed_valid_data,
    tokenizer=tokenizer,
)

  trainer = Trainer(


### 4.3 Train the model:

In [13]:
trainer.train()



Step,Training Loss,Validation Loss
500,2.0747,1.315381
1000,1.3333,1.163866
1500,1.1919,1.077957
2000,1.142,1.067446
2500,1.0732,1.024293
3000,0.9308,1.0514
3500,0.7939,1.038457
4000,0.8052,1.014067
4500,0.7908,1.024794
5000,0.794,1.011436




TrainOutput(global_step=5534, training_loss=1.062432464210833, metrics={'train_runtime': 9451.5637, 'train_samples_per_second': 18.732, 'train_steps_per_second': 0.586, 'total_flos': 3.4696551139946496e+16, 'train_loss': 1.062432464210833, 'epoch': 2.0})

### 4.4 Save the model and the tokeinizer:

In [14]:
trainer.save_model("/kaggle/working/qa_model_final")

In [15]:
tokenizer.save_pretrained("/kaggle/working/qa_model_final")

('/kaggle/working/qa_model_final/tokenizer_config.json',
 '/kaggle/working/qa_model_final/special_tokens_map.json',
 '/kaggle/working/qa_model_final/vocab.txt',
 '/kaggle/working/qa_model_final/added_tokens.json',
 '/kaggle/working/qa_model_final/tokenizer.json')

## 5. Model Evaluation:

In [16]:
# Compute the Exact Match (EM) and F1 score for the model's predictions
def compute_metrics(start_logits, end_logits, features, examples, n_best=20, max_answer_length=50):
    metric = evaluate.load("squad")

    # keep a dictionary that maps examples to predictions through unique IDs
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            # keep a list of the top-k best predictions for the start and end position indexes
            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    # reconstruct the answer considering each prediction for the start and end positions 
                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # select the answer with the best score based on the logit scores
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})
    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [17]:
predictions, _, _ = trainer.predict(processed_valid_data)
start_logits, end_logits = predictions
compute_metrics(start_logits, end_logits, processed_valid_data, data["validation"])



Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

100%|██████████| 10570/10570 [00:18<00:00, 557.51it/s]


{'exact_match': 80.27436140018922, 'f1': 87.91057046462916}

Let's also provide an answer for the same random samples:

In [18]:
random_indexes = np.random.randint(0, len(data["validation"]), 3)
subdataset = data["validation"].select(random_indexes)
qa_pipe = pipeline("question-answering", model=model, tokenizer=tokenizer, device='cuda')

for row in subdataset:
    context = row["context"]
    question = row["question"]
    answer = qa_pipe(question=question, context=context)

    print(f"Context: \n\n {context} \n")
    print(f"Question: \n\n {question} \n")
    print(f"Answer: \n\n {answer['answer']} \n")
    print("--- \n")

Device set to use cuda


Context: 

 During Reconstruction and the Gilded Age, Jacksonville and nearby St. Augustine became popular winter resorts for the rich and famous. Visitors arrived by steamboat and later by railroad. President Grover Cleveland attended the Sub-Tropical Exposition in the city on February 22, 1888 during his trip to Florida. This highlighted the visibility of the state as a worthy place for tourism. The city's tourism, however, was dealt major blows in the late 19th century by yellow fever outbreaks. In addition, extension of the Florida East Coast Railway further south drew visitors to other areas. From 1893 to 1938 Jacksonville was the site of the Florida Old Confederate Soldiers and Sailors Home with a nearby cemetery. 

Question: 

 Which US President visited Jacksonville in 1888? 

Answer: 

 Grover Cleveland 

--- 

Context: 

 Several commemorative events take place every year. Gatherings of thousands of people on the banks of the Vistula on Midsummer’s Night for a festival called