
# 6.3.3 - Fine-Tuning DistilBERT for Extractive QA

In this notebook, we fine-tune DistilBERT for extractive Question Answering using the SQuAD dataset.

We'll cover:
- Model and tokenizer setup
- Dataset loading and preprocessing
- Creating start/end token labels
- Training using Hugging Face `Trainer`
- Evaluation and inference


In [None]:

!pip install transformers datasets accelerate


In [None]:

from transformers import DistilBertTokenizerFast, DistilBertForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset
import torch


In [None]:

dataset = load_dataset("squad")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")


In [None]:

def preprocess_function(example):
    inputs = tokenizer(example["question"], example["context"], truncation=True, padding="max_length", max_length=384, return_offsets_mapping=True)
    offset_mapping = inputs.pop("offset_mapping")
    answer = example["answers"]["text"][0]
    answer_start = example["answers"]["answer_start"][0]
    answer_end = answer_start + len(answer)

    start_position = end_position = 0
    for i, (start, end) in enumerate(offset_mapping):
        if start <= answer_start < end:
            start_position = i
        if start < answer_end <= end:
            end_position = i

    inputs["start_positions"] = start_position
    inputs["end_positions"] = end_position
    return inputs


In [None]:

tokenized_dataset = dataset.map(preprocess_function, batched=False)
train_dataset = tokenized_dataset["train"]
val_dataset = tokenized_dataset["validation"]


In [None]:

training_args = TrainingArguments(
    output_dir="./distilbert-qa",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    logging_steps=10,
    logging_dir="./logs"
)


In [None]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)


In [None]:

trainer.train()


In [None]:

model.save_pretrained("./distilbert-finetuned-qa")
tokenizer.save_pretrained("./distilbert-finetuned-qa")


In [None]:

context = "Albert Einstein developed the theory of relativity in the early 20th century."
question = "Who developed the theory of relativity?"

inputs = tokenizer(question, context, return_tensors="pt", max_length=384, truncation=True, padding="max_length")
with torch.no_grad():
    outputs = model(**inputs)

start_logits = outputs.start_logits
end_logits = outputs.end_logits
start_index = torch.argmax(start_logits)
end_index = torch.argmax(end_logits) + 1
tokens = inputs["input_ids"][0][start_index:end_index]
answer = tokenizer.decode(tokens)

print("Extracted Answer:", answer)
