# Fine-tune QA example
This notebook demonstrates how to fine-tune a question answering model using the `datacreek` toolkit. The workflow covers dataset ingestion with the semantic safety filter, training with Hugging Face `Trainer`, and running inference on the fine-tuned model.


In [None]:
# Install dependencies
# In a Colab environment this cell installs required packages.
import sys

if 'google.colab' in sys.modules:
    !pip install -q datasets transformers accelerate datacreek


In [None]:
"""Load the SQuAD dataset and apply the semantic safety filter.

The safety filter uses a tiny toxicity model and NSFW regex heuristics to drop unsafe samples before training.

Variables
    dataset: Raw SQuAD dataset split dictionary
    safe_dataset: Dataset after filtering
"""
from datasets import load_dataset
from ingest.safety_filter import SafetyFilter

dataset = load_dataset('squad')
filter = SafetyFilter()

def is_safe(example):
    text = example['question'] + ' ' + example['context']
    return filter(text)

safe_dataset = dataset.filter(is_safe)
safe_dataset


In [None]:
"""Tokenize the dataset for the QA task.

This cell leverages the pretrained tokenizer associated with the chosen model.
Variables
    tokenizer: Hugging Face tokenizer
    tokenized: tokenized dataset ready for training
"""
from transformers import AutoTokenizer

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(example):
    return tokenizer(example['question'], example['context'], truncation=True)

tokenized = safe_dataset.map(preprocess, batched=True)
tokenized


In [None]:
"""Fine-tune the model using Hugging Face Trainer.

TrainingArguments set a small number of epochs for demonstration. On Colab with a GPU this cell completes within minutes.
Variables
    model: pretrained transformer model for QA
    trainer: Hugging Face Trainer instance
"""
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name)
args = TrainingArguments(
    output_dir='qa-model',
    per_device_train_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=50,
)

trainer = Trainer(model=model, args=args, train_dataset=tokenized['train'])
trainer.train()
trainer.save_model()


In [None]:
"""Run inference with the fine-tuned model.

Given a question and context from the validation set, the model predicts the answer span.

Variables
    question: sample question from dataset
    context: corresponding context passage
    answer: text span predicted by the model
"""
from transformers import pipeline

qa_pipeline = pipeline('question-answering', model='qa-model')
sample = safe_dataset['validation'][0]
question = sample['question']
context = sample['context']
answer = qa_pipeline(question=question, context=context)
answer
