In [3]:
# Step 1: Install required libraries
!pip install --upgrade datasets transformers evaluate accelerate

# Step 2: Import required libraries
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer
)

# Step 3: Load only 10% of the dataset
dataset = load_dataset("squad", split="train[:1%]")  # Load only first 1% of the training data

# Step 4: Split the subset into train and test (90% train / 10% test)
dataset = dataset.train_test_split(test_size=0.1)  # Of the 1%, 90-10 train-test split

# Step 5: Load tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Step 6: Preprocessing function
max_length = 384
stride = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    answers = examples["answers"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        answer = answers[sample_mapping[i]]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = tokenized_examples.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offsets[context_start][0] > end_char or offsets[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offsets[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offsets[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples

tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

# Step 7: Load model
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

# Step 8: Define training arguments (no wandb integration)
training_args = TrainingArguments(
    output_dir="distilbert-finetuned-squad",
    eval_strategy="epoch",  # <-- uses updated argument name
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="logs",  # local logging only
    logging_steps=10,
    report_to="none",  # <-- disables wandb and other external reporting
    push_to_hub=False,
)

# Step 9: Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

# Step 10: Start training
trainer.train()

# Step 11: Run inference
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def predict(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt").to(device)  # <-- dynamic device
    outputs = model(**inputs)

    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    answer = tokenizer.convert_tokens_to_string(
        tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end])
    )
    return answer

# Example usage
question = "What is the capital of France?"
context = "France is a country in Western Europe. The capital of France is Paris."

print(predict(question, context))



Map:   0%|          | 0/88 [00:00<?, ? examples/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,4.3686,4.245692
2,3.5848,3.786978
3,3.3979,3.745161


Using device: cuda
paris
