In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("squad")

In [None]:
train_data = dataset['train']
val_data = dataset['validation']

In [None]:
train_data[0]

In [None]:
from transformers import AutoTokenizer

# Use a tokenizer for the chosen model
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(data):
    # Tokenize input pairs (question + context)
    return tokenizer(
        data["question"], 
        data["context"], 
        truncation=True, 
        padding="max_length", 
        max_length=512,
        return_tensors="pt"
    )

# Apply preprocessing
train_data = train_data.map(preprocess, batched=True)
val_data = val_data.map(preprocess, batched=True)

In [None]:
from transformers import AutoModelForQuestionAnswering

# Load the pre-trained model
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


In [None]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)


In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
print(metrics)

In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [None]:
from transformers import pipeline

# Load the QA pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# Test the model
result = qa_pipeline({
    "question": "What is the capital of France?",
    "context": "France's capital is Paris, known for its cultural heritage."
})

print(result)
