# Fine-Tuning RoBERTa on SQuAD Using SageMaker
This notebook demonstrates how to fine-tune a Hugging Face RoBERTa model on the SQuAD dataset using Amazon SageMaker.

In [None]:
!pip install transformers datasets sagemaker --quiet


In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace
import boto3

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
print(f"Using bucket: {bucket}")


## Load and Preprocess the SQuAD Dataset

In [None]:
from datasets import load_dataset

# Load SQuAD dataset
dataset = load_dataset("squad")

# Save to local files for SageMaker input
train_file = "train.json"
validation_file = "validation.json"
dataset["train"].to_json(train_file)
dataset["validation"].to_json(validation_file)


## Upload Dataset to S3

In [None]:
s3_prefix = "qa-squad-data"
s3_train_path = sess.upload_data(train_file, bucket=bucket, key_prefix=f"{s3_prefix}/train")
s3_val_path = sess.upload_data(validation_file, bucket=bucket, key_prefix=f"{s3_prefix}/validation")
print(f"Train path: {s3_train_path}\nVal path: {s3_val_path}")


## Define Hugging Face Training Job

In [None]:
hyperparameters = {
    "model_name_or_path": "roberta-base",
    "dataset_name": "squad",
    "do_train": True,
    "do_eval": True,
    "per_device_train_batch_size": 8,
    "per_device_eval_batch_size": 8,
    "learning_rate": 3e-5,
    "num_train_epochs": 2,
    "max_seq_length": 384,
    "doc_stride": 128,
    "output_dir": "/opt/ml/model"
}

huggingface_estimator = HuggingFace(
    entry_point="train.py",
    source_dir="./scripts",
    instance_type="ml.p3.2xlarge",
    instance_count=1,
    role=role,
    transformers_version="4.17",
    pytorch_version="1.10",
    py_version="py38",
    hyperparameters=hyperparameters,
)

huggingface_estimator.fit({
    "train": s3_train_path,
    "validation": s3_val_path
})


## Deploy the Fine-Tuned Model

In [None]:
predictor = huggingface_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large"
)


## Test Inference

In [None]:
context = "Amazon SageMaker is a fully managed service that provides every developer and data scientist with the ability to build, train, and deploy machine learning models quickly."
question = "What is Amazon SageMaker?"

response = predictor.predict({
    "inputs": {
        "question": question,
        "context": context
    }
})

print("Answer:", response)
