In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role

# Setup the SageMaker session
session = sagemaker.Session()
role = get_execution_role()

# S3 bucket for storing training data and model artifacts
bucket = session.default_bucket()
prefix = 'distilbert-fine-tuning'

In [None]:
# Assuming you have a dataset prepared in a format compatible with the Hugging Face transformers library
# For sequence classification (e.g., sentiment analysis)

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer

# Load your dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your actual data loading

# Split the dataset
train_df, eval_df = train_test_split(df, test_size=0.1)

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Function to tokenize and format data
def tokenize_data(texts, labels):
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=128)
    dataset = {
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels.tolist()
    }
    return dataset

# Prepare datasets
train_dataset = tokenize_data(train_df['text'], train_df['label'])
eval_dataset = tokenize_data(eval_df['text'], eval_df['label'])

# Convert to format expected by SageMaker
import json
import os

def write_to_jsonl(dataset, filename):
    with open(filename, 'w') as f:
        for i in range(len(dataset['input_ids'])):
            item = {
                'input_ids': dataset['input_ids'][i],
                'attention_mask': dataset['attention_mask'][i],
                'labels': dataset['labels'][i]
            }
            f.write(json.dumps(item) + '\n')

# Create local files
write_to_jsonl(train_dataset, 'train.jsonl')
write_to_jsonl(eval_dataset, 'eval.jsonl')

# Upload to S3
train_s3 = session.upload_data('train.jsonl', bucket=bucket, key_prefix=f"{prefix}/data")
eval_s3 = session.upload_data('eval.jsonl', bucket=bucket, key_prefix=f"{prefix}/data")

In [None]:
from sagemaker.huggingface import HuggingFace

# Hyperparameters for fine-tuning
hyperparameters = {
    'epochs': 3,
    'train_batch_size': 16,
    'eval_batch_size': 16,
    'learning_rate': 5e-5,
    'warmup_steps': 500,
    'model_name': 'distilbert-base-uncased',
    'output_dir': '/opt/ml/model'
}

# Create Hugging Face estimator
huggingface_estimator = HuggingFace(
    entry_point='train.py',  # Your training script
    source_dir='./scripts',  # Directory containing your training scripts
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',  # Consider your budget and requirements
    transformers_version='4.26.0',  # Specify the transformers version
    pytorch_version='1.13.1',       # Specify the PyTorch version
    py_version='py39',              # Python version
    hyperparameters=hyperparameters
)

In [None]:
# Start the training job
huggingface_estimator.fit({
    'training': train_s3,
    'eval': eval_s3
})

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel

# Create the Hugging Face Model
huggingface_model = HuggingFaceModel(
    model_data=huggingface_estimator.model_data,  # S3 path to your model.tar.gz
    role=role,
    transformers_version="4.26.0",
    pytorch_version="1.13.1",
    py_version="py39",
    entry_point="inference.py"  # Your inference script
)

# Deploy the model
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"  # Choose an appropriate instance type
)

In [None]:
# Test with example data
sample_data = {
    'texts': [
        "I absolutely loved the movie, the acting was superb!",
        "The service at the restaurant was terrible and the food was cold."
    ]
}

# Get predictions
response = predictor.predict(sample_data)
print(response)

In [None]:
predictor.delete_endpoint()
