In [1]:
# Number of parameters for flan-t5 family: small 80M, base 250M, large 780M, xl 3B, xxl 11B
model_id = "google/flan-t5-large"

# https://huggingface.co/datasets/billsum
dataset_id = "billsum"

#### Setup

In [2]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


In [3]:
import sagemaker

print(sagemaker.__version__)

sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.config INFO - Not applying SDK defaults from location: C:\ProgramData\sagemaker\sagemaker\config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: C:\Users\aayus\AppData\Local\sagemaker\sagemaker\config.yaml
2.241.0


In [4]:
import transformers
import datasets

print(transformers.__version__)
print(datasets.__version__)

4.48.3
3.3.0


### Preprocessing

#### Load Dataset

In [5]:
from datasets import load_dataset, load_from_disk

dataset = load_dataset(dataset_id)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})

#### Preprocess data

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

prefix = "summarize: "
input_max_length = 1024
output_max_length = 128


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=input_max_length, truncation=True)
    labels = tokenizer(
        text_target=examples["title"], max_length=output_max_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 18949
    })
    test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 3269
    })
    ca_test: Dataset({
        features: ['text', 'summary', 'title'],
        num_rows: 1237
    })
})


In [8]:
tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["title", "text", "summary"]
)

Map:   0%|          | 0/1237 [00:00<?, ? examples/s]

In [9]:
#tokenized_dataset.save_to_disk(f"billsum-t5-tokenized") 

#### Upload processed dataset to S3


In [10]:
pip install pyarrow





In [11]:
import pyarrow.parquet as pq
import pyarrow as pa
import fsspec
import sagemaker
# Initialize S3 session
sess = sagemaker.Session()
bucket = sess.default_bucket()

# Define S3 paths for training and validation datasets
s3_prefix = "huggingface/billsum-t5-summarization"
dataset_input_path = f"s3://{bucket}/{s3_prefix}"
train_input_path = f"{dataset_input_path}/train.parquet"
valid_input_path = f"{dataset_input_path}/test.parquet"  # `test` is used for validation

print("Train Input Path:", train_input_path)
print("Valid Input Path:", valid_input_path)

# Make sure they exist in S3
s3 = fsspec.filesystem("s3")
assert s3.exists(train_input_path), f"Train dataset not found at {train_input_path}"
assert s3.exists(valid_input_path), f"Validation dataset not found at {valid_input_path}"

# # Initialize S3
# s3 = fsspec.filesystem("s3")

# # Convert and upload train dataset
# train_table = pa.Table.from_pandas(tokenized_dataset["train"].to_pandas())
# train_s3_path = f"s3://{bucket}/huggingface/billsum-t5-summarization/train.parquet"

# with s3.open(train_s3_path, "wb") as f:
#     pq.write_table(train_table, f)
# print(f"Uploaded train dataset to {train_s3_path} 🚀")

# # Convert and upload test dataset
# test_table = pa.Table.from_pandas(tokenized_dataset["test"].to_pandas())
# test_s3_path = f"s3://{bucket}/huggingface/billsum-t5-summarization/test.parquet"

# with s3.open(test_s3_path, "wb") as f:
#     pq.write_table(test_table, f)
# print(f"Uploaded test dataset to {test_s3_path} 🚀")


Train Input Path: s3://sagemaker-us-east-1-767398054325/huggingface/billsum-t5-summarization/train.parquet
Valid Input Path: s3://sagemaker-us-east-1-767398054325/huggingface/billsum-t5-summarization/test.parquet


In [12]:
!aws s3 ls s3://sagemaker-us-east-1-767398054325/huggingface/billsum-t5-summarization/


2025-03-09 18:42:58    5603021 test.parquet
2025-03-09 18:42:49   33747084 train.parquet


###  Fine-tune on SageMaker with a Hugging Face Deep Learning Container

In [13]:
!pygmentize train.py 

import argparse
import logging
import os

import evaluate
import numpy as np
from datasets import load_from_disk
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

rouge = evaluate.load("rouge")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return {k: round(v, 4) for k, v in result.items()}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    # hyperparameters are passed 

In [14]:
hyperparameters = {
    "epochs": 1,
    "learning-rate": 1e-6,
    "train-batch-size": 1,
    "eval-batch-size": 8,
    "model-name": model_id,
}

In [None]:
from sagemaker.huggingface import HuggingFace
import os
from dotenv import load_dotenv

load_dotenv()
role = os.getenv("SAGEMAKER_ROLE")
bucket = os.getenv("S3_BUCKET_NAME")

huggingface_estimator = HuggingFace(
    role=role,  # Use the IAM role
    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters=hyperparameters,
    transformers_version="4.26.0",
    pytorch_version="1.13.1",
    py_version="py39",
    instance_type="ml.p3.2xlarge",
    instance_count=1,
    distribution={"smdistributed": {"dataparallel": {"enabled": True}}}
)


In [16]:
huggingface_estimator.fit({"train": train_input_path, "valid": valid_input_path})


In [None]:
huggingface_estimator.model_data 

### Deploy on sagemaker with a hugging face deep leraning container

In [17]:
huggingface_predictor = huggingface_estimator.deploy(
    initial_instance_count=1, instance_type="ml.p3.2xlarge"
)

In [18]:
test_data = {"inputs": f"{prefix}: {dataset['test'][10]}"}

In [19]:
prediction = huggingface_predictor.predict(test_data)
print(prediction)

In [20]:
 huggingface_predictor.delete_endpoint() 