In [None]:
!pip install transformers "datasets[s3]" --upgrade
!pip install sagemaker --upgrade
!pip install sagemaker-experiments
!pip install evaluate
!pip install rouge-score
!pip -q install transformers datasets sagemaker --upgrade
!pip -q install widgetsnbextension ipywidgets

In [None]:
import sagemaker.huggingface

In [None]:
import sagemaker

sagemaker_session_bucket = "quotes-hf-aws"

sess = sagemaker.Session()
bucket = sess.default_bucket()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer_name = 'google/flan-t5-large'
dataset_name = 'Rozi05/quotes_dataset'

In [None]:
dataset = load_dataset(dataset_name, split='train').train_test_split(test_size=0.2, shuffle=True)

In [None]:
train_dataset, test_dataset = dataset["train"], dataset["test"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

max_source_length = 275
max_target_length = 512

def preprocess_function(sample, padding="max_length"):
    inputs = ["Write a motivational quote that: " + (tag + ", describes it best.") for tag in sample["tags"]]
    model_inputs = tokenizer(inputs, max_length = max_source_length, truncation=True)

    labels = tokenizer(text_target=sample["quote"], max_length = max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["index", "quote", "tags"])

In [None]:
import botocore
from datasets.filesystems import S3FileSystem

s3 = S3FileSystem()

s3_prefix = 'samples/datasets/quotes_dataset'

dataset_input_path = "s3://{}/{}".format(sagemaker_session_bucket, s3_prefix)
train_input_path = "{}/train".format(dataset_input_path)
valid_input_path = "{}/validation".format(dataset_input_path)

# save datasets to s3
tokenized_dataset["train"].save_to_disk(train_input_path, fs=s3)
tokenized_dataset["test"].save_to_disk(valid_input_path, fs=s3)
    
print(dataset_input_path)
print(train_input_path)
print(valid_input_path)

In [None]:
import sagemaker
import boto3
from sagemaker.huggingface import HuggingFace

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

hyperparameters = {
    'model_name_or_path':'google/flan-t5-large',
    'output_dir':'/opt/ml/model',
    'do_train':True,
    'dataset_name':'Rozi05/quotes_dataset',
    "epochs": 1,
    "learning-rate": 1e-6,
    "train-batch-size": 1,
    "eval-batch-size": 8,
}

# git configuration to download our fine-tuning script
git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.26.0'}

# creates Hugging Face estimator
huggingface_estimator = HuggingFace(
    role=sagemaker.get_execution_role(),

    entry_point="train.py",
    dependencies=["requirements.txt"],
    hyperparameters=hyperparameters,

    transformers_version="4.26.0",
    pytorch_version="1.13.1",
    py_version="py39",
    instance_type="ml.p3.16xlarge",
    instance_count=1,
    distribution={"smdistributed": {"dataparallel": {"enabled": True}}},
)

# starting the train job
huggingface_estimator.fit({"train": train_input_path, "valid": valid_input_path})