Upgrade to the latest `sagemaker` version.

In [None]:
# !pip install "sagemaker" -qU

In [1]:
import boto3
import numpy as np
import pandas as pd

import sagemaker
from sagemaker import TrainingJobAnalytics
from sagemaker.huggingface import HuggingFace, HuggingFaceModel, HuggingFacePredictor

In [2]:
# permissions
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
s3_client = boto3.client('s3')

bucket = sess.default_bucket()
prefix = "huggingface_classifier"
sess = sagemaker.Session(default_bucket=bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::367158743199:role/service-role/AmazonSageMaker-ExecutionRole-20210413T121296
sagemaker bucket: sagemaker-us-east-1-367158743199
sagemaker session region: us-east-1


# Downloading the Dataset

In [3]:
df = pd.read_csv('./data/Womens Clothing E-Commerce Reviews.csv')
df = df[['Review Text',	'Rating']]
df.columns = ['text', 'label']
df['label'] = df['label'] - 1

df = df.dropna()

train, validate, test = \
              np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])

train.shape, validate.shape, test.shape

((13584, 2), (4528, 2), (4529, 2))

In [4]:
train.to_csv('./data/train.csv', index=False)
validate.to_csv('./data/validate.csv', index=False)
test.to_csv('./data/test.csv', index=False)

In [5]:
s3_client.upload_file('./data/train.csv', bucket,
                      f'{prefix}/data/train.csv')
s3_client.upload_file('./data/validate.csv', bucket,
                      f'{prefix}/data/validate.csv')
s3_client.upload_file('./data/test.csv', bucket,
                      f'{prefix}/data/test.csv')

## Prepare a HuggingFace Transformers fine-tuning script.

In [6]:
!mkdir ./src

mkdir: cannot create directory ‘./src’: File exists


In [7]:
%%writefile src/train.py

import os
import sys
import logging
import argparse
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer


# Set up logging
logger = logging.getLogger(__name__)

logging.basicConfig(
    level=logging.getLevelName("INFO"),
    handlers=[logging.StreamHandler(sys.stdout)],
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

if __name__ == "__main__":

    logger.info(sys.argv)

    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train-batch-size", type=int, default=32)
    parser.add_argument("--eval-batch-size", type=int, default=64)
    parser.add_argument("--warmup_steps", type=int, default=500)
    parser.add_argument("--model_name", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--output_dir", type=str)

    # Data, model, and output directories
    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])

    args, _ = parser.parse_known_args()
    
    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )
   
    # download tokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    
    # Load dataset
    train_file = f"{args.training_dir}/train.csv"
    validate_file = f"{args.test_dir}/validate.csv"
    dataset = load_dataset('csv', data_files={'train': train_file,
                                             'test': validate_file})
    
    train_dataset = dataset['train']
    test_dataset = dataset['test']
    logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
    logger.info(f" loaded test_dataset length is: {len(test_dataset)}")


    # tokenizer helper function
    def tokenize(batch):
        return tokenizer(batch['text'], padding='max_length', truncation=True)

    # tokenize dataset
    train_dataset = train_dataset.map(tokenize, batched=True)
    test_dataset = test_dataset.map(tokenize, batched=True)
    

    # set format for pytorch
    train_dataset =  train_dataset.rename_column("label", "labels")
    train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    test_dataset = test_dataset.rename_column("label", "labels")
    test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

    logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
    logger.info(f" loaded test_dataset length is: {len(test_dataset)}")

    # compute metrics function for binary classification
    def compute_metrics(pred):
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
        acc = accuracy_score(labels, preds)
        return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

    # download model from model hub
    model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=5)

    # define training args
    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.epochs,
        per_device_train_batch_size=args.train_batch_size,
        per_device_eval_batch_size=args.eval_batch_size,
        warmup_steps=args.warmup_steps,
        evaluation_strategy="epoch",
        logging_dir=f"{args.output_data_dir}/logs",
        learning_rate=float(args.learning_rate),
    )

    # create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    # train model
    if get_last_checkpoint(args.output_dir) is not None:
        logger.info("***** continue training *****")
        trainer.train(resume_from_checkpoint=args.output_dir)
    else:
        trainer.train()
    # evaluate model
    eval_result = trainer.evaluate(eval_dataset=test_dataset)

    # writes eval result to file which can be accessed later in s3 ouput
    with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
        print(f"***** Eval results *****")
        for key, value in sorted(eval_result.items()):
            writer.write(f"{key} = {value}\n")

    # Saves the model to s3
    trainer.save_model(args.model_dir)
    tokenizer.save_pretrained(args.model_dir)

Overwriting src/train.py


## Create an HuggingFace Estimator


In [8]:
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'train_batch_size': 32,
                 'model_name':'distilbert-base-uncased',
                 'output_dir':'/opt/ml/checkpoints'
                 }



metric_definitions=[
    {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_accuracy', 'Regex': "'eval_accuracy': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_f1', 'Regex': "'eval_f1': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_precision', 'Regex': "'eval_precision': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_recall', 'Regex': "'eval_recall': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_runtime', 'Regex': "'eval_runtime': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'eval_samples_per_second', 'Regex': "'eval_samples_per_second': ([0-9]+(.|e\-)[0-9]+),?"},
    {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}]

In [9]:
# s3 uri where our checkpoints will be uploaded during training
job_name = "using-spot"
checkpoint_s3_uri = f's3://{bucket}/{job_name}/checkpoints'

# create the Estimator
huggingface_estimator = HuggingFace(entry_point='train.py',
                            source_dir='./src',
                            instance_type='ml.p3.2xlarge',
                            size=5,
                            instance_count=1,
                            base_job_name=job_name,
                            checkpoint_s3_uri=checkpoint_s3_uri,
#                             use_spot_instances=True,
#                             max_wait=3600, # This should be equal to or greater than max_run in seconds'
#                             max_run=1000, # expected max run in seconds
                            role=role,
                            transformers_version='4.6',
                            pytorch_version='1.7',
                            py_version='py36',
                            hyperparameters = hyperparameters,
                            metric_definitions=metric_definitions
                            )

# Excute the fine-tuning Job

In [10]:
data = {'train': f"s3://{bucket}/{prefix}/data/train.csv",
        'test': f"s3://{bucket}/{prefix}/data/validate.csv"
       }

huggingface_estimator.fit(data)

2021-07-20 21:07:00 Starting - Starting the training job...
2021-07-20 21:07:02 Starting - Launching requested ML instancesProfilerReport-1626815220: InProgress
......
2021-07-20 21:08:29 Starting - Preparing the instances for training.........
2021-07-20 21:09:49 Downloading - Downloading input data...
2021-07-20 21:10:29 Training - Downloading the training image...............
2021-07-20 21:12:58 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-07-20 21:12:58,588 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-07-20 21:12:58,612 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-07-20 21:13:00,052 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-07-20 21:13:00,507 sagemaker-train

## Accessing Training Metrics

In [11]:
# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=huggingface_estimator.latest_training_job.name).dataframe()
df.head(10)



Unnamed: 0,timestamp,metric_name,value
0,0.0,eval_loss,0.826591
1,0.0,eval_accuracy,0.655256
2,0.0,eval_f1,0.622453
3,0.0,eval_precision,0.611006
4,0.0,eval_recall,0.655256
5,0.0,eval_runtime,31.27
6,0.0,eval_samples_per_second,144.803
7,0.0,epoch,1.0


# Deploying the endpoint

In [12]:
predictor = huggingface_estimator.deploy(1,
                                         "ml.g4dn.xlarge")

---------------!

## Evaluate predictions on the test set

In [22]:
payload = {"inputs": test['text'].tolist()}

In [41]:
pred_list = []
for idx, row in test.iterrows():
    payload = {"inputs": row['text']}
    pred = predictor.predict(payload)[0]
    
    # rename label to prediction
    pred['prediction'] = pred.pop('label')
    # convert prediction value to int
    pred['prediction'] = int(pred['prediction'].replace('LABEL_', ''))
    pred_list.append(pred)

In [42]:
test['prediction'] = pred_list

In [45]:
df_test = pd.concat([test.drop(['prediction'], axis=1), test['prediction'].apply(pd.Series)], axis=1)

In [46]:
df_test.head()

Unnamed: 0,text,label,score,prediction
20494,"First, it's huge. i'm not big busted but it's ...",1,0.584123,2.0
12682,"I thought this top was so pretty and airy, but...",2,0.55281,2.0
1096,I love the style and quality of this blouse. i...,4,0.887288,4.0
15059,I was really disappointed when i tried this to...,0,0.55544,2.0
11826,Super soft and comfy. so far not stretching ou...,3,0.485245,3.0


# Deploying the endpoint - HF model

In [None]:
model_name = 'huggingface-finetune'

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=huggingface_estimator.model_data, # S3 path to your trained sagemaker model
   role=role, # IAM role with permissions to create an Endpoint
    transformers_version='4.6',
    tensorflow_version='2.4',
    py_version='py37',
    name=model_name
)

In [None]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"
)

### invoke iwth the Python SDK

In [None]:
test['text'][:1].tolist()

In [None]:
# data = {"inputs": test['text'][:2].tolist()} 
# print(predictor.predict(data))

### Alternative: invoke with boto3

In [None]:
client = boto3.client('sagemaker')

In [None]:
endpoint = client.list_endpoints()['Endpoints'][0]['EndpointName']
endpoint

In [None]:
runtime = boto3.client("sagemaker-runtime")

In [None]:
payload = {"inputs": ["I love using the new Inference DLC."]}

predictor = HuggingFacePredictor(endpoint_name=endpoint,
                                sagemaker_session=sess
                                )
predictor.predict(data=payload)

## DATA CITATION 