Upgrade to the latest `sagemaker` version.

In [None]:
# !pip install datasets

In [None]:
# !pip install "sagemaker" "transformers" "datasets" --upgrade
# !pip install ipywidgets IProgress

In [None]:
# %%capture
# import IPython
# !conda install -c conda-forge ipywidgets -y
# !pip install ipywidgets IProgress
# IPython.Application.instance().kernel.do_shutdown(True) # has to restart kernel so changes are used

In [21]:
import boto3
import numpy as np
import pandas as pd

import sagemaker
from sagemaker.huggingface import HuggingFace, HuggingFaceModel

In [2]:
# permissions
sess = sagemaker.Session()
role = sagemaker.get_execution_role()
s3_client = boto3.client('s3')

bucket = sess.default_bucket()
prefix = "huggingface_classifier"
sess = sagemaker.Session(default_bucket=bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::367158743199:role/service-role/AmazonSageMaker-ExecutionRole-20210413T121296
sagemaker bucket: sagemaker-us-east-1-367158743199
sagemaker session region: us-east-1


# Downloading the Dataset

In [22]:
df = pd.read_csv('./data/Womens Clothing E-Commerce Reviews.csv')
df = df[['Review Text',	'Rating']]
df.columns = ['text', 'label']

df = df.dropna()

train, validate, test = \
              np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])

train.shape, validate.shape, test.shape

((13584, 2), (4528, 2), (4529, 2))

In [23]:
train.to_csv('./data/train.csv', index=False)
validate.to_csv('./data/validate.csv', index=False)
test.to_csv('./data/test.csv', index=False)

In [None]:
s3_client.upload_file('./data/train.csv', bucket,
                      f'{prefix}/data/train.csv')
s3_client.upload_file('./data/validate.csv', bucket,
                      f'{prefix}/data/validate.csv')
s3_client.upload_file('./data/test.csv', bucket,
                      f'{prefix}/data/test.csv')

## Prepare a HuggingFace Transformers fine-tuning script.

In [None]:
!mkdir ./src

In [12]:
%%writefile src/train.py

import argparse
import logging
import os
import sys

import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification


if __name__ == "__main__":

    parser = argparse.ArgumentParser()

    # Hyperparameters sent by the client are passed as command-line arguments to the script.
    parser.add_argument("--epochs", type=int, default=3)
    parser.add_argument("--train-batch-size", type=int, default=16)
    parser.add_argument("--eval-batch-size", type=int, default=8)
    parser.add_argument("--model_name", type=str)
    parser.add_argument("--learning_rate", type=str, default=5e-5)
    parser.add_argument("--do_train", type=bool, default=True)
    parser.add_argument("--do_eval", type=bool, default=True)

    # Data, model, and output directories
    parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
    parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"])
    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])

    args, _ = parser.parse_known_args()

    # Set up logging
    logger = logging.getLogger(__name__)

    logging.basicConfig(
        level=logging.getLevelName("INFO"),
        handlers=[logging.StreamHandler(sys.stdout)],
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )

    # Load model and tokenizer
    model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    # Load dataset
    train_file = f"{args.training_dir}/train.csv"
    validate_file = f"{args.test_dir}/validate.csv"
    dataset = load_dataset('csv', data_files={'train': train_file,
                                             'test': validate_file})
    
    train_dataset = dataset['train']
    test_dataset = dataset['test']
    logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
    logger.info(f" loaded test_dataset length is: {len(test_dataset)}")

    # Preprocess train dataset
    train_dataset = train_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
    train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

    train_features = {
        x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
        for x in ["input_ids", "attention_mask"]
    }
    tf_train_dataset = tf.data.Dataset.from_tensor_slices((train_features, train_dataset["label"])).batch(
        args.train_batch_size
    )

    # Preprocess test dataset
    test_dataset = test_dataset.map(
        lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True
    )
    test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"])

    test_features = {
        x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length])
        for x in ["input_ids", "attention_mask"]
    }
    tf_test_dataset = tf.data.Dataset.from_tensor_slices((test_features, test_dataset["label"])).batch(
        args.eval_batch_size
    )

    # define optimizer and loss function
    optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    # Training
    if args.do_train:

        train_results = model.fit(tf_train_dataset, epochs=args.epochs, batch_size=args.train_batch_size)
        logger.info("*** Train ***")

        output_eval_file = os.path.join(args.output_data_dir, "train_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Train results *****")
            logger.info(train_results)
            for key, value in train_results.history.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

    # Evaluation
    if args.do_eval:

        result = model.evaluate(tf_test_dataset, batch_size=args.eval_batch_size, return_dict=True)
        logger.info("*** Evaluate ***")

        output_eval_file = os.path.join(args.output_data_dir, "eval_results.txt")

        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info(result)
            for key, value in result.items():
                logger.info("  %s = %s", key, value)
                writer.write("%s = %s\n" % (key, value))

    # Save result
    model.save_pretrained(args.model_dir)
    tokenizer.save_pretrained(args.model_dir)

Overwriting src/train.py


## Create an HuggingFace Estimator


In [13]:
# hyperparameters, which are passed into the training job
hyperparameters={'epochs': 1,
                 'per_device_train_batch_size': 32,
                 'model_name': 'distilbert-base-uncased'
                 }

In [14]:
# create the Estimator
huggingface_estimator = HuggingFace(
        entry_point='train.py',
        source_dir='./src',
        instance_type='ml.p3.2xlarge',
        instance_count=1,
        role=role,
        transformers_version='4.6',
        tensorflow_version='2.4',
        py_version='py37',
        hyperparameters = hyperparameters
)

# Excute the fine-tuning Job

In [15]:
train_uri = f"s3://{bucket}/{prefix}/data/train.csv"
validate_uri = f"s3://{bucket}/{prefix}/data/validate.csv"

data = {'train': train_uri,
        'test': validate_uri
       }
huggingface_estimator.fit(data)

2021-07-14 18:42:51 Starting - Starting the training job...
2021-07-14 18:43:17 Starting - Launching requested ML instancesProfilerReport-1626288171: InProgress
.........
2021-07-14 18:44:37 Starting - Preparing the instances for training.........
2021-07-14 18:46:20 Downloading - Downloading input data
2021-07-14 18:46:20 Training - Downloading the training image.....................
2021-07-14 18:49:42 Training - Training image download completed. Training in progress.[34m2021-07-14 18:49:43.868480: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.[0m
[34m2021-07-14 18:49:43.876125: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.[0m
[34m2021-07-14 18:49:44.045792: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0[0m
[34m202

In [16]:
model_data = huggingface_estimator.model_data

# Deploying the endpoint

In [17]:
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=model_data, # S3 path to your trained sagemaker model
   role=role, # IAM role with permissions to create an Endpoint
    transformers_version='4.6',
    tensorflow_version='2.4',
    py_version='py37',
)

In [18]:
# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.xlarge"
)

-------------!

### invoke iwth the Python SDK

In [32]:
test['text'][:1].tolist()

["First, it's huge. i'm not big busted but it's so roomy that i looked like i was pregnant. not exactly what i was going for. nice color and soft fabric but would only work if you plan to tuck it in--and i'm short waisted so no tucking for me. back it went."]

In [35]:
# data = {"inputs": test['text'][:2].tolist()} 
# print(predictor.predict(data))

### Alternative: invoke with boto3

In [36]:
# runtime = boto3.client("sagemaker-runtime")

## DATA CITATION 