In [1]:
import sagemaker
from pathlib import Path
from sagemaker.predictor import json_serializer
import json
import numpy as np
import boto3

In [2]:
role = sagemaker.get_execution_role()
session = sagemaker.Session()

## Setup Path 

In [3]:
# location for train.csv, val.csv and labels.csv
DATA_PATH = Path("../sm-data/")   

# Location for storing training_config.json
CONFIG_PATH = DATA_PATH/'config'
CONFIG_PATH.mkdir(exist_ok=True)

suffix = str(np.random.uniform())[4:9]

# S3 bucket name
bucket = 'toxic-pytorch-sagemaker-' + suffix

# Prefix for S3 bucket for input and output
prefix = 'toxic_comments/input'
prefix_output = 'toxic_comments/output'

In [4]:
!aws s3 mb s3://{bucket}

make_bucket: toxic-pytorch-sagemaker-25884


## Hyperparameters & Training Config

In [5]:
hyperparameters = {
    "epochs": 10,
    "lr": 8e-5,
    "max_seq_length": 512,
    "train_batch_size": 16,
    "lr_schedule": "warmup_cosine",
    "warmup_steps": 1000,
    "optimizer_type": "adamw"
}

In [6]:
training_config = {
    "run_text": "toxic comments",
    "finetuned_model": None,
    "do_lower_case": "True",
    "train_file": "train.csv",
    "val_file": "val.csv",
    "label_file": "labels.csv",
    "text_col": "comment_text",
    "label_col": '["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]',
    "multi_label": "True",
    "grad_accumulation_steps": "1",
    "fp16_opt_level": "O1",
    "fp16": "True",
    "model_type": "roberta",
    "model_name": "roberta-base",
    "logging_steps": "300"
}

with open(CONFIG_PATH/'training_config.json', 'w') as f:
    json.dump(training_config, f)

## Upload Data

In [7]:
# This is a helper feature to upload data
# from your local machine to S3 bucket.

s3_input = session.upload_data(DATA_PATH, bucket=bucket , key_prefix=prefix)

session.upload_data(str(DATA_PATH/'val.csv'), bucket=bucket , key_prefix=prefix)

's3://toxic-pytorch-sagemaker-25884/toxic_comments/input/val.csv'

In [8]:
session.upload_data(str(DATA_PATH/'labels.csv'), bucket=bucket , key_prefix=prefix)

's3://toxic-pytorch-sagemaker-25884/toxic_comments/input/labels.csv'

In [9]:
session.upload_data(str(DATA_PATH/'train.csv'), bucket=bucket , key_prefix=prefix)

's3://toxic-pytorch-sagemaker-25884/toxic_comments/input/train.csv'

## Create an Estimator and start training

In [10]:
#account = session.boto_session.client('sts').get_caller_identity()['Account']
#region = session.boto_session.region_name

#image = "{}.dkr.ecr.{}.amazonaws.com/sagemaker-bert:1.0-gpu-py36".format(account, region)
image = "111652037296.dkr.ecr.us-west-2.amazonaws.com/chazarey-sagemaker-fast-bert:1.0-gpu-py36"
#TODO Convert this to using SM Pytorch 

In [11]:
output_path = "s3://{}/{}".format(bucket, prefix_output)

In [12]:
estimator = sagemaker.estimator.Estimator(image, 
                                          role,
                                          train_instance_count=1, 
                                          train_instance_type='ml.p3.8xlarge', 
                                          output_path=output_path, 
                                          base_job_name='toxic-comments',
                                          hyperparameters=hyperparameters,
                                          sagemaker_session=session
                                         )

In [13]:
estimator.fit(s3_input)

2019-11-16 07:23:52 Starting - Starting the training job...
2019-11-16 07:23:54 Starting - Launching requested ML instances......
2019-11-16 07:24:56 Starting - Preparing the instances for training......
2019-11-16 07:26:02 Downloading - Downloading input data
2019-11-16 07:26:02 Training - Downloading the training image............
2019-11-16 07:28:09 Training - Training image download completed. Training in progress.
[31mStarting the training.[0m
[31m/opt/ml/input/data/training/config/training_config.json[0m
[31m{'run_text': 'toxic comments', 'finetuned_model': None, 'do_lower_case': 'True', 'train_file': 'train.csv', 'val_file': 'val.csv', 'label_file': 'labels.csv', 'text_col': 'comment_text', 'label_col': '["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]', 'multi_label': 'True', 'grad_accumulation_steps': '1', 'fp16_opt_level': 'O1', 'fp16': 'True', 'model_type': 'roberta', 'model_name': 'roberta-base', 'logging_steps': '300'}[0m
[31m{'train_batch_s

## Deploy the model to hosting service

In [14]:
predictor = estimator.deploy(1, 
                             'ml.m5.large', 
                             endpoint_name='bert-toxic-comments', 
                             serializer=json_serializer)

--------------------------------------------------------------------------------------------------!

In [15]:
### Invoke the Endpoint
client = boto3.client('sagemaker-runtime')

sample_payload='{"text": "this is really really good thanks for recommending!!"}'

response = client.invoke_endpoint(
    EndpointName='bert-toxic-comments',
    Body=sample_payload,
    ContentType='application/json'
)
print('Our result for this payload is: {}'.format(response['Body'].read().decode('ascii')))

NameError: name 'boto3' is not defined