# FAST-BERT SENTIMENT ANALYSIS

In [1]:
import os
import boto3
import sagemaker
from pathlib import Path
from sagemaker.predictor import json_serializer, csv_serializer
import json

# Get the sagemaker execution role and the session
role = sagemaker.get_execution_role()
session = sagemaker.Session()

In [2]:
!aws s3 cp s3://sagemaker-fast-bert/sentiment/input/data/train.csv .
!aws s3 cp s3://sagemaker-fast-bert/sentiment/input/data/val.csv .

download: s3://sagemaker-fast-bert/sentiment/input/data/train.csv to ./train.csv
download: s3://sagemaker-fast-bert/sentiment/input/data/val.csv to ./val.csv


In [3]:
import pandas as pd

train = pd.read_csv('train.csv', index_col=0)
val = pd.read_csv('val.csv', index_col=0)

print('Train set comprises {} samples.'.format(len(train)))
print('Validation set comprises {} samples.'.format(len(val))) 
train.head() 

Train set comprises 134603 samples.
Validation set comprises 8973 samples.


Unnamed: 0,text,label
15001,Friendly staff and quick service,pos
16134,Great food an service.,pos
137459,Great service and staff,pos
11891,No service,neg
8761,Servi was gud,pos


In [6]:
labels = ['neg', 'pos', 'mixed']

with open('labels.csv', 'w') as csvfile:
    for label in labels:
        csvfile.write(label + '\n')
        
# Prefix for S3 bucket for configurations, input and output
prefix_input = 'sentiment/input/data'
prefix_output = 'sentiment/output'
prefix_config = 'sentiment/input/data/config'

# S3 bucket name
bucket = 'sagemaker-fast-bert'
        
fObj = open("labels.csv", 'rb')
key = os.path.join(prefix_input, 'labels.csv')
boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(fObj)

# location for train.csv, val.csv, and labels.csv
s3_input = "s3://{}/{}/".format(bucket, prefix_input)
data_input = sagemaker.session.s3_input(s3_input, distribution='FullyReplicated', 
                             content_type='text/csv', s3_data_type='S3Prefix')

# output path for storage of model
output_path = "s3://{}/{}".format(bucket, prefix_output)

data_input.config

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix',
   'S3Uri': 's3://sagemaker-fast-bert/sentiment/input/data/',
   'S3DataDistributionType': 'FullyReplicated'}},
 'ContentType': 'text/csv'}

In [8]:
hyperparameters = {
    "epochs": 1,
    "lr": 4e-5,
    "max_seq_length": 256,
    "train_batch_size": 8,
    "lr_schedule": "warmup_cosine",
    "warmup_steps": 500,
    "optimizer_type": "adamw"
}

training_config = {
    "run_text": "sentiment classification",
    "finetuned_model": None,
    "do_lower_case": True,
    "train_file": "train.csv",
    "val_file": "val.csv",
    "label_file": "labels.csv",
    "text_col": "text",
    "label_col": "label",
    "multi_label": False,
    "grad_accumulation_steps": "8",
    "fp16_opt_level": "O2",
    "fp16": False,
    "model_type": "bert",
    "model_name": "bert-base-uncased",
    "logging_steps": "300"
}

# save training config
with open('training_config.json', 'w', encoding='utf-8') as f:
    json.dump(training_config, f)

fObj = open("training_config.json", 'rb')
key = os.path.join(prefix_config, 'training_config.json')
boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(fObj)

# Construct the ECR image location 
account = session.boto_session.client('sts').get_caller_identity()['Account']
region = session.boto_session.region_name
image = "{}.dkr.ecr.{}.amazonaws.com/sagemaker-bert-session:1.0-gpu-py36".format(account, region)

image

'944828514909.dkr.ecr.us-east-1.amazonaws.com/sagemaker-bert-session:1.0-gpu-py36'

In [36]:
# Create the estimator
estimator = sagemaker.estimator.Estimator(image,                                # ECR image arn
                                          role,                                 # execution role
                                          train_instance_count=1,               # no. of sagemaker instances
                                          train_instance_type='ml.p3.2xlarge',  # instance type
                                          output_path=output_path,              # output path to store model outputs
                                          base_job_name='bert-sentiment',       # job name prefix
                                          hyperparameters=hyperparameters,      # hyperparamters object
                                          sagemaker_session=session             # session
                                         )

# Launch instance and start training
estimator.fit(data_input)

2019-10-24 15:23:27 Starting - Starting the training job...
2019-10-24 15:23:28 Starting - Launching requested ML instances......
2019-10-24 15:24:35 Starting - Preparing the instances for training......
2019-10-24 15:25:53 Downloading - Downloading input data
2019-10-24 15:25:53 Training - Downloading the training image.................[31mStarting the training.[0m
[31m/opt/ml/input/data/training/config/training_config.json[0m
[31m{'run_text': 'sentiment', 'finetuned_model': None, 'do_lower_case': True, 'train_file': 'train.csv', 'val_file': 'val.csv', 'label_file': 'labels.csv', 'text_col': 'text', 'label_col': 'label', 'multi_label': False, 'grad_accumulation_steps': '8', 'fp16_opt_level': 'O2', 'fp16': False, 'model_type': 'bert', 'model_name': 'bert-base-uncased', 'logging_steps': '300'}[0m
[31m{'train_batch_size': '8', 'warmup_steps': '500', 'lr': '4e-05', 'max_seq_length': '256', 'optimizer_type': 'adamw', 'lr_schedule': 'warmup_cosine', 'epochs': '1'}[0m
[31m10/24/2019

[31m10/24/2019 15:50:36 - INFO - root -   eval_loss after step 900: 0.10492015442746208: [0m
[31m10/24/2019 15:50:36 - INFO - root -   eval_accuracy after step 900: 0.964783238604703: [0m
[31m10/24/2019 15:50:36 - INFO - root -   lr after step 900: 3.416290737724918e-05[0m
[31m10/24/2019 15:50:36 - INFO - root -   train_loss after step 900: 0.11460865293319027[0m
[31m10/24/2019 15:56:22 - INFO - root -   Running evaluation[0m
[31m10/24/2019 15:56:22 - INFO - root -     Num examples = 8973[0m
[31m10/24/2019 15:56:22 - INFO - root -     Batch size = 16[0m
[31m10/24/2019 15:57:07 - INFO - root -   eval_loss after step 1200: 0.08611540489044844: [0m
[31m10/24/2019 15:57:07 - INFO - root -   eval_accuracy after step 1200: 0.9696868382926558: [0m
[31m10/24/2019 15:57:07 - INFO - root -   lr after step 1200: 2.395225026408395e-05[0m
[31m10/24/2019 15:57:07 - INFO - root -   train_loss after step 1200: 0.10511528893373906[0m
[31m10/24/2019 16:02:53 - INFO - root -   Runn

In [77]:
%%time
from time import gmtime, strftime

endpoint_name = 'bert-sentiment-' + strftime("%Y-%m-%d-%H-%M-%S", gmtime())

predictor = estimator.deploy(initial_instance_count = 1, 
                             instance_type = 'ml.t2.large', 
                             endpoint_name = endpoint_name,
                             serializer=json_serializer)

Using already existing model: bert-sentiment-2019-10-24-15-23-27-102


---------------------------------------------------------------------------------------------------------------------------------------!CPU times: user 777 ms, sys: 44.3 ms, total: 821 ms
Wall time: 11min 23s


In [149]:
import json

payload = json.dumps({"text":"The food was nice, element of improvement on the preparation of the ribs should be grilled. \
Please invest in a flat top griller. Use a different bbq sauce for the chicken wings and  ribs. \
Overall service from Andries was great. "})

print(predictor.predict(payload))

b'[["mixed", 0.9535662531852722], ["neg", 0.02639363519847393], ["pos", 0.020040083676576614]]'


## Single prediction with invoke endpoint

In [253]:
from sagemaker.content_types import CONTENT_TYPE_JSON

client = boto3.client('sagemaker-runtime')

content_type=CONTENT_TYPE_JSON
accept='application/json'

response = client.invoke_endpoint(EndpointName = endpoint_name,
                                  ContentType = content_type,
                                  Accept=accept,
                                  Body=payload)


probas = json.loads(response['Body'].read().decode())
print(probas)

[['mixed', 0.9535662531852722], ['neg', 0.02639363519847393], ['pos', 0.020040083676576614]]


In [310]:
# function to get highest probability from three likelihoods
# for single responses

def get_prediction(probas):
    return max(probas, key=lambda item:item[1])

pred = get_prediction(probas)
print('Prediction is: {}'.format(pred))

Prediction is: ['mixed', 0.9535662531852722]


## Multiple predictions with invoke endpoint

In [259]:
reviews = ["The food was nice, element of improvement on the preparation of the ribs should be grilled. \
            Please invest in a flat top griller. Use a different bbq sauce for the chicken wings and  ribs. \
            Overall service from Andries was great. ",
           
           "They delivered lamb chops that were off. The chops were smelly that I threw up. I took them back \
            and they told me that it's the smell of lamb; I mean, really, I've eaten lamb before it didn't smelled \
            off and never tasted like that. This is unacceptable, I couldn't have gotten sick.\
            Food inspectors need to check it out "]

payloads = [{"text" : review} for review in reviews]

content_type=CONTENT_TYPE_JSON
accept='application/json'

probas = []

# I am using a loop as a work around for batch predictions, as invoke 
# endpoint only manages one request at a time. 
for payload in payloads:
    response = client.invoke_endpoint(EndpointName = endpoint_name,
                                      ContentType = content_type,
                                      Accept=accept,
                                      Body=json.dumps(payload))
    
    result = json.loads(response['Body'].read().decode())
    probas.append(result)
    
print(probas)

[[['mixed', 0.9535662531852722], ['neg', 0.02639363519847393], ['pos', 0.020040083676576614]], [['neg', 0.995301365852356], ['mixed', 0.004171039909124374], ['pos', 0.0005276606534607708]]]


In [334]:
# function to get highest probability from three likelihoods
# for multiple responses

def get_predictions(probas):
    return [max(prob, key=lambda item:item[1]) for prob in probas]

preds = get_predictions(probas)
print('Predictions are: {}'.format(preds))

Predictions are: [[['mixed', 0.9535662531852722], ['neg', 0.995301365852356]]]


In [261]:
sm_client = session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName=endpoint_name)

{'ResponseMetadata': {'RequestId': '1892ba70-9475-46a6-85f1-4df60f1cf2ea',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1892ba70-9475-46a6-85f1-4df60f1cf2ea',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0',
   'date': 'Thu, 24 Oct 2019 22:58:02 GMT'},
  'RetryAttempts': 0}}