In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [2]:
bucket_name = 'chandra-ml-sagemaker'
training_file_key = 'movie/user_movie_train.recordio'
test_file_key = 'movie/user_movie_test.recordio'

s3_model_output_location = r's3://{0}/movie/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [3]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_test_file_location)

s3://chandra-ml-sagemaker/movie/model
s3://chandra-ml-sagemaker/movie/user_movie_train.recordio
s3://chandra-ml-sagemaker/movie/user_movie_test.recordio


In [7]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [8]:
write_to_s3(r'ml-latest-small/user_movie_train.recordio',bucket_name,training_file_key)

In [9]:
write_to_s3(r'ml-latest-small/user_movie_test.recordio',bucket_name,test_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [10]:
# Registry Path for algorithms provided by SageMaker
#  https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}

In [11]:
role = get_execution_role()

In [12]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::144943967277:role/service-role/AmazonSageMaker-ExecutionRole-20180311T102769


## Build Model

In [13]:
sess = sagemaker.Session()

In [14]:
# Access appropriate algorithm container image
#  Specify how many instances to use for distributed training and what type of machine to use
#  Finally, specify where the trained model artifacts needs to be stored
#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html
#    Optionally, give a name to the training job using base_job_name

estimator = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],
                                       role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.m4.xlarge',
                                       output_path=s3_model_output_location,
                                       sagemaker_session=sess,
                                       base_job_name ='fm-movie-v2')

In [15]:
# Specify hyper parameters that appropriate for the training algorithm
# Sparse Matrix dimension: 100004, 9737
estimator.set_hyperparameters(feature_dim=9737,
                              num_factors=8,
                              predictor_type='regressor', 
                              mini_batch_size=1000,
                              epochs=100)

In [16]:
estimator.hyperparameters()

{'epochs': 100,
 'feature_dim': 9737,
 'mini_batch_size': 1000,
 'num_factors': 8,
 'predictor_type': 'regressor'}

### Train the model

In [17]:
# XGBoost supports "train", "validation" channels
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location, 'test': s3_test_file_location})

INFO:sagemaker:Creating training-job with name: fm-movie-v2-2018-05-31-00-22-13-174


.......................
[31mDocker entrypoint called with argument(s): train[0m
[31m[05/31/2018 00:25:52 INFO 140523443300160] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-conf.json: {u'factors_lr': u'0.0001', u'linear_init_sigma': u'0.01', u'epochs': 1, u'_wd': u'1.0', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'factors_init_sigma': u'0.001', u'_log_level': u'info', u'bias_init_method': u'normal', u'linear_init_method': u'normal', u'linear_lr': u'0.001', u'factors_init_method': u'normal', u'_tuning_objective_metric': u'', u'bias_wd': u'0.01', u'use_linear': u'true', u'bias_lr': u'0.1', u'mini_batch_size': u'1000', u'_use_full_symbolic': u'true', u'batch_metrics_publish_interval': u'500', u'bias_init_sigma': u'0.01', u'_num_gpus': u'auto', u'_data_format': u'record', u'factors_wd': u'0.00001', u'linear_wd': u'0.001', u'_kvstore': u'auto', u'_learning_rate': u'1.0', u'_optimizer': u'adam'}[0m
[31m[05/31/2018 00:25:5

[31m[05/31/2018 00:25:59 INFO 140523443300160] #quality_metric: host=algo-1, epoch=15, train rmse <loss>=1.00289045575[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 363.73114585876465, "sum": 363.73114585876465, "min": 363.73114585876465}}, "EndTime": 1527726359.130419, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1527726358.766199}
[0m
[31m[05/31/2018 00:25:59 INFO 140523443300160] #progress_metric: host=algo-1, completed 16 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Records Since Last Reset": {"count": 1, "max": 70002, "sum": 70002.0, "min": 70002}, "Total Batches Seen": {"count": 1, "max": 1137, "sum": 1137.0, "min": 1137}, "Total Records Seen": {"count": 1, "max": 1121032, "sum": 1121032.0, "min": 1121032}, "Max Rec

[31m[05/31/2018 00:26:13 INFO 140523443300160] #quality_metric: host=algo-1, epoch=53, train rmse <loss>=0.928422744294[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 405.6811332702637, "sum": 405.6811332702637, "min": 405.6811332702637}}, "EndTime": 1527726373.881547, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1527726373.475315}
[0m
[31m[05/31/2018 00:26:13 INFO 140523443300160] #progress_metric: host=algo-1, completed 54 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Records Since Last Reset": {"count": 1, "max": 70002, "sum": 70002.0, "min": 70002}, "Total Batches Seen": {"count": 1, "max": 3835, "sum": 3835.0, "min": 3835}, "Total Records Seen": {"count": 1, "max": 3781108, "sum": 3781108.0, "min": 3781108}, "Max Recor

[31m[05/31/2018 00:26:23 INFO 140523443300160] #quality_metric: host=algo-1, epoch=79, train rmse <loss>=0.906015347516[0m
[31m#metrics {"Metrics": {"update.time": {"count": 1, "max": 385.93006134033203, "sum": 385.93006134033203, "min": 385.93006134033203}}, "EndTime": 1527726383.973657, "Dimensions": {"Host": "algo-1", "Operation": "training", "Algorithm": "factorization-machines"}, "StartTime": 1527726383.587188}
[0m
[31m[05/31/2018 00:26:23 INFO 140523443300160] #progress_metric: host=algo-1, completed 80 % of epochs[0m
[31m#metrics {"Metrics": {"Max Batches Seen Between Resets": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Batches Since Last Reset": {"count": 1, "max": 71, "sum": 71.0, "min": 71}, "Number of Records Since Last Reset": {"count": 1, "max": 70002, "sum": 70002.0, "min": 70002}, "Total Batches Seen": {"count": 1, "max": 5681, "sum": 5681.0, "min": 5681}, "Total Records Seen": {"count": 1, "max": 5601160, "sum": 5601160.0, "min": 5601160}, "Max Re

===== Job Complete =====
Billable seconds: 156


## Deploy Model

In [18]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m4.xlarge',
                             endpoint_name = 'fm-movie-v2')

INFO:sagemaker:Creating model with name: factorization-machines-2018-05-31-00-28-15-636
INFO:sagemaker:Creating endpoint with name fm-movie-v2


--------------------------------------------------------------------------!

## Run Predictions
### Dense and Sparse Formats
https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html

In [19]:

import json
from sagemaker.predictor import json_deserializer

dim_movie = 9737
def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [20]:
predictor.content_type = 'application/json'
predictor.serializer = fm_sparse_serializer
predictor.deserializer = json_deserializer

In [21]:
import numpy as np

In [22]:
fm_sparse_serializer([np.array([341,1416])])

'{"instances": [{"data": {"features": {"keys": [341, 1416], "shape": [9737], "values": [1, 1]}}}]}'

In [23]:
# Rating Entry: ['5 341:1 1416:1', '2.5 209:1 2640:1','2.5 164:1 1346:1']
predictor.predict([np.array([341,1416])])

{'predictions': [{'score': 4.107213020324707}]}

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions