In [1]:
import numpy as np
import boto3
import sagemaker
import io
import sagemaker.amazon.common as smac
import os
import pandas as pd

# Read csv from s3.
# Download from your S3 bucket the bike share data CSV file based on the publically available bike share data from the ML repository curated by the University of California, Irvine
from io import StringIO
s3 = boto3.resource('s3')
bucket = 'machine-learning-exam' # place the day.csv file in a bucket in your account
object_key = 'day.csv'

# Load the data into a pandas dataframe 
csv_obj = s3.Object(bucket, object_key)
csv_string = csv_obj.get()['Body'].read().decode('utf-8')

dataset = pd.read_csv(StringIO(csv_string))
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [2]:
# Convert categorical date field
dataset['dteday'] = dataset['dteday'].str.replace("-","")
dataset.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,20110101,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,20110102,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,20110103,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,20110104,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108,1454,1562
4,5,20110105,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82,1518,1600


In [3]:
# Randomize the data and split it between train and test datasets on a 70% 30% split respectively
train_data, test_data = np.split(dataset.sample(frac=1, random_state=1729), [int(0.7 * len(dataset))])
print(train_data.shape, test_data.shape)

(511, 16) (220, 16)


In [4]:
# Get the features and labels.
feature_dataset = train_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
                           'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered' ]]
features = np.array(feature_dataset.values).astype('float32')

label_dataset= train_data[['cnt']]
labels = np.array(label_dataset.values).astype('float32')
labels_vec = np.squeeze(np.asarray(labels))

In [5]:
# Setup protoBuf
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, features, labels_vec)
buffer.seek(0)

prefix = 'realestate'
key = 'linearregression'
boto3.resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(buffer)
s3_training_data_location = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('training dataset will be uploaded to: {}'.format(s3_training_data_location))

training dataset will be uploaded to: s3://machine-learning-exam/realestate/train/linearregression


In [6]:
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('model artifacts will be uploaded to: {}'.format(output_location))

model artifacts will be uploaded to: s3://machine-learning-exam/realestate/output


In [7]:
# Get the Linear Learner container instance
from sagemaker.amazon.amazon_estimator import get_image_uri
linear_container = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [8]:
# Train the model
from sagemaker import get_execution_role

role = get_execution_role()

sagemaker_session = sagemaker.Session()

# Provide the container, role, instance type and model output location
linear = sagemaker.estimator.Estimator(linear_container,
                                       role=role, 
                                       train_instance_count=1, 
                                       train_instance_type='ml.c4.xlarge',
                                       output_path=output_location,
                                       sagemaker_session=sagemaker_session)

# Provide the number of features identified during data preparation
# Provide the predictor_type 

linear.set_hyperparameters(feature_dim=15,
                           mini_batch_size=4,
                           predictor_type='regressor')

# Train the model using the previously prepared test data and validate the 
# data by providing the validation data.

linear.fit({'train': s3_training_data_location})

2020-02-14 19:06:55 Starting - Starting the training job...
2020-02-14 19:06:56 Starting - Launching requested ML instances......
2020-02-14 19:08:02 Starting - Preparing the instances for training......
2020-02-14 19:09:24 Downloading - Downloading input data...
2020-02-14 19:09:47 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34m[02/14/2020 19:10:09 INFO 140163623442240] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto', u'lr_scheduler_step': u'auto', u'init_method': u'uniform', u'init_sigma': u'0.01', u'lr_scheduler_


2020-02-14 19:10:06 Training - Training image download completed. Training in progress.[34m[2020-02-14 19:10:15.572] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 6, "duration": 2785, "num_examples": 128, "num_bytes": 53144}[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.043587390780742245, "sum": 0.043587390780742245, "min": 0.043587390780742245}}, "EndTime": 1581707415.572849, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1581707415.572751}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.04950973400722375, "sum": 0.04950973400722375, "min": 0.04950973400722375}}, "EndTime": 1581707415.572937, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 1}, "StartTime": 1581707415.572923}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max

[34m[2020-02-14 19:10:25.396] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 14, "duration": 2273, "num_examples": 128, "num_bytes": 53144}[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.002343199140319828, "sum": 0.002343199140319828, "min": 0.002343199140319828}}, "EndTime": 1581707425.396479, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 5}, "StartTime": 1581707425.396384}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.0019308556938350502, "sum": 0.0019308556938350502, "min": 0.0019308556938350502}}, "EndTime": 1581707425.396573, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 5}, "StartTime": 1581707425.396553}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.005624238548087191, "sum": 0.005624238548087191, "min": 0.005624238548087191

[34m[2020-02-14 19:10:35.228] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 22, "duration": 2319, "num_examples": 128, "num_bytes": 53144}[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.00026026721050899555, "sum": 0.00026026721050899555, "min": 0.00026026721050899555}}, "EndTime": 1581707435.228513, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 9}, "StartTime": 1581707435.228419}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 6.459823257805188e-05, "sum": 6.459823257805188e-05, "min": 6.459823257805188e-05}}, "EndTime": 1581707435.228719, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 9}, "StartTime": 1581707435.228698}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 0.0009920670591087601, "sum": 0.0009920670591087601, "min": 0.0009920670


2020-02-14 19:10:54 Uploading - Uploading generated training model
2020-02-14 19:10:54 Completed - Training job completed
[34m[2020-02-14 19:10:47.046] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 32, "duration": 2140, "num_examples": 128, "num_bytes": 53144}[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 2.952944395578173e-05, "sum": 2.952944395578173e-05, "min": 2.952944395578173e-05}}, "EndTime": 1581707447.046762, "Dimensions": {"model": 0, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 14}, "StartTime": 1581707447.04668}
[0m
[34m#metrics {"Metrics": {"train_mse_objective": {"count": 1, "max": 2.023877109476466e-05, "sum": 2.023877109476466e-05, "min": 2.023877109476466e-05}}, "EndTime": 1581707447.046859, "Dimensions": {"model": 1, "Host": "algo-1", "Operation": "training", "Algorithm": "Linear Learner", "epoch": 14}, "StartTime": 1581707447.046841}
[0m
[34m#metrics {"Metri

Training seconds: 90
Billable seconds: 90


In [None]:
# Deploy the model
linear_predictor = linear.deploy(initial_instance_count=1,
                                 instance_type='ml.c4.xlarge',
                                 endpoint_name='bikeshare-sagemaker-regression-v1')

------

In [10]:
# Get prediction using the test data
from sagemaker.predictor import csv_serializer, json_deserializer

linear_predictor.content_type = 'text/csv'
linear_predictor.serializer = csv_serializer
linear_predictor.deserializer = json_deserializer

test_feature_dataset = test_data[['instant', 'dteday', 'season', 'yr', 'mnth', 'holiday', 'weekday',
                           'workingday', 'weathersit', 'temp', 'atemp', 'hum', 'windspeed', 'casual', 'registered']]

test_actuals = np.array(test_data['cnt'].astype('float32'))
test_features = np.array(test_feature_dataset.values).astype('float32')

predictions = []
actuals = []
for tf, actual in zip(test_features, test_actuals):
    prediction = linear_predictor.predict(tf)
    predictions.append(prediction['predictions'][0]['score'])
    actuals.append(actual)
    print('prediction: ', prediction['predictions'][0]['score'], '\t\tactual: ', str(actual))

prediction:  799.390625 		actual:  801.0
prediction:  5216.703125 		actual:  5217.0
prediction:  7765.5625 		actual:  7767.0
prediction:  6850.9375 		actual:  6852.0
prediction:  2209.953125 		actual:  2209.0
prediction:  6289.296875 		actual:  6290.0
prediction:  4790.71875 		actual:  4792.0
prediction:  1866.8125 		actual:  1865.0
prediction:  5670.359375 		actual:  5668.0
prediction:  4492.265625 		actual:  4492.0
prediction:  4368.203125 		actual:  4367.0
prediction:  2402.453125 		actual:  2402.0
prediction:  3847.546875 		actual:  3846.0
prediction:  4787.96875 		actual:  4788.0
prediction:  3189.03125 		actual:  3190.0
prediction:  3006.515625 		actual:  3005.0
prediction:  7284.65625 		actual:  7286.0
prediction:  2131.25 		actual:  2132.0
prediction:  5462.21875 		actual:  5464.0
prediction:  4984.09375 		actual:  4985.0
prediction:  6304.640625 		actual:  6304.0
prediction:  5532.03125 		actual:  5532.0
prediction:  8008.15625 		actual:  8009.0
prediction:  3000.46875 		actua

prediction:  3749.1875 		actual:  3747.0
prediction:  3572.734375 		actual:  3574.0
prediction:  2594.296875 		actual:  2594.0
prediction:  7040.453125 		actual:  7040.0
prediction:  2235.21875 		actual:  2236.0
prediction:  3785.796875 		actual:  3784.0
prediction:  2733.140625 		actual:  2732.0
prediction:  5255.671875 		actual:  5255.0
prediction:  4341.09375 		actual:  4342.0
prediction:  5528.203125 		actual:  5531.0
prediction:  4066.546875 		actual:  4067.0
prediction:  1890.6875 		actual:  1891.0
prediction:  3376.5625 		actual:  3376.0
prediction:  6977.0625 		actual:  6978.0
prediction:  6657.6875 		actual:  6660.0
prediction:  5112.59375 		actual:  5115.0
prediction:  2162.65625 		actual:  2162.0
prediction:  5875.5 		actual:  5875.0


In [11]:
# Get accuracy using Cosine Similarity method 
from numpy import dot
from numpy.linalg import norm
tolerance = 1e-10
accuracy = (dot(actuals, predictions)/(norm(actuals)*norm(predictions))) * 100
print('accuracy: ', accuracy)

accuracy:  100.00000313584616


In [12]:
# delete the endpoint
sagemaker.Session().delete_endpoint(linear_predictor.endpoint)