In [230]:
import os
import boto3
import re
from sagemaker import get_execution_role
#retrives the IAM role created at the time of creating the notebook instance
role = get_execution_role()
bucket='ccprojectmlpart'
prefix = 'sagemaker/cc-ml-part' # place to upload training files within the bucket

In [231]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

In [235]:
#data = pd.read_csv('https://raw.githubusercontent.com/CCLocationSharing/Real-time-Location-Sharing-System/data/mldata/data.csv', header = None)
data = pd.read_csv('https://raw.githubusercontent.com/CCLocationSharing/Real-time-Location-Sharing-System/data/mldata/ccMlData.csv', header = None)
# specify columns extracted from wbdc.names

data.columns = ["libId", "q", "m", "carpenter", "olin", "uris", "gates"] 
#data.columns = ["id","default","student", "balance", "income"] 
#data.columns = ["result","varone"] 

# save the data
data.to_csv("data.csv", sep=',', index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis 
display(data.libId.value_counts())
#display(data.result.value_counts())


(10000, 7)


Unnamed: 0,libId,q,m,carpenter,olin,uris,gates
0,4,0,1,26,30,10,81
1,1,0,1,26,14,0,41
2,1,0,1,22,3,9,8
3,2,1,0,14,13,55,3
4,1,0,1,85,15,20,49


Unnamed: 0,libId,q,m,carpenter,olin,uris,gates
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.493,0.6128,0.5801,29.2473,28.2652,27.6031,27.9973
std,1.112868,0.487134,0.493567,22.187541,24.992141,25.799142,23.251266
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.75,0.0,0.0,13.0,10.0,8.0,10.0
50%,2.0,1.0,1.0,25.0,19.0,17.0,21.0
75%,3.0,1.0,1.0,38.0,43.0,44.0,40.0
max,4.0,1.0,1.0,100.0,100.0,100.0,100.0


3    2546
2    2512
1    2500
4    2442
Name: libId, dtype: int64

In [249]:
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]

train_y = (data_train.iloc[:,0]).as_matrix();
train_X = (data_train.iloc[:,1:]).as_matrix();

val_y = data_val.iloc[:,0].as_matrix();
val_X = data_val.iloc[:,1:].as_matrix();

test_y = data_test.iloc[:,0].as_matrix();
test_X = data_test.iloc[:,1:].as_matrix();


#train_y = ((data_train.iloc[:,0] == 'YES') +0).as_matrix();
#train_X = data_train.iloc[:,1:].as_matrix();

#val_y = ((data_val.iloc[:,0] == 'YES') +0).as_matrix();
#val_X = data_val.iloc[:,1:].as_matrix();

#test_y = ((data_test.iloc[:,0] == 'YES') +0).as_matrix();
#test_X = data_test.iloc[:,1:].as_matrix();



In [250]:
print(train_y)
print(train_X)

[1 2 1 ... 2 1 4]
[[ 0  1 26 14  0 41]
 [ 1  0 14 13 55  3]
 [ 0  1 85 15 20 49]
 ...
 [ 1  1 11 90 29 34]
 [ 0  0 75  3  4 20]
 [ 1  1 40  4  6 56]]


In [251]:
#convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms, and then upload this data to S3.
train_file = 'linear_train.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(f)

In [252]:
#convert and upload the validation dataset.
validation_file = 'linear_validation.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', validation_file)).upload_fileobj(f)

In [253]:
# See 'Algorithms Provided by Amazon SageMaker: Common Parameters' in the SageMaker documentation for an explanation of these values.
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest'}

In [254]:
#Train

linear_job = 'cc-ml-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())



print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {
        "TrainingImage": containers[boto3.Session().region_name],
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.c4.2xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "HyperParameters": {
        "feature_dim": "6",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "auto",
        "loss": "auto"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

Job name is: cc-ml-2018-05-01-22-48-12


In [255]:
%%time

region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress
CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 4min


In [256]:
linear_hosting_container = {
    'Image': containers[boto3.Session().region_name],
    'ModelDataUrl': sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts']['S3ModelArtifacts']
}

create_model_response = sm.create_model(
    ModelName=linear_job,
    ExecutionRoleArn=role,
    PrimaryContainer=linear_hosting_container)

print(create_model_response['ModelArn'])

arn:aws:sagemaker:us-west-2:226826713164:model/cc-ml-2018-05-01-22-48-12


In [258]:
linear_endpoint_config = 'ccml-linear-endpoint-config-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[{
        'InstanceType': 'ml.m4.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': linear_job,
        'VariantName': 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

ccml-linear-endpoint-config-2018-05-02-03-54-51
Endpoint Config Arn: arn:aws:sagemaker:us-west-2:226826713164:endpoint-config/ccml-linear-endpoint-config-2018-05-02-03-54-51


In [260]:
%%time

linear_endpoint = 'ccml-linear-endpoint-' + time.strftime("%Y%m%d%H%M", time.gmtime())
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint,
    EndpointConfigName=linear_endpoint_config)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Status: " + status)

sm.get_waiter('endpoint_in_service').wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

if status != 'InService':
    raise Exception('Endpoint creation did not succeed')

ccml-linear-endpoint-201805020402
arn:aws:sagemaker:us-west-2:226826713164:endpoint/ccml-linear-endpoint-201805020402
Status: Creating
Arn: arn:aws:sagemaker:us-west-2:226826713164:endpoint/ccml-linear-endpoint-201805020402
Status: InService
CPU times: user 60 ms, sys: 4 ms, total: 64 ms
Wall time: 6min 32s
