In [151]:
import os
import boto3
import re
from sagemaker import get_execution_role
#retrives the IAM role created at the time of creating the notebook instance
role = get_execution_role()
bucket='ccprojectmlpart'
prefix = 'sagemaker/cc-ml-part' # place to upload training files within the bucket

In [152]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

In [153]:
#data = pd.read_csv('https://raw.githubusercontent.com/CCLocationSharing/Real-time-Location-Sharing-System/data/mldata/data.csv', header = None)
data = pd.read_csv('https://raw.githubusercontent.com/CCLocationSharing/Real-time-Location-Sharing-System/data/mldata/ccMlData.csv', header = None)
# specify columns extracted from wbdc.names

#data.columns = ["libID","Quiet","Monitor", "carpenter", "olin", "uris", "gates"] 
#data.columns = ["id","default","student", "balance", "income"] 
data.columns = ["result","varone", "vartwo", "varthree", "varfour"] 

# save the data
data.to_csv("data.csv", sep=',', index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis 
#display(data.libID.value_counts())
#display(data.result.value_counts())


(5, 5)


Unnamed: 0,result,varone,vartwo,varthree,varfour
0,NO,0.843686,0.501659,0.878892,0.078554
1,YES,0.47186,0.554475,0.313986,0.63536
2,YES,0.589976,0.031104,0.966718,0.690538
3,NO,0.703476,0.611699,0.324005,0.236264
4,YES,0.863,0.058626,0.634915,0.620406


Unnamed: 0,varone,vartwo,varthree,varfour
count,5.0,5.0,5.0,5.0
mean,0.6944,0.351513,0.623703,0.452224
std,0.166751,0.282789,0.303583,0.276082
min,0.47186,0.031104,0.313986,0.078554
25%,0.589976,0.058626,0.324005,0.236264
50%,0.703476,0.501659,0.634915,0.620406
75%,0.843686,0.554475,0.878892,0.63536
max,0.863,0.611699,0.966718,0.690538


In [154]:
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]

'''train_y = data_train.iloc[:,0].as_matrix();
train_X = data_train.iloc[:,1:].as_matrix();

val_y = data_val.iloc[:,0].as_matrix();
val_X = data_val.iloc[:,1:].as_matrix();

test_y = data_test.iloc[:,0].as_matrix();
test_X = data_test.iloc[:,1:].as_matrix();
'''
train_y = ((data_train.iloc[:,0] == 'YES') +0).as_matrix();
train_X = data_train.iloc[:,1:].as_matrix();

val_y = ((data_val.iloc[:,0] == 'YES') +0).as_matrix();
val_X = data_val.iloc[:,1:].as_matrix();

test_y = ((data_test.iloc[:,0] == 'YES') +0).as_matrix();
test_X = data_test.iloc[:,1:].as_matrix();


In [161]:
print(train_y)
print(train_X)

[0 1 1 0 1]
[[0.84368593 0.50165867 0.87889159 0.07855391]
 [0.47185961 0.55447456 0.31398609 0.63535956]
 [0.58997628 0.03110429 0.96671832 0.690538  ]
 [0.70347576 0.61169899 0.32400541 0.2362635 ]
 [0.86300033 0.05862632 0.63491452 0.62040644]]


In [156]:
#convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms, and then upload this data to S3.
train_file = 'linear_train.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(f)

In [157]:
#convert and upload the validation dataset.
validation_file = 'linear_validation.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', validation_file)).upload_fileobj(f)

In [158]:
# See 'Algorithms Provided by Amazon SageMaker: Common Parameters' in the SageMaker documentation for an explanation of these values.
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest'}

In [159]:
#Train

linear_job = 'cc-ml-' + time.strftime("%Y-%m-%d-%H-%M-%S", time.gmtime())



print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {
        "TrainingImage": containers[boto3.Session().region_name],
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.c4.2xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "HyperParameters": {
        "feature_dim": "1",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "auto",
        "loss": "auto"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

Job name is: cc-ml-2018-05-01-17-58-27


In [160]:
%%time

region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress


WaiterError: Waiter TrainingJobCompletedOrStopped failed: Waiter encountered a terminal failure state