In [1]:
import os
import boto3
import re
from sagemaker import get_execution_role
#retrives the IAM role created at the time of creating the notebook instance
role = get_execution_role()
bucket='ccprojectmlpart3'
prefix = 'sagemaker/cc-ml-part' # place to upload training files within the bucket

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac

In [3]:
#Let's download the data and save it in the local folder with the name data.csv and take a look at it.
data = pd.read_csv('https://raw.githubusercontent.com/CCLocationSharing/Real-time-Location-Sharing-System/data_new/mldata/law/mlDataLaw.csv', header = None)
# specify columns extracted from wbdc.names

data.columns = ["LawLabel", "major", "distance", "history_precent", "curr_ava"] 

# save the data
data.to_csv("data.csv", sep=',', index=False)

# print the shape of the data file
print(data.shape)

# show the top few rows
display(data.head())

# describe the data object
display(data.describe())

# we will also summarize the categorical field diganosis 
display(data.CarLabel.value_counts())
#display(data.result.value_counts())


(5000, 5)


Unnamed: 0,LawLabel,major,distance,history_precent,curr_ava
0,N,Chemical Engineering,1.256598,0.147288,0.876576
1,Y,Development Sociology,0.221862,0.656703,0.154757
2,Y,Development Sociology,0.216811,0.509221,0.300954
3,Y,Human Development,0.726466,0.687809,0.287159
4,Y,Human Development,0.238141,0.924249,0.293344


Unnamed: 0,distance,history_precent,curr_ava
count,5000.0,5000.0,5000.0
mean,0.498807,0.496979,0.417647
std,0.445355,0.289736,0.276974
min,0.0,0.000463,6.3e-05
25%,0.104088,0.244414,0.165642
50%,0.437516,0.496272,0.395542
75%,0.773588,0.745475,0.643633
max,1.999561,0.999898,0.999911


AttributeError: 'DataFrame' object has no attribute 'CarLabel'

In [4]:
#information about majors
arts_majors = {'Africana Studies', 'American Studies',
'Anthropology', 'Applied Economics and Management', 'Classics', 'Communication',
'Development Sociology', 'Economics', 'Feminist', 'Fine Arts', 'French', 'History', 'Human Development', 'Linguistics', 'Music',
'Philosophy', 'Religious Studies', 'Sociology', 'Urban and Regional Studies'}
science_majors = {'Animal Science', 'Biological Sciences', 'Environmental and Sustainability Sciences',
'Food Science', 'Mathematics', 'Nutritional Sciences', 'Statistical Science', 'Science and Technology Studies'}
business_majors = {'Accounting', 'Policy Analysis and Management', 'Hotel Administration'}
engineering_majors = {'Biological Engineering', 'Biomedical Engineering', 'Chemical Engineering',
'Computer Science', 'Environmental Engineering', 'Independent Major—Engineering', 'Operations Research and Engineering',
'Mechanical Engineering', 'Electrical and Computer Engineering'}

In [5]:
#change majors into labels
def changeMajors(input_data):
    length = len(input_data)
    for i in range(0, length):
        temp = input_data[i, 0]
        if(temp in arts_majors):
            input_data[i, 0] = 1
        elif(temp in science_majors):
            input_data[i, 0] = 2
        elif(temp in business_majors):
            input_data[i, 0] = 3
        elif(temp in engineering_majors):
            input_data[i, 0] = 4
        else:
            input_data[i, 0] = 5
    return input_data

In [6]:
#Split the data into 80% training, 10% validation and 10% testing.
rand_split = np.random.rand(len(data))
train_list = rand_split < 0.8
val_list = (rand_split >= 0.8) & (rand_split < 0.9)
test_list = rand_split >= 0.9

data_train = data[train_list]
data_val = data[val_list]
data_test = data[test_list]

train_y = ((data_train.iloc[:,0]=='Y')+0).as_matrix();
train_X = (data_train.iloc[:,1:]).as_matrix();
train_X = changeMajors(train_X)

val_y = ((data_val.iloc[:,0]=="Y")+0).as_matrix();
val_X = data_val.iloc[:,1:].as_matrix();
val_X = changeMajors(val_X)

test_y = ((data_test.iloc[:,0]=="Y")+0).as_matrix();
test_X = data_test.iloc[:,1:].as_matrix();
test_X = changeMajors(test_X)

In [7]:
#convert the datasets to the recordIO-wrapped protobuf format used by the Amazon SageMaker algorithms, and then upload this data to S3.
train_file = 'linear_train.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', train_file)).upload_fileobj(f)

In [8]:
#convert and upload the validation dataset.
validation_file = 'linear_validation.data'

f = io.BytesIO()
smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))
f.seek(0)

boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', validation_file)).upload_fileobj(f)

In [9]:
# See 'Algorithms Provided by Amazon SageMaker: Common Parameters' in the SageMaker documentation for an explanation of these values.
containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/linear-learner:latest',
              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:latest',
              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/linear-learner:latest',
              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/linear-learner:latest'}

In [10]:
#Train

linear_job = 'cc-ml-Law1'



print("Job name is:", linear_job)

linear_training_params = {
    "RoleArn": role,
    "TrainingJobName": linear_job,
    "AlgorithmSpecification": {
        "TrainingImage": containers[boto3.Session().region_name],
        "TrainingInputMode": "File"
    },
    "ResourceConfig": {
        "InstanceCount": 1,
        "InstanceType": "ml.c4.2xlarge",
        "VolumeSizeInGB": 10
    },
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/train/".format(bucket, prefix),
                    "S3DataDistributionType": "ShardedByS3Key"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": "s3://{}/{}/validation/".format(bucket, prefix),
                    "S3DataDistributionType": "FullyReplicated"
                }
            },
            "CompressionType": "None",
            "RecordWrapperType": "None"
        }

    ],
    "OutputDataConfig": {
        "S3OutputPath": "s3://{}/{}/".format(bucket, prefix)
    },
    "HyperParameters": {
        "feature_dim": "4",
        "mini_batch_size": "100",
        "predictor_type": "regressor",
        "epochs": "10",
        "num_models": "auto",
        "loss": "auto"
    },
    "StoppingCondition": {
        "MaxRuntimeInSeconds": 60 * 60
    }
}

Job name is: cc-ml-Law1


In [11]:
%%time
#Now let's kick off our training job in SageMaker's distributed, managed training, using the parameters we just created.
region = boto3.Session().region_name
sm = boto3.client('sagemaker')

sm.create_training_job(**linear_training_params)

status = sm.describe_training_job(TrainingJobName=linear_job)['TrainingJobStatus']
print(status)
sm.get_waiter('training_job_completed_or_stopped').wait(TrainingJobName=linear_job)
if status == 'Failed':
    message = sm.describe_training_job(TrainingJobName=linear_job)['FailureReason']
    print('Training failed with the following error: {}'.format(message))
    raise Exception('Training job failed')

InProgress
CPU times: user 168 ms, sys: 0 ns, total: 168 ms
Wall time: 4min


In [12]:
#Now that we've trained the linear algorithm on our data, let's setup a model which can later be hosted.
linear_hosting_container = {
    'Image': containers[boto3.Session().region_name],
    'ModelDataUrl': sm.describe_training_job(TrainingJobName=linear_job)['ModelArtifacts']['S3ModelArtifacts']
}

create_model_response = sm.create_model(
    ModelName=linear_job,
    ExecutionRoleArn=role,
    PrimaryContainer=linear_hosting_container)

print(create_model_response['ModelArn'])

arn:aws:sagemaker:us-west-2:908265993272:model/cc-ml-law1


In [13]:
#Once we've setup a model, we can configure what our hosting endpoints should be.
linear_endpoint_config = 'ccml-Law-linear-endpoint-config'
print(linear_endpoint_config)
create_endpoint_config_response = sm.create_endpoint_config(
    EndpointConfigName=linear_endpoint_config,
    ProductionVariants=[{
        'InstanceType': 'ml.m4.xlarge',
        'InitialInstanceCount': 1,
        'ModelName': linear_job,
        'VariantName': 'AllTraffic'}])

print("Endpoint Config Arn: " + create_endpoint_config_response['EndpointConfigArn'])

ccml-Law-linear-endpoint-config
Endpoint Config Arn: arn:aws:sagemaker:us-west-2:908265993272:endpoint-config/ccml-law-linear-endpoint-config


In [14]:
%%time
#Now that we've specified how our endpoint should be configured, we can create them. 
linear_endpoint = 'ccml-Law-linear-endpoint'
print(linear_endpoint)
create_endpoint_response = sm.create_endpoint(
    EndpointName=linear_endpoint,
    EndpointConfigName=linear_endpoint_config)
print(create_endpoint_response['EndpointArn'])

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Status: " + status)

sm.get_waiter('endpoint_in_service').wait(EndpointName=linear_endpoint)

resp = sm.describe_endpoint(EndpointName=linear_endpoint)
status = resp['EndpointStatus']
print("Arn: " + resp['EndpointArn'])
print("Status: " + status)

if status != 'InService':
    raise Exception('Endpoint creation did not succeed')

ccml-Law-linear-endpoint
arn:aws:sagemaker:us-west-2:908265993272:endpoint/ccml-law-linear-endpoint
Status: Creating
Arn: arn:aws:sagemaker:us-west-2:908265993272:endpoint/ccml-law-linear-endpoint
Status: InService
CPU times: user 56 ms, sys: 8 ms, total: 64 ms
Wall time: 5min 31s


In [15]:
#Now that we have our hosted endpoint, we can generate statistical predictions from it. Let's predict on our test dataset to understand how accurate our model is.
def np2csv(arr):
    csv = io.BytesIO()
    np.savetxt(csv, arr, delimiter=',', fmt='%g')
    return csv.getvalue().decode().rstrip()

In [16]:
#Next, we'll invoke the endpoint to get predictions.
runtime= boto3.client('runtime.sagemaker')

payload = np2csv(test_X)
response = runtime.invoke_endpoint(EndpointName=linear_endpoint,
                                   ContentType='text/csv',
                                   Body=payload)
result = json.loads(response['Body'].read().decode())
test_pred = np.array([r['score'] for r in result['predictions']])

In [17]:
#Let's compare linear learner based mean absolute prediction errors from a baseline prediction which uses majority class to predict every instance.
test_mae_linear = np.mean(np.abs(test_y - test_pred))
test_mae_baseline = np.mean(np.abs(test_y - np.median(train_y))) ## training median as baseline predictor

print("Test MAE Baseline :", round(test_mae_baseline, 3))
print("Test MAE Linear:", round(test_mae_linear,3))



Test MAE Baseline : 0.515
Test MAE Linear: 0.205


In [18]:
#Let's compare predictive accuracy using a classification threshold of 0.5 for the predicted and compare against the majority class prediction from training data set
test_pred_class = (test_pred > 0.5)+0;
test_pred_baseline = np.repeat(np.median(train_y), len(test_y))

prediction_accuracy = np.mean((test_y == test_pred_class))*100
baseline_accuracy = np.mean((test_y == test_pred_baseline))*100

print("Prediction Accuracy:", round(prediction_accuracy,1), "%")
print("Baseline Accuracy:", round(baseline_accuracy,1), "%")

Prediction Accuracy: 99.0 %
Baseline Accuracy: 48.5 %
