##  Steps:

1. Importing necessary Libraries
2. Creating S3 bucket
3. Mapping train And Test Data in S3
4. Split data into train, val, test; Use sagemaker inbuilt Xgboost algo to fit the data and validate on val set
5. Map path of the models in S3
6. Predict on test set and create confusion matrix 

In [1]:
import sagemaker
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri 
from sagemaker.session import s3_input, Session

In [2]:
my_region = boto3.session.Session().region_name # set the region of the instance
print(my_region)

us-east-1


In [8]:
# create s3 bucket

bucket_name = 'eramosbankapplication' 
s3 = boto3.resource('s3')

try:
    if  my_region == 'us-east-1':
        s3.create_bucket(Bucket=bucket_name)
    print('S3 bucket created successfully')
except Exception as e:
    print('S3 error: ',e)

S3 bucket created successfully


In [9]:
# set an output path where the trained model will be saved

prefix = 'xgboost-as-a-built-in-algo'
output_path ='s3://{}/{}/output'.format(bucket_name, prefix)
print(output_path)

s3://eramosbankapplication/xgboost-as-a-built-in-algo/output


In [10]:
# download dataset

import pandas as pd
import urllib
try:
    urllib.request.urlretrieve ("https://d1.awsstatic.com/tmt/build-train-deploy-machine-learning-model-sagemaker/bank_clean.27f01fbbdf43271788427f3682996ae29ceca05d.csv", "bank_clean.csv")
    print('Success: downloaded bank_clean.csv.')
    
except Exception as e:
    print('Data load error: ',e)

try:
    model_data = pd.read_csv('./bank_clean.csv',index_col=0)
    print('Success: Data loaded into dataframe.')
    
except Exception as e:
    print('Data load error: ',e)

Success: downloaded bank_clean.csv.
Success: Data loaded into dataframe.


In [12]:
model_data

# last two columns show the outcome - whether the consumer bought the application
# since it is in a one-hot encoded format, we can drop one and keep the other --- i.e. keep y_yes (consumer bought app)

Unnamed: 0,age,campaign,pdays,previous,no_previous_contact,not_working,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,day_of_week_fri,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_failure,poutcome_nonexistent,poutcome_success,y_no,y_yes
0,56,1,999,0,1,0,0,0,0,1,...,0,1,0,0,0,0,1,0,1,0
1,57,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,37,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
3,40,1,999,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
4,56,1,999,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,1,999,0,1,1,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
41184,46,1,999,0,1,0,0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
41185,56,2,999,0,1,1,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
41186,44,1,999,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1


In [25]:
# split downloaded data into train, val, test split
# not splitting it in typical x_train, y_train, x_test, y_test because we instead want to save the train and test data back to ..
# .. s3 first then use each of those files separately 


import numpy as np
trainval_data, test_data = np.split(model_data.sample(frac=1, random_state=1729), [int(0.8 * len(model_data))])
train_data, val_data = np.split(trainval_data.sample(frac=1, random_state=1729), [int(0.8 * len(trainval_data))])

print(train_data.shape, val_data.shape, test_data.shape)

(26360, 61) (6590, 61) (8238, 61)


In [26]:
### Saving Train Into Indicated Bucket

# keep only one of the last two cols which show the outcome of interest

import os

pd.concat([train_data['y_yes'], train_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('train.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'train/train.csv')).upload_file('train.csv')
s3_input_train = sagemaker.TrainingInput(s3_data='s3://{}/{}/train'.format(bucket_name, prefix), content_type='csv')

In [27]:
### Saving Train Into Indicated Bucket

pd.concat([val_data['y_yes'], val_data.drop(['y_no', 'y_yes'], 
                                                axis=1)], 
                                                axis=1).to_csv('val.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'val/val.csv')).upload_file('val.csv')
s3_input_val = sagemaker.TrainingInput(s3_data='s3://{}/{}/val'.format(bucket_name, prefix), content_type='csv')

In [28]:
# Test Data Into Indicated Bucket

pd.concat([test_data['y_yes'], test_data.drop(['y_no', 'y_yes'], axis=1)], axis=1).to_csv('test.csv', index=False, header=False)
boto3.Session().resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test/test.csv')).upload_file('test.csv')
s3_input_test = sagemaker.TrainingInput(s3_data='s3://{}/{}/test'.format(bucket_name, prefix), content_type='csv')

# Let's use the inbuilt Xgboost algo provided by Sagemaker

### these inbuilt algos are containers which need to be pulled in order to be used
### Xgboost can also be run as a framework

In [18]:
# this line automatically looks for the XGBoost image URI and builds an XGBoost container.
# specify the repo_version depending on your preference.
container = get_image_uri(boto3.Session().region_name,
                          'xgboost', 
                          repo_version='1.0-1')

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [20]:
# set hyperparams
# these were detemrined locally
# we do not want to do the tuning on sagemaker due to increased costs

hyperparameters = {
        "max_depth":"5",
        "eta":"0.2",
        "gamma":"4",
        "min_child_weight":"6",
        "subsample":"0.7",
        "objective":"binary:logistic",
        "num_round":50
        }

In [22]:
# construct a SageMaker estimator that calls the xgboost-container
estimator = sagemaker.estimator.Estimator(image_uri=container, 
                                          hyperparameters=hyperparameters,
                                          role=sagemaker.get_execution_role(),
                                          train_instance_count=1, 
                                          train_instance_type='ml.m5.2xlarge', 
                                          train_volume_size=5, # 5 GB 
                                          output_path=output_path,
                                          train_use_spot_instances=True,
                                          train_max_run=300,
                                          train_max_wait=600)

# Billing is based on training duration
# last 3 params help to limit costs with training the model

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_use_spot_instances has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_wait has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [None]:
estimator.fit({})

In [29]:
# fit the train data on the model and validate on val set 
estimator.fit({'train': s3_input_train,'validation': s3_input_val})

# input is taken from S3 bucket

# note that train error and val error is decreasing with every iteration

2021-03-24 04:18:19 Starting - Starting the training job...
2021-03-24 04:18:42 Starting - Launching requested ML instancesProfilerReport-1616559498: InProgress
......
2021-03-24 04:19:42 Starting - Preparing the instances for training...
2021-03-24 04:20:11 Downloading - Downloading input data...
2021-03-24 04:20:42 Training - Downloading the training image...
2021-03-24 04:21:20 Uploading - Uploading generated training model.[34mINFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training[0m
[34mINFO:sagemaker-containers:Failed to parse hyperparameter objective value binary:logistic to Json.[0m
[34mReturning the value itself[0m
[34mINFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)[0m
[34mINFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of CSV input is ','[0m
[34mINFO:root:Determined delimiter of

## Let's deploy the model

In [31]:
xgb_predictor = estimator.deploy(initial_instance_count=1,instance_type='ml.m4.xlarge')

---------------!

## Predict on the test data and output confusion matrix

In [36]:
# serializer is essential for making predictions

from sagemaker.predictor import csv_serializer


test_data_array = test_data.drop(['y_no', 'y_yes'], axis=1).values #drop outcomes and load the data into an array
# xgb_predictor.content_type = 'text/csv' # set the data type for an inference
xgb_predictor.serializer = csv_serializer # set the serializer type since we are working with a csv file


predictions = xgb_predictor.predict(test_data_array).decode('utf-8') # predict! Decoding is required to interpret serialzed findings
predictions_array = np.fromstring(predictions[1:], sep=',') # and turn the prediction into an array


print(predictions_array.shape)

The csv_serializer has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


(8238,)


In [37]:
predictions_array

array([0.20057243, 0.04776593, 0.01932938, ..., 0.0384167 , 0.04332333,
       0.04149   ])

In [39]:
# from AWS documentation 

cm = pd.crosstab(index=test_data['y_yes'], columns=np.round(predictions_array), rownames=['Observed'], colnames=['Predicted'])
tn = cm.iloc[0,0]; fn = cm.iloc[1,0]; tp = cm.iloc[1,1]; fp = cm.iloc[0,1]; p = (tp+tn)/(tp+tn+fp+fn)*100
print("\n{0:<20}{1:<4.1f}%\n".format("Overall Classification Rate: ", p))
print("{0:<15}{1:<15}{2:>8}".format("Predicted", "No Purchase", "Purchase"))
print("Observed")
print("{0:<15}{1:<2.0f}% ({2:<}){3:>6.0f}% ({4:<})".format("No Purchase", tn/(tn+fn)*100,tn, fp/(tp+fp)*100, fp))
print("{0:<16}{1:<1.0f}% ({2:<}){3:>7.0f}% ({4:<}) \n".format("Purchase", fn/(tn+fn)*100,fn, tp/(tp+fp)*100, tp))


Overall Classification Rate: 89.4%

Predicted      No Purchase    Purchase
Observed
No Purchase    90% (7182)    33% (88)
Purchase        10% (787)     67% (181) 



### TP = Predicted purchase /  purchase   ; TN = Predicted no purchase /  no purchase
### FP = Predicted purchae / no purchase ; FN = Predicted no purchase / purchase 



### Precision is: TP / TP + FP ----> 181 / (181 + 88) =  67%
### Recall is: TP / TP + FN ---->  181 / (181  + 787) = 19%

## Deleting the endpoints (model and s3 bucket) to prevent additional charges

In [40]:
sagemaker.Session().delete_endpoint(xgb_predictor.endpoint)
bucket_to_delete = boto3.resource('s3').Bucket(bucket_name)
bucket_to_delete.objects.all().delete()

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


[{'ResponseMetadata': {'RequestId': 'PT6C2N95J33A037S',
   'HostId': 'qs4LYpuNJ5m/QOSohjb2zYXqUSwuTPZFu/W7wGsnMZmd3yv88kSr0PyI5sX5lsRD8gu22SIvSxs=',
   'HTTPStatusCode': 200,
   'HTTPHeaders': {'x-amz-id-2': 'qs4LYpuNJ5m/QOSohjb2zYXqUSwuTPZFu/W7wGsnMZmd3yv88kSr0PyI5sX5lsRD8gu22SIvSxs=',
    'x-amz-request-id': 'PT6C2N95J33A037S',
    'date': 'Wed, 24 Mar 2021 04:47:23 GMT',
    'content-type': 'application/xml',
    'transfer-encoding': 'chunked',
    'server': 'AmazonS3',
    'connection': 'close'},
   'RetryAttempts': 0},
  'Deleted': [{'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2021-03-24-04-18-18-842/profiler-output/system/incremental/2021032404/1616559600.algo-1.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2021-03-24-04-18-18-842/rule-output/ProfilerReport-1616559498/profiler-output/profiler-reports/StepOutlier.json'},
   {'Key': 'xgboost-as-a-built-in-algo/output/sagemaker-xgboost-2021-03-24-04-18-18-842/rule-output/ProfilerReport-16165