# ML Model with xgboost

In [1]:
import os

import time
from time import gmtime, strftime

import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


## Import the preprocessed data

In [2]:
features = pd.read_csv('data/features_viewed.csv', index_col=0)
labels = pd.read_csv('data/labels_viewed.csv', index_col=0)

In [3]:
features.shape, labels.shape

((63288, 10), (63288, 1))

In [4]:
features

Unnamed: 0,age,income,email,mobile,social,web,F,M,O,U
0,0.180723,0.466667,1,1,1,1,0,1,0,0
1,0.438476,0.393389,1,1,1,1,0,0,0,1
2,0.072289,0.333333,1,1,1,1,1,0,0,0
3,0.445783,0.488889,1,1,1,1,1,0,0,0
4,0.433735,0.766667,1,1,1,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
63283,0.506024,0.455556,1,1,1,0,1,0,0,0
63284,0.438476,0.393389,1,1,1,0,0,0,0,1
63285,0.506024,0.588889,1,1,1,0,1,0,0,0
63286,0.421687,0.822222,1,1,1,0,0,1,0,0


## Create Training, Validation and Testdata

The loaded data is already preprocessed, there are no further data cleaning steps necessary. However, we do need to split the rows in the dataset up into train, test and validation sets.
To avoid overfitting I split the train data additional in validation data.

In [5]:
from sklearn.model_selection import train_test_split 

In [6]:
# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.33)

# Then we split the training set further into 2/3 training and 1/3 validation sets.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33)

In [8]:
# Define the data directory and make sure that the directory exists
data_dir = 'data/viewed'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Create csv files for test, validation and train data

In [9]:
# We use pandas to save our test, train and validation data to csv files. Note that we make sure not to include header
# information or an index as this is required by the built in algorithms provided by Amazon. Also, for the train and
# validation data, it is assumed that the first entry in each row is the target variable.

X_test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

## Import the sagemaker specific classes and functions
In addition to the modules above, we need to import the various bits of SageMaker that we will be using. 

In [10]:
import sagemaker
from sagemaker import get_execution_role

# This is an object that represents the SageMaker session that we are currently operating in. This
# object contains some useful information that we will need to access later such as our region.
session = sagemaker.Session()

# This is an object that represents the IAM role that we are currently assigned. When we construct
# and launch the training job later we will need to tell it what IAM role it should have. Since our
# use case is relatively simple we will simply assign the training job the role we currently have.
role = get_execution_role()

## Define a prefix for s3 data upload and upload the created files

In [11]:
prefix = 'capstone_viewed'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

## Create Sagemaker Estimator and Hyperparamaters

In [12]:
# TODO: Create a SageMaker estimator using the container location determined in the previous cell.
#       It is recommended that you use a single training instance of type ml.m4.xlarge. It is also
#       recommended that you use 's3://{}/{}/output'.format(session.default_bucket(), prefix) as the
#       output path.

container = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, 'latest')

xgb = sagemaker.estimator.Estimator(container,
                                    role=role,
                                    instance_count=1,
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


# TODO: Set the XGBoost hyperparameters in the xgb object. Don't forget that in this case we have a binary
#       label so we should be using the 'binary:logistic' objective.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        early_stopping_rounds=10, 
                        num_round=200)



In [13]:
#s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
#s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [14]:
%%time
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-10-05 06:17:06 Starting - Starting the training job...
2021-10-05 06:17:30 Starting - Launching requested ML instancesProfilerReport-1633414626: InProgress
...
2021-10-05 06:17:55 Starting - Preparing the instances for training.........
2021-10-05 06:19:31 Downloading - Downloading input data...
2021-10-05 06:19:50 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-10-05:06:20:09:INFO] Running standalone xgboost training.[0m
[34m[2021-10-05:06:20:09:INFO] File size need to be processed in the node: 2.25mb. Available memory size in the node: 8389.34mb[0m
[34m[2021-10-05:06:20:09:INFO] Determined delimiter of CSV input is ','[0m
[34m[06:20:09] S3DistributionType set as FullyReplicated[0m
[34m[06:20:10] 28409x10 matrix with 284090 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-10-05:06:20:10:INFO] Determined delimiter of CSV input is ','[0m
[34m[06:20:10] S3DistributionType set as FullyReplicate

In [15]:
# TODO: Create a transformer object from the trained model. Using an instance count of 1 and an instance type of ml.m4.xlarge
#       should be more than enough.
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [16]:
%%time
# TODO: Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

..............................
[34mArguments: serve[0m
[34m[2021-10-05 06:26:00 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-10-05 06:26:00 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-10-05 06:26:00 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-10-05 06:26:00 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-10-05 06:26:00 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-10-05 06:26:00 +0000] [23] [INFO] Booting worker with pid: 23[0m
[35mArguments: serve[0m
[35m[2021-10-05 06:26:00 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2021-10-05 06:26:00 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2021-10-05 06:26:00 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2021-10-05 06:26:00 +0000] [21] [INFO] Booting worker with pid: 21[0m
[35m[2021-10-05 06:26:00 +0000] [22] [INFO] Booting worker with pid: 22[0m
[35m[2021-10-05 06:26:00 +0000] [23] [INFO] Booting worker with pid: 23[0m

In [17]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-eu-central-1-647915836300/xgboost-2021-10-05-06-21-11-050/test.csv.out to data/viewed/test.csv.out


In [18]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)


In [19]:
predictions = [round(num) for num in Y_pred.squeeze().values]
#predictions

In [20]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.818921765776118

## Train the model with a hyperparameter tuning

In [21]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:rmse', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

## Fit the Hyperparamereter Tuner

In [22]:
%%time
# This is a wrapper around the location of our train and validation data, to make sure that SageMaker
# knows our data is in csv format.
#s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
#s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

...........................................................................................................................................................................................................................................................................................................................................!
CPU times: user 1.67 s, sys: 153 ms, total: 1.82 s
Wall time: 27min 52s


In [23]:
best_training_job = xgb_hyperparameter_tuner.best_training_job()
best_training_job, type(best_training_job)

('xgboost-211005-0627-011-361d9088', str)

In [24]:
%%time
xgb_attached = sagemaker.estimator.Estimator.attach(best_training_job)


2021-10-05 06:43:05 Starting - Preparing the instances for training
2021-10-05 06:43:05 Downloading - Downloading input data
2021-10-05 06:43:05 Training - Training image download completed. Training in progress.
2021-10-05 06:43:05 Uploading - Uploading generated training model
2021-10-05 06:43:05 Completed - Training job completed
CPU times: user 113 ms, sys: 5.12 ms, total: 118 ms
Wall time: 525 ms


In [26]:
%%time
xgb_tuned_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

CPU times: user 18.3 ms, sys: 0 ns, total: 18.3 ms
Wall time: 431 ms


In [27]:
%%time
xgb_tuned_transformer.transform(test_location, content_type='text/csv', split_type='Line')

............................[34mArguments: serve[0m
[35mArguments: serve[0m
[34m[2021-10-05 06:59:54 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-10-05 06:59:54 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-10-05 06:59:54 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-10-05 06:59:54 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2021-10-05 06:59:54 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-10-05 06:59:54 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-10-05 06:59:54 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2021-10-05:06:59:54:INFO] Model loaded successfully for worker : 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-10-05:06:59:54:INFO] Model loaded successfully for worker : 21[0m
[35m[2021-10-05 06:59:54 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2021-10-05 06:59:54 +0000] [1] [INFO] Listening at: http://0.0.0.0:80

In [28]:
!aws s3 cp --recursive $xgb_tuned_transformer.output_path $data_dir

download: s3://sagemaker-eu-central-1-647915836300/xgboost-2021-10-05-06-55-24-630/test.csv.out to data/viewed/test.csv.out


In [29]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)

In [30]:
predictions = [round(num) for num in Y_pred.squeeze().values]
#predictions

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.8171981231446902

In [33]:
test_labels = Y_test.values.flatten()
test_preds = np.array(predictions)
# calculate true positives, false positives, true negatives, false negatives
tp = np.logical_and(test_labels, test_preds).sum()
fp = np.logical_and(1-test_labels, test_preds).sum()
tn = np.logical_and(1-test_labels, 1-test_preds).sum()
fn = np.logical_and(test_labels, 1-test_preds).sum()

# calculate binary classification metrics
recall = tp / (tp + fn)
precision = tp / (tp + fp)
accuracy = (tp + tn) / (tp + fp + tn + fn)

In [34]:
print(pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
print("\n{:<11} {:.3f}".format('Recall:', recall))
print("{:<11} {:.3f}".format('Precision:', precision))
print("{:<11} {:.3f}".format('Accuracy:', accuracy))
print()



prediction (col)   0.0    1.0
actual (row)                 
0                 2692   2321
1                 1497  14376

Recall:     0.906
Precision:  0.861
Accuracy:   0.817

