# ML Model with xgboost

In [1]:
import os

import time
from time import gmtime, strftime

import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


## Import the preprocessed data

In [5]:
features = pd.read_csv('data/single_features.csv', index_col=0)
targets = pd.read_csv('data/single_targets.csv', index_col=0)

In [6]:
features.shape, targets.shape

((6330, 15), (6330, 1))

In [7]:
features

Unnamed: 0,age,income,member_since_days,M,F,O,U,reward,difficulty,duration,bogo,email,mobile,social,web
0,0.500000,0.666667,0.479956,0,1,0,0,10,10,120,1,1,1,1,1
1,0.487805,0.988889,0.126853,0,1,0,0,10,10,120,1,1,1,1,1
2,0.646341,0.122222,0.151565,1,0,0,0,10,10,120,1,1,1,1,1
3,0.073171,0.133333,0.175178,1,0,0,0,10,10,120,1,1,1,1,1
4,0.536585,0.566667,0.422295,0,1,0,0,10,10,120,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6325,0.443823,0.393389,0.012081,0,0,0,1,10,10,120,1,1,1,1,1
6326,0.597561,0.555556,0.222954,1,0,0,0,10,10,120,1,1,1,1,1
6327,0.500000,0.700000,0.180670,0,1,0,0,10,10,120,1,1,1,1,1
6328,0.646341,0.477778,0.147172,1,0,0,0,10,10,120,1,1,1,1,1


## Create Training, Validation and Testdata
To avoid overfitting I split the train data additional in validation data.

In [8]:
from sklearn.model_selection import train_test_split 

In [9]:
# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(features, targets, test_size=0.33)

# Then we split the training set further into 2/3 training and 1/3 validation sets.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33)

In [10]:
# Define the data directory and make sure that the directory exists
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Create csv files for test, validation and train data

In [14]:
# We use pandas to save our test, train and validation data to csv files. Note that we make sure not to include header
# information or an index as this is required by the built in algorithms provided by Amazon. Also, for the train and
# validation data, it is assumed that the first entry in each row is the target variable.

X_test.to_csv(os.path.join(data_dir, 'single_test.csv'), header=False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'single_validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'single_train.csv'), header=False, index=False)

In [15]:
X_test.shape

(2089, 15)

## Import the sagemaker specific classes and functions

In [16]:
import sagemaker
from sagemaker import get_execution_role
#from sagemaker.amazon.amazon_estimator import get_image_uri

# This is an object that represents the SageMaker session that we are currently operating in. This
# object contains some useful information that we will need to access later such as our region.
session = sagemaker.Session()

# This is an object that represents the IAM role that we are currently assigned. When we construct
# and launch the training job later we will need to tell it what IAM role it should have. Since our
# use case is relatively simple we will simply assign the training job the role we currently have.
role = get_execution_role()

## Define a prefix for s3 data upload and upload the createrd files

In [17]:
prefix = 'capstone_binary'

test_location = session.upload_data(os.path.join(data_dir, 'single_test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'single_validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'single_train.csv'), key_prefix=prefix)

## Create Sagemaker Estimator and Hyperparamaters

In [18]:
# TODO: Create a SageMaker estimator using the container location determined in the previous cell.
#       It is recommended that you use a single training instance of type ml.m4.xlarge. It is also
#       recommended that you use 's3://{}/{}/output'.format(session.default_bucket(), prefix) as the
#       output path.

container = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, 'latest')

xgb = sagemaker.estimator.Estimator(container,
                                    role=role,
                                    instance_count=1,
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


# TODO: Set the XGBoost hyperparameters in the xgb object. Don't forget that in this case we have a binary
#       label so we should be using the 'binary:logistic' objective.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        early_stopping_rounds=10, 
                        num_round=200)



In [19]:
#s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
#s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [20]:
%%time
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-10-01 14:51:39 Starting - Starting the training job...
2021-10-01 14:52:03 Starting - Launching requested ML instancesProfilerReport-1633099899: InProgress
......
2021-10-01 14:53:03 Starting - Preparing the instances for training.........
2021-10-01 14:54:29 Downloading - Downloading input data
2021-10-01 14:54:29 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-10-01:14:54:51:INFO] Running standalone xgboost training.[0m
[34m[2021-10-01:14:54:51:INFO] File size need to be processed in the node: 0.36mb. Available memory size in the node: 8397.95mb[0m
[34m[2021-10-01:14:54:51:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:54:51] S3DistributionType set as FullyReplicated[0m
[34m[14:54:51] 2841x15 matrix with 42615 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-10-01:14:54:51:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:54:51] S3DistributionType set as FullyReplicated

In [21]:
# TODO: Create a transformer object from the trained model. Using an instance count of 1 and an instance type of ml.m4.xlarge
#       should be more than enough.
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [22]:
%%time
# TODO: Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

.............................[34mArguments: serve[0m
[34m[2021-10-01 15:00:02 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-10-01 15:00:02 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-10-01 15:00:02 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-10-01 15:00:02 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-10-01 15:00:02 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-10-01 15:00:02 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-10-01:15:00:02:INFO] Model loaded successfully for worker : 22[0m
[34m[2021-10-01:15:00:02:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-10-01:15:00:02:INFO] Model loaded successfully for worker : 23[0m
[34m[2021-10-01 15:00:02 +0000] [24] [INFO] Booting worker with pid: 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-10-0

In [23]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-eu-central-1-647915836300/xgboost-2021-10-01-14-55-22-039/single_test.csv.out to data/single_test.csv.out


In [24]:
Y_test

Unnamed: 0,binary_target
185,1.0
4515,0.0
4706,1.0
5612,1.0
1181,1.0
...,...
2343,1.0
483,1.0
3315,0.0
3889,1.0


In [25]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'single_test.csv.out'), header=None)


In [26]:
predictions = [round(num) for num in Y_pred.squeeze().values]


In [27]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.7630445189085687

## Train the model with a hyperparameter tuning

In [120]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:rmse', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 20, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

## Fit the Hyperparamereter Tuner

In [121]:
%%time
# This is a wrapper around the location of our train and validation data, to make sure that SageMaker
# knows our data is in csv format.
#s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
#s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

....................................................................................................................................................................................................................................................................................................................................................!
CPU times: user 1.7 s, sys: 73.5 ms, total: 1.77 s
Wall time: 28min 38s


In [122]:
best_training_job = xgb_hyperparameter_tuner.best_training_job()
best_training_job, type(best_training_job)

('xgboost-211001-1013-020-56d7fd91', str)

In [123]:
%%time
xgb_attached = sagemaker.estimator.Estimator.attach(best_training_job)


2021-10-01 10:41:18 Starting - Preparing the instances for training
2021-10-01 10:41:18 Downloading - Downloading input data
2021-10-01 10:41:18 Training - Training image download completed. Training in progress.
2021-10-01 10:41:18 Uploading - Uploading generated training model
2021-10-01 10:41:18 Completed - Training job completed
CPU times: user 83.6 ms, sys: 16.1 ms, total: 99.7 ms
Wall time: 207 ms


In [124]:
%%time
xgb_tuned_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

CPU times: user 8.4 ms, sys: 3 µs, total: 8.4 ms
Wall time: 399 ms


In [125]:
%%time
xgb_tuned_transformer.transform(test_location, content_type='text/csv', split_type='Line')

...............................[34mArguments: serve[0m
[34m[2021-10-01 10:48:10 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-10-01 10:48:10 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-10-01 10:48:10 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-10-01 10:48:10 +0000] [20] [INFO] Booting worker with pid: 20[0m
  monkey.patch_all(subprocess=True)[0m
[35mArguments: serve[0m
[35m[2021-10-01 10:48:10 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[35m[2021-10-01 10:48:10 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[35m[2021-10-01 10:48:10 +0000] [1] [INFO] Using worker: gevent[0m
[35m[2021-10-01 10:48:10 +0000] [20] [INFO] Booting worker with pid: 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-10-01 10:48:10 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-10-01:10:48:10:INFO] Model loaded successfully for worker : 20[0m
[34m[2021-10-01 10:48:10 +0000] [22] [INFO] Booting worker with pid: 22

In [126]:
!aws s3 cp --recursive $xgb_tuned_transformer.output_path $data_dir

download: s3://sagemaker-eu-central-1-647915836300/xgboost-2021-10-01-10-43-12-747/regression_test.csv.out to data/regression_test.csv.out


In [127]:
Y_pred = pd.read_csv(os.path.join(data_dir, 'regression_test.csv.out'), header=None)

In [128]:
predictions = [round(num) for num in Y_pred.squeeze().values]
#predictions

In [129]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.7300584123336206

In [130]:
# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # We have a lot of test data, so we'll split it into batches of 100
    # split the test data set into batches and evaluate using prediction endpoint    
    prediction_batches = [predictor.predict(batch) for batch in np.array_split(test_features, 100)]
    
    # LinearLearner produces a `predicted_label` for each data point in a batch
    # get the 'predicted_label' for every point in a batch
    test_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                                 for batch in prediction_batches])
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    # printing a table of metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy}



In [131]:
test_labels

NameError: name 'test_labels' is not defined

In [132]:
np.array(test_preds)

NameError: name 'test_preds' is not defined

In [133]:
test_labels = Y_test.values.flatten()
test_preds = np.array(predictions)
# calculate true positives, false positives, true negatives, false negatives
tp = np.logical_and(test_labels, test_preds).sum()
fp = np.logical_and(1-test_labels, test_preds).sum()
tn = np.logical_and(1-test_labels, 1-test_preds).sum()
fn = np.logical_and(test_labels, 1-test_preds).sum()

# calculate binary classification metrics
recall = tp / (tp + fn)
precision = tp / (tp + fp)
accuracy = (tp + tn) / (tp + fp + tn + fn)

In [134]:
print(pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
print("\n{:<11} {:.3f}".format('Recall:', recall))
print("{:<11} {:.3f}".format('Precision:', precision))
print("{:<11} {:.3f}".format('Accuracy:', accuracy))
print()



prediction (col)   0.0    1.0
actual (row)                 
0                 5109   3547
1                 2091  10139

Recall:     0.829
Precision:  0.741
Accuracy:   0.730



In [None]:
print('Metrics for simple, LinearLearner.\n')

# get metrics for linear predictor
metrics = evaluate(linear_predictor, 
                   test_features.astype('float32'), 
                   test_labels, 
                   verbose=True) # verbose means we'll print out the metrics



## Same process for viewed data

In [36]:
features = pd.read_csv('data/preprocessed_features_viewed.csv', index_col=0)
targets = pd.read_csv('data/preprocessed_targets_viewed.csv', index_col=0)

In [39]:
targets.viewed.value_counts()

1    56895
0    19382
Name: viewed, dtype: int64

In [40]:
# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(features, targets, test_size=0.33)

# Then we split the training set further into 2/3 training and 1/3 validation sets.
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33)

In [56]:
# set values -1 to 0
Y_test

Unnamed: 0,viewed
251518,1
186892,1
210361,1
230635,1
170103,1
...,...
239052,1
201645,1
305218,1
231518,1


In [42]:
# Define the data directory and make sure that the directory exists
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Create csv files for test, validation and train data

In [43]:
# We use pandas to save our test, train and validation data to csv files. Note that we make sure not to include header
# information or an index as this is required by the built in algorithms provided by Amazon. Also, for the train and
# validation data, it is assumed that the first entry in each row is the target variable.

X_test.to_csv(os.path.join(data_dir, 'test_viewed.csv'), header=False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation_viewed.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train_viewed.csv'), header=False, index=False)

## Import the sagemaker specific classes and functions

In [44]:
import sagemaker
from sagemaker import get_execution_role
#from sagemaker.amazon.amazon_estimator import get_image_uri

# This is an object that represents the SageMaker session that we are currently operating in. This
# object contains some useful information that we will need to access later such as our region.
session = sagemaker.Session()

# This is an object that represents the IAM role that we are currently assigned. When we construct
# and launch the training job later we will need to tell it what IAM role it should have. Since our
# use case is relatively simple we will simply assign the training job the role we currently have.
role = get_execution_role()

## Define a prefix for s3 data upload and upload the createrd files

In [45]:
prefix = 'capstone_viewed'

test_location = session.upload_data(os.path.join(data_dir, 'test_viewed.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation_viewed.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train_viewed.csv'), key_prefix=prefix)

## Create Sagemaker Estimator and Hyperparamaters

In [46]:
# TODO: Create a SageMaker estimator using the container location determined in the previous cell.
#       It is recommended that you use a single training instance of type ml.m4.xlarge. It is also
#       recommended that you use 's3://{}/{}/output'.format(session.default_bucket(), prefix) as the
#       output path.

container = sagemaker.image_uris.retrieve('xgboost', session.boto_region_name, 'latest')

xgb = sagemaker.estimator.Estimator(container,
                                    role=role,
                                    instance_count=1,
                                    instance_type='ml.m4.xlarge',
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)


# TODO: Set the XGBoost hyperparameters in the xgb object. Don't forget that in this case we have a binary
#       label so we should be using the 'binary:logistic' objective.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        objective='binary:logistic',
                        early_stopping_rounds=10, 
                        num_round=200)



In [47]:
#s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
#s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

s3_input_train = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv')

In [48]:
%%time
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-09-23 06:40:06 Starting - Starting the training job...
2021-09-23 06:40:08 Starting - Launching requested ML instancesProfilerReport-1632379206: InProgress
......
2021-09-23 06:41:25 Starting - Preparing the instances for training......
2021-09-23 06:42:37 Downloading - Downloading input data
2021-09-23 06:42:37 Training - Downloading the training image...
2021-09-23 06:43:05 Uploading - Uploading generated training model[34mArguments: train[0m
[34m[2021-09-23:06:42:58:INFO] Running standalone xgboost training.[0m
[34m[2021-09-23:06:42:58:INFO] File size need to be processed in the node: 3.1mb. Available memory size in the node: 8389.71mb[0m
[34m[2021-09-23:06:42:58:INFO] Determined delimiter of CSV input is ','[0m
[34m[06:42:58] S3DistributionType set as FullyReplicated[0m
[34m[06:42:58] 34240x10 matrix with 342400 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-09-23:06:42:58:INFO] Determined delimiter of CSV input is 

In [49]:
# TODO: Create a transformer object from the trained model. Using an instance count of 1 and an instance type of ml.m4.xlarge
#       should be more than enough.
xgb_transformer = xgb.transformer(instance_count=1, instance_type='ml.m4.xlarge')

In [50]:
%%time
# TODO: Start the transform job. Make sure to specify the content type and the split type of the test data.
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

............................[34mArguments: serve[0m
[34m[2021-09-23 06:48:55 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-09-23 06:48:55 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-09-23 06:48:55 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-09-23 06:48:55 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2021-09-23 06:48:55 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-09-23 06:48:55 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-09-23 06:48:55 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-09-23:06:48:55:INFO] Model loaded successfully for worker : 21[0m
[34m[2021-09-23:06:48:55:INFO] Model loaded successfully for worker : 20[0m
[34m[2021-09-23:06:48:55:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-09-23

In [51]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-eu-central-1-647915836300/xgboost-2021-09-23-06-44-25-632/test_viewed.csv.out to data/test_viewed.csv.out


In [57]:
predictions = pd.read_csv(os.path.join(data_dir, 'test_viewed.csv.out'), header=None)


In [58]:
predictions = [round(num) for num in predictions.squeeze().values]
#predictions

In [59]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.8098283807405053