In [28]:
#The sagemakar package that was prevously installed is imported. 

import sagemaker
from sagemaker import get_execution_role

#The current session information is saved so it can be accessed later. 
session = sagemaker.Session() # Store the current SageMaker session
role = get_execution_role()
prefix = 'crossSell-xgboost_2' # a prefix is set so later on a folder on s3 can carry said name

test_location = session.upload_data("processed_data_2/test.csv", key_prefix=prefix)
val_location = session.upload_data("processed_data_2/validation.csv", key_prefix=prefix)
train_location = session.upload_data("processed_data_2/train.csv", key_prefix=prefix)

In [53]:
#the sagemaker image for the xgboost estimator is now imported to create the container in which 
#the algorithm will run

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [54]:
#The estimator is now set, the information about the sagemaker session, container, role, etc.
#is passed as parameters.
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

#The hyperparameters for the estimator are set. It is important to know that these are the baseline parameters
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [32]:
#With the estimator created and the baseline hyperparameters set. 
#An hyperparameter tuner is created and the estimator is passed as parameter.

from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:logloss', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 6, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [55]:
#the datasets that were uploaded to s3 to train the model are prepared to use as input for the algorithm

s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [56]:
#The fit method is called on the tuner so Sagemaker can start the training jobs and estimating the models
#and their metrics. the train and validation datasets in S3 are passed as parameters
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [57]:
#Since the training jobs are running on sagemaker but it cannot be visualized, the wait method
#is called so it is visible when the training is complete
xgb_hyperparameter_tuner.wait()

.................................................................................!


In [58]:
#When the training jobs are completed, the best model is attached to a new estimator object

xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2021-07-01 01:00:55 Starting - Preparing the instances for training
2021-07-01 01:00:55 Downloading - Downloading input data
2021-07-01 01:00:55 Training - Training image download completed. Training in progress.
2021-07-01 01:00:55 Uploading - Uploading generated training model
2021-07-01 01:00:55 Completed - Training job completed[34mArguments: train[0m
[34m[2021-07-01:01:00:44:INFO] Running standalone xgboost training.[0m
[34m[2021-07-01:01:00:44:INFO] Setting up HPO optimized metric to be : logloss[0m
[34m[2021-07-01:01:00:44:INFO] File size need to be processed in the node: 15.59mb. Available memory size in the node: 8419.11mb[0m
[34m[2021-07-01:01:00:44:INFO] Determined delimiter of CSV input is ','[0m
[34m[01:00:44] S3DistributionType set as FullyReplicated[0m
[34m[01:00:44] 66453x10 matrix with 664530 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-07-01:01:00:44:INFO] Determined delimiter of CSV input is ','[0m
[

In [73]:
#The job name is printed so all the details of the job can be found from the sagemaker console
print(xgb_attached._current_job_name)

xgboost-210701-0057-001-7f0840d5


In [59]:
#A transformer object is scpecified calling the transformer method on the newly  created estimator

xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [60]:
#Using the transformer object the predictions are calculated based on the test dataset previously created

xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [61]:
#Since the transform method is running on sagemaker and is not visible, the wait method is called again

xgb_transformer.wait()

.............................[34mArguments: serve[0m
[34m[2021-07-01 01:18:29 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-07-01 01:18:29 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-07-01 01:18:29 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-07-01 01:18:29 +0000] [20] [INFO] Booting worker with pid: 20[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-07-01:01:18:29:INFO] Model loaded successfully for worker : 20[0m
[34m[2021-07-01 01:18:29 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-07-01 01:18:29 +0000] [22] [INFO] Booting worker with pid: 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-07-01:01:18:29:INFO] Model loaded successfully for worker : 21[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-07-01:01:18:29:INFO] Model loaded successfully for worker : 22[0m
[34m[2021-07-01 01:18:29 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-07-0

In [63]:
#the directory in which it is wanted to save the predictions made by the transformer is saved as a varible

data_dir = '../project/processed_data_2'

In [64]:
#Using the string variable referencing the directory to save the predictions, these are copied from the 
#S3 bucket to the local directory

!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-2-730413480526/xgboost-210701-0057-001-7f0840d5-2021-07-01-01-13-44-933/test.csv.out to processed_data_2/test.csv.out


In [65]:
#some additional imports are needed to read both the actual labels and the predictions saved

import pandas as pd
import os

In [66]:
#The predictions are loaded using the pandas library. Since the predictions are floats
#(values from 01 to 1), they are rounded.

predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [68]:
#the labels for the test dataset are uploaded using pandas and are transformed into an array so they can be
#compared against the predictions

test_y = pd.read_csv("processed_data_2/test_y.csv", header=None)
test_y = test_y.to_numpy()

In [76]:
#The confusion matriz and classification report are imported from the sklearn package
#Finally, the results of the model can be analyzed by a classification report. 
#The analysis for these results are present on the report document related to this notebook. 

from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(test_y,predictions))

              precision    recall  f1-score   support

           0       0.90      0.67      0.76      4615
           1       0.73      0.92      0.82      4615

    accuracy                           0.79      9230
   macro avg       0.81      0.79      0.79      9230
weighted avg       0.81      0.79      0.79      9230



In [75]:
confusion_matrix(test_y,predictions)

array([[3069, 1546],
       [ 356, 4259]])