First the sagemaker package is installed in the environment

In [1]:
!pip install sagemaker==1.72.0

Collecting sagemaker==1.72.0
  Downloading sagemaker-1.72.0.tar.gz (297 kB)
[K     |████████████████████████████████| 297 kB 5.8 MB/s eta 0:00:01
Collecting smdebug-rulesconfig==0.1.4
  Downloading smdebug_rulesconfig-0.1.4-py2.py3-none-any.whl (10 kB)
Building wheels for collected packages: sagemaker
  Building wheel for sagemaker (setup.py) ... [?25ldone
[?25h  Created wheel for sagemaker: filename=sagemaker-1.72.0-py2.py3-none-any.whl size=386358 sha256=6b2a26abd15047aeb63c9d8754941183a532bb8d7b86a2a4a0848fd662a79585
  Stored in directory: /home/ec2-user/.cache/pip/wheels/c3/58/70/85faf4437568bfaa4c419937569ba1fe54d44c5db42406bbd7
Successfully built sagemaker
Installing collected packages: smdebug-rulesconfig, sagemaker
  Attempting uninstall: smdebug-rulesconfig
    Found existing installation: smdebug-rulesconfig 1.0.1
    Uninstalling smdebug-rulesconfig-1.0.1:
      Successfully uninstalled smdebug-rulesconfig-1.0.1
  Attempting uninstall: sagemaker
    Found existing install

In [2]:
#The sagemakar package that was prevously installed is imported. 

import sagemaker
from sagemaker import get_execution_role


#The current session information is saved so it can be accessed later. 
session = sagemaker.Session() # Store the current SageMaker session
role = get_execution_role()
prefix = 'crossSell-xgboost' # a prefix is set so later on a folder on s3 can carry said name

test_location = session.upload_data("processed_data/test.csv", key_prefix=prefix)
val_location = session.upload_data("processed_data/validation.csv", key_prefix=prefix)
train_location = session.upload_data("processed_data/train.csv", key_prefix=prefix)

In [4]:
#the sagemaker image for the xgboost estimator is now imported to create the container in which 
#the algorithm will run

from sagemaker.amazon.amazon_estimator import get_image_uri

container = get_image_uri(session.boto_region_name, 'xgboost')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').


In [5]:
#The estimator is now set, the information about the sagemaker session, container, role, etc.
#is passed as parameters.

xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

#The hyperparameters for the estimator are set. It is important to know that these are the baseline parameters
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [15]:
#With the estimator created and the baseline hyperparameters set. 
#An hyperparameter tuner is created and the estimator is passed as parameter.

from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, # The estimator object to use as the basis for the training jobs.
                                               objective_metric_name = 'validation:logloss', # The metric used to compare trained models.
                                               objective_type = 'Minimize', # Whether we wish to minimize or maximize the metric.
                                               max_jobs = 6, # The total number of models to train
                                               max_parallel_jobs = 3, # The number of models to train in parallel
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10),
                                               })

In [11]:
#the datasets that were uploaded to s3 to train the model are prepared to use as input for the algorithm

s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [16]:
#The fit method is called on the tuner so Sagemaker can start the training jobs and estimating the models
#and their metrics. the train and validation datasets in S3 are passed as parameters

xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [17]:
#Since the training jobs are running on sagemaker but it cannot be visualized, the wait method
#is called so it is visible when the training is complete
xgb_hyperparameter_tuner.wait()

.............................................................................................!


In [18]:
#When the training jobs are completed, the best model is attached to a new estimator object

xgb_attached = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2021-06-26 16:19:29 Starting - Preparing the instances for training
2021-06-26 16:19:29 Downloading - Downloading input data
2021-06-26 16:19:29 Training - Training image download completed. Training in progress.
2021-06-26 16:19:29 Uploading - Uploading generated training model
2021-06-26 16:19:29 Completed - Training job completed[34mArguments: train[0m
[34m[2021-06-26:16:19:17:INFO] Running standalone xgboost training.[0m
[34m[2021-06-26:16:19:17:INFO] Setting up HPO optimized metric to be : logloss[0m
[34m[2021-06-26:16:19:17:INFO] File size need to be processed in the node: 23.44mb. Available memory size in the node: 8416.43mb[0m
[34m[2021-06-26:16:19:17:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:19:17] S3DistributionType set as FullyReplicated[0m
[34m[16:19:17] 99679x10 matrix with 996790 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-06-26:16:19:17:INFO] Determined delimiter of CSV input is ','[0m
[

In [19]:
#A transformer object is scpecified calling the transformer method on the newly  created estimator

xgb_transformer = xgb_attached.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [20]:
#Using the transformer object the predictions are calculated based on the test dataset previously created

xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [21]:
#Since the transform method is running on sagemaker and is not visible, the wait method is called again

xgb_transformer.wait()

.............................[34mArguments: serve[0m
[34m[2021-06-26 16:26:22 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-06-26 16:26:22 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-06-26 16:26:22 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-06-26 16:26:22 +0000] [20] [INFO] Booting worker with pid: 20[0m
[34m[2021-06-26 16:26:22 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2021-06-26 16:26:22 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-06-26 16:26:22 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-06-26:16:26:22:INFO] Model loaded successfully for worker : 21[0m
[34m[2021-06-26:16:26:22:INFO] Model loaded successfully for worker : 20[0m
[34m[2021-06-26:16:26:22:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-06-2

In [5]:
#the directory in which it is wanted to save the predictions made by the transformer is saved as a varible

data_dir = '../project/processed_data'

In [23]:
#Using the string variable referencing the directory to save the predictions, these are copied from the 
#S3 bucket to the local directory

!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-2-730413480526/xgboost-210626-1612-004-a27e4f29-2021-06-26-16-21-38-571/test.csv.out to processed_data/test.csv.out


In [2]:
#some additional imports are needed to read both the actual labels and the predictions saved

import pandas as pd
import os

In [6]:
#The predictions are loaded using the pandas library. Since the predictions are floats
#(values from 01 to 1), they are rounded.

predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [7]:
#the labels for the test dataset are uploaded using pandas and are transformed into an array so they can be
#compared against the predictions

test_y = pd.read_csv("processed_data/test_y.csv", header=None)
test_y = test_y.to_numpy()

In [9]:
#The confusion matriz and classification report are imported from the sklearn package
#Finally, the results of the model can be analyzed by a classification report. 
#The analysis for these results are present on the report document related to this notebook. 

from sklearn.metrics import confusion_matrix,classification_report
print("Classification Report \n")
print(classification_report(test_y,predictions))

Classification Report 

              precision    recall  f1-score   support

           0       0.88      0.76      0.82      9230
           1       0.62      0.78      0.70      4615

    accuracy                           0.77     13845
   macro avg       0.75      0.77      0.76     13845
weighted avg       0.79      0.77      0.78     13845

