In [None]:
!pip install sagemaker==1.72.0

In [1]:
## Import relevant libraries
import os
import boto3
import re
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac
import scipy
from sklearn.model_selection import train_test_split
from sagemaker.amazon.amazon_estimator import get_image_uri

## Defin role, region, session, and default s3 bucket ##
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
session = sagemaker.Session()
bucket = session.default_bucket()

In [2]:
## Read in processed datasets ##
X_sparse = scipy.sparse.load_npz('./readmissions_X.npz')
y_sparse = scipy.sparse.load_npz('./readmissions_y.npz')
X = pd.DataFrame(np.array(X_sparse.todense()))
y = pd.DataFrame(np.array(y_sparse.todense()).reshape(-1,1))

In [3]:
## Create train/test/validation sets ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33)

In [4]:
X_test.shape

(30899, 105)

In [5]:
y_test.shape

(30899, 1)

#### Build XGboost Classifier 

In [6]:
prefix = 'readmission-xgboost'
data_dir = './data/xgb/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

## Save train/test/validation sets to local data directory ##
pd.DataFrame(X_test).to_csv(os.path.join(data_dir, 'X_test.csv'), header=False, index=False)
pd.DataFrame(y_test).to_csv(os.path.join(data_dir, 'y_test.csv'), header=False, index=False)
pd.concat([y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

## Upload train/test/validation sets from local directory to s3 bucket ##
test_location = session.upload_data(os.path.join(data_dir, 'X_test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [7]:
## Define estimator object
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
container = get_image_uri(session.boto_region_name, 'xgboost')
xgb = sagemaker.estimator.Estimator(container, 
                                    role,                                   
                                    train_instance_count=1,                 
                                    train_instance_type='ml.m4.xlarge',    
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=session)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [8]:
## Define default and tunable hyperparameters ## 
xgb.set_hyperparameters(silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, 
                                               objective_metric_name = 'validation:auc', 
                                               objective_type = 'Maximize',
                                               max_jobs = 20,
                                               max_parallel_jobs = 3,
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10)})

In [9]:
## Point hyperparameter tuner object to training/validation data in s3 ##
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [10]:
## Fit hyperparameter tuner object ##
xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})
xgb_hyperparameter_tuner.wait()

..........................................................................................................................................................................................................................................................................................................................................................................................................................!


In [11]:
## View Hyperparameters and performance for best job ##
analytics = xgb_hyperparameter_tuner.analytics()
analytics_df = analytics.dataframe()

best_job = xgb_hyperparameter_tuner.best_training_job()
analytics_df[analytics_df['TrainingJobName'] == best_job]

Unnamed: 0,eta,gamma,max_depth,min_child_weight,subsample,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,0.154636,0.757286,12.0,2.0,0.888536,xgboost-220107-1704-019-09793d10,Completed,0.735732,2022-01-07 17:36:26+00:00,2022-01-07 17:38:35+00:00,129.0


In [14]:
## Create a new estimator object using the "best training job" ##
xgb_best = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())

## Create a batch transform job and point to s3 training data location ##
xgb_transformer = xgb_best.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')
xgb_transformer.wait()

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2022-01-07 17:38:35 Starting - Preparing the instances for training
2022-01-07 17:38:35 Downloading - Downloading input data
2022-01-07 17:38:35 Training - Training image download completed. Training in progress.
2022-01-07 17:38:35 Uploading - Uploading generated training model
2022-01-07 17:38:35 Completed - Training job completed[34mArguments: train[0m
[34m[2022-01-07:17:37:26:INFO] Running standalone xgboost training.[0m
[34m[2022-01-07:17:37:26:INFO] Setting up HPO optimized metric to be : auc[0m
[34m[2022-01-07:17:37:26:INFO] File size need to be processed in the node: 12.79mb. Available memory size in the node: 8381.38mb[0m
[34m[2022-01-07:17:37:26:INFO] Determined delimiter of CSV input is ','[0m
[34m[17:37:26] S3DistributionType set as FullyReplicated[0m
[34m[17:37:26] 42031x105 matrix with 4413255 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2022-01-07:17:37:26:INFO] Determined delimiter of CSV input is ','[0m
[34

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


Training seconds: 129
Billable seconds: 129
................................[34mArguments: serve[0m
[34m[2022-01-07 17:48:38 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2022-01-07 17:48:38 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2022-01-07 17:48:38 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2022-01-07 17:48:38 +0000] [21] [INFO] Booting worker with pid: 21[0m
[34m[2022-01-07 17:48:38 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2022-01-07 17:48:38 +0000] [23] [INFO] Booting worker with pid: 23[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-01-07 17:48:38 +0000] [24] [INFO] Booting worker with pid: 24[0m
[34m[2022-01-07:17:48:38:INFO] Model loaded successfully for worker : 21[0m
[34m[2022-01-07:17:48:38:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2022-01-07:17:48:38:INFO] Model loaded s

In [15]:
## Download tranform output from s3 location ##
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir/xgb_results

download: s3://sagemaker-us-east-1-345989147144/xgboost-220107-1704-019-09793d10-2022-01-07-17-43-25-064/X_test.csv.out to data/xgb/xgb_results/X_test.csv.out


In [16]:
## Calculate test accuracy and auc performance ##
output = pd.read_csv(os.path.join(data_dir, 'xgb_results/X_test.csv.out'), header=None)
predictions = [round(num) for num in output.squeeze().values]
probs = [p for p in output.squeeze().values]

from sklearn.metrics import accuracy_score, roc_auc_score
print ('XBG test accuracy: {}'.format(accuracy_score(y_test, predictions)))
print ('XBG test auc: {}'.format(roc_auc_score(y_test, probs)))

XBG test accuracy: 0.6754911162173534
XBG test auc: 0.737488341310636
