In [None]:
!pip install sagemaker==1.72.0

In [1]:
import os
import boto3
import re
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac
import scipy
from sklearn.model_selection import train_test_split
from sagemaker.amazon.amazon_estimator import get_image_uri

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
session = sagemaker.Session()
bucket = session.default_bucket()

prefix = ("sagemaker/readmissions" )

In [2]:
X_sparse = scipy.sparse.load_npz('./readmissions_X.npz')
y_sparse = scipy.sparse.load_npz('./readmissions_y.npz')
X = pd.DataFrame(np.array(X_sparse.todense()))
y = pd.DataFrame(np.array(y_sparse.todense()).reshape(-1,1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33)

data_dir = './data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
pd.DataFrame(X_test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.concat([y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [3]:
prefix = 'readmission-xgboost'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [None]:
from sagemaker.tuner import IntegerParameter, ContinuousParameter, HyperparameterTuner
container = get_image_uri(session.boto_region_name, 'xgboost')

xgb = sagemaker.estimator.Estimator(container, 
                                    role,                                   
                                    train_instance_count=1,                 
                                    train_instance_type='ml.m4.xlarge',    
                                    output_path='s3://{}/{}/output'.format(bucket, prefix),
                                    sagemaker_session=session)

xgb.set_hyperparameters(silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

xgb_hyperparameter_tuner = HyperparameterTuner(estimator = xgb, 
                                               objective_metric_name = 'validation:auc', 
                                               objective_type = 'Maximize',
                                               max_jobs = 20,
                                               max_parallel_jobs = 3,
                                               hyperparameter_ranges = {
                                                    'max_depth': IntegerParameter(3, 12),
                                                    'eta'      : ContinuousParameter(0.05, 0.5),
                                                    'min_child_weight': IntegerParameter(2, 8),
                                                    'subsample': ContinuousParameter(0.5, 0.9),
                                                    'gamma': ContinuousParameter(0, 10)})

In [None]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb_hyperparameter_tuner.fit({'train': s3_input_train, 'validation': s3_input_validation})

xgb_hyperparameter_tuner.wait()

In [None]:
my_tuner_analytics = xgb_hyperparameter_tuner.analytics()
my_dataframe = my_tuner_analytics.dataframe()

best_job = xgb_hyperparameter_tuner.best_training_job()
my_dataframe[my_dataframe['TrainingJobName'] == best_job]

In [None]:
xgb_best = sagemaker.estimator.Estimator.attach(xgb_hyperparameter_tuner.best_training_job())
xgb_transformer = xgb_best.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

xgb_transformer.wait()

In [None]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

output = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in output.squeeze().values]
probs = [p for p in output.squeeze().values]

from sklearn.metrics import accuracy_score
print (accuracy_score(y_test, predictions))

from sklearn.metrics import roc_auc_score
print (roc_auc_score(y_test, probs))

In [4]:
import io
import numpy as np
import sagemaker.amazon.common as smac

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_train.values.astype("float32"), y_train.values[:,0].astype("float32"))
buf.seek(0)

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"uploaded training data location: {s3_train_data}")

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_val.values.astype("float32"), y_val.values[:,0].astype("float32"))
buf.seek(0)

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "val", key)).upload_fileobj(buf)
s3_val_data = f"s3://{bucket}/{prefix}/val/{key}"
print(f"uploaded validation data location: {s3_val_data}")

uploaded training data location: s3://sagemaker-us-east-1-345989147144/readmission-xgboost/train/recordio-pb-data
uploaded validation data location: s3://sagemaker-us-east-1-345989147144/readmission-xgboost/val/recordio-pb-data


In [5]:
container = get_image_uri(session.boto_region_name, 'linear-learner')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [7]:
prefix = 'readmission-linreg'

In [8]:
linear = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.c4.xlarge",
    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
    sagemaker_session=session)
linear.set_hyperparameters(feature_dim=105, predictor_type="binary_classifier", mini_batch_size=200)

linear.fit({'train': s3_train_data, 'validation': s3_val_data})

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-12-29 16:47:20 Starting - Starting the training job...
2021-12-29 16:47:22 Starting - Launching requested ML instances......
2021-12-29 16:48:24 Starting - Preparing the instances for training......
2021-12-29 16:49:44 Downloading - Downloading input data
2021-12-29 16:49:44 Training - Downloading the training image......
2021-12-29 16:50:38 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/29/2021 16:50:42 INFO 139751046911808] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init

In [9]:
lr_transformer = linear.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
lr_transformer.transform(test_location, content_type='text/csv', split_type='Line')
lr_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


..................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loading entry points[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded request iterator application/json[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded request iterator application/jsonlines[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded request iterator application/x-recordio-protobuf[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded request iterator text/csv[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded response encoder application/json[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded response encoder application/jsonlines[0m
[34m[12/29/2021 16:59:14 INFO 139765769918272] loaded response encoder application/

In [22]:
!aws s3 cp --recursive $lr_transformer.output_path $data_dir

results = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [int(p.split(':')[1]) for p in results[0]]
probs = [float(p.split(':')[1].split('}')[0]) for p in results[1]]

download: s3://sagemaker-us-east-1-345989147144/linear-learner-2021-12-29-16-53-39-986/test.csv.out to data/test.csv.out


In [23]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

0.6272047639082171

In [25]:
from sklearn.metrics import roc_auc_score
print (roc_auc_score(y_test, probs))

0.6765213634907756
