In [1]:
import os
import boto3
import re
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac
import scipy
from sklearn.model_selection import train_test_split
from sagemaker.amazon.amazon_estimator import get_image_uri

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
session = sagemaker.Session()
bucket = session.default_bucket()

prefix = (
    "sagemaker/readmissions" 
)

In [2]:
!pip install sagemaker==1.72.0



In [3]:
X_sparse = scipy.sparse.load_npz('./readmissions_X.npz')
y_sparse = scipy.sparse.load_npz('./readmissions_y.npz')

In [4]:
X = pd.DataFrame(np.array(X_sparse.todense()))
y = pd.DataFrame(np.array(y_sparse.todense()).reshape(-1,1))

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33)

X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.33)

In [6]:
data_dir = './data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [7]:
pd.DataFrame(X_test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

pd.concat([Y_val, X_val], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([Y_train, X_train], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [8]:
prefix = 'readmission-xgboost'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

In [10]:
container = get_image_uri(session.boto_region_name, 'xgboost')

# First we create a SageMaker estimator object for our model.
xgb = sagemaker.estimator.Estimator(container, # The location of the container we wish to use
                                    role,                                    # What is our current IAM Role
                                    train_instance_count=1,                  # How many compute instances
                                    train_instance_type='ml.m4.xlarge',      # What kind of compute instances
                                    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
                                    sagemaker_session=session)

# And then set the algorithm specific parameters.
xgb.set_hyperparameters(max_depth=5,
                        eta=0.2,
                        gamma=4,
                        min_child_weight=6,
                        subsample=0.8,
                        silent=0,
                        objective='binary:logistic',
                        early_stopping_rounds=10,
                        num_round=500)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
There is a more up to date SageMaker XGBoost image. To use the newer image, please set 'repo_version'='1.0-1'. For example:
	get_image_uri(region, 'xgboost', '1.0-1').
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [11]:
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


In [12]:
xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

2021-12-28 16:44:13 Starting - Starting the training job...
2021-12-28 16:44:15 Starting - Launching requested ML instances......
2021-12-28 16:45:23 Starting - Preparing the instances for training.........
2021-12-28 16:46:51 Downloading - Downloading input data...
2021-12-28 16:47:34 Training - Downloading the training image..[34mArguments: train[0m
[34m[2021-12-28:16:47:57:INFO] Running standalone xgboost training.[0m
[34m[2021-12-28:16:47:57:INFO] File size need to be processed in the node: 12.79mb. Available memory size in the node: 8384.88mb[0m
[34m[2021-12-28:16:47:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:47:57] S3DistributionType set as FullyReplicated[0m
[34m[16:47:57] 42031x105 matrix with 4413255 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2021-12-28:16:47:57:INFO] Determined delimiter of CSV input is ','[0m
[34m[16:47:57] S3DistributionType set as FullyReplicated[0m
[34m[16:47:57] 20702x105 

In [13]:
xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


In [14]:
xgb_transformer.wait()

...................................[34mArguments: serve[0m
[34m[2021-12-28 16:54:11 +0000] [1] [INFO] Starting gunicorn 19.9.0[0m
[34m[2021-12-28 16:54:11 +0000] [1] [INFO] Listening at: http://0.0.0.0:8080 (1)[0m
[34m[2021-12-28 16:54:11 +0000] [1] [INFO] Using worker: gevent[0m
[34m[2021-12-28 16:54:11 +0000] [22] [INFO] Booting worker with pid: 22[0m
[34m[2021-12-28 16:54:11 +0000] [23] [INFO] Booting worker with pid: 23[0m
[34m[2021-12-28 16:54:11 +0000] [24] [INFO] Booting worker with pid: 24[0m
[34m[2021-12-28 16:54:11 +0000] [25] [INFO] Booting worker with pid: 25[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-28:16:54:11:INFO] Model loaded successfully for worker : 22[0m
  monkey.patch_all(subprocess=True)[0m
  monkey.patch_all(subprocess=True)[0m
[34m[2021-12-28:16:54:11:INFO] Model loaded successfully for worker : 23[0m
[34m[2021-12-28:16:54:11:INFO] Model loaded successfully for worker : 24[0m
  monkey.patch_all(subprocess=True)[0m
[34m[202

In [15]:
!aws s3 cp --recursive $xgb_transformer.output_path $data_dir

download: s3://sagemaker-us-east-1-345989147144/xgboost-2021-12-28-16-48-26-686/test.csv.out to data/test.csv.out


In [16]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [round(num) for num in predictions.squeeze().values]

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.6366872714327324

In [37]:
train_y.values

NameError: name 'train_y' is not defined

In [33]:
X_test.values

array([[38,  9, 79, ...,  1,  0,  1],
       [18,  1, 20, ...,  1,  0,  1],
       [ 1,  3, 33, ...,  1,  0,  1],
       ...,
       [12,  6, 62, ...,  1,  1,  0],
       [23,  4, 52, ...,  1,  0,  1],
       [ 9,  3,  1, ...,  0,  0,  1]])

In [44]:
Y_train.values[:,0]

array([1, 1, 1, ..., 0, 0, 1])

In [51]:
import io
import numpy as np
import sagemaker.amazon.common as smac

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_train.values.astype("float32"), Y_train.values[:,0].astype("float32"))
buf.seek(0)

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"uploaded training data location: {s3_train_data}")

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_val.values.astype("float32"), Y_val.values[:,0].astype("float32"))
buf.seek(0)

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "val", key)).upload_fileobj(buf)
s3_val_data = f"s3://{bucket}/{prefix}/val/{key}"
print(f"uploaded validation data location: {s3_val_data}")

uploaded training data location: s3://sagemaker-us-east-1-345989147144/readmission-xgboost/train/recordio-pb-data
uploaded validation data location: s3://sagemaker-us-east-1-345989147144/readmission-xgboost/val/recordio-pb-data


In [52]:
container = get_image_uri(session.boto_region_name, 'linear-learner')

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


In [53]:
linear = sagemaker.estimator.Estimator(
    container,
    role,
    train_instance_count=1,
    train_instance_type="ml.c4.xlarge",
    output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
    sagemaker_session=session)
linear.set_hyperparameters(feature_dim=105, predictor_type="binary_classifier", mini_batch_size=200)

linear.fit({'train': s3_train_data, 'validation': s3_val_data})

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


2021-12-28 17:56:58 Starting - Starting the training job...
2021-12-28 17:57:00 Starting - Launching requested ML instances......
2021-12-28 17:58:00 Starting - Preparing the instances for training.........
2021-12-28 17:59:58 Downloading - Downloading input data...
2021-12-28 18:00:27 Training - Downloading the training image...
2021-12-28 18:00:49 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[12/28/2021 18:00:54 INFO 140167606798144] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', '

In [54]:
lr_transformer = linear.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
lr_transformer.transform(test_location, content_type='text/csv', split_type='Line')
lr_transformer.wait()

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.


..................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loading entry points[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded request iterator application/json[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded request iterator application/jsonlines[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded request iterator application/x-recordio-protobuf[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded request iterator text/csv[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded response encoder application/json[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded response encoder application/jsonlines[0m
[34m[12/28/2021 18:13:56 INFO 139818928498496] loaded response encoder application/

In [55]:
!aws s3 cp --recursive $lr_transformer.output_path $data_dir

download: s3://sagemaker-us-east-1-345989147144/linear-learner-2021-12-28-18-08-18-047/test.csv.out to data/test.csv.out


In [77]:
predictions = pd.read_csv(os.path.join(data_dir, 'test.csv.out'), header=None)
predictions = [int(p.split(':')[1]) for p in predictions[0]]

In [78]:
from sklearn.metrics import accuracy_score
accuracy_score(Y_test, predictions)

0.6278843975533189