In [None]:
!pip install sagemaker==1.72.0

In [1]:
## Import relevant libraries
import os
import boto3
import re
import sagemaker
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import io
import time
import json
import sagemaker.amazon.common as smac
import scipy
from sklearn.model_selection import train_test_split
from sagemaker.amazon.amazon_estimator import get_image_uri

## Defin role, region, session, and default s3 bucket ##
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
session = sagemaker.Session()
bucket = session.default_bucket()

In [2]:
## Read in processed datasets ##
X_sparse = scipy.sparse.load_npz('./readmissions_X.npz')
y_sparse = scipy.sparse.load_npz('./readmissions_y.npz')
X = pd.DataFrame(np.array(X_sparse.todense()))
y = pd.DataFrame(np.array(y_sparse.todense()).reshape(-1,1))

In [3]:
## Create train/test/validation sets ##
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33)

#### Build LinearLearner Classifier 

In [4]:
prefix = "readmissions/linreg"
data_dir = './data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [17]:
## Save test set to local data directory ##
pd.DataFrame(X_test).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

## Upload test set from local directory to s3 bucket ##
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

In [6]:
import io
import numpy as np
import sagemaker.amazon.common as smac

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_train.values.astype("float32"), y_train.values[:,0].astype("float32"))
buf.seek(0)

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "train", key)).upload_fileobj(buf)
s3_train_data = f"s3://{bucket}/{prefix}/train/{key}"
print(f"uploaded training data location: {s3_train_data}")

buf = io.BytesIO()
smac.write_numpy_to_dense_tensor(buf, X_val.values.astype("float32"), y_val.values[:,0].astype("float32"))
buf.seek(0)

key = "recordio-pb-data"
boto3.resource("s3").Bucket(bucket).Object(os.path.join(prefix, "val", key)).upload_fileobj(buf)
s3_val_data = f"s3://{bucket}/{prefix}/val/{key}"
print(f"uploaded validation data location: {s3_val_data}")

uploaded training data location: s3://sagemaker-us-east-1-345989147144/readmissions/linreg/train/recordio-pb-data
uploaded validation data location: s3://sagemaker-us-east-1-345989147144/readmissions/linreg/val/recordio-pb-data


In [7]:
## get image and define Linear Learner estimator object ##
container = get_image_uri(session.boto_region_name, 'linear-learner')
linear = sagemaker.estimator.Estimator(
         container,
         role,
         train_instance_count=1,
         train_instance_type="ml.c4.xlarge",
         output_path='s3://{}/{}/output'.format(session.default_bucket(), prefix),
         sagemaker_session=session)

'get_image_uri' method will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


In [8]:
## Set default hyperparameters ##
linear.set_hyperparameters(feature_dim=105, predictor_type="binary_classifier")

In [9]:
## Instantiate tuner instance and input tunable hyperparameter ranges ##
from sagemaker.tuner import IntegerParameter, ContinuousParameter, CategoricalParameter, HyperparameterTuner
lin_hyperparameter_tuner = HyperparameterTuner(estimator = linear, 
                                               objective_metric_name = 'validation:binary_classification_accuracy', 
                                               objective_type = 'Maximize',
                                               max_jobs = 20,
                                               max_parallel_jobs = 3,
                                               hyperparameter_ranges = {
                                                    'wd': ContinuousParameter(0.1, 1.0),
                                                    'l1'      : ContinuousParameter(0.1, 1.0),
                                                    'learning_rate': ContinuousParameter(0.1, 1.0),
                                                    'mini_batch_size': IntegerParameter(100, 5000),
                                                    'use_bias': CategoricalParameter([True, False])})

In [10]:
## Point tuner object to training/validation s3 locations and fit tuner##
lin_hyperparameter_tuner.fit({'train': s3_train_data, 'validation': s3_val_data})
lin_hyperparameter_tuner.wait()

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.


......................................................................................................................................................................................................................................................................................................................................................................................!


In [13]:
## View Hyperparameters and performance for best job ##
analytics = lin_hyperparameter_tuner.analytics()
analytics_df = analytics.dataframe()

best_job = lin_hyperparameter_tuner.best_training_job()
analytics_df[analytics_df['TrainingJobName'] == best_job]

Unnamed: 0,l1,learning_rate,mini_batch_size,use_bias,wd,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
8,0.200825,0.205698,4331.0,True,0.10565,linear-learner-220104-2108-012-2611030e,Completed,0.612453,2022-01-04 21:25:03+00:00,2022-01-04 21:26:25+00:00,82.0


In [18]:
## Create a new estimator object using the "best training job" ##
lin_best = sagemaker.estimator.Estimator.attach(lin_hyperparameter_tuner.best_training_job())

## Create a batch transform job and point to s3 training data location ##
lin_transformer = lin_best.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
lin_transformer.transform(test_location, content_type='text/csv', split_type='Line')
lin_transformer.wait()

Parameter image_name will be renamed to image_uri in SageMaker Python SDK v2.


2022-01-04 21:26:25 Starting - Preparing the instances for training
2022-01-04 21:26:25 Downloading - Downloading input data
2022-01-04 21:26:25 Training - Training image download completed. Training in progress.
2022-01-04 21:26:25 Uploading - Uploading generated training model
2022-01-04 21:26:25 Completed - Training job completed[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/04/2022 21:26:13 INFO 140250267572032] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'l

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Using already existing model: linear-learner-220104-2108-012-2611030e


...................................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loading entry points[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded request iterator application/json[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded request iterator application/jsonlines[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded request iterator application/x-recordio-protobuf[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded request iterator text/csv[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded response encoder application/json[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded response encoder application/jsonlines[0m
[34m[01/04/2022 21:58:27 INFO 140196092274496] loaded response encoder application

In [23]:
## Download tranform output from s3 location ##
!aws s3 cp --recursive $lin_transformer.output_path $data_dir/linreg_results

download: s3://sagemaker-us-east-1-345989147144/linear-learner-220104-2108-012-2611030e-2022-01-04-21-52-44-276/test.csv.out to data/linreg_results/test.csv.out


In [28]:
## Calculate test accuracy and auc performance ##
results = pd.read_csv(os.path.join(data_dir, 'linreg_results/test.csv.out'), header=None)
predictions = [int(p.split(':')[1]) for p in results[0]]
probs = [float(p.split(':')[1].split('}')[0]) for p in results[1]]

from sklearn.metrics import accuracy_score
print ('LinLearner test accuracy: {}'.format(accuracy_score(y_test, predictions)))
print ('LinLearner test auc: {}'.format(roc_auc_score(y_test, probs)))

LinLearner test accuracy: 0.6038706754263892
LinLearner test auc: 0.6467501458732036
