# ML Model with LinearLearner

In [1]:
import os

import time
from time import gmtime, strftime

import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'


## Import the preprocessed data

In [2]:
features = pd.read_csv('data/features_completed.csv', index_col=0)
labels = pd.read_csv('data/labels_completed.csv', index_col=0)

In [3]:
features.shape, labels.shape

((63288, 11), (63288, 1))

## Create Sagemaker items

In [4]:
import sagemaker

In [5]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()



## Create Training, Validation and Testdata
To avoid overfitting I split the train data additional in validation data.

In [6]:
from sklearn.model_selection import train_test_split 

In [7]:
# We split the dataset into 2/3 training and 1/3 testing sets.
X_train, X_test, Y_train, Y_test = train_test_split(features, labels, test_size=0.10)



In [8]:
# import LinearLearner
from sagemaker import LinearLearner

# specify an output path
prefix = 'creditcard'
output_path = 's3://{}/{}'.format(bucket, prefix)

# instantiate LinearLearner
linear = LinearLearner(role=role,
                       instance_count=1, 
                       instance_type='ml.c4.xlarge',
                       predictor_type='binary_classifier',
                       output_path=output_path,
                       sagemaker_session=sagemaker_session,
                       epochs=15)



In [9]:
# convert features/labels to numpy
train_x_np = features.values.astype('float32')
train_y_np = labels.values.astype('float32').flatten()

train_x_np
train_y_np

array([1., 1., 1., ..., 0., 0., 0.], dtype=float32)

In [10]:
# convert features/labels to numpy
train_x_np = features.values.astype('float32')
train_y_np = labels.values.astype('float32').flatten()


#type(train_x_np)
# create RecordSet
formatted_train_data = linear.record_set(train_x_np, labels=train_y_np)

In [11]:
%%time 
# train the estimator on formatted training data
linear.fit(formatted_train_data)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-10-08 16:29:52 Starting - Starting the training job...
2021-10-08 16:29:56 Starting - Launching requested ML instancesProfilerReport-1633710592: InProgress
...
2021-10-08 16:30:45 Starting - Preparing the instances for training............
2021-10-08 16:32:46 Downloading - Downloading input data
2021-10-08 16:32:46 Training - Downloading the training image...
2021-10-08 16:33:14 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/08/2021 16:33:12 INFO 140404880299840] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '1000

In [12]:
%%time 
# deploy and create a predictor
linear_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-------!CPU times: user 166 ms, sys: 1.51 ms, total: 167 ms
Wall time: 3min 32s


In [13]:
features.values.astype('float32')

array([[0.18072289, 0.46666667, 0.        , ..., 1.        , 1.        ,
        1.        ],
       [0.07228915, 0.33333334, 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.44578314, 0.4888889 , 1.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [0.4384762 , 0.3933888 , 0.        , ..., 1.        , 1.        ,
        0.        ],
       [0.7590361 , 0.3888889 , 1.        , ..., 1.        , 1.        ,
        0.        ],
       [0.02409638, 0.15555556, 0.        , ..., 1.        , 1.        ,
        0.        ]], dtype=float32)

### Helper function for evaluation


The provided function below, takes in a deployed predictor, some test features and labels, and returns a dictionary of metrics; calculating false negatives and positives as well as recall, precision, and accuracy.

In [29]:
Y_train, Y_test

(       binary_target
 8177               1
 24547              1
 53131              1
 29763              1
 47481              0
 ...              ...
 3078               0
 22529              0
 21746              1
 43860              0
 49209              1
 
 [56959 rows x 1 columns],
        binary_target
 3601               0
 32207              1
 28192              1
 4486               1
 5757               0
 ...              ...
 35862              0
 32240              1
 35337              1
 46978              0
 35469              0
 
 [6329 rows x 1 columns])

In [24]:
# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # We have a lot of test data, so we'll split it into batches of 100
    # split the test data set into batches and evaluate using prediction endpoint    
    prediction_batches = [predictor.predict(batch) for batch in np.array_split(test_features, 100)]
    
    # LinearLearner produces a `predicted_label` for each data point in a batch
    # get the 'predicted_label' for every point in a batch
    test_preds = np.concatenate([np.array([x.label['predicted_label'].float32_tensor.values[0] for x in batch]) 
                                 for batch in prediction_batches])
    
    # calculate true positives, false positives, true negatives, false negatives
    tp = np.logical_and(test_labels, test_preds).sum()
    fp = np.logical_and(1-test_labels, test_preds).sum()
    tn = np.logical_and(1-test_labels, 1-test_preds).sum()
    fn = np.logical_and(test_labels, 1-test_preds).sum()
    
    # calculate binary classification metrics
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    confusion = pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)'])
    # printing a table of metrics
    if verbose:
        print(pd.crosstab(test_labels, test_preds, rownames=['actual (row)'], colnames=['prediction (col)']))
        print("\n{:<11} {:.3f}".format('Recall:', recall))
        print("{:<11} {:.3f}".format('Precision:', precision))
        print("{:<11} {:.3f}".format('Accuracy:', accuracy))
        print()
        
    return {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn, 
            'Precision': precision, 'Recall': recall, 'Accuracy': accuracy,
           'Confustion': confusion}


In [25]:
labels.values.flatten()

array([1, 1, 1, ..., 0, 0, 0])

### Test Results

The cell below runs the `evaluate` function. 

The code assumes that you have a defined `predictor` and `test_features` and `test_labels` from previously-run cells.

In [30]:
print('Metrics for simple, LinearLearner.\n')

# get metrics for linear predictor
metrics = evaluate(linear_predictor, 
                   X_test.values.astype('float32'), 
                   Y_test.values.flatten(), 
                   verbose=True) # verbose means we'll print out the metrics


Metrics for simple, LinearLearner.

prediction (col)   0.0   1.0
actual (row)                
0                 1396  1193
1                  760  2980

Recall:     0.797
Precision:  0.714
Accuracy:   0.691



In [31]:
metrics['Confustion']

prediction (col),0.0,1.0
actual (row),Unnamed: 1_level_1,Unnamed: 2_level_1
0,1396,1193
1,760,2980


In [17]:
# Deletes a precictor.endpoint
def delete_endpoint(predictor):
        try:
            boto3.client('sagemaker').delete_endpoint(EndpointName=predictor.endpoint_name)
            print('Deleted {}'.format(predictor.endpoint))
        except:
            print('Already deleted: {}'.format(predictor.endpoint_name))

In [18]:
# delete the predictor endpoint 
delete_endpoint(linear_predictor)

Already deleted: linear-learner-2021-10-08-16-34-05-747


In [19]:
# instantiate a LinearLearner
# tune the model for a higher recall
linear_recall = LinearLearner(role=role,
                              instance_count=1, 
                              instance_type='ml.m4.xlarge',
                              predictor_type='binary_classifier',
                              output_path=output_path,
                              sagemaker_session=sagemaker_session,
                              epochs=15,
                              binary_classifier_model_selection_criteria='precision_at_target_recall', # target recall
                              target_recall=0.9) # 90% recall



In [20]:
%%time 
# train the estimator on formatted training data
linear_recall.fit(formatted_train_data)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-10-08 16:37:46 Starting - Starting the training job...
2021-10-08 16:38:09 Starting - Launching requested ML instancesProfilerReport-1633711066: InProgress
...
2021-10-08 16:38:36 Starting - Preparing the instances for training............
2021-10-08 16:40:30 Downloading - Downloading input data...
2021-10-08 16:41:12 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[10/08/2021 16:41:17 INFO 140047028090688] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_

KeyboardInterrupt: 

In [21]:
%%time 
# deploy and create a predictor
recall_predictor = linear_recall.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


-

KeyboardInterrupt: 

In [22]:
print('Metrics for tuned (recall), LinearLearner.\n')

# get metrics for tuned predictor
metrics = evaluate(recall_predictor, 
                   features.values.astype('float32'), 
                   labels.values.flatten(), 
                   verbose=True)

Metrics for tuned (recall), LinearLearner.



NameError: name 'recall_predictor' is not defined

## Delete the endpoint 

As always, when you're done evaluating a model, you should delete the endpoint. Below, I'm using the `delete_endpoint` helper function I defined earlier.

In [None]:
# delete the predictor endpoint 
delete_endpoint(recall_predictor)

In [49]:
sagemaker_session.delete_endpoint(endpoint_name='linear-learner-2021-09-21-11-07-47-455')

In [50]:
# instantiate a LinearLearner

# include params for tuning for higher recall
# *and* account for class imbalance in training data
linear_balanced = LinearLearner(role=role,
                                instance_count=1, 
                                instance_type='ml.c4.xlarge',
                                predictor_type='binary_classifier',
                                output_path=output_path,
                                sagemaker_session=sagemaker_session,
                                epochs=15,
                                binary_classifier_model_selection_criteria='precision_at_target_recall', # target recall
                                target_recall=0.9,
                                positive_example_weight_mult='balanced')


### EXERCISE: Train the balanced estimator

Fit the new, balanced estimator on the formatted training data.

In [51]:
%%time 
# train the estimator on formatted training data
linear_balanced.fit(formatted_train_data)

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-09-21 11:25:46 Starting - Starting the training job...
2021-09-21 11:25:48 Starting - Launching requested ML instancesProfilerReport-1632223546: InProgress
......
2021-09-21 11:27:16 Starting - Preparing the instances for training.........
2021-09-21 11:28:39 Downloading - Downloading input data...
2021-09-21 11:29:17 Training - Downloading the training image.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[09/21/2021 11:29:23 INFO 139736075499328] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias

### EXERCISE: Deploy and evaluate the balanced estimator

Deploy the balanced predictor and evaluate it. Do the results match with your expectations?

In [55]:
%%time 
# deploy and create a predictor
balanced_predictor = linear_balanced.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


------------!CPU times: user 281 ms, sys: 12.6 ms, total: 293 ms
Wall time: 6min 3s


In [57]:
print('Metrics for balanced, LinearLearner.\n')

# get metrics for balanced predictor
metrics = evaluate(balanced_predictor, 
                   features.values.astype('float32'), 
                   targets.values.flatten(), 
                   verbose=True)

Metrics for balanced, LinearLearner.

prediction (col)    0.0    1.0
actual (row)                  
0                 12959  20204
1                  4233  38881

Recall:     0.902
Precision:  0.658
Accuracy:   0.680



## Delete the endpoint 

When you're done evaluating a model, you should delete the endpoint.

In [60]:
sagemaker_session.delete_endpoint(endpoint_name=balanced_predictor.endpoint_name)

In [65]:
sagemaker_session.delete_endpoint(endpoint_name='linear-learner-2021-09-21-11-32-07-490')

In [61]:
# delete the predictor endpoint 
delete_endpoint(balanced_predictor)

Already deleted: linear-learner-2021-09-21-11-45-05-238
