# Create SageMaker session and role

In [None]:
# S3 prefix
s3_bucket = 'srt-sm'
prefix = 'Scikit-LinearLearner-pipeline-srt'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# Create a SageMaker Scikit estimator 
To run our Scikit-learn training script on SageMaker, we construct a `sagemaker.sklearn.estimator.sklearn` estimator, which accepts several constructor arguments:

 - **entry_point**: The path to the Python script SageMaker runs for training and prediction.
 - **source_dir**: Path (absolute or relative) to a directory with any other training source code dependencies aside from tne entry point file (default: None). Structure within this directory are preserved when training on Amazon SageMaker.
 - **role**: Role ARN
 - **train_instance_type (optional)**: The type of SageMaker instances for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.
 - **sagemaker_session (optional)**: The session used to train on Sagemaker.
 - **output_path (optional)**: s3 location where you want the training result (model artifacts and optional output files) saved. If not specified, results are stored to a default bucket. If the bucket with the specific name does not exist, the estimator creates the bucket during the fit() method execution.
 

In [None]:
from time import gmtime, strftime
from sagemaker.sklearn.estimator import SKLearn

entry_point = 'sklearn_featureizer.py'
source_dir = 'pipeline'

s3_ll_output_key_prefix = "ll_training_output"
preprocessor_output_path = 's3://{}/{}/{}/{}'.format(s3_bucket, prefix, s3_ll_output_key_prefix, 'll_preprocessor')

sklearn_preprocessor = SKLearn(source_dir = source_dir,
                               entry_point = entry_point,
                               role = role,
                               train_instance_type = "ml.c4.xlarge",
                               sagemaker_session = sagemaker_session,
                               output_path = preprocessor_output_path)


train_input = f's3://{s3_bucket}/{prefix}/srt_train.csv'

In [None]:
sklearn_preprocessor.fit({'train': train_input})

## Batch transform the training data 
With the preprocessor fitted, we can now preprocess our training data. We'll use **batch transform**, storing the output right back into s3.

In [None]:
# Define a SKLearn Transformer from the trained SKLearn Estimator
transformer_output_path = 's3://{}/{}/{}/{}'.format(s3_bucket, prefix, s3_ll_output_key_prefix, 'll_transformer')

transformer = sklearn_preprocessor.transformer(instance_count = 1,
                                               instance_type = 'ml.m4.xlarge',
                                               assemble_with = 'Line',
                                               accept =' text/csv',
                                               output_path = transformer_output_path)

In [None]:
# Preprocess training input
transformer.transform(train_input, content_type='text/csv')
print(f'Waiting for transform job: {transformer.latest_transform_job.job_name}')
transformer.wait()
preprocessed_train = transformer.output_path
print(f"Transform job done. Outpath:  {preprocessed_train}")

# Fit a LinearLearner
Here we'll use our preprocessed training data to fit a LinearLearner Model.

In [None]:
import boto3
from sagemaker.amazon.amazon_estimator import get_image_uri
ll_image = get_image_uri(boto3.Session().region_name, 'linear-learner')

In [None]:
#working with a subset of a larger dataset that won't have the 100 dimensions from the chi-2 selection, we 
#need to manually fetch the dimensions in order to specify the feature_dim hyperparameter of the linear learner.

import pandas as pd

train_out = f"{preprocessed_train}/srt_train.csv.out"
train_df = pd.read_csv(train_out)
# minus 1 to exclude the labels
feature_dim = train_df.shape[1] - 1

In [None]:
estimator_output_path = 's3://{}/{}/{}/{}'.format(s3_bucket, prefix, s3_ll_output_key_prefix, 'll_estimator')

ll_estimator = sagemaker.estimator.Estimator(ll_image,
                                             role, 
                                             train_instance_count = 1, 
                                             train_instance_type = 'ml.m4.2xlarge',
                                             train_volume_size = 20,
                                             train_max_run = 3600,
                                             input_mode = 'File',
                                             output_path = estimator_output_path,
                                             sagemaker_session = sagemaker_session)

ll_estimator.set_hyperparameters(feature_dim = feature_dim, 
                                 predictor_type = 'binary_classifier', 
                                 mini_batch_size = 5)

ll_train_data = sagemaker.session.s3_input(preprocessed_train, 
                                           distribution = 'FullyReplicated',
                                           content_type = 'text/csv', 
                                           s3_data_type = 'S3Prefix')

data_channels = {'train': ll_train_data}

ll_estimator.fit(inputs = data_channels, logs = True)

# Set up the inference pipeline

We can use the Pipeline Model here. This sets up a sequence of models in a single endpoint; in this example, we configure our pipeline model with the fitted Scikit-learn inference model and the fitted Linear Learner model.

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())

scikit_learn_inference_model = sklearn_preprocessor.create_model()
linear_learner_model = ll_estimator.create_model()

model_name = 'inference-pipeline-' + timestamp_prefix
endpoint_name = 'inference-pipeline-ep-' + timestamp_prefix

sm_model = PipelineModel(name = model_name, 
                         role = role, 
                         models = [scikit_learn_inference_model, linear_learner_model])

sm_model.deploy(initial_instance_count = 1, 
                instance_type = 'ml.c4.xlarge',
                endpoint_name = endpoint_name)

# Make a request to the pipeline endpoint 
Here we'll use the deployed model to get predictions for our test data. 

Below, the `content_type` field configures the first container, while the `accept` field configures the last container.

We make our request with the payload in `text/csv` format, since that is what our script currently supports. If other formats need to be supported, this would have to be added to the `output_fn()` method in our entrypoint. Note that we set the `accpet` to `application/json`, since LinearLearner does not support `text/csv`.

In [None]:
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

test_input = f's3://{s3_bucket}/{prefix}/srt_test.csv'

test_df = pd.read_csv(test_input)

def format_as_csv(text):
    #since we make our request with the payload in text/csv format, we need to sanitize the text first
    return text.replace(",","").replace("\n","")

test_df['1'] = test_df['1'].apply(format_as_csv)
x_test = ",\n".join(test_df['1'].tolist())

In [None]:
import json

predictor = RealTimePredictor(endpoint = endpoint_name,
                              sagemaker_session = sagemaker_session,
                              serializer = csv_serializer,
                              content_type = CONTENT_TYPE_CSV,
                              accept = CONTENT_TYPE_JSON)

predictions_str = predictor.predict(x_test)
predictions_dict = json.loads(predictions_str)

In [None]:
y_true = test_df['0']
y_pred = [i['predicted_label'] for i in predictions_dict['predictions']]

from sklearn.metrics import accuracy_score

accuracy_score(y_true, y_pred)

# Delete Endpoint 
Once we are finished with the endpoint, we clean up the resources since the endpoint incurs costs for as long as it is alive.

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName = endpoint_name)