# Create SageMaker session and role

In [None]:
# S3 prefix
s3_bucket = 'srt-sm'
prefix = 'Scikit-LinearLearner-pipeline-srt'

import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# Create a SageMaker Scikit estimator 
To run our Scikit-learn training script on SageMaker, we construct a `sagemaker.sklearn.estimator.sklearn` estimator, which accepts several constructor arguments:

 - **entry_point**: The path to the Python script SageMaker runs for training and prediction.
 - **source_dir**: Path (absolute or relative) to a directory with any other training source code dependencies aside from tne entry point file (default: None). Structure within this directory are preserved when training on Amazon SageMaker.
 - **role**: Role ARN
 - **train_instance_type (optional)**: The type of SageMaker instances for training. Note: Because Scikit-learn does not natively support GPU training, Sagemaker Scikit-learn does not currently support training on GPU instance types.
 - **sagemaker_session (optional)**: The session used to train on Sagemaker.
 - **output_path (optional)**: s3 location where you want the training result (model artifacts and optional output files) saved. If not specified, results are stored to a default bucket. If the bucket with the specific name does not exist, the estimator creates the bucket during the fit() method execution.
 

In [None]:
from time import gmtime, strftime
from sagemaker.sklearn.estimator import SKLearn

entry_point = 'sklearn_featureizer.py'
source_dir = 'pipeline'

s3_ll_output_key_prefix = "ll_training_output"
model_output_path = 's3://{}/{}/{}/{}'.format(s3_bucket, prefix, s3_ll_output_key_prefix, 'll_preprocessor')

grid_search = SKLearn(source_dir = source_dir,
                      entry_point = entry_point,
                      role = role,
                      train_instance_type = "ml.c4.xlarge",
                      sagemaker_session = sagemaker_session,
                      output_path = model_output_path)

train_input = f's3://{s3_bucket}/{prefix}/srt_train.csv'

In [None]:
grid_search.fit({'train': train_input}, logs = True)

# Deploy the Model

We can use the Pipeline Model here. This sets up a sequence of models in a single endpoint; in this example, we configure our pipeline model with the fitted Scikit-learn inference model and the fitted Linear Learner model.

In [None]:
from sagemaker.model import Model
from sagemaker.pipeline import PipelineModel
from time import gmtime, strftime

timestamp_prefix = strftime("%Y-%m-%d-%H-%M-%S", gmtime())



model_name = 'inference-pipeline-' + timestamp_prefix
endpoint_name = 'inference-pipeline-ep-' + timestamp_prefix

model = grid_search.create_model(role = role)

model.deploy(initial_instance_count = 1, 
             instance_type = 'ml.c4.xlarge',
             endpoint_name = endpoint_name)

# Make a request to the endpoint 
Here we'll use the deployed model to get predictions for our test data. 

We need to make our request with the payload in `text/csv` format, since that is what our script currently supports. If other formats need to be supported, this would have to be added to the `output_fn()` method in our entrypoint. Note that we set the `accept` to `application/json` to get our output that way.

In [None]:
import pandas as pd
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

test_input = f's3://{s3_bucket}/{prefix}/srt_test.csv'

test_df = pd.read_csv(test_input)

def format_as_csv(text):
    #since we make our request with the payload in text/csv format, we need to sanitize the text first
    return text.replace(",","").replace("\n","")

test_df['1'] = test_df['1'].apply(format_as_csv)

In [None]:
import json

predictor = RealTimePredictor(endpoint = endpoint_name,
                              sagemaker_session = sagemaker_session,
                              serializer = csv_serializer,
                              content_type = CONTENT_TYPE_CSV,
                              accept = CONTENT_TYPE_JSON)

In [None]:
# request an inference for a single sample
sample = test_df['1'].iloc[0]

prediction_str = predictor.predict(sample)
prediction_dict = json.loads(predictions_str)

In [None]:
# request inferences for multiple samples
samples = test_df['1'].tolist()
predictions_str = predictor.predict(samples)
predictions_dict = json.loads(predictions_str)

In [None]:
# evaluate predictions

y_true = test_df['0']
y_pred = [i['predicted_label'] for i in predictions_dict['predictions']]

from sklearn.metrics import accuracy_score

accuracy_score(y_true, y_pred)

# Delete Endpoint 
Once we are finished with the endpoint, we clean up the resources since the endpoint incurs costs for as long as it is alive.

In [None]:
sm_client = sagemaker_session.boto_session.client('sagemaker')
sm_client.delete_endpoint(EndpointName = endpoint_name)