# Build machine learning workflow to predict new data with Amazon SageMaker and AWS Step Functions

This script creates a Step Function state machine to preprocess the inference data and predict with the images in ECR.

## Import modules

In [1]:
import uuid
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.s3 import S3Uploader
from sagemaker import get_execution_role
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput
import stepfunctions
from stepfunctions.steps import (
    Chain,
    ProcessingStep,
    TransformStep
)
from stepfunctions.inputs import ExecutionInput
from stepfunctions.workflow import Workflow

## Setup

Modify according to your configurations.

In [2]:
# Bucket name in S3
bucket = "hermione-sagemaker"

In [3]:
# Set session
region_name="us-east-1"
boto3.setup_default_session(region_name=region_name)

In [4]:
# Get user role
role = get_execution_role()

In [5]:
# Role to create and execute step functions
# paste the AmazonSageMaker-StepFunctionsWorkflowExecutionRole ARN
workflow_execution_role = ""

In [6]:
# SageMaker expects unique names for each job, model and endpoint.
# Otherwise, the execution will fail. The ExecutionInput creates
# dynamically names for each execution.
execution_input = ExecutionInput(
    schema={
        "PreprocessingJobName": str,
        "TransformJobName": str 
    }
)

In [7]:
# Get AWS Account ID
account_number = boto3.client("sts").get_caller_identity()["Account"]

In [8]:
# Processor image name previous uploaded in ECR
image_name_processor = "hermione-processor"

In [9]:
# Inference image name previous uploaded in ECR
image_name_inference = "hermione-inference"

In [10]:
# Input and output paths to execute train and inference
paths = {
    'expectations': f"s3://{bucket}/PREPROCESSING/EXPECTATIONS",
    'preprocessing': f"s3://{bucket}/PREPROCESSING/PREPROCESSING",
    'test_raw': f"s3://{bucket}/TEST_RAW",
    'inference_processed': f"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED",
    'validations': f"s3://{bucket}/PREPROCESSING/VALIDATIONS",
    'model': f"s3://{bucket}/PREPROCESSING/MODEL/Hermione-train-2021-05-26-12-41-29-505/output/model.tar.gz",
    'output_path': f"s3://{bucket}/PREPROCESSING/OUTPUT"
}

In [11]:
# instance to run the code
instance_type_preprocessing="ml.t3.medium"
instance_type_inference="ml.m5.large"

## Preprocessing Step

In [12]:
# Processor image previous uploaded in ECR
image_uri_processor = f"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_processor}"

In [13]:
# Creates the processor to access the ECR image
processor = Processor(image_uri=image_uri_processor,
                     role=role,
                     instance_count=1,
                     instance_type=instance_type_preprocessing)

In [14]:
# Creates input and output objects for ProcessingStep
inputs=[
    ProcessingInput(source=paths['test_raw'],
                    destination='/opt/ml/processing/input/raw_data', 
                    input_name='raw_data'),
    ProcessingInput(source=paths['preprocessing'], 
                    destination='/opt/ml/processing/input/preprocessing', 
                    input_name='preprocessing'),
    ProcessingInput(source=paths['expectations'], 
                    destination='/opt/ml/processing/input/expectations', 
                    input_name='expectations')
]
outputs = [
    ProcessingOutput(
        source="/opt/ml/processing/output/processed/inference",
        destination=paths['inference_processed'],
        output_name="inference_data",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/validations",
        destination=paths['validations'],
        output_name="validations",
    )
]

In [15]:
# Creates the ProcessingStep
processing_step = ProcessingStep(
    "SageMaker Preprocessing step",
    processor=processor,
    job_name=execution_input["PreprocessingJobName"],
    inputs=inputs,
    outputs=outputs,
    container_arguments=["--step", "test"]
)

## Inference Step

In [16]:
# Inference image previous uploaded in ECR
image_uri_inference = f"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name_inference}"

In [17]:
# Creates input and output objects for TransformStep
input_path = paths['inference_processed']
model_path = paths['model']
output_path = paths['output_path']

In [18]:
# Creates the model to access the ECR image
model = sagemaker.model.Model(
    image_uri = image_uri_inference,
    model_data=model_path,
    role=role)

In [19]:
# Creates a transformer object from the trained model
transformer = model.transformer(
                          instance_count=1,
                          instance_type=instance_type_inference,   
                          output_path=output_path,
                          accept = 'text/csv')

In [20]:
# Creates the TransformStep
transform_step = TransformStep(
    "Inference Step",
    transformer=transformer,
    job_name=execution_input["TransformJobName"],
    data=input_path,
    content_type='text/csv',
    wait_for_completion=True,
    model_name=model.name
)

## Create Workflow and Execute

In [21]:
# Creates Fail state to mark the workflow failed in case any of the steps fail.
failed_state_sagemaker_processing_failure = stepfunctions.steps.states.Fail(
    "ML Workflow failed", cause="SageMakerProcessingJobFailed"
)

In [22]:
# Adds the Error handling in the workflow
catch_state_processing = stepfunctions.steps.states.Catch(
    error_equals=["States.TaskFailed"],
    next_step=failed_state_sagemaker_processing_failure,
)

processing_step.add_catch(catch_state_processing)
transform_step.add_catch(catch_state_processing)

In [None]:
# Creates workflow with Pre-Processing Job and Transform Job
workflow_graph = Chain([processing_step, transform_step])
branching_workflow = Workflow(
    name="SFN_Hermione_Inference",
    definition=workflow_graph,
    role=workflow_execution_role,
)
branching_workflow.create()

In [24]:
# Generates unique names for Pre-Processing Job and Training Job
# Each job requires a unique name
preprocessing_job_name = "Hermione-Preprocessing-{}".format(
    uuid.uuid1().hex
) 
inference_job_name = "Hermione-Inference-{}".format(
    uuid.uuid1().hex
) 

In [25]:
# Executes the workflow
execution = branching_workflow.execute(
    inputs={
        "PreprocessingJobName": preprocessing_job_name,
        "TransformJobName": inference_job_name
    }
)
execution_output = execution.get_output(wait=False)
execution.render_progress()

## Results

In [26]:
import pandas as pd
pd.read_csv('s3://hermione-sagemaker/PREPROCESSING/OUTPUT/inference.csv.out')

Unnamed: 0,Survived,Age,Pclass_1,Pclass_2,Pclass_3,Sex_1,Sex_2,predict
0,1.0,0.007288,0.0,0.0,1.0,1.0,0.0,1.0
1,0.0,0.371701,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.761247,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.334004,0.0,0.0,1.0,0.0,1.0,0.0
4,0.0,0.572757,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
217,0.0,0.208344,0.0,0.0,1.0,0.0,1.0,0.0
218,0.0,0.233476,0.0,0.0,1.0,0.0,1.0,0.0
219,0.0,0.019854,0.0,0.0,1.0,1.0,0.0,1.0
220,1.0,0.220910,1.0,0.0,0.0,1.0,0.0,1.0
