# Sagemaker Processor

This script generates the train, val and inference files with the processor previous uploaded in ECR.

## Import modules

In [1]:
import boto3
import time
from datetime import datetime
from sagemaker import get_execution_role
from sagemaker.processing import Processor, ProcessingInput, ProcessingOutput

## Setup

Modify according to your configurations.

In [2]:
# Bucket name in S3
bucket = "hermione-sagemaker"

In [3]:
# Set session
region_name="us-east-1"
boto3.setup_default_session(region_name=region_name)

In [4]:
# Get user role
role = get_execution_role()

In [5]:
# Get AWS Account ID
account_number = boto3.client("sts").get_caller_identity()["Account"]

In [6]:
# Image previous uploaded in ECR
image_name = "hermione-processor"
image_uri = f"{account_number}.dkr.ecr.{region_name}.amazonaws.com/{image_name}"

In [7]:
# Input and output paths to execute train and inference
paths = {
    'train_raw': f"s3://{bucket}/TRAIN_RAW",
    'expectations': f"s3://{bucket}/PREPROCESSING/EXPECTATIONS",
    'preprocessing': f"s3://{bucket}/PREPROCESSING/PREPROCESSING",
    'train_processed': f"s3://{bucket}/PREPROCESSING/TRAIN_PROCESSED",
    'val_processed': f"s3://{bucket}/PREPROCESSING/VAL_PROCESSED",
    'test_raw': f"s3://{bucket}/TEST_RAW",
    'inference_processed': f"s3://{bucket}/PREPROCESSING/INFERENCE_PROCESSED",
    'validations': f"s3://{bucket}/PREPROCESSING/VALIDATIONS"
}

In [8]:
# upload train and test data in S3
s3 = boto3.resource('s3')    
s3.Bucket(bucket).upload_file('../../../data/raw/raw_train.csv', 'TRAIN_RAW/raw_train.csv')
s3.Bucket(bucket).upload_file('../../../data/raw/raw_test.csv', 'TEST_RAW/raw_test.csv')

In [9]:
# instance to run the code
instance_type_train="ml.t3.medium"
instance_type_inference="ml.t3.medium"

## Processor - Train

In [10]:
# Receives a raw data in S3
inputs=[
    ProcessingInput(source=paths['train_raw'], 
                    destination='/opt/ml/processing/input/raw_data', 
                    input_name="raw_data")
]

In [11]:
# Returns the great expectation object, preprocessing object, 
# processed training data and processed validation data, and saves them in S3
outputs = [
    ProcessingOutput(
        source="/opt/ml/processing/output/expectations",
        destination=paths['expectations'],
        output_name="expectations",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/preprocessing",
        destination=paths['preprocessing'],
        output_name="preprocessing",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/processed/train",
        destination=paths['train_processed'],
        output_name="train_data",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/processed/val",
        destination=paths['val_processed'],
        output_name="val_data",
    )
]

In [12]:
# Creates the processor to access the ECR image
processor = Processor(image_uri=image_uri,
                     role=role,
                     instance_count=1,
                     instance_type=instance_type_train)

In [13]:
%%time
# Runs the processor to access the ECR image and process the training data
processor.run(inputs=inputs,
              outputs= outputs,
              arguments=["--step", "train"]              
             )


Job Name:  hermione-processor-2021-07-22-19-53-22-425
Inputs:  [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TRAIN_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'expectations', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/output/expectations', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'preprocessing', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/output/preprocessing', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'train_data', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/TRAIN_PROCESSED', 'LocalPath': '/opt/ml/processing/output/processed/train', 'S3UploadMode': 'EndOfJob'}}

## Processor - Inference

In [10]:
# Receives a raw data in S3, the preprocessing and great expectation objects created in the training
inputs=[
    ProcessingInput(source=paths['test_raw'],
                    destination='/opt/ml/processing/input/raw_data', 
                    input_name='raw_data'),
    ProcessingInput(source=paths['preprocessing'], 
                    destination='/opt/ml/processing/input/preprocessing', 
                    input_name='preprocessing'),
    ProcessingInput(source=paths['expectations'], 
                    destination='/opt/ml/processing/input/expectations', 
                    input_name='expectations')
]

In [11]:
# Returns the processed inference data and validations, and saves them in S3
outputs = [
    ProcessingOutput(
        source="/opt/ml/processing/output/processed/inference",
        destination=paths['inference_processed'],
        output_name="inference_data",
    ),
    ProcessingOutput(
        source="/opt/ml/processing/output/validations",
        destination=paths['validations'],
        output_name="validations",
    )
]

In [12]:
# Creates the processor to access the ECR image
processor = Processor(image_uri=image_uri,
                     role=role,
                     instance_count=1,
                     instance_type=instance_type_inference)

In [13]:
%%time
# Runs the processor to access the ECR image and process the inference data
processor.run(inputs=inputs,
              outputs= outputs,
              arguments=["--step", "test"]              
             )


Job Name:  hermione-processor-2021-07-22-19-40-48-848
Inputs:  [{'InputName': 'raw_data', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/TEST_RAW', 'LocalPath': '/opt/ml/processing/input/raw_data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'preprocessing', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/PREPROCESSING', 'LocalPath': '/opt/ml/processing/input/preprocessing', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'expectations', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://hermione-sagemaker/PREPROCESSING/EXPECTATIONS', 'LocalPath': '/opt/ml/processing/input/expectations', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'inference_data',