# Machine Learning Pipelines

## Machine Learning Orchestration with Amazon SageMaker Pipelines

In [2]:
import os
import sagemaker
import logging
import boto3
import time
import pandas as pd
import json
import botocore
from botocore.exceptions import ClientError


# ========================== low-level service client of the boto3 session ==========================
config = botocore.config.Config(user_agent_extra='bedissj-1699438736259')


sm = boto3.client(service_name='sagemaker', 
                  config=config)

sm_runtime = boto3.client('sagemaker-runtime',
                          config=config)

sess = sagemaker.Session(sagemaker_client=sm,
                         sagemaker_runtime_client=sm_runtime)

bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = sess.boto_region_name


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## 1. Configure Data Processing Step

In [None]:
raw_data_s3_uri = f's3://{bucket}/data/transformed_querying/month1'


In [3]:
from sagemaker.workflow.parameters import ParameterFloat, ParameterInteger, ParameterString


# ========================== Processing parameters ==========================
input_data = ParameterString(
    name='input-data',
    default_value='/opt/ml/processing/input/data'
)


output_data = ParameterString(
    name='input-data',
    default_value='/opt/ml/processing/output'
)

validation_split_percentage = ParameterFloat(
    name='validation-split-percentage',
    default_value=0.1
)


test_split_percentage = ParameterFloat(
    name='test-split-percentage',
    default_value=0.2
)


feature_group_name = ParameterString(
    name='feature-group-name',
    default_value='bank-churn-mon1-feature-group'
)



feature_store_offline_prefix = ParameterString(
    name='feature-store-offline-prefix',
    default_value=None
)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput


# ========================== Processing Inputs ==========================
processing_inputs = [ProcessingInput(input_name='bank-churn-mon1-raw-data',
                                     source=raw_data_s3_uri, 
                                     destination=input_data.default_value,
                                     s3_data_distribution_type='ShardedByS3Key')]



# ========================== Processing Outputs ==========================
output_data_train = output_data.default_value + '/train'
output_data_validation = output_data.default_value + '/validation'
output_data_test = output_data.default_value + '/test'



processing_outputs = [
    ProcessingOutput(output_name='bank-churn-mon1-train',
                     source=output_data_train,
                     s3_upload_mode='EndOfJob'),
    
    ProcessingOutput(source=output_data_validation,
                    output_name='bank-churn-mon1-validation',
                    s3_upload_mode='EndOfJob'),
    
    ProcessingOutput(source=output_data_test,
                    output_name='bank-churn-mon1-test',
                    s3_upload_mode='EndOfJob'),
]

In [4]:
from sagemaker.sklearn.processing import SKLearnProcessor


# ========================== Processing Parameters ==========================
FRAMEWORK_VERSION = '1.0-1'
processing_instance_type = 'ml.t3.medium'
processing_instance_count = 1



# ========================== Instanciate SKLearn Processor ==========================

sklearn_processor = SKLearnProcessor(
    framework_version=FRAMEWORK_VERSION,
    role=role,
    instance_type=processing_instance_type,
    instance_count = processing_instance_count,
    env={'AWS_DEFAULT_REGION': region}
)

In [5]:
from sagemaker.workflow.steps import ProcessingStep


processing_step = ProcessingStep(
    name='data-processing',
    code='./src/processing.py',
    processor=sklearn_processor,
    inputs=processing_inputs,
    outputs=processing_outputs,
    job_arguments=[
        '--input-data', str(input_data.default_value),
        '--output-data', str(output_data.default_value),
        '--validation-split-percentage', str(validation_split_percentage.default_value),
        '--test-split-percentage', str(test_split_percentage.default_value),
        '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),
        '--feature-group-name', str(feature_group_name.default_value)
    ]
)

In [6]:
from pprint import pprint

pprint(list(processing_step.arguments.keys()))

Popping out 'ProcessingJobName' from the pipeline definition by default since it will be overridden at pipeline execution time. Please utilize the PipelineDefinitionConfig to persist this field in the pipeline definition if desired.


['ProcessingResources',
 'AppSpecification',
 'RoleArn',
 'ProcessingInputs',
 'ProcessingOutputConfig',
 'Environment']


In [7]:
pprint(processing_step.arguments['ProcessingInputs'])

Popping out 'ProcessingJobName' from the pipeline definition by default since it will be overridden at pipeline execution time. Please utilize the PipelineDefinitionConfig to persist this field in the pipeline definition if desired.


[{'AppManaged': False,
  'InputName': 'bank-churn-mon1-raw-data',
  'S3Input': {'LocalPath': '/opt/ml/processing/input/data',
              'S3CompressionType': 'None',
              'S3DataDistributionType': 'ShardedByS3Key',
              'S3DataType': 'S3Prefix',
              'S3InputMode': 'File',
              'S3Uri': 's3://sagemaker-eu-west-3-668303144976/data/transformed_querying/month1'}},
 {'AppManaged': False,
  'InputName': 'code',
  'S3Input': {'LocalPath': '/opt/ml/processing/input/code',
              'S3CompressionType': 'None',
              'S3DataDistributionType': 'FullyReplicated',
              'S3DataType': 'S3Prefix',
              'S3InputMode': 'File',
              'S3Uri': 's3://sagemaker-eu-west-3-668303144976/data-processing-fadeac1e1c3f09d53b332e0d3a5ccde0/input/code/processing.py'}}]


In [8]:
pprint(processing_step.arguments['ProcessingOutputConfig'])

Popping out 'ProcessingJobName' from the pipeline definition by default since it will be overridden at pipeline execution time. Please utilize the PipelineDefinitionConfig to persist this field in the pipeline definition if desired.


{'Outputs': [{'AppManaged': False,
              'OutputName': 'bank-churn-mon1-train',
              'S3Output': {'LocalPath': '/opt/ml/processing/output/train',
                           'S3UploadMode': 'EndOfJob',
                           'S3Uri': 's3://sagemaker-eu-west-3-668303144976/data-processing-fadeac1e1c3f09d53b332e0d3a5ccde0/output/bank-churn-mon1-train'}},
             {'AppManaged': False,
              'OutputName': 'bank-churn-mon1-validation',
              'S3Output': {'LocalPath': '/opt/ml/processing/output/validation',
                           'S3UploadMode': 'EndOfJob',
                           'S3Uri': 's3://sagemaker-eu-west-3-668303144976/data-processing-fadeac1e1c3f09d53b332e0d3a5ccde0/output/bank-churn-mon1-validation'}},
             {'AppManaged': False,
              'OutputName': 'bank-churn-mon1-test',
              'S3Output': {'LocalPath': '/opt/ml/processing/output/test',
                           'S3UploadMode': 'EndOfJob',
                   

In [9]:
pprint(processing_step.arguments['ProcessingResources'])

Popping out 'ProcessingJobName' from the pipeline definition by default since it will be overridden at pipeline execution time. Please utilize the PipelineDefinitionConfig to persist this field in the pipeline definition if desired.


{'ClusterConfig': {'InstanceCount': 1,
                   'InstanceType': 'ml.t3.medium',
                   'VolumeSizeInGB': 30}}


## 2. Configure Training Step

In [10]:
# ========================= Training resources =========================
training_instance_type = 'ml.m5.large'
training_instance_count = 1


# ========================== training inputs ==========================
objective = 'validation:accuracy'
metric_definitions = [
    {'Name': 'validation:precision', 'Regex': 'val_precision: ([0-9.]+)'},
    {'Name': 'validation:recall', 'Regex': 'val_recall: ([0-9.]+)'},
    {'Name': 'validation:f1Score', 'Regex': 'val_f1score: ([0-9.]+)'},
    {'Name': 'validation:ROCAUC', 'Regex': 'val_roc_auc: ([0-9.]+)'},
    {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9.]+)'}
]

In [11]:
from sagemaker.parameter import IntegerParameter, CategoricalParameter, ContinuousParameter


# =========================  Static hyperparameters =========================
random_state = ParameterInteger(
    name='random-state',
    default_value=2024
)

static_hyperparameters = {
    'random_state': random_state
}


# ==========================  Hyperparameter ranges ==========================

hyperparameter_ranges = {
    'n_estimators': IntegerParameter(min_value=10, 
                                     max_value=200, 
                                     scaling_type='Logarithmic'),
    
    'max_depth': IntegerParameter(min_value=3, 
                                  max_value=10, 
                                  scaling_type='Linear'),
    
    'criterion': CategoricalParameter(values=['gini', 'entropy'])
}

In [12]:
from sagemaker.sklearn.estimator import SKLearn


# =========================  Instanciate estimator  =========================
sklearn_estimator = SKLearn(
    entry_point='./src/training.py',
    framework_version=FRAMEWORK_VERSION,
    instance_type=training_instance_type,
    instance_count=training_instance_count,
    role=role,
    hyperparameters=static_hyperparameters,
    metric_definitions=metric_definitions
)

In [13]:
from sagemaker.tuner import HyperparameterTuner


# =========================  Instanciate hyperparameter tuner  =========================
tuner = HyperparameterTuner(
        estimator=sklearn_estimator,
        hyperparameter_ranges=hyperparameter_ranges,
        objective_metric_name=objective,
        metric_definitions=metric_definitions,
        strategy='Bayesian',
        objective_type='Maximize',
        max_jobs=10,
        max_parallel_jobs=2,
        # autotune=True
)

In [14]:
from sagemaker.inputs import TrainingInput


# ====================== Configure training/tuning inputs ======================
tuning_inputs = [
    TrainingInput(
        s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
            'bank-churn-mon1-train'
        ].S3Output.S3Uri,
        content_type='text/csv'
    ),
    
    TrainingInput(
        s3_data=processing_step.properties.ProcessingOutputConfig.Outputs[
            'bank-churn-mon1-validation'
        ].S3Output.S3Uri,
        content_type='text/csv'
    ),
]

In [15]:
from sagemaker.workflow.steps import CacheConfig

cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") # PT1H represents `one hour`

In [16]:
from sagemaker.workflow.steps import TuningStep


tuning_step =TuningStep(
    name='hyperparameter-tuninig',
    tuner=tuner,
    inputs=tuning_inputs,
    cache_config=cache_config
)


#### Completed so far:
- `static_hyperparameters`
- `hyperparameter_ranges`

#### 1.2.5 Describe Training Step

In [69]:
pprint(tuning_step.properties.__dict__)

{'Autotune': {'_step': <sagemaker.workflow.steps.TuningStep object at 0x7f019e5a2140>, 'step_name': 'hyperparameter-tuninig', 'path': 'Autotune', '_shape_names': ['Autotune'], 'Mode': {'_step': <sagemaker.workflow.steps.TuningStep object at 0x7f019e5a2140>, 'step_name': 'hyperparameter-tuninig', 'path': 'Autotune.Mode', '_shape_names': ['AutotuneMode'], '__str__': 'AutotuneMode'}},
 'BestTrainingJob': {'_step': <sagemaker.workflow.steps.TuningStep object at 0x7f019e5a2140>, 'step_name': 'hyperparameter-tuninig', 'path': 'BestTrainingJob', '_shape_names': ['HyperParameterTrainingJobSummary'], 'TrainingJobDefinitionName': {'_step': <sagemaker.workflow.steps.TuningStep object at 0x7f019e5a2140>, 'step_name': 'hyperparameter-tuninig', 'path': 'BestTrainingJob.TrainingJobDefinitionName', '_shape_names': ['HyperParameterTrainingJobDefinitionName'], '__str__': 'HyperParameterTrainingJobDefinitionName'}, 'TrainingJobName': {'_step': <sagemaker.workflow.steps.TuningStep object at 0x7f019e5a2140

In [70]:
tuning_step.properties.__dict__.keys()

dict_keys(['_step', 'step_name', 'path', '_shape_names', 'HyperParameterTuningJobName', 'HyperParameterTuningJobArn', 'HyperParameterTuningJobConfig', 'TrainingJobDefinition', 'TrainingJobDefinitions', 'HyperParameterTuningJobStatus', 'CreationTime', 'HyperParameterTuningEndTime', 'LastModifiedTime', 'TrainingJobStatusCounters', 'ObjectiveStatusCounters', 'BestTrainingJob', 'OverallBestTrainingJob', 'WarmStartConfig', 'Autotune', 'FailureReason', 'TuningJobCompletionDetails', 'ConsumedResources', 'TrainingJobSummaries', 'NextToken'])

In [77]:
tuning_step.properties.TrainingJobDefinition.__dict__.keys()

dict_keys(['_step', 'step_name', 'path', '_shape_names', 'DefinitionName', 'TuningObjective', 'HyperParameterRanges', 'StaticHyperParameters', 'AlgorithmSpecification', 'RoleArn', 'InputDataConfig', 'VpcConfig', 'OutputDataConfig', 'ResourceConfig', 'HyperParameterTuningResourceConfig', 'StoppingCondition', 'EnableNetworkIsolation', 'EnableInterContainerTrafficEncryption', 'EnableManagedSpotTraining', 'CheckpointConfig', 'RetryStrategy', 'Environment'])

In [37]:
# tuning_step.properties.properties.ModelArtifacts.S3ModelArtifacts

## 3. Configure model evaluation step

In [86]:
evaluation_processor = SKLearnProcessor(
    framework_version=FRAMEWORK_VERSION,
    role=role,
    instance_type=processing_instance_type,
    instance_count = processing_instance_count,
    env={'AWS_DEFAULT_REGION': region}  
)

In [87]:
from sagemaker.workflow.properties import PropertyFile


evaluation_metrics = PropertyFile(
    name='EvaluationReport',
    output_name='metrics',
    path='evaluation.json'
)

In [89]:
evaluation_step = ProcessingStep(
    name='evaluate-metrics',
    code='./src/evaluate_metrics.py',
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=tuning_step.get_top_model_s3_uri(top_k=0, s3_bucket=bucket),
            destination='/opt/ml/processing/input/model'
        ),
        ProcessingInput(
            source=processing_step.properties.ProcessingOutputConfig.Outputs[
                'bank-churn-mon1-test'
            ].S3Output.S3Uri,
            destination='/opt/ml/processing/input/data'
        )
    ],
    outputs = [
        ProcessingOutput(
            output_name='metrics',
            s3_upload_mode='EndOfJob',
            source='/opt/ml/processing/output/metrics/'
        )
    ],
    property_files=[evaluation_metrics]
    # job_arguments=None
)