# Dairy Generative Formulation
### Model training and deployment using SageMaker Pipelines

In [1]:
import os
import sagemaker
import logging
import boto3
import time
import pandas as pd
import json
import botocore
from botocore.exceptions import ClientError


# ========================== low-level service client of the boto3 session ==========================
config = botocore.config.Config(user_agent_extra='bedissj-1699438736259')

bucket =  "dairy-generative-formulation"

sm = boto3.client(service_name='sagemaker', 
                  config=config)

sm_runtime = boto3.client('sagemaker-runtime',
                          config=config)

sess = sagemaker.Session(sagemaker_client=sm,
                         sagemaker_runtime_client=sm_runtime,
                         default_bucket = bucket)

role = sagemaker.get_execution_role()
region = sess.boto_region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [4]:
raw_data_s3_uri = "s3://{}/data/raw/".format(bucket)
print(raw_data_s3_uri)

s3://dairy-generative-formulation/data/raw/


### 1. Processing step

In [5]:
from sagemaker.workflow.parameters import ParameterString, ParameterFloat, ParameterInteger


input_data = ParameterString(
    name='input-data',
    default_value='/opt/ml/processing/input/data'
)

output_data = ParameterString(
    name='output-data',
    default_value='/opt/ml/processing/output'
)

validation_split_percentage = ParameterFloat(
    name='validation-split-percentage',
    default_value=0.1
)


test_split_percentage = ParameterFloat(
    name='test-split-percentage',
    default_value=0.2
)

feature_group_name = ParameterString(
    name='feature-group-name',
    default_value='dairy-generative-formulation-feature-group'
)


feature_store_offline_prefix = ParameterString(
    name='feature-store-offline-prefix',
    default_value=None
)

In [6]:
from sagemaker.processing import ProcessingInput, ProcessingOutput


# ========================== Processing Inputs ==========================
processing_inputs = [
    ProcessingInput(
        input_name='dairy-generative-formulation-raw-data',
        source=raw_data_s3_uri,
        destination=input_data.default_value,
        s3_data_distribution_type='ShardedByS3Key'
    )
]


# ========================== Processing Outputs ==========================
output_data_train = output_data.default_value + '/train'
output_data_validation = output_data.default_value + '/validation'
output_data_test = output_data.default_value + '/test'
output_encoder = output_data.default_value + '/encoder'


processing_outputs = [
    ProcessingOutput(source=output_data_train,
                    output_name='dairy-generative-formulation-train',
                     s3_upload_mode='EndOfJob'),
    
    ProcessingOutput(source=output_data_validation,
                    output_name='dairy-generative-formulation-validation',
                    s3_upload_mode='EndOfJob'),
    
    ProcessingOutput(source=output_data_test,
                    output_name='dairy-generative-formulation-test',
                    s3_upload_mode='EndOfJob'),
    
    ProcessingOutput(source=output_encoder,
                    output_name='dairy-generative-formulation-encoder',
                    s3_upload_mode='EndOfJob')
]

In [7]:
from sagemaker.sklearn import SKLearnProcessor


# ========================== Processing Parameters ==========================
FRAMEWORK_VERSION = '1.0-1'
processing_instance_type = 'ml.t3.medium'
processing_instance_count = 1


# ========================== Instanciate SKLearn Processor ==========================
sklearn_processor = SKLearnProcessor(
    framework_version=FRAMEWORK_VERSION,
    role=role,
    instance_type=processing_instance_type,
    instance_count=processing_instance_count,
    sagemaker_session=sess,
    env={
        'AWS_DEFAULT_REGION': region
    }
)

In [110]:
sklearn_processor.run(
    code='./src/processing.py',
    inputs=processing_inputs,
    outputs=processing_outputs,
    arguments=[
        '--input-data', str(input_data.default_value),
        '--output-data', str(output_data.default_value),
        '--validation-split-percentage', str(validation_split_percentage.default_value),
        '--test-split-percentage', str(test_split_percentage.default_value),
        '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),
        '--feature-group-name', str(feature_group_name.default_value)
    ],
    wait=True,
    logs=True,
)

INFO:sagemaker:Creating processing-job with name sagemaker-scikit-learn-2024-12-02-14-43-07-811


In [134]:
from pprint import pprint

pprint(sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri'])

's3://dairy-generative-formulation/sagemaker-scikit-learn-2024-12-02-14-43-07-811/output/dairy-generative-formulation-train'


In [76]:
# from sagemaker.workflow.steps import ProcessingStep


# # ========================== Instanciate Processing Step ==========================
# processing_step = ProcessingStep(
#     name='DataProcessing',
#     code='./src/processing.py',
#     processor=sklearn_processor,
#     inputs=processing_inputs,
#     outputs=processing_outputs,
#     job_arguments=[
#         '--input-data', str(input_data.default_value),
#         '--output-data', str(output_data.default_value),
#         '--validation-split-percentage', str(validation_split_percentage.default_value),
#         '--test-split-percentage', str(test_split_percentage.default_value),
#         '--feature-store-offline-prefix', str(feature_store_offline_prefix.default_value),
#         '--feature-group-name', str(feature_group_name.default_value)
#     ]
# )

In [None]:
# from pprint import pprint

# pprint(processing_step.arguments['ProcessingOutputConfig'])

### 2. Hyperparameter tuning step

In [8]:
# ========================= Training resources =========================
training_instance_type = 'ml.m5.large'
training_instance_count = 1


# ========================== training inputs ==========================
objective = 'validation:rmse'
metric_definitions = [
    {'Name': 'validation:rmse', 'Regex': 'val_rmse: ([0-9.]+)'},
    {'Name': 'validation:mse', 'Regex': 'val_mse: ([0-9.]+)'},
    {'Name': 'validation:mae', 'Regex': 'val_mae: ([0-9.]+)'},
    {'Name': 'validation:r2', 'Regex': 'val_r2: ([0-9.]+)'},
]

In [21]:
# =========================  Sensory attributes to loop on  =========================

ModelTrainingPrefix = 'ModelTraining'

sensory_attributes = [
    'Flavor_intensity ',
    'sweetness',
    'Fruit_intensity',
    'Chalkiness',
    'Color_intensity',
    'thickness',
    'Coating',
    'Global Appreciation'
]

In [None]:
from sagemaker.parameter import IntegerParameter, CategoricalParameter, ContinuousParameter
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import HyperparameterTuner
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import CacheConfig
from sagemaker.workflow.steps import TuningStep, TrainingStep


for attribute in sensory_attributes:
    # =========================  Static hyperparameters =========================
    static_hyperparameters = {
        'random_state': 2024,
        'sensory_output': attribute
    }


    # ==========================  Hyperparameter ranges ==========================

    hyperparameter_ranges = {
        'n_estimators': IntegerParameter(min_value=10, 
                                         max_value=200, 
                                         scaling_type='Logarithmic'),
        
        'max_depth': IntegerParameter(min_value=3, 
                                      max_value=10, 
                                      scaling_type='Linear'),
        
        'criterion': CategoricalParameter(values=['squared_error', 'friedman_mse'])
    }


    # =========================  Instanciate estimator  =========================
    sklearn_estimator = SKLearn(
        entry_point='./src/training.py',
        framework_version=FRAMEWORK_VERSION,
        instance_type=training_instance_type,
        instance_count=training_instance_count,
        role=role,
        hyperparameters=static_hyperparameters,
        #metric_definitions=metric_definitions
    )


    # =========================  Instanciate hyperparameter tuner  =========================
    tuner = HyperparameterTuner(
            estimator=sklearn_estimator,
            hyperparameter_ranges=hyperparameter_ranges,
            objective_metric_name=objective,
            metric_definitions=metric_definitions,
            strategy='Bayesian',
            objective_type='Minimize',
            max_jobs=15,
            max_parallel_jobs=5,
            autotune=True
            OutputDataConfig={ 
             "S3OutputPath": os.path.join(bucket, ModelTrainingPrefix)
        }
    )


    # ====================== Configure training/tuning inputs ======================
    tuning_inputs = {
        'train': TrainingInput(
            s3_data='s3://dairy-generative-formulation/sagemaker-scikit-learn-2024-12-02-14-43-07-811/output/dairy-generative-formulation-train',
            # s3_data=sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][0]['S3Output']['S3Uri'],
            content_type='text/csv',
            input_mode='File'
        ),
        'validation': TrainingInput(
            s3_data='s3://dairy-generative-formulation/sagemaker-scikit-learn-2024-12-02-14-43-07-811/output/dairy-generative-formulation-validation',
            # s3_data=sklearn_processor.latest_job.describe()['ProcessingOutputConfig']['Outputs'][1]['S3Output']['S3Uri'],
            content_type='text/csv',
            input_mode='File'
        )
    }

    print("Training model for output: {}".format(attribute))
    tuner.fit(
        inputs=tuning_inputs, 
        wait=True, 
        logs=True
    )
    


    # # ====================== Cache configuration  ======================
    # cache_config = CacheConfig(enable_caching=True, expire_after="PT1H") # PT1H represents `one hour`


    # # ====================== Configure hyperparameter tuning step ======================
    # tuning_step =TuningStep(
    #     name='ModelTraining-{}'.format(attribute),
    #     tuner=tuner,
    #     inputs=tuning_inputs,
    #     cache_config=cache_config
    # )

In [23]:
# tuner.stop_tuning_job()