# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [1]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker==2.117.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sparkmagic 0.21.0 requires pandas<2.0.0,>=0.17.1, but you have pandas 2.2.0 which is incompatible.
yapf 0.40.1 requires importlib-metadata>=6.6.0, but you have importlib-metadata 4.13.0 which is incompatible.[0m[31m
[0m

In [2]:
!pip install scikit-learn==1.1.3

Collecting scikit-learn==1.1.3
  Downloading scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Downloading scikit_learn-1.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.5/30.5 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.4.1.post1
    Uninstalling scikit-learn-1.4.1.post1:
      Successfully uninstalled scikit-learn-1.4.1.post1
Successfully installed scikit-learn-1.1.3


In [3]:
import sagemaker
import pandas as pd
from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# uri of your remote mlflow server
tracking_uri = 'http://MLflow-MLFLO-w9tpeQwzgqZz-45551a4ba3a1ef8d.elb.us-east-1.amazonaws.com/' 

## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [4]:
# we use the Boston housing dataset 
data = load_boston()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [5]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [6]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing-mlops',
    'n-estimators': 100,
    'min-samples-leaf': 3,
    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.0-1',
    base_job_name='mlflow',
)

In [7]:
estimator.fit({'train':train_path, 'test': test_path})

2024-05-17 10:26:03 Starting - Starting the training job...
2024-05-17 10:26:29 Starting - Preparing the instances for trainingProfilerReport-1715941563: InProgress
...
2024-05-17 10:27:03 Downloading - Downloading the training image......
2024-05-17 10:27:49 Training - Training image download completed. Training in progress.[34m2024-05-17 10:27:49,911 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-05-17 10:27:49,914 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-05-17 10:27:49,916 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-05-17 10:27:49,930 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-05-17 10:27:50,134 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[34mProcessing /opt/m

[34mSuccessfully installed Mako-1.3.5 alembic-1.13.1 cloudpickle-2.2.1 contourpy-1.1.1 cycler-0.12.1 databricks-cli-0.18.0 docker-6.1.3 entrypoints-0.4 fonttools-4.51.0 gitdb-4.0.11 gitpython-3.1.43 importlib-metadata-5.2.0 importlib-resources-6.4.0 kiwisolver-1.4.5 llvmlite-0.41.1 markdown-3.6 matplotlib-3.7.5 mlflow-2.0.1 numba-0.58.1 oauthlib-3.2.2 packaging-21.3 pyarrow-10.0.1 pyjwt-2.8.0 pyparsing-3.1.2 pytz-2022.7.1 pyyaml-6.0.1 querystring-parser-1.2.4 sagemaker-example-1.0 shap-0.44.1 slicer-0.0.7 smmap-5.0.1 sqlalchemy-1.4.52 sqlparse-0.5.0 tabulate-0.9.0 tqdm-4.66.4 typing-extensions-4.11.0 websocket-client-1.8.0 zipp-3.18.2[0m
[34m2024-05-17 10:28:06,128 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-05-17 10:28:06,131 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-05-17 10:28:06,148 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0

[34m2024-05-17 10:28:11,359 sagemaker-containers INFO     Reporting training SUCCESS[0m

2024-05-17 10:28:29 Uploading - Uploading generated training model
2024-05-17 10:28:29 Completed - Training job completed
Training seconds: 99
Billable seconds: 99
