# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [1]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker

In [2]:
!pip install --upgrade numpy scikit-learn

Collecting numpy
  Downloading numpy-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Downloading numpy-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m122.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.5/13.5 MB[0m [31m160.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scikit-learn
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.2
    Uninstal

In [5]:
!pip install --upgrade numpy pandas


Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m180.8 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: tzdata, pandas
  Attempting uninstall: pandas
    Found existing installation: pandas 1.5.3
    Uninstalling pandas-1.5.3:
      Successfully uninstalled pandas-1.5.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
autovizwidget 0.21.0 requires pandas<2.0.0,>=0.20.1, but you have pandas 2.2.3 which is incompatible.
hdijupyterutil

In [1]:
import sagemaker
import pandas as pd
# from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
# from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# uri of your remote mlflow server
tracking_uri = 'http://MLflow-MLFLO-A1wpR8HEhkUG-685780275bfbe98f.elb.us-east-1.amazonaws.com/' 



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [5]:
# we use the Boston housing dataset 
# data = load_boston()

# X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

# trainX = pd.DataFrame(X_train, columns=data.feature_names)
# trainX['target'] = y_train

# testX = pd.DataFrame(X_test, columns=data.feature_names)
# testX['target'] = y_test

# trainX.to_csv('boston_train.csv')
# testX.to_csv('boston_test.csv')


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [2]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [3]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing-mlops',
    'n-estimators': 150,
    'min-samples-leaf': 4,
    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.0-1',
    base_job_name='mlflow',
)

In [4]:
estimator.fit({'train':train_path, 'test': test_path})

2024-12-14 06:47:41 Starting - Starting the training job...
2024-12-14 06:47:56 Starting - Preparing the instances for training...
2024-12-14 06:48:20 Downloading - Downloading input data...
2024-12-14 06:48:50 Downloading - Downloading the training image...
2024-12-14 06:49:31 Training - Training image download completed. Training in progress..[34m2024-12-14 06:49:37,620 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2024-12-14 06:49:37,623 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2024-12-14 06:49:37,625 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2024-12-14 06:49:37,640 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2024-12-14 06:49:37,867 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[