# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [1]:
!pip install -q --upgrade pip
!pip install -q --upgrade sagemaker

In [2]:
!pip install --upgrade numpy scikit-learn

Collecting numpy
  Using cached numpy-2.2.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Using cached numpy-2.2.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mkl-fft 1.3.11 requires mkl, which is not installed.
numba 0.61.0 requires numpy<2.2,>=1.24, but you have numpy 2.2.4 which is incompatible.
sagemaker 2.243.0 requires numpy<2.0,>=1.9.0, but you have numpy 2.2.4 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.2.4


In [3]:
!pip install numpy==1.26.4

!pip install --upgrade pandas


Collecting numpy==1.26.4
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.2.4
    Uninstalling numpy-2.2.4:
      Successfully uninstalled numpy-2.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
mkl-fft 1.3.11 requires mkl, which is not installed.[0m[31m
[0mSuccessfully installed numpy-1.26.4


In [4]:
import sagemaker
import pandas as pd
# from sklearn.datasets import load_boston
from sagemaker.sklearn.estimator import SKLearn
# from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()

# uri of your remote mlflow server
tracking_uri = 'http://MLflow-MLFLO-VR3wJBi9L1nl-09870ac901267fb3.elb.us-east-1.amazonaws.com/' 



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [9]:
# we use the Boston housing dataset 
# data = load_boston()

# X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

# trainX = pd.DataFrame(X_train, columns=data.feature_names)
# trainX['target'] = y_train

# testX = pd.DataFrame(X_test, columns=data.feature_names)
# testX['target'] = y_test

# trainX.to_csv('boston_train.csv')
# testX.to_csv('boston_test.csv')

In [5]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [6]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing',
    'n-estimators': 200,
    'min-samples-leaf': 2,
    'features': 'CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.0-1',
    base_job_name='mlflow',
)

In [7]:
estimator.fit({'train':train_path, 'test': test_path})

2025-03-29 09:17:34 Starting - Starting the training job...
2025-03-29 09:17:48 Starting - Preparing the instances for training...
2025-03-29 09:18:29 Downloading - Downloading the training image......
2025-03-29 09:19:25 Training - Training image download completed. Training in progress..[34m2025-03-29 09:19:35,351 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2025-03-29 09:19:35,355 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-03-29 09:19:35,358 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-03-29 09:19:35,373 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2025-03-29 09:19:35,645 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[34mProcessing /opt/ml/code
  Preparing metadata (setup.py

[34mSuccessfully installed Mako-1.3.9 alembic-1.14.1 cloudpickle-3.1.1 contourpy-1.1.1 cycler-0.12.1 databricks-cli-0.18.0 docker-7.1.0 entrypoints-0.4 fonttools-4.56.0 gitdb-4.0.12 gitpython-3.1.44 importlib-metadata-7.2.1 importlib-resources-6.4.5 kiwisolver-1.4.7 markdown-3.7 matplotlib-3.7.5 mlflow-2.10.0 oauthlib-3.2.2 packaging-23.2 pyjwt-2.9.0 pyparsing-3.1.4 pyyaml-6.0.2 querystring-parser-1.2.4 sagemaker-example-1.0 smmap-5.0.2 sqlalchemy-2.0.40 sqlparse-0.5.3 tabulate-0.9.0 typing-extensions-4.13.0 zipp-3.20.2[0m
[34m2025-03-29 09:19:46,287 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-03-29 09:19:46,290 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)[0m
[34m2025-03-29 09:19:46,309 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2025-03-29 09:19:46,312 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installe


2025-03-29 09:20:03 Uploading - Uploading generated training model
2025-03-29 09:20:03 Completed - Training job completed
Training seconds: 114
Billable seconds: 114
