# Train a Scikit-Learn model in SageMaker and track with MLFlow

## Setup Environment

In [None]:
# !pip install -q --upgrade pip
# !pip install -q --upgrade sagemaker==2.117.0

In [45]:
import sagemaker
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sagemaker.sklearn.estimator import SKLearn
from sklearn.model_selection import train_test_split
import boto3

sess = sagemaker.Session()
bucket = sess.default_bucket()
# role = sagemaker.get_execution_role()
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sm_execution')['Role']['Arn']

# uri of your remote mlflow server
tracking_uri = 'mlflo-mlflo-bzup59rprwdi-206c68add5dac1b3.elb.eu-west-2.amazonaws.com'



## Prepare data
We load a dataset from sklearn, split it and send it to S3

In [40]:
# we use the Boston housing dataset 
data = fetch_california_housing()

X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.25, random_state=42)

trainX = pd.DataFrame(X_train, columns=data.feature_names)
trainX['target'] = y_train

testX = pd.DataFrame(X_test, columns=data.feature_names)
testX['target'] = y_test

trainX.to_csv('boston_train.csv')
testX.to_csv('boston_test.csv')

In [41]:
# send data to S3. SageMaker will take training data from s3
train_path = sess.upload_data(path='boston_train.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')
test_path = sess.upload_data(path='boston_test.csv', bucket=bucket, key_prefix='sagemaker/sklearncontainer')

## Train

In [47]:
hyperparameters = {
    'tracking_uri': tracking_uri,
    'experiment_name': 'boston-housing',
    'n-estimators': 100,
    'min-samples-leaf': 3,
    'features': 'MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude',
    'target': 'target'
}

metric_definitions = [{'Name': 'median-AE', 'Regex': "AE-at-50th-percentile: ([0-9.]+).*$"}]

estimator = SKLearn(
    entry_point='train.py',
    source_dir='source_dir',
    role=role,
    metric_definitions=metric_definitions,
    hyperparameters=hyperparameters,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    framework_version='1.0-1',
    base_job_name='mlflow',
)

In [48]:
estimator.fit({'train':train_path, 'test': test_path})

Using provided s3_resource


INFO:sagemaker:Creating training-job with name: mlflow-2023-05-05-13-40-51-573


2023-05-05 13:39:56 Starting - Starting the training job...
2023-05-05 13:40:15 Starting - Preparing the instances for training......
2023-05-05 13:41:27 Downloading - Downloading input data
2023-05-05 13:41:27 Training - Downloading the training image...
2023-05-05 13:41:58 Training - Training image download completed. Training in progress...[34m2023-05-05 13:42:05,071 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2023-05-05 13:42:05,074 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2023-05-05 13:42:05,083 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2023-05-05 13:42:05,286 sagemaker-training-toolkit INFO     Installing module with the following command:[0m
[34m/miniconda3/bin/python -m pip install . -r requirements.txt[0m
[34mProcessing /opt/ml/code
  Preparing metadata (setup.py): started[0m
[34m  Preparing metadata (setup.py): finished with 