# MLflow Training Demo

Creates a model for predicting the quality of wine using [sklearn.linear_model.ElasticNet](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html).  We perform a naive search of the hyper-parameter space in order to determine the optimal values.

The results of the model training runs are tracked in an MLflow experiment. The best performing model is then registered in the model registry and set to the `Production` stage for usage.

> This is notebook is based on `train.py` from the MLflow example [sklearn_elasticnet_wine](https://github.com/mlflow/mlflow/tree/master/examples/sklearn_elasticnet_wine).

Attribution
* The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality and sourced from [here](https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv).
* P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
* Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.


In [None]:
import warnings

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from mlflow.models.signature import infer_signature
import mlflow.sklearn
import mlflow

from mlflow_adsp import create_unique_name, upsert_experiment

warnings.filterwarnings("ignore")
np.random.seed(42)

In [None]:
"""
Load user specific configuration.
"""

from ae5_tools import load_ae5_user_secrets

load_ae5_user_secrets()

## Tracking Setup

Create our experiment to track all our model training runs in.

* This experiment is used across runs of the notebook and will not be recreated if it already exists.
* The name of the experiment is defined as an anaconda project variable located within `anaconda-project.yml`.
    * The variable name is `MLFLOW_EXPERIMENT_NAME`, and the default value is `demo_sklearn_elasticnet_wine`.

In [None]:
from mlflow.tracking import MlflowClient

# Generate a client, this will be used for several operations across the notebook.
client = MlflowClient()

In [None]:
from mlflow.exceptions import MlflowException


def upsert_model_registry(client: MlflowClient) -> None:
    try:
        client.create_registered_model(
            name=os.environ["MLFLOW_EXPERIMENT_NAME"]
        )
    except MlflowException as error:
        if error.error_code != "RESOURCE_ALREADY_EXISTS":
            raise error

In [None]:
"""
Ensure that the experiment and model registry exist for reporting and tracking.
"""

import os
from mlflow.entities import Experiment

experiment: Experiment = mlflow.set_experiment(
    experiment_id=upsert_experiment()
)
upsert_model_registry(client=client)

# Training

In [None]:
"""
Data Preparation
Loads the data from csv file, and returns our train, test splits for training.
"""


def prepare_data(csv_url: str) -> dict:
    data: pd.DataFrame = pd.read_csv(csv_url, sep=",")

    # The predicted column is "quality" which is a scalar from [3, 9]
    X: pd.DataFrame = data.drop(["quality"], axis=1)
    y: pd.DataFrame = data[["quality"]]

    X_train, X_test, y_train, y_test = train_test_split(X, y)
    return {"X_train": X_train, "X_test": X_test, "y_train": y_train, "y_test": y_test}

In [None]:
"""
Training performance evaluation function
"""


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [None]:
"""
Model Training Function
"""

import os
from pprint import pprint

import xgboost as xgb
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import mlflow
import mlflow.xgboost
from utils import fetch_logged_data


def train(training_data: dict) -> str:
    X_train = training_data["X_train"]
    X_test = training_data["X_test"]
    y_train = training_data["y_train"]
    y_test = training_data["y_test"]

    # Start the MLflow run to track the model training.
    with mlflow.start_run(
        run_name=create_unique_name(name=os.environ["MLFLOW_EXPERIMENT_NAME"])
    ) as run:
        # enable auto logging
        # this includes xgboost.sklearn estimators
        mlflow.xgboost.autolog()

        regressor = xgb.XGBRegressor(n_estimators=20, reg_lambda=1, gamma=0, max_depth=3)
        regressor.fit(X_train, y_train, eval_set=[(X_test, y_test)])
        y_pred = regressor.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        run_id = mlflow.last_active_run().info.run_id
        print("Logged data and model in run {}".format(run_id))

        # show logged data
        for key, data in fetch_logged_data(run_id).items():
            print("\n---------- logged {} ----------".format(key))
            pprint(data)

        # Return the run_id for training run comparisons.
        return run.info.run_id

# Train a single model

In [None]:
from mlflow.entities import Run

training_data: dict = prepare_data(csv_url="datasets/wine-quality.csv")

run_id: str = train(training_data=training_data)
run: Run = client.search_runs(
    [experiment.experiment_id], f"attributes.run_id = '{run_id}'"
)[0]
print(run.data.metrics)

# Perform a naive search of the hyperparameter space

We will naively review model performance at specific internals across the solution space.  There are many optimization functions which can be leveraged base on business needs.

In [None]:
# from tqdm import trange
#
# runs: list[str] = []
#
# for i in trange(5):
#     alpha: float = i * 0.1
#     for j in trange(5, leave=False):
#         l1_ratio: float = j * 0.1
#         run_id: str = train(
#             alpha=alpha,
#             l1_ratio=l1_ratio,
#             training_data=prepare_data(csv_url="datasets/wine-quality.csv"),
#         )
#         runs.append(run_id)

# Find and register the best model
Define our functions

In [None]:
from mlflow.tracking import MlflowClient
from mlflow.entities import Run
from typing import Optional


def get_best_run(
    client: MlflowClient, experiment_id, runs: list[str]
) -> tuple[Optional[Run], dict]:
    _inf = np.finfo(np.float64).max

    best_metrics: dict = {
        "rmse": _inf,
        "mae": _inf,
        "r2": _inf,
    }
    best_run: Optional[Run] = None

    for run_id in runs:
        # find the best run, log its metrics as the final metrics of this run.
        run: Run = client.search_runs(
            [experiment_id], f"attributes.run_id = '{run_id}'"
        )[0]

        if run.data.metrics["rmse"] < best_metrics["rmse"]:
            best_metrics = run.data.metrics
            best_run = run

    return best_run, best_metrics

In [None]:
from mlflow.entities.model_registry import ModelVersion


def register_best_model(client: MlflowClient, run: Run) -> ModelVersion:
    model_version: ModelVersion = client.create_model_version(
        name=os.environ["MLFLOW_EXPERIMENT_NAME"],
        source=f"{run.info.artifact_uri}/model",
        run_id=run.info.run_id,
        tags={"run_id": run.info.run_id},
    )
    return model_version

## Review the runs for the best performing model and add it to the model registry

In [None]:
model_version: ModelVersion = register_best_model(client=client, run=run)
(run, metrics) = get_best_run(
    client=client, experiment_id=experiment.experiment_id, runs=runs
)

print(f"Run ID: {run.info.run_id}")
print(f"Report: {metrics}")

## Promote the latest model to the `Production` stage for usage.

In [None]:
model_version: ModelVersion = client.transition_model_version_stage(
    name=os.environ["MLFLOW_EXPERIMENT_NAME"],
    version=model_version.version,
    stage="Production",
    archive_existing_versions=True,
)