# MLflow Training Demo

Creates a model for predicting the quality of wine using [sklearn.linear_model.ElasticNet](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html).  We perform a naive search of the hyperparameter space in order to determine the optimal values.

The results of the model training runs are tracked in an MLflow experiment. The best performing model is then registered in the model registry and set to the `Production` stage for usage.

> This is notebook is based on `train.py` from the MLflow example [sklearn_elasticnet_wine](https://github.com/mlflow/mlflow/tree/master/examples/sklearn_elasticnet_wine).

Attribution
* The data set used in this example is from http://archive.ics.uci.edu/ml/datasets/Wine+Quality and sourced from [here](https://raw.githubusercontent.com/mlflow/mlflow/master/examples/sklearn_elasticnet_wine/wine-quality.csv).
* P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
* Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.


## Tracking Setup

Create our experiment to track all our model training runs in.

* This experiment is used across runs of the notebook and will not be recreated if it already exists.
* The name of the experiment is defined as an anaconda project variable located within `anaconda-project.yml`.
    * The variable name is `MLFLOW_EXPERIMENT_NAME`, and the default value is `demo_sklearn_elasticnet_wine`.

In [None]:
from environment_utils import init
import warnings
import numpy as np

warnings.filterwarnings("ignore")
np.random.seed(42)

experiment_id, client = init()

# Training

In [None]:
"""
Training performance evaluation function
"""

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [None]:
from data_utils import DataSet
from sklearn.linear_model import ElasticNet
from mlflow.models.signature import infer_signature

"""
Model Training Function
Note that the hyper-parameters `alpha` and `l1_ratio` are parameters.  This parameterization allows us to drive a hyper-parameter search for optimizing model performance.
"""

import os
from mlflow_adsp import create_unique_name
import mlflow


def train(alpha: float, l1_ratio: float, ds: DataSet) -> str:
    # Start the MLflow run to track the model training.
    with mlflow.start_run(run_name=create_unique_name(name=os.environ["MLFLOW_EXPERIMENT_NAME"])) as run:
        # Create the model
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)

        # Fit the model
        lr.fit(ds.X_train, ds.y_train)

        # Assess model performance
        predicted_qualities = lr.predict(ds.X_test)
        (rmse, mae, r2) = eval_metrics(ds.y_test, predicted_qualities)

        # Log our training hyper-parameters
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)

        # Log our model performance metrics.
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mae", mae)

        # Generate our model signatures for consumption.
        predictions = lr.predict(ds.X_train)
        signature = infer_signature(ds.X_train, predictions)

        # Log the model
        mlflow.sklearn.log_model(lr, "model", signature=signature)

        # Return the run_id for training run comparisons.
        return run.info.run_id

# Train a single model

In [None]:
from data_utils import prepare_data
from mlflow.entities import Run

alpha: float = 0.2
l1_ratio: float = 0.1

DATA_SET_FILENAME: str = "datasets/winequality-white.csv"
data_set: DataSet = prepare_data(csv_url=DATA_SET_FILENAME)

run_id: str = train(alpha=alpha, l1_ratio=l1_ratio, ds=data_set)
run: Run = client.search_runs([experiment_id], f"attributes.run_id = '{run_id}'")[0]

print(run.data.metrics)

# Perform a naive search of the hyperparameter space

We will naively review model performance at specific internals across the solution space.  There are many optimization functions, which can be leveraged based on business needs.

In [None]:
from tqdm import trange

runs: list[str] = []

for i in trange(5):
    alpha: float = i * 0.1
    for j in trange(5, leave=False):
        l1_ratio: float = j * 0.1
        run_id: str = train(
            alpha=alpha,
            l1_ratio=l1_ratio,
            ds=prepare_data(csv_url=DATA_SET_FILENAME),
        )
        runs.append(run_id)

# Find and register the best model

In [None]:
from mlflow.tracking import MlflowClient
from mlflow.entities import Run
from typing import Optional


def get_best_run(client: MlflowClient, experiment_id, runs: list[str]) -> tuple[Optional[Run], dict]:
    _inf = np.finfo(np.float64).max

    best_metrics: dict = {
        "rmse": _inf,
        "mae": _inf,
        "r2": _inf,
    }
    best_run: Optional[Run] = None

    for run_id in runs:
        # find the best run, log its metrics as the final metrics of this run.
        run: Run = client.search_runs([experiment_id], f"attributes.run_id = '{run_id}'")[0]

        if run.data.metrics["rmse"] < best_metrics["rmse"]:
            best_metrics = run.data.metrics
            best_run = run

    return best_run, best_metrics

## Review the runs for the best performing model and add it to the model registry

In [None]:
from mlflow_utils import register_best_model
from mlflow.entities.model_registry import ModelVersion

(run, metrics) = get_best_run(client=client, experiment_id=experiment_id, runs=runs)
print(f"Run ID: {run.info.run_id}")
print(f"Report: {metrics}")

In [None]:
model_version: ModelVersion = register_best_model(client=client, run=run)

## Promote the latest model to the `Production` stage for usage.

In [None]:
model_version: ModelVersion = client.transition_model_version_stage(
    name=os.environ["MLFLOW_EXPERIMENT_NAME"],
    version=model_version.version,
    stage="Production",
    archive_existing_versions=True,
)