# MLflow 101

In [1]:
from datetime import datetime
from random import randint
from shutil import rmtree
from urllib.request import urlopen

import mlflow
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from tqdm import tqdm

## Setup

In [2]:
mlflow.set_experiment('Iris Classification')

INFO: 'Iris Classification' does not exist. Creating a new experiment


## Download Data

In [3]:
data_url = 'http://bodywork-ml-ops-project.s3.eu-west-2.amazonaws.com/data/iris_classification_data.csv'
data = pd.read_csv(urlopen(data_url))
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


## Data Preparation

In [4]:
feature_columns = [
    'sepal length (cm)',
    'sepal width (cm)',
    'petal length (cm)',
    'petal width (cm)'
]

label_column = 'species'
species_to_class_map = {'setosa': 0, 'versicolor': 1, 'virginica': 2}

X = data[feature_columns].values
y = data[label_column].apply(lambda e: species_to_class_map[e]).values

## Split Data into Train and Test Subsets

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

## Define Task Metrics

In [6]:
def log_metrics_summary(y_actual, y_predicted):
    time_now = datetime.now().isoformat(timespec='seconds')
    accuracy = balanced_accuracy_score(
        y_actual,
        y_predicted,
        adjusted=True
    )
    f1 = f1_score(
        y_actual,
        y_predicted,
        average='weighted'
    )
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('f1', f1)

## Train Model

In [7]:
def train_model(
    X: np.ndarray,
    y: np.ndarray,
    max_depth: int,
    random_state: int
) -> DecisionTreeClassifier:
    """Train a single model, given hyper-parameters."""
    iris_tree_classifier = DecisionTreeClassifier(
        class_weight='balanced',
        random_state=random_state,
        max_depth=max_depth
    )
    iris_tree_classifier.fit(X_train, y_train)
    return iris_tree_classifier


with mlflow.start_run(run_name='DecisionTreeClassifier') as parent_run:
    for _ in tqdm(range(10)):
        with mlflow.start_run(nested=True) as child_run:
            max_depth = randint(1, 4)
            random_state = randint(1, 100)
            mlflow.log_param('random_state', random_state)
            mlflow.log_param('max_depth', max_depth)
            trained_model = train_model(X_train, y_train, max_depth, random_state)
            log_metrics_summary(y_test, trained_model.predict(X_test))
    
    best_run = (
        mlflow.search_runs(parent_run.info.experiment_id)
        .sort_values(by=['metrics.f1', 'metrics.accuracy'], ascending=False)
        [:1]
    )
    best_f1 = float(best_run['metrics.f1'])
    best_accuracy = float(best_run['metrics.accuracy'])
    best_max_depth = int(best_run['params.max_depth'])
    best_random_state = int(best_run['params.random_state'])
    best_model = train_model(X, y, best_max_depth, best_random_state)
    mlflow.log_param('best_max_depth', best_max_depth)
    mlflow.log_param('best_random_state', best_random_state)
    mlflow.log_metric('best_f1', best_f1)
    mlflow.log_metric('best_accuracy', best_accuracy)
    mlflow.set_tag('model_estimated_on_full_dataset', "true")
    mlflow.sklearn.log_model(best_model, 'iris_classifier')

100%|██████████| 10/10 [00:01<00:00,  5.53it/s]


## Explore with the MLflow UI

In [10]:
!mlflow u

[2020-12-22 01:12:23 +0000] [96550] [INFO] Starting gunicorn 20.0.4
[2020-12-22 01:12:23 +0000] [96550] [INFO] Listening at: http://127.0.0.1:5000 (96550)
[2020-12-22 01:12:23 +0000] [96550] [INFO] Using worker: sync
[2020-12-22 01:12:23 +0000] [96551] [INFO] Booting worker with pid: 96551
^C
[2020-12-22 01:13:50 +0000] [96550] [INFO] Handling signal: int
[2020-12-22 01:13:50 +0000] [96551] [INFO] Worker exiting (pid: 96551)


## Clean-Up

In [68]:
rmtree('mlruns', ignore_errors=True)