In [1]:
import os
import pickle
import mlflow as mf
import numpy as np

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient

from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings('ignore')

In [2]:
mf.set_tracking_uri('http://127.0.0.1:5000')
mf.set_experiment('raxi-data-experiment')
mf.sklearn.autolog(log_datasets = False)

2024/05/29 17:22:04 INFO mlflow.tracking.fluent: Experiment with name 'raxi-data-experiment' does not exist. Creating a new experiment.


### Helper functions

In [3]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


### run train

In [4]:
def run_train(data_path: str):
    with mf.start_run():
        
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
    
        rmse = mean_squared_error(y_val, y_pred, squared=False)


### run_optimization

In [5]:
def run_optimization(data_path: str, num_trials: int):

    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

    def objective(params):
        mf.set_experiment("random-forest-hyperopt")
        with mf.start_run():
            mf.log_params(params)
            
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_val)
            rmse = mean_squared_error(y_val, y_pred, squared=False)
            
            mf.log_metric("rmse", rmse)
    
            mf.end_run()

            return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )


### register model

In [6]:
HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mf.start_run():
        mf.set_experiment(EXPERIMENT_NAME)
        new_params = {}
        for param in RF_PARAMS:
            new_params[param] = int(params[param])

        rf = RandomForestRegressor(**new_params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mf.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mf.log_metric("test_rmse", test_rmse)

def run_register_model(data_path: str, top_n: int):
    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.rmse ASC"]
    )
    for run in runs:
        train_and_log_model(data_path=data_path, params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    best_run = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=1,
        order_by=["metrics.test_rmse ASC"]
    )[0]

    # Register the best model
    model_uri = f"runs:/{best_run.info.run_id}/model"
    model_name = "nyc-taxi-regressor"
    mf.register_model(model_uri=model_uri, name=model_name)


### Q1 Check the version of MLflow

In [7]:
mf.__version__

'2.13.0'

### Q2 Get and preprocess the data

In [8]:
!python toolbox/preprocess_data.py --raw_data_path ../data/ --dest_path ./output

4 files are created in the output folder

### Q3 Train the model

In [9]:
run_train("./output")

min_samples_split - 2

### Q4 Launch the tracking server locally

full command: mlflow server --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts 

### Q5. Tune model hyperparameters

In [10]:
run_optimization("./output", 10)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

2024/05/29 17:22:45 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.



100%|██████████| 10/10 [00:36<00:00,  3.64s/trial, best loss: 5.354700855292386]


best validation RMSE: 5.354700855292386 

### Q6 Promote the best model to the model registry

In [11]:
run_register_model('./output', 5)

2024/05/29 17:23:27 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'nyc-taxi-regressor'.
2024/05/29 17:23:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-regressor, version 1
Created version '1' of model 'nyc-taxi-regressor'.


test RMSE of the best model: 5.59