In [1]:
import os
import pickle
import click
import mlflow

from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

HPO_EXPERIMENT_NAME = "random-forest-hyperopt-final"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()



In [2]:
def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def train_and_log_model(data_path, params):
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])

        rf = RandomForestRegressor(**params)
        rf.fit(X_train, y_train)

        # Evaluate model on the validation and test sets
        val_rmse = mean_squared_error(y_val, rf.predict(X_val), squared=False)
        mlflow.log_metric("val_rmse", val_rmse)
        test_rmse = mean_squared_error(y_test, rf.predict(X_test), squared=False)
        mlflow.log_metric("test_rmse", test_rmse)

In [3]:
client = MlflowClient()

# Retrieve the top_n model runs and log the models
experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
runs = client.search_runs(
    experiment_ids=experiment.experiment_id,
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=5,
    order_by=["metrics.rmse ASC"]
)

In [4]:
runs

[<Run: data=<RunData: metrics={'rmse': 5.335419588556921}, params={'max_depth': '19',
  'min_samples_leaf': '2',
  'min_samples_split': '2',
  'n_estimators': '11',
  'random_state': '42'}, tags={'mlflow.runName': 'wistful-stoat-569',
  'mlflow.source.name': 'hpo.py',
  'mlflow.source.type': 'LOCAL',
  'mlflow.user': 'abhis',
  'model': 'randomforestregressor'}>, info=<RunInfo: artifact_uri='mlflow-artifacts:/5/e74cf6951b784cc28f96df384f76291c/artifacts', end_time=1716681279330, experiment_id='5', lifecycle_stage='active', run_id='e74cf6951b784cc28f96df384f76291c', run_name='wistful-stoat-569', run_uuid='e74cf6951b784cc28f96df384f76291c', start_time=1716681274997, status='FINISHED', user_id='abhis'>, inputs=<RunInputs: dataset_inputs=[]>>,
 <Run: data=<RunData: metrics={'rmse': 5.354695072530291}, params={'max_depth': '15',
  'min_samples_leaf': '2',
  'min_samples_split': '3',
  'n_estimators': '40',
  'random_state': '42'}, tags={'mlflow.runName': 'sneaky-seal-102',
  'mlflow.source.

In [5]:
for run in runs:
    train_and_log_model(data_path="./output", params=run.data.params)



In [7]:
run.data.params

{'max_depth': 14,
 'n_estimators': 23,
 'min_samples_split': 6,
 'min_samples_leaf': 2,
 'random_state': 42}

In [12]:
best_run = runs[0]

In [13]:
run_id = best_run.info.run_id

In [14]:
run_id

'e74cf6951b784cc28f96df384f76291c'

In [15]:
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri= model_uri, name="nyc-taxi-randomforestregressor " )

Registered model 'nyc-taxi-randomforestregressor ' already exists. Creating a new version of this model...
2024/05/26 13:50:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-randomforestregressor , version 2
Created version '2' of model 'nyc-taxi-randomforestregressor '.


<ModelVersion: aliases=[], creation_timestamp=1716745843338, current_stage='None', description='', last_updated_timestamp=1716745843338, name='nyc-taxi-randomforestregressor\u2002', run_id='e74cf6951b784cc28f96df384f76291c', run_link='', source='mlflow-artifacts:/5/e74cf6951b784cc28f96df384f76291c/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>