In [None]:
!pip install mlflow -q
!pip install pyngrok -q

In [2]:
import mlflow
import subprocess
from pyngrok import ngrok, conf
import getpass

In [None]:
# Define the MLflow tracking URI with SQLite
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

# Start the MLflow server using subprocess
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI, "--port", "5000"])

In [4]:
# Set MLflow tracking URI
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [None]:
# Set up ngrok for exposing the MLflow UI
print("Enter your authtoken, which can be copied from https://dashboard.ngrok.com/auth")
conf.get_default().auth_token = getpass.getpass()

In [None]:
# Expose the MLflow UI on port 5000
port = 5000
public_url = ngrok.connect(port).public_url
print(f' * ngrok tunnel "{public_url}" -> "http://127.0.0.1:{port}"')

In [7]:
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

import sklearn
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

from mlflow import MlflowClient
from mlflow.entities import ViewType

In [8]:
mlflow.set_experiment('Exp_1')

2025/05/23 13:15:00 INFO mlflow.tracking.fluent: Experiment with name 'Exp_1' does not exist. Creating a new experiment.


<Experiment: artifact_location='/content/mlruns/1', creation_time=1748006100220, experiment_id='1', last_update_time=1748006100220, lifecycle_stage='active', name='Exp_1', tags={}>

**Que 1**. Mlflow version?

In [9]:
!mlflow --version

mlflow, version 2.22.0


In [10]:
def preprocess_data():

    def dump_pickle(obj, filename: str):
        with open(filename, "wb") as f_out:
            return pickle.dump(obj, f_out)

    def read_dataframe(filename: str):
        df = pd.read_parquet(filename)

        df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
        df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
        df = df[(df.duration >= 1) & (df.duration <= 60)]

        categorical = ['PULocationID', 'DOLocationID']
        df[categorical] = df[categorical].astype(str)

        return df

    def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):

        df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
        categorical = ['PU_DO']
        numerical = ['trip_distance']
        dicts = df[categorical + numerical].to_dict(orient='records')

        if fit_dv: X = dv.fit_transform(dicts)
        else: X = dv.transform(dicts)
        return X, dv

    def run_data_prep(raw_data_path='/content', dest_path='/content/Output', dataset: str = "green"):

        df_train = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-01.parquet"))
        df_val = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-02.parquet"))
        df_test = read_dataframe(os.path.join(raw_data_path, f"{dataset}_tripdata_2023-03.parquet"))

        target = 'duration'
        y_train = df_train[target].values
        y_val = df_val[target].values
        y_test = df_test[target].values

        dv = DictVectorizer()
        X_train, dv = preprocess(df_train, dv, fit_dv=True)
        X_val, _ = preprocess(df_val, dv, fit_dv=False)
        X_test, _ = preprocess(df_test, dv, fit_dv=False)

        os.makedirs(dest_path, exist_ok=True)

        dump_pickle(dv, os.path.join(dest_path, "dv.pkl"))
        dump_pickle((X_train, y_train), os.path.join(dest_path, "train.pkl"))
        dump_pickle((X_val, y_val), os.path.join(dest_path, "val.pkl"))
        dump_pickle((X_test, y_test), os.path.join(dest_path, "test.pkl"))

    run_data_prep()

In [12]:
preprocess_data()

**Que 2**. Number of files that were saved in output folder?

In [13]:
!ls /content/Output | wc -l

4


In [14]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [15]:
def train():

    def load_pickle(filename: str):
        with open(filename, "rb") as f_in:
            return pickle.load(f_in)

    def run_train(data_path='/content/Output'):

        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

        mlflow.autolog()

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = root_mean_squared_error(y_val, y_pred)

    run_train()

In [16]:
train()

2025/05/23 13:16:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/05/23 13:16:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for statsmodels.
2025/05/23 13:16:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2025/05/23 13:16:25 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '6e36864218b64273a3fd769044f643b1', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


**Que 3**. Value of min_sample_slit?

In [17]:
run = client.search_runs(experiment_ids=[client.get_experiment_by_name('Exp_1').experiment_id])

In [18]:
splits = run[0].data.params['min_samples_split']

In [19]:
print(f'min_sample_split: {splits}')

min_sample_split: 2


In [20]:
# Define the MLflow tracking URI with SQLite
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
FOLDER = '/artifacts'

# Start the MLflow server using subprocess
subprocess.Popen(["mlflow", "ui", "--backend-store-uri", MLFLOW_TRACKING_URI, '--default-artifact-root', FOLDER, "--port", "5000"])

<Popen: returncode: None args: ['mlflow', 'ui', '--backend-store-uri', 'sqli...>

In [21]:
# Expose the MLflow UI on port 5000
port = 5000
public_url = ngrok.connect(port).public_url
print(f' * ngrok tunnel "{public_url}" -> "http://127.0.0.1:{port}"')

 * ngrok tunnel "https://7b45-35-194-151-48.ngrok-free.app" -> "http://127.0.0.1:5000"


In [22]:
def hpo():

    mlflow.set_tracking_uri("http://127.0.0.1:5000")
    mlflow.set_experiment("random-forest-hyperopt")

    def load_pickle(filename: str):
        with open(filename, "rb") as f_in:
            return pickle.load(f_in)

    def run_optimization(data_path='/content/Output', num_trials=15):

        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))

        def objective(params):

            with mlflow.start_run():

                mlflow.log_params(params)

                rf = RandomForestRegressor(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_val)
                rmse = root_mean_squared_error(y_val, y_pred)

                mlflow.log_metric('RMSE', rmse)

                return {'loss': rmse, 'status': STATUS_OK}

        search_space = {
            'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
            'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
            'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
            'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
            'random_state': 42
        }

        rstate = np.random.default_rng(42)
        fmin(
            fn=objective,
            space=search_space,
            algo=tpe.suggest,
            max_evals=num_trials,
            trials=Trials(),
            rstate=rstate
        )

    run_optimization()

In [23]:
hpo()

2025/05/23 13:17:40 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-hyperopt' does not exist. Creating a new experiment.


🏃 View run capable-dolphin-712 at: http://127.0.0.1:5000/#/experiments/2/runs/0f2d91d499a74f9c9a2ac18b8f9522c2

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run wistful-dolphin-208 at: http://127.0.0.1:5000/#/experiments/2/runs/a7b4b590002848f4971ace482fcf9bc6

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run clumsy-gnu-0 at: http://127.0.0.1:5000/#/experiments/2/runs/732b73db6b7e40a18d5536e25f433938

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run peaceful-gnat-832 at: http://127.0.0.1:5000/#/experiments/2/runs/eb678ac06ef0468693db8cdf91fdf1c1

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run unruly-fish-721 at: http://127.0.0.1:5000/#/experiments/2/runs/2c2b8cbb45904f27be139595488ba0a9

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2

🏃 View run persistent-fowl-168 at: http://127.0.0.1:5000/#/experiments/2/runs/f5e2da6ebc6b40f5bcfcbe70dc49e2cc

🧪 View experiment at: http://127.0.0

**Que 5.** Minimum validation RMSE value?

In [24]:
runs = client.search_runs(experiment_ids=[client.get_experiment_by_name('random-forest-hyperopt').experiment_id])
valid_rmse = sorted([run.data.metrics['RMSE'] for run in runs])
print(f'minimum_validation_rmse: {valid_rmse[0]}')

minimum_validation_rmse: 5.335419588556921


In [25]:
def register_model():
    HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
    EXPERIMENT_NAME = "random-forest-best-models"
    RF_PARAMS = ['max_depth', 'n_estimators', 'min_samples_split', 'min_samples_leaf', 'random_state']

    mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
    mlflow.set_experiment(EXPERIMENT_NAME)
    mlflow.sklearn.autolog()


    def load_pickle(filename):
        with open(filename, "rb") as f_in:
            return pickle.load(f_in)


    def train_and_log_model(data_path, params):
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
        X_test, y_test = load_pickle(os.path.join(data_path, "test.pkl"))

        with mlflow.start_run():
            new_params = {}
            for param in RF_PARAMS:
                new_params[param] = int(params[param])

            rf = RandomForestRegressor(**new_params)
            rf.fit(X_train, y_train)

            # Evaluate model on the validation and test sets
            val_rmse = root_mean_squared_error(y_val, rf.predict(X_val))
            mlflow.log_metric("val_rmse", val_rmse)
            test_rmse = root_mean_squared_error(y_test, rf.predict(X_test))
            mlflow.log_metric("test_rmse", test_rmse)

    def run_register_model(data_path='/content/Output', top_n=5):

        client = MlflowClient()

        # Retrieve the top_n model runs and log the models
        experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
        runs = client.search_runs(
            experiment_ids=experiment.experiment_id,
            run_view_type=ViewType.ACTIVE_ONLY,
            max_results=top_n,
            order_by=["metrics.rmse ASC"]
        )
        for run in runs:
            train_and_log_model(data_path=data_path, params=run.data.params)

        # Select the model with the lowest test RMSE
        experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
        best_run = client.search_runs(
            experiment_ids=experiment.experiment_id,
            order_by=['metrics.test_rmse ASC']
        )[0]

        # Register the best model
        mlflow.register_model(
            model_uri=f'runs:/{best_run.info.run_id}/model',
            name='random-forest-model-registry'
        )

    run_register_model()


In [26]:
register_model()

2025/05/23 13:22:25 INFO mlflow.tracking.fluent: Experiment with name 'random-forest-best-models' does not exist. Creating a new experiment.
Successfully registered model 'random-forest-model-registry'.
Created version '1' of model 'random-forest-model-registry'.


**Que 6.** Test_rmse for regitered model?


In [27]:
model = client.get_registered_model('random-forest-model-registry')
run_id = model.latest_versions[0].run_id
test_rmse = client.get_run(run_id).data.metrics['test_rmse']
print(f'test_rmse: {test_rmse}')

test_rmse: 5.567408012462019
