In [1]:
# import library 
import pandas as pd 
import os
import pickle
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll import scope 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVR
from xgboost import XGBRegressor
from sklearn.ensemble import ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor
import xgboost as xgb
import mlflow.sklearn
import mlflow.xgboost
from sklearn.metrics import root_mean_squared_error


In [2]:
import mlflow

# Set the tracking URI to the same one used in your UI
mlflow.set_tracking_uri("http://127.0.0.1:5000")  

# Create or set the experiment
mlflow.set_experiment("chicago-taxi-experiment")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1753312612818, experiment_id='1', last_update_time=1753312612818, lifecycle_stage='active', name='chicago-taxi-experiment', tags={}>

In [3]:
def read_dataframe(filename: str) -> pd.DataFrame:
    # Define full file path using OneDrive
    local_path = os.path.abspath(os.path.join(os.getcwd(), "..", "Dataset"))
    file_path = os.path.join(local_path, filename)

    # Read the parquet file
    df = pd.read_parquet(file_path)

    # Convert timestamp columns to datetime
    df["trip_start_timestamp"] = pd.to_datetime(df["trip_start_timestamp"], errors="coerce")
    df["trip_end_timestamp"] = pd.to_datetime(df["trip_end_timestamp"], errors="coerce")

    # Convert trip_seconds to numeric and create duration
    df["trip_seconds"] = pd.to_numeric(df["trip_seconds"], errors="coerce")
    df["duration_minutes"] = df["trip_seconds"] / 60

    # Convert other relevant columns to numeric
    df["trip_miles"] = pd.to_numeric(df["trip_miles"], errors="coerce")
    df["duration_minutes"] = pd.to_numeric(df["duration_minutes"], errors="coerce")

    # Drop rows with missing or invalid duration or miles
    df = df.dropna(subset=["duration_minutes", "trip_miles"])
    df = df[df["duration_minutes"] > 0]

    # Extract time features
    df["hour"] = df["trip_start_timestamp"].dt.hour
    df["day_of_week"] = df["trip_start_timestamp"].dt.dayofweek
    df["is_weekend"] = df["day_of_week"] >= 5


    df["fare"] = pd.to_numeric(df["fare"], errors="coerce")
    df["trip_total"] = pd.to_numeric(df["trip_total"], errors="coerce")

    # Combine features
    df["PU_DO"] = df["pickup_community_area"].fillna("NA").astype(str) + "_" + df["dropoff_community_area"].fillna("NA").astype(str)

    # Filter invalid rows before computing derived features
    df = df[df["trip_miles"] > 0]
    df = df[df["duration_minutes"] > 0]

    # Compute derived features
    df["fare_per_mile"] = df["fare"] / df["trip_miles"]
    df["trip_speed"] = df["trip_miles"] / (df["duration_minutes"] / 60)

    # Handle infinite values in fare_per_mile and trip_speed
    df["fare_per_mile"].replace([float("inf"), -float("inf")], pd.NA, inplace=True)
    df["trip_speed"].replace([float("inf"), -float("inf")], pd.NA, inplace=True)

    df = df.dropna(subset=["fare_per_mile", "trip_speed"])

    return df

In [4]:
chicago_taxi= read_dataframe("chicago_taxi_2023_01.parquet")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["fare_per_mile"].replace([float("inf"), -float("inf")], pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["trip_speed"].replace([float("inf"), -float("inf")], pd.NA, inplace=True)


In [5]:
# Create train and validation split (e.g., 80% train, 20% validation)
X = chicago_taxi[["trip_miles", "PU_DO","is_weekend", "fare_per_mile", "trip_speed", "hour", "day_of_week"]]
y = chicago_taxi["duration_minutes"]

top_pudo = X["PU_DO"].value_counts().nlargest(1000).index
X["PU_DO"] = X["PU_DO"].where(X["PU_DO"].isin(top_pudo), "Other")


# Encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Split into train/test (validation)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["PU_DO"] = X["PU_DO"].where(X["PU_DO"].isin(top_pudo), "Other")


### Training model pipeline

In [6]:
with mlflow.start_run(run_name="XGBoostRegressor"):
    mlflow.set_tag("developer", "dario")
    mlflow.set_tag("model", "XGBoostRegressor")

    # Initialize and train model
    model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        objective="reg:squarederror",
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Save preprocessing metadata (e.g., column names)
    preprocessor = {"feature_names": list(X_train.columns)}
    os.makedirs("models", exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(preprocessor, f_out)

    # Log preprocessing to MLflow
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

    # Predict and evaluate
    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)

    # Log to MLflow
    mlflow.log_param("model_type", "XGBoostRegressor")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("max_depth", 6)
    mlflow.log_param("learning_rate", 0.1)
    mlflow.log_metric("rmse", rmse)

    mlflow.xgboost.log_model(model, "model")
    

    print(f"✅ Validation RMSE: {rmse:.2f}")



✅ Validation RMSE: 9.81
🏃 View run XGBoostRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/d454b8f893924b5b907e96c8fcfc34a1
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


### Train with several base  models 

In [12]:
models = {
    "LinearSVR": LinearSVR(),
    "ExtraTrees": ExtraTreesRegressor(n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(),
    "RandomForest": RandomForestRegressor(n_jobs=-1),
    "XGBoost": XGBRegressor(n_jobs=-1)
}

In [13]:
for name, model in models.items():
    with mlflow.start_run(run_name=name):
        #set the tag name for who response
        mlflow.set_tag("developer","Dario")
        
        # Train the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        
        # Log parameters and metrics
        mlflow.set_tag("model", name)
        mlflow.log_param("train_rows", X_train.shape[0])
        mlflow.log_metric("rmse", rmse)
        
        # Save preprocessing metadata (e.g., column names)
        preprocessor = {"feature_names": list(X_train.columns)}
        os.makedirs("models", exist_ok=True)
        with open("models/preprocessor.b", "wb") as f_out:
            pickle.dump(preprocessor, f_out)

        # Log preprocessing to MLflow
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")
        

        print(f"{name} RMSE: {rmse:.2f}")



LinearSVR RMSE: 293.67
🏃 View run LinearSVR at: http://127.0.0.1:5003/#/experiments/1/runs/4e6739c22ec04b348f990581f4d5f184
🧪 View experiment at: http://127.0.0.1:5003/#/experiments/1
ExtraTrees RMSE: 20.64
🏃 View run ExtraTrees at: http://127.0.0.1:5003/#/experiments/1/runs/0a738cc34b5f475eb6b88e2814dea0bc
🧪 View experiment at: http://127.0.0.1:5003/#/experiments/1
GradientBoosting RMSE: 5.69
🏃 View run GradientBoosting at: http://127.0.0.1:5003/#/experiments/1/runs/b65d549aeed84aaba8cb34e4f7fc2abf
🧪 View experiment at: http://127.0.0.1:5003/#/experiments/1
RandomForest RMSE: 4.83
🏃 View run RandomForest at: http://127.0.0.1:5003/#/experiments/1/runs/fa53d2cd314b4911b15c3b6f18a7d151
🧪 View experiment at: http://127.0.0.1:5003/#/experiments/1
XGBoost RMSE: 9.84
🏃 View run XGBoost at: http://127.0.0.1:5003/#/experiments/1/runs/c80ea21159eb44959f400c1f941db6c1
🧪 View experiment at: http://127.0.0.1:5003/#/experiments/1


### Hyperparameter tunning with XGBoost

In [6]:
# Create the train data 
train = xgb.DMatrix(X_train, label = y_train)
valid = xgb.DMatrix(X_val, label = y_val)

In [7]:
# define the objective function 
def objective(params):
    "Write the objective function"
    with mlflow.start_run(run_name="XGBoostRegressor"):
        mlflow.set_tag("developer", "dario")
        mlflow.set_tag("model", "xgboost")
        mlflow.log_param("model_type", "XGBoostRegressor")
        mlflow.log_params(params)
        booster = xgb.train(
            params = params,
            dtrain = train,
            num_boost_round = 1000, # n.o of iteration
            evals = [(valid, "validation")],
            early_stopping_rounds = 50
        )
        y_pred = booster.predict(valid)
        rmse = root_mean_squared_error(y_val, y_pred)
        mlflow.log_metric("rmse", rmse)
    return {'loss': rmse, 'status': STATUS_OK} 

In [9]:
# define the search space 
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 10, 1)),             # Narrow from 4–100 → 4–10
    'learning_rate': hp.uniform('learning_rate', 0.05, 0.3),                # Narrow to [0.05, 0.3]
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),                             # L1 regularization: 0–1
    'reg_lambda': hp.uniform('reg_lambda', 0.5, 2),                         # L2 regularization: 0.5–2
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),          # Prevent overfitting
    'objective': 'reg:squarederror',                                        # Updated from deprecated 'reg:linear'
    'seed': 42
}

best_result = fmin(
    fn = objective,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 50,
    trials = Trials())

[0]	validation-rmse:27.50564                          
[1]	validation-rmse:25.60322                          
[2]	validation-rmse:23.86288                          
[3]	validation-rmse:22.33518                          
[4]	validation-rmse:20.98373                          
[5]	validation-rmse:19.69780                          
[6]	validation-rmse:18.56922                          
[7]	validation-rmse:17.57172                          
[8]	validation-rmse:16.68471                          
[9]	validation-rmse:15.88081                          
[10]	validation-rmse:15.15408                         
[11]	validation-rmse:14.55658                         
[12]	validation-rmse:14.00015                         
[13]	validation-rmse:13.51837                         
[14]	validation-rmse:13.08592                         
[15]	validation-rmse:12.69337                         
[16]	validation-rmse:12.37145                         
[17]	validation-rmse:12.08784                         
[18]	valid

### Hyperparameter tuning with RandomForest 

In [8]:
# create the globalt tracker 
best_rmse = float("inf")
best_model = None

# define the objective function
def objective(params):
    with mlflow.start_run(run_name="RandomForestRegressor"):
        mlflow.set_tag("developer", "dario")
        mlflow.set_tag("model", "random_forest")
        mlflow.log_params(params)

        # Create the model
        model = RandomForestRegressor(
            n_estimators=int(params["n_estimators"]),
            max_depth=int(params["max_depth"]),
            min_samples_split=int(params["min_samples_split"]),
            min_samples_leaf=int(params["min_samples_leaf"]),
            random_state=42,
            n_jobs=-1
        )

        # Fit and evaluate
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)

        # Save preprocessing metadata (e.g., column names)
        preprocessor = {"feature_names": list(X_train.columns)}
        os.makedirs("models", exist_ok=True)
        with open("models/preprocessor.b", "wb") as f_out:
            pickle.dump(preprocessor, f_out)

        # Log preprocessing to MLflow
        mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

        # Log metric
        mlflow.log_param("model_type", "RandomForestRegressor")
        mlflow.log_metric("rmse", rmse)
        mlflow.sklearn.log_model(model, "model")

        global best_rmse, best_model
        if rmse < best_rmse:
            best_rmse = rmse
            best_model = model

        return {"loss": rmse, "status": STATUS_OK}

In [9]:
# define the search space
search_space = {
    "n_estimators": scope.int(hp.quniform("n_estimators", 50, 100, 10)),      # Reduce upper bound from 200 → 100
    "max_depth": scope.int(hp.quniform("max_depth", 5, 10, 1)),               # Reduce upper bound from 20 → 10
    "min_samples_split": scope.int(hp.quniform("min_samples_split", 2, 5, 1)),# Narrow range
    "min_samples_leaf": scope.int(hp.quniform("min_samples_leaf", 1, 3, 1))   # Narrow range
}

In [10]:
# run hyperparameter tuning
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,
    trials=Trials()
)

# Save best model to disk
with open("models/rf_reg.bin", "wb") as f_out:
    pickle.dump(best_model, f_out)

# show the best result
print("✅ Best Result:", best_result)

  0%|          | 0/30 [00:00<?, ?trial/s, best loss=?]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/6da71d800bb74c90ad8fd4fef47a7f17

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1

  3%|▎         | 1/30 [02:43<1:18:54, 163.26s/trial, best loss: 9.486421183272594]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/2c25ba5ea78043d3bb15ecca742cbd5f

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

  7%|▋         | 2/30 [06:15<1:29:45, 192.33s/trial, best loss: 5.674678523233581]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/a3ddee4f19e94420902f4a685af217c1

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 10%|█         | 3/30 [11:13<1:48:08, 240.32s/trial, best loss: 5.2434785652382425]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/91cead90febc4c0690c78e72e7230cf4

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 13%|█▎        | 4/30 [16:25<1:56:25, 268.67s/trial, best loss: 5.028872192256805] 




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/71d4b851c3ac47e9b97bda5d92a4e40f

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 17%|█▋        | 5/30 [19:15<1:37:04, 232.96s/trial, best loss: 5.028872192256805]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/eed0134aca2a4e6c8550e99763a7c83a

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 20%|██        | 6/30 [21:50<1:22:39, 206.66s/trial, best loss: 5.028872192256805]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/8a1829c89fa94712a937c5d186e89299

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 23%|██▎       | 7/30 [26:49<1:30:48, 236.91s/trial, best loss: 5.028872192256805]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/96742099845d486daf990fb6261fd3d9

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 27%|██▋       | 8/30 [32:55<1:41:51, 277.78s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/1b1bf89c68a84db0ac43136f0d698b3c

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 30%|███       | 9/30 [35:57<1:26:45, 247.86s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/de5b450e209d4106ad851630a6aaa881

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 33%|███▎      | 10/30 [40:05<1:22:37, 247.90s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/c7b7eab8b8864ba9b6aa36bf42317c1d

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 37%|███▋      | 11/30 [45:15<1:24:29, 266.83s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/11275e6faab84f0994442ea2d9dec2ea

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 40%|████      | 12/30 [49:21<1:18:11, 260.65s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/b9169a63cc4a4c328e1fbe2b1e8620aa

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 43%|████▎     | 13/30 [52:43<1:08:46, 242.75s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/cda567537d394e0ea7aed256b0746428

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 47%|████▋     | 14/30 [55:16<57:34, 215.90s/trial, best loss: 4.969727301893083]  




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/cc6cb1cbd47443128cf7010550b32126

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                     

 50%|█████     | 15/30 [56:42<44:09, 176.64s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/b9dd9d702d774a4eac3b5e47e22c986a

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                     

 53%|█████▎    | 16/30 [58:13<35:10, 150.72s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/6c18b05de397436ca12755c8554bb064

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                     

 57%|█████▋    | 17/30 [59:34<28:09, 129.94s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/9c105e6e0570424983da88eb7d1e15db

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 60%|██████    | 18/30 [1:01:45<26:03, 130.30s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/e7dfa94515ec47d7baae19f2d9bba358

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 63%|██████▎   | 19/30 [1:03:15<21:39, 118.16s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/f1753684345949519de4146850d9568e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 67%|██████▋   | 20/30 [1:04:29<17:29, 104.93s/trial, best loss: 4.969727301893083]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/3a2edfbe13fe45999299159884c7a859

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 70%|███████   | 21/30 [1:07:13<18:23, 122.64s/trial, best loss: 4.841008684125414]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/993c16628a924f70ae9c0d447bd6a198

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 73%|███████▎  | 22/30 [1:09:57<18:00, 135.11s/trial, best loss: 4.841008684125413]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/ff469c1abb9c4bd5b94bda657a65d7f9

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 77%|███████▋  | 23/30 [1:12:41<16:46, 143.75s/trial, best loss: 4.841008684125413]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/c769c1e6f15440b5bf8cdc9124b823bf

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 80%|████████  | 24/30 [1:15:39<15:23, 153.91s/trial, best loss: 4.841008684125413]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/27bce70db2544a49b849b35bf3de2cce

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 83%|████████▎ | 25/30 [1:19:22<14:33, 174.71s/trial, best loss: 4.841008684125413]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/7342f5c27a0c419789bf814b8e1d339e

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                       

 87%|████████▋ | 26/30 [1:23:00<12:30, 187.69s/trial, best loss: 4.78491652878406] 




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/7a96ea92f86a4d419f9def0c44c91642

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 90%|█████████ | 27/30 [1:26:14<09:29, 189.67s/trial, best loss: 4.78491652878406]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/697852c9e9c04985894718c60b5afb1a

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 93%|█████████▎| 28/30 [1:29:31<06:23, 191.61s/trial, best loss: 4.78491652878406]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/2b57fe709ddb4745af5363a014393f3a

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

 97%|█████████▋| 29/30 [1:31:44<02:54, 174.16s/trial, best loss: 4.78491652878406]




🏃 View run RandomForestRegressor at: http://127.0.0.1:5000/#/experiments/1/runs/f2bbd6e89f604efab2dd02577493d0c2

🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1                      

100%|██████████| 30/30 [1:35:16<00:00, 190.53s/trial, best loss: 4.78491652878406]
✅ Best Result: {'max_depth': 10.0, 'min_samples_leaf': 2.0, 'min_samples_split': 2.0, 'n_estimators': 100.0}


### Use the MLFlow client classes 

In [18]:
# import the mlflow client library
from mlflow.tracking import MlflowClient 

# set the tracking URI
MLFLOW_TRACKING_URI = "sqlite:///mflow.db"

client = MlflowClient(tracking_uri = MLFLOW_TRACKING_URI)

In [19]:
# Listing all the experiments
client = MlflowClient()
experiments = client.search_experiments()
for exp in experiments:
    print(f"Name: {exp.name}, ID: {exp.experiment_id}")

Name: chicago-taxi-experiment, ID: 1
Name: Default, ID: 0


In [20]:
from mlflow.entities import ViewType 
# Show the best run models
runs = client.search_runs(
    experiment_ids = 1,
    filter_string = "metrics.rmse < 5",
    run_view_type = ViewType.ACTIVE_ONLY,
    max_results = 5, 
    order_by = ["metrics.rmse ASC"]
)

# show the result
for run in runs:
    print(f"run_id: {run.info.run_id}, rmse:{run.data.metrics['rmse']:.4f}")

run_id: 7342f5c27a0c419789bf814b8e1d339e, rmse:4.7849
run_id: 697852c9e9c04985894718c60b5afb1a, rmse:4.7849
run_id: 993c16628a924f70ae9c0d447bd6a198, rmse:4.8410
run_id: 27bce70db2544a49b849b35bf3de2cce, rmse:4.8410
run_id: c769c1e6f15440b5bf8cdc9124b823bf, rmse:4.8410


### Register the best performance model

In [14]:
# Use correct run ID and model URI
run_id = "7342f5c27a0c419789bf814b8e1d339e"
model_uri = f"runs:/{run_id}/model"

# Register the model
mlflow.register_model(model_uri=model_uri, name="chicago-taxi-experiment")

Successfully registered model 'chicago-taxi-experiment'.
2025/07/24 14:27:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: chicago-taxi-experiment, version 1
Created version '1' of model 'chicago-taxi-experiment'.


<ModelVersion: aliases=[], creation_timestamp=1753324052759, current_stage='None', description='', last_updated_timestamp=1753324052759, name='chicago-taxi-experiment', run_id='7342f5c27a0c419789bf814b8e1d339e', run_link='', source='mlflow-artifacts:/1/7342f5c27a0c419789bf814b8e1d339e/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>

### Transition model into another stage 

In [15]:
# get the lastest verion 
model_name = "chicago-taxi-experiment"
lastest_versions = client.get_latest_versions(name = model_name)

for version in lastest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


  lastest_versions = client.get_latest_versions(name = model_name)


In [16]:
model_version = 1
new_stage = "Production"
# transition model stage
client.transition_model_version_stage(
    name = model_name,
    version = model_version, 
    stage = new_stage,
    archive_existing_versions = False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1753324052759, current_stage='Production', description='', last_updated_timestamp=1753324084733, name='chicago-taxi-experiment', run_id='7342f5c27a0c419789bf814b8e1d339e', run_link='', source='mlflow-artifacts:/1/7342f5c27a0c419789bf814b8e1d339e/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>

### Change the model description

In [17]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name = model_name,
    version = model_version, 
    description = f"The model version {model_version} was transition to {new_stage} on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1753324052759, current_stage='Production', description='The model version 1 was transition to Production on 2025-07-24', last_updated_timestamp=1753324089360, name='chicago-taxi-experiment', run_id='7342f5c27a0c419789bf814b8e1d339e', run_link='', source='mlflow-artifacts:/1/7342f5c27a0c419789bf814b8e1d339e/artifacts/model', status='READY', status_message=None, tags={}, user_id='', version='1'>