In [1]:
import mlflow
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
import numpy as np  

In [5]:
mlflow.set_tracking_uri("http://localhost:8080/")
mlflow.set_experiment("my_first_experiment")

<Experiment: artifact_location='mlflow-artifacts:/737292920269680723', creation_time=1748519859311, experiment_id='737292920269680723', last_update_time=1748519859311, lifecycle_stage='active', name='my_first_experiment', tags={}>

In [7]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    return df

In [8]:
def preprocess(df: pd.DataFrame, dv: DictVectorizer, fit_dv: bool = False):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    if fit_dv:
        X = dv.fit_transform(dicts)
    else:
        X = dv.transform(dicts)
    return X, dv


In [9]:
path = "data/green_tripdata_2023-01.parquet"
df_train = read_dataframe('data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('data/green_tripdata_2023-02.parquet')
df_test = read_dataframe('data/green_tripdata_2023-03.parquet')

In [10]:
# Extract the target
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values
y_test = df_test[target].values

# Fit the DictVectorizer and preprocess data
dv = DictVectorizer()
X_train, dv = preprocess(df_train, dv, fit_dv=True)
X_val, _ = preprocess(df_val, dv, fit_dv=False)
X_test, _ = preprocess(df_test, dv, fit_dv=False)

In [11]:
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((65946, 5702), (62574, 5702), (69392, 5702), (65946,), (62574,), (69392,))

In [12]:
type(X_train), type(y_train), type(dv)

(scipy.sparse._csr.csr_matrix,
 numpy.ndarray,
 sklearn.feature_extraction._dict_vectorizer.DictVectorizer)

In [13]:
mlflow.autolog()
with mlflow.start_run():
    mlflow.set_tag('developer', 'Emmanuel')
    
    #mlflow.log_param('training_data_path', 'data/green_tripdata_2023-01.parquet')
    
    #mlflow.log_param('validation_data_path', 'data/green_tripdata_2023-02.parquet')
    #mlflow.log_param('test_data_path', 'data/green_tripdata_2023-03.parquet')
    rf = RandomForestRegressor(max_depth=10, random_state=0)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    #mlflow.log_metric('rmse', rmse)
    #mlflow.sklearn.log_model(rf, artifact_path='model')

2025/05/29 16:14:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


🏃 View run crawling-swan-13 at: http://localhost:8080/#/experiments/737292920269680723/runs/2911b84b61c7481099a98bf876431158
🧪 View experiment at: http://localhost:8080/#/experiments/737292920269680723


In [5]:

#http://127.0.0.1:8080
# mlflow server --host 127.0.0.1 --port 8080



In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("random-forest-hyperopt")

<Experiment: artifact_location='mlflow-artifacts:/651912994835839713', creation_time=1748536528132, experiment_id='651912994835839713', last_update_time=1748536528132, lifecycle_stage='active', name='random-forest-hyperopt', tags={}>

In [20]:
def objective(params):

        with mlflow.start_run():
                mlflow.set_tag('developer', 'Emmanuel')
                mlflow.autolog()
                rf = RandomForestRegressor(**params)
                rf.fit(X_train, y_train)
                y_pred = rf.predict(X_val)
                rmse = root_mean_squared_error(y_val, y_pred)

        return {'loss': rmse, 'status': STATUS_OK}




In [23]:
search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

rstate = np.random.default_rng(42)  # for reproducible results
best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=20,
        trials=Trials(),
        rstate=rstate
    )

  0%|          | 0/20 [00:00<?, ?trial/s, best loss=?]

2025/05/29 17:06:58 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run orderly-rook-125 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/e129e5e952fa42fdbc2aa40ca233401c

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713

  5%|▌         | 1/20 [00:14<04:31, 14.31s/trial, best loss: 5.370086069268862]

2025/05/29 17:07:12 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run defiant-wasp-530 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/7c4573986aec49df9036e12921d41b04

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 10%|█         | 2/20 [00:19<02:38,  8.82s/trial, best loss: 5.370086069268862]

2025/05/29 17:07:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run fearless-midge-412 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/5120a36184b9464e97ac46625e303146

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 15%|█▌        | 3/20 [00:24<02:01,  7.15s/trial, best loss: 5.370086069268862]

2025/05/29 17:07:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run capable-goose-676 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/7c54933b22c2485f907c368cfb352c2b

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 20%|██        | 4/20 [00:34<02:15,  8.48s/trial, best loss: 5.357490752366866]

2025/05/29 17:07:33 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run rumbling-vole-700 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/1f5954840ac64f3192abe008758d82cf

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 25%|██▌       | 5/20 [00:42<02:03,  8.24s/trial, best loss: 5.357490752366866]

2025/05/29 17:07:41 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run gregarious-hawk-904 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/1f3a2677d2de43aa843c0cffc54588f0

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 30%|███       | 6/20 [00:57<02:25, 10.42s/trial, best loss: 5.354695072530291]

2025/05/29 17:07:55 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run funny-swan-586 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/7ae149f2bbcd46ad9028a1d3d4211534

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 35%|███▌      | 7/20 [01:12<02:34, 11.86s/trial, best loss: 5.354695072530291]

2025/05/29 17:08:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run masked-vole-675 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/d69e4cee78254d049e88a88355ac48fd

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 40%|████      | 8/20 [01:17<01:58,  9.89s/trial, best loss: 5.354695072530291]

2025/05/29 17:08:16 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run abundant-bug-410 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/29e1090a7bb64ab08f1eb9889a97608d

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 45%|████▌     | 9/20 [01:29<01:52, 10.27s/trial, best loss: 5.354695072530291]

2025/05/29 17:08:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run placid-mink-114 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/f28ce4d0b0b5440683b3ae21a9daf8cf

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713  

 50%|█████     | 10/20 [01:38<01:41, 10.15s/trial, best loss: 5.354695072530291]

2025/05/29 17:08:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run whimsical-shrike-188 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/3c762600596444299847592d5c48802c

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 55%|█████▌    | 11/20 [01:47<01:26,  9.65s/trial, best loss: 5.335419588556921]

2025/05/29 17:08:46 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run redolent-goose-264 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/7ea9ca1ef1384803876b0561a499720c

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 60%|██████    | 12/20 [01:55<01:13,  9.16s/trial, best loss: 5.335419588556921]

2025/05/29 17:08:53 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run legendary-shrike-482 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/847c4e38934a484ebe4c0b4f4b2a020a

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 65%|██████▌   | 13/20 [02:01<00:56,  8.09s/trial, best loss: 5.335419588556921]

2025/05/29 17:08:59 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run spiffy-perch-11 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/a70b854d74954e9aa6c7178002e27d53

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 70%|███████   | 14/20 [02:09<00:48,  8.11s/trial, best loss: 5.335419588556921]

2025/05/29 17:09:07 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run bouncy-jay-618 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/75c23793197842e7bacefc5a99092875

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 75%|███████▌  | 15/20 [02:19<00:43,  8.75s/trial, best loss: 5.335419588556921]

2025/05/29 17:09:17 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run agreeable-vole-160 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/f14b5fddd0dc4665a7e1677722995749

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 80%|████████  | 16/20 [02:27<00:34,  8.60s/trial, best loss: 5.335419588556921]

2025/05/29 17:09:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run shivering-sow-984 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/8e2f4de5186c4863ae8f8df69485d2db

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 85%|████████▌ | 17/20 [02:38<00:27,  9.29s/trial, best loss: 5.335419588556921]

2025/05/29 17:09:37 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run wistful-robin-188 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/1ed2dddf4b7c4a6982c65198fdf247d9

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 90%|█████████ | 18/20 [02:51<00:20, 10.44s/trial, best loss: 5.322418787243458]

2025/05/29 17:09:50 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run able-snake-838 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/4c1f56b29334479282f790749dad15e3

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

 95%|█████████▌| 19/20 [02:58<00:09,  9.40s/trial, best loss: 5.322418787243458]

2025/05/29 17:09:57 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.



🏃 View run loud-turtle-364 at: http://127.0.0.1:8080/#/experiments/651912994835839713/runs/8aecf638fb444855ac6358ee078838d4

🧪 View experiment at: http://127.0.0.1:8080/#/experiments/651912994835839713   

100%|██████████| 20/20 [03:04<00:00,  9.24s/trial, best loss: 5.322418787243458]
