In [1]:
import pathlib
import pickle
from datetime import datetime

import dagshub
import mlflow
import pandas as pd
import utils
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_percentage_error

In [2]:
dagshub.init(url="https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="nyc-taxi-experiment")

https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow


<Experiment: artifact_location='mlflow-artifacts:/63ce2eff028d42fda8747e5857239051', creation_time=1726773560535, experiment_id='0', last_update_time=1726773560535, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
df_train = utils.read_dataframe('../data/green_tripdata_2024-01.parquet')
df_val = utils.read_dataframe('../data/green_tripdata_2024-02.parquet')

In [4]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [5]:
categorical = ['PU_DO']  #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']
dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [6]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [7]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2024-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2024-02")

In [8]:
mlflow.sklearn.autolog()



# GradientBoosting

In [9]:
def objective(params):
    with mlflow.start_run(nested=True):
         
        # Tag model
        mlflow.set_tag("model_family", "GradientBoost")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = GradientBoostingRegressor(**params).fit(X_train, y_train)
        
        # Log xgboost model with artifact_path
        mlflow.sklearn.log_model(booster, artifact_path="model")

        # Predict in the val dataset
        y_pred = booster.predict(X_val)

        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)

        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

Definir el espacio de búsqueda

In [10]:
with mlflow.start_run(run_name="GradientBoosting Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'learning_rate': hp.loguniform('learning_rate', -3, 0),
        'loss': 'squared_error',
        'random_state': 20020906
    }

    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["random_state"] = 20020906
    best_params["loss"] = "squared_error"

    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "GradientBoost",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = GradientBoostingRegressor(**best_params).fit(X_train, y_train)

    y_pred = booster.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/19 20:32:49 INFO mlflow.tracking._tracking_service.client: 🏃 View run beautiful-croc-570 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f09529bfb06c4bf6896afbf0a987f977.

2024/09/19 20:32:49 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [03:31<31:40, 211.12s/trial, best loss: 5.4451349464189045]



2024/09/19 20:33:59 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-robin-361 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/dca26f968d2f4ae584ebbc09721d7380.

2024/09/19 20:33:59 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [04:41<17:06, 128.27s/trial, best loss: 5.2536230297782165]



2024/09/19 20:35:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run skillful-wasp-215 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/74ba6839920c4203875301ee4ae041f9.

2024/09/19 20:35:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [06:23<13:33, 116.20s/trial, best loss: 5.2536230297782165]



2024/09/19 20:37:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run youthful-grouse-368 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/9604706aefe84fe2b3738bf86803901c.

2024/09/19 20:37:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [07:46<10:18, 103.11s/trial, best loss: 5.2536230297782165]



2024/09/19 20:39:20 INFO mlflow.tracking._tracking_service.client: 🏃 View run thoughtful-carp-478 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a5a959269ce4429a853a1cfc4be94d47.

2024/09/19 20:39:20 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [10:02<09:35, 115.14s/trial, best loss: 5.2536230297782165]



2024/09/19 20:41:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-stag-644 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/321dfefd03cd4a06bef40b13e295633d.

2024/09/19 20:41:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [12:26<08:19, 124.81s/trial, best loss: 5.2536230297782165]



2024/09/19 20:42:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run charming-hound-429 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/a148803b9c824d358ade337a7c14a4ad.

2024/09/19 20:42:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [12:57<04:42, 94.06s/trial, best loss: 5.2536230297782165] 



2024/09/19 20:42:43 INFO mlflow.tracking._tracking_service.client: 🏃 View run worried-steed-376 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/39e22f552fa642a6b673cc1a4bc72627.

2024/09/19 20:42:43 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [13:25<02:26, 73.10s/trial, best loss: 5.2536230297782165]



2024/09/19 20:43:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run useful-flea-442 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/0c330af15eda4325ab6e40e657486c70.

2024/09/19 20:43:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [14:11<01:04, 64.83s/trial, best loss: 5.249297572869492] 



2024/09/19 20:44:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-mule-843 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/0aaa5022cab940b289016567ee065166.

2024/09/19 20:44:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [15:23<00:00, 92.36s/trial, best loss: 5.249297572869492]


2024/09/19 20:45:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run GradientBoosting Hyper-parameter Optimization at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/4eb841d914494fffb70c0eb5c7119ae2.
2024/09/19 20:45:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [11]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/19 20:46:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 2
Created version '2' of model 'nyc-taxi-model'.


In [12]:
client = mlflow.MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "GradientChallenger"
date = datetime.today()
model_version = "2"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['GradientChallenger'], creation_timestamp=1726800408780, current_stage='None', description=('The model version 2 was transitioned to GradientChallenger on 2024-09-19 '
 '20:46:48.220837'), last_updated_timestamp=1726800409354, name='nyc-taxi-model', run_id='4eb841d914494fffb70c0eb5c7119ae2', run_link='', source='mlflow-artifacts:/63ce2eff028d42fda8747e5857239051/4eb841d914494fffb70c0eb5c7119ae2/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>

# RandomForest

In [13]:
def objective(params):
    with mlflow.start_run(nested=True):

        # Tag model
        mlflow.set_tag("model_family", "RandomForest")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Train model
        booster = RandomForestRegressor(**params).fit(X_train, y_train)
        
        # Log xgboost model with artifact_path
        mlflow.sklearn.log_model(booster, artifact_path="model")

        # Predict in the val dataset
        y_pred = booster.predict(X_val)

        # Calculate metric
        rmse = root_mean_squared_error(y_val, y_pred)

        # Log performance metric
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [14]:
with mlflow.start_run(run_name="RandomForest Hyper-parameter Optimization", nested=True):
    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 4, 100, 1)),
        'criterion': 'squared_error',
        'random_state': 20020906
    }

    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    best_params["max_depth"] = int(best_params["max_depth"])
    best_params["n_estimators"] = int(best_params["n_estimators"])
    best_params["random_state"] = 20020906
    best_params["criterion"] = "squared_error"

    mlflow.log_params(best_params)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "NYC Taxi Time Prediction Project",
            "optimizer_engine": "hyper-opt",
            "model_family": "RandomForest",
            "feature_set_version": 1,
        }
    )

    # Log a fit model instance
    booster = RandomForestRegressor(**best_params).fit(X_train, y_train)

    y_pred = booster.predict(X_val)

    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)

    pathlib.Path("models").mkdir(exist_ok=True)
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)

    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]



2024/09/19 20:47:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run learned-newt-451 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/1d6944c1d84046b2bb15435d4592aed1.

2024/09/19 20:47:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 10%|█         | 1/10 [00:35<05:23, 35.97s/trial, best loss: 5.465890014385762]



2024/09/19 20:48:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run sneaky-lark-323 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/7390c025a47b4ec5b3ab127eb021e7a1.

2024/09/19 20:48:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 20%|██        | 2/10 [01:12<04:51, 36.45s/trial, best loss: 5.464003457608055]



2024/09/19 20:50:26 INFO mlflow.tracking._tracking_service.client: 🏃 View run clean-chimp-662 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/dc509c60a3ca40ea9358f4c6509eae7a.

2024/09/19 20:50:26 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 30%|███       | 3/10 [03:37<10:01, 85.93s/trial, best loss: 5.356859041160056]



2024/09/19 20:52:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run sassy-cod-644 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/6f68408a45584f77921c8301bee6a4ce.

2024/09/19 20:52:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 40%|████      | 4/10 [05:18<09:10, 91.72s/trial, best loss: 5.355434161417264]



2024/09/19 20:53:06 INFO mlflow.tracking._tracking_service.client: 🏃 View run thundering-seal-315 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/6b3dd69f6a934b7db553a218fb49eeed.

2024/09/19 20:53:06 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 50%|█████     | 5/10 [06:17<06:39, 79.99s/trial, best loss: 5.355434161417264]



2024/09/19 20:55:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run vaunted-rook-370 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/303de61d476d49dc8922bca63f96b194.

2024/09/19 20:55:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 60%|██████    | 6/10 [08:25<06:25, 96.30s/trial, best loss: 5.355434161417264]



2024/09/19 20:55:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run thoughtful-roo-64 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/985015b3abea4c16ab94be1c33728402.

2024/09/19 20:55:41 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 70%|███████   | 7/10 [08:52<03:41, 73.69s/trial, best loss: 5.355434161417264]



2024/09/19 21:00:04 INFO mlflow.tracking._tracking_service.client: 🏃 View run wistful-crow-800 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/53de4d44555b4c05b5c49e4017389179.

2024/09/19 21:00:04 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 80%|████████  | 8/10 [13:16<04:28, 134.20s/trial, best loss: 5.355434161417264]



2024/09/19 21:01:08 INFO mlflow.tracking._tracking_service.client: 🏃 View run shivering-ray-571 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/074cb82f058e4d8c8f97506520dedb39.

2024/09/19 21:01:08 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



 90%|█████████ | 9/10 [14:19<01:52, 112.05s/trial, best loss: 5.355434161417264]



2024/09/19 21:02:44 INFO mlflow.tracking._tracking_service.client: 🏃 View run angry-dove-139 at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/16bc4e6019a644998077f9a9c129f410.

2024/09/19 21:02:44 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.



100%|██████████| 10/10 [15:56<00:00, 95.61s/trial, best loss: 5.354853498846549] 


2024/09/19 21:04:15 INFO mlflow.tracking._tracking_service.client: 🏃 View run RandomForest Hyper-parameter Optimization at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0/runs/f62d1733bbfc4c89aaaf9f234759e2bf.
2024/09/19 21:04:15 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/G4ll4rd0/nyc-taxi-time-prediction.mlflow/#/experiments/0.


In [15]:
run_id = input("Ingrese el run_id")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
2024/09/19 21:04:39 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: nyc-taxi-model, version 3
Created version '3' of model 'nyc-taxi-model'.


In [16]:
client = mlflow.MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="nyc-taxi-model",
    description="Model registry for the NYC Taxi Time Prediction Project",
)

new_alias = "ForestChallenger"
date = datetime.today()
model_version = "3"

# create "champion" alias for version 1 of model "nyc-taxi-model"
client.set_registered_model_alias(
    name="nyc-taxi-model",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="nyc-taxi-model",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['ForestChallenger'], creation_timestamp=1726801480610, current_stage='None', description=('The model version 3 was transitioned to ForestChallenger on 2024-09-19 '
 '21:04:39.997590'), last_updated_timestamp=1726801481098, name='nyc-taxi-model', run_id='f62d1733bbfc4c89aaaf9f234759e2bf', run_link='', source='mlflow-artifacts:/63ce2eff028d42fda8747e5857239051/f62d1733bbfc4c89aaaf9f234759e2bf/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='3'>

# Champion vs Challengers

In [17]:
# Download march
!curl -o ../data/green_tripdata_2024-03.parquet https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2024-03.parquet

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 1340k    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1340k  100 1340k    0     0  1208k      0  0:00:01  0:00:01 --:--:-- 1217k


In [18]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
    df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    df = df[(df.duration >= 1) & (df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    
    return df


def preprocess(df, dv):
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    train_dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(train_dicts)


def test_model(name, alias, X_test, y_test):
    model = mlflow.pyfunc.load_model(f"models:/{name}@{alias}")
    y_pred = model.predict(X_test)
    return {"rmse": root_mean_squared_error(y_test, y_pred), 'r2': r2_score(y_test, y_pred), 'mape': mean_absolute_percentage_error(y_test, y_pred)}

In [20]:
df = read_dataframe("../data/green_tripdata_2024-03.parquet")

In [21]:
with open("models/preprocessor.b", "rb") as f_in:
    dv = pickle.load(f_in)

In [22]:
X_test = preprocess(df, dv)
target = "duration"
y_test = df[target].values

In [23]:
%time test_model(name="nyc-taxi-model", alias="champion", X_test=X_test, y_test=y_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

CPU times: total: 7.47 s
Wall time: 4.89 s


{'rmse': np.float64(5.226208716048294),
 'r2': 0.672599249427333,
 'mape': np.float64(0.2907673419338061)}

In [24]:
%time test_model(name="nyc-taxi-model", alias="GradientChallenger", X_test=X_test, y_test=y_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

CPU times: total: 1.03 s
Wall time: 2.22 s


{'rmse': np.float64(5.300469297255932),
 'r2': 0.6632288985420741,
 'mape': np.float64(0.28121172742162676)}

In [25]:
%time test_model(name="nyc-taxi-model", alias="ForestChallenger", X_test=X_test, y_test=y_test)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

CPU times: total: 1.42 s
Wall time: 3.01 s


{'rmse': np.float64(5.401799719101317),
 'r2': 0.6502295411224656,
 'mape': np.float64(0.28407109827423793)}

Ninuno de los dos challengers debe de ser promovido, aunque bajan ambos el MAPE en ~1%, se nota también un descenso en la R<sup>2</sup>, esto, sumado a un aumento en el RMSE sugiere que se debe mantener el primer modelo en producción por el momento