In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import root_mean_squared_error

def read_dataframe(path):
    df = pd.read_parquet(path)
    df["duration"] = (df.lpep_dropoff_datetime - df.lpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    df[["PULocationID", "DOLocationID"]] = df[["PULocationID", "DOLocationID"]].astype(str)
    return df

df_train = read_dataframe("data/green_tripdata_2025-01.parquet")
df_val = read_dataframe("data/green_tripdata_2025-02.parquet")

df_train["PU_DO"] = df_train["PULocationID"] + "_" + df_train["DOLocationID"]
df_val["PU_DO"] = df_val["PULocationID"] + "_" + df_val["DOLocationID"]

categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()
X_train = dv.fit_transform(df_train[categorical + numerical].to_dict(orient="records"))
X_val = dv.transform(df_val[categorical + numerical].to_dict(orient="records"))

y_train = df_train["duration"].values
y_val = df_val["duration"].values

In [3]:
df_train

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,cbd_congestion_fee,duration,PU_DO
0,2,2025-01-01 00:03:01,2025-01-01 00:17:12,N,1.0,75,235,1.0,5.93,24.70,...,0.00,,1.0,34.00,1.0,1.0,0.00,0.0,14.183333,75_235
1,2,2025-01-01 00:19:59,2025-01-01 00:25:52,N,1.0,166,75,1.0,1.32,8.60,...,0.00,,1.0,11.10,2.0,1.0,0.00,0.0,5.883333,166_75
2,2,2025-01-01 00:05:29,2025-01-01 00:07:21,N,5.0,171,73,1.0,0.41,25.55,...,0.00,,1.0,26.55,2.0,2.0,0.00,0.0,1.866667,171_73
3,2,2025-01-01 00:52:24,2025-01-01 01:07:52,N,1.0,74,223,1.0,4.12,21.20,...,6.94,,1.0,36.77,1.0,1.0,0.00,0.0,15.466667,74_223
4,2,2025-01-01 00:25:05,2025-01-01 01:01:10,N,1.0,66,158,1.0,4.71,33.80,...,0.00,,1.0,46.86,1.0,1.0,2.75,0.0,36.083333,66_158
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48321,2,2025-01-31 19:36:00,2025-01-31 20:05:00,,,179,132,,13.99,55.61,...,0.00,,1.0,68.53,,,,,29.000000,179_132
48322,2,2025-01-31 20:33:00,2025-01-31 20:41:00,,,166,75,,1.51,13.58,...,0.00,,1.0,17.34,,,,,8.000000,166_75
48323,2,2025-01-31 21:09:00,2025-01-31 21:30:00,,,41,42,,2.90,30.89,...,0.00,,1.0,32.39,,,,,21.000000,41_42
48324,2,2025-01-31 22:22:00,2025-01-31 22:25:00,,,75,43,,0.34,14.78,...,0.00,,1.0,18.72,,,,,3.000000,75_43


In [4]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("class-nyc-taxi-experiment")

with mlflow.start_run(run_name="lasso_alpha_0.1"):
    alpha = 0.1
    mlflow.log_param("alpha", alpha)

    model = Lasso(alpha=alpha)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.sklearn.log_model(model, "model")

    print(f"✅ Run finalizado. RMSE = {rmse:.4f}")


2025/10/23 21:37:39 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/23 21:37:39 INFO mlflow.store.db.utils: Updating database tables
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
2025-10-23 21:37:39 INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
2025-10-23 21:37:39 INFO  [alembic.runtime.mig

✅ Run finalizado. RMSE = 8.9926


In [5]:
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR

In [6]:
import mlflow

In [7]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment_name="nyc-taxi-model-registry-example")

2025/10/23 21:39:35 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-model-registry-example' does not exist. Creating a new experiment.


<Experiment: artifact_location=('file:///c:/Users/canal/Documents/Semestre 7/Proyecto de Ciencia de '
 'Datos/nyc-predictions/nyc-taxi-predictions-2025-2/mlruns/2'), creation_time=1761277175133, experiment_id='2', last_update_time=1761277175133, lifecycle_stage='active', name='nyc-taxi-model-registry-example', tags={}>

In [8]:
mlflow.sklearn.autolog()

In [9]:
import pickle

In [10]:
training_dataset = mlflow.data.from_numpy(X_train.data, targets=y_train, name="green_tripdata_2025-01")
validation_dataset = mlflow.data.from_numpy(X_val.data, targets=y_val, name="green_tripdata_2025-02")

In [11]:
models = [
    
    {"model": GradientBoostingRegressor,
     "params": {"n_estimators": 100, "learning_rate": 0.3, "max_depth": 25, "random_state": 42},
     },
    
    {"model": ExtraTreesRegressor,
     "params": {"n_estimators": 100, "max_depth": 15, "random_state": 42},
     },
    
    {"model": LinearSVR,
     "params": {"C": 1.0, "epsilon": 0}, 
     },

]

In [12]:
with mlflow.start_run(run_name="Nested Runs"):
    for model in models:
        
        model_class = model["model"]
        model_name = model_class.__name__
        params = model["params"]
        
        with mlflow.start_run(run_name=model_name,nested=True):
            
            ml_model = model_class(**params)
           
            ml_model.fit(X_train, y_train)
    
            y_pred = ml_model.predict(X_val)
            
            rmse = root_mean_squared_error(y_val, y_pred)
            mlflow.log_metric("rmse", rmse)
            
            !mkdir models
            with open("models/preprocessor.b", "wb") as f_out:
                pickle.dump(dv, f_out)
                
            mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

A subdirectory or file models already exists.
A subdirectory or file models already exists.


In [13]:
from sklearn.ensemble import RandomForestRegressor


with mlflow.start_run(run_name="RandomForestRegressor"):
    ml_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        random_state=42
    )
    
    ml_model.fit(X_train, y_train)
    
    mlflow.sklearn.log_model(
        sk_model=model, 
        artifact_path="model",
        registered_model_name="nyc-taxi-model"
    )
    
    y_pred = ml_model.predict(X_val)
    
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric("rmse", rmse)
    
    # !mkdir models
    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv, f_out)
        
    mlflow.log_artifact("models/preprocessor.b", artifact_path="preprocessor")

2025/10/23 21:44:55 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/10/23 21:44:55 INFO mlflow.store.db.utils: Updating database tables
2025-10-23 21:44:55 INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
2025-10-23 21:44:55 INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'nyc-taxi-model'.
Created version '1' of model 'nyc-taxi-model'.


In [16]:
run_id = input("1b48e72195f94a198820f6492234e69b")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="nyc-taxi-model"
)

Registered model 'nyc-taxi-model' already exists. Creating a new version of this model...
Created version '2' of model 'nyc-taxi-model'.


In [17]:
from mlflow import MlflowClient

client = MlflowClient(tracking_uri="sqlite:///mlflow.db")

In [18]:
client.create_registered_model(name="nyc-taxi-model-client")

<RegisteredModel: aliases={}, creation_timestamp=1761277560365, deployment_job_id=None, deployment_job_state=None, description=None, last_updated_timestamp=1761277560365, latest_versions=[], name='nyc-taxi-model-client', tags={}>

In [19]:
# create "champion" alias for version 1 of model "example-model"
client.set_registered_model_alias(
    name="nyc-taxi-model", 
    alias="champion",
    version=1
)

# set the "challenger" alias to version 2
client.set_registered_model_alias(
    name="nyc-taxi-model", 
    alias="challenger",
    version=2
)

In [20]:
client.update_model_version(
    name="nyc-taxi-model",
    version=1,
    description="This model version is a scikit-learn XGBoost.",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1761277495742, current_stage='None', deployment_job_state=None, description='This model version is a scikit-learn XGBoost.', last_updated_timestamp=1761277586008, metrics=None, model_id=None, name='nyc-taxi-model', params=None, run_id='d9ee680c80be46a29e350b5b2cae1df1', run_link=None, source='models:/m-0a9693dc90cd436bb94d181bb1d65685', status='READY', status_message=None, tags={}, user_id=None, version=1>

In [24]:
from mlflow import MlflowClient

client = MlflowClient()
model_version_info = client.get_model_version(name=model_name, version=model_version)
print(model_version_info.source)

models:/m-0a9693dc90cd436bb94d181bb1d65685


In [25]:
import mlflow.pyfunc

model_name = "nyc-taxi-model"
model_version = 1

model_uri = f"models:/{model_name}/{model_version}"

model = mlflow.pyfunc.load_model(
    model_uri=model_uri,
)

model.predict(X_val)

MlflowException: Model does not have the "python_function" flavor