In [1]:
!python -V

Python 3.12.11


In [2]:
import pandas as pd
import pickle
import mlflow
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import root_mean_squared_error

In [None]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('my-nyc-taxi-experiment')

2025/07/17 10:34:58 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/17 10:34:58 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2025/07/17 10:34:58 INFO mlflow.tracking.fluent: Experiment with name 'my-nyc-taxi-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='/workspaces/mlops-zoomcamp/MLFlow/mlruns/9', creation_time=1752748498964, experiment_id='9', last_update_time=1752748498964, lifecycle_stage='active', name='my-nyc-taxi-experiment', tags={}>

Bad pipe message: %s [b'"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";']
Bad pipe message: %s [b'"138"\r\nsec-ch-ua-mobile: ?0\r\nsec-ch-ua-platform: "Windows"\r\nUpgrade-Insecure-Requests: 1\r\nUser-Agent: Mozilla/5.0 (Wi', b'ows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36\r\nAccept: text/']
Bad pipe message: %s [b'ml,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/s']
Bad pipe message: %s [b'ned-exchange;v=b3;q=0.7\r\nSec-Fetch-Site: none\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-User: ?1\r\nSec-Fetch-D', b't: document\r\nAccept-Encoding: gzip, deflate, br, zstd\r\nAccept-Language: ar-MA,ar-AE;q=0.9,ar;q=0.8,f', b'MA;q=0.7,fr;q=0.6,en-US;q=0.5,en;q=0.4\r\nCookie: username-127-0-0-1-8889="2|1:0|10:1751555102|23:username-127-0-0-', b'8889|44:ODA3NzI2MmQxM2ZkNDYyMzg0MWJjMjg3MTZkNzYx', b'M=|271d9a77b88fe318d596adc3e1f3e8b0289514f10a4e01c19f326803e2154b6a"; _xsrf=', b'84abd5f4|ea459

In [7]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds()/60)

    df = df[(df.duration >= 1)&(df.duration <= 60)]

    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)

    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']

    return df

In [8]:
df_train = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-01.parquet')
df_val = read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2023-02.parquet')

In [9]:
categorical = ['PU_DO']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
X_val = dv.transform(val_dicts)

In [10]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

In [13]:
import xgboost as xgb

In [15]:
from pathlib import Path

In [16]:
models_folder = Path('models')
models_folder.mkdir(exist_ok= True)

In [None]:
import mlflow.xgboost


with mlflow.start_run():
    train = xgb.DMatrix(X_train, label = y_train)
    valid = xgb.DMatrix(X_val, label = y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'pbjective' : 'reg:linear',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }
    mlflow.log_params(best_params)

    booster = xgb.train(
        params = best_params,
        dtrain = train,
        num_boost_round = 30,
        evals = [(valid, 'validation')],
        early_stopping_rounds = 50
    )
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)
    mlflow.log_metric('rmse', rmse)
    

    with open("models/preprocessor.b", "wb") as f_out:
        pickle.dump(dv,f_out)
    mlflow.log_artifact('models/preprocessor.b', artifact_path='preprocessor')

    mlflow.xgboost.log_model(booster, name = 'models_mlflow')


Parameters: { "pbjective" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation-rmse:8.73788
[1]	validation-rmse:8.22960
[2]	validation-rmse:7.78914
[3]	validation-rmse:7.40823
[4]	validation-rmse:7.08398
[5]	validation-rmse:6.80130
[6]	validation-rmse:6.56559
[7]	validation-rmse:6.35942
[8]	validation-rmse:6.18716
[9]	validation-rmse:6.04364
[10]	validation-rmse:5.91994
[11]	validation-rmse:5.81441
[12]	validation-rmse:5.72701
[13]	validation-rmse:5.65236
[14]	validation-rmse:5.58821
[15]	validation-rmse:5.53629
[16]	validation-rmse:5.49451
[17]	validation-rmse:5.45443
[18]	validation-rmse:5.42147
[19]	validation-rmse:5.39347
[20]	validation-rmse:5.37267
[21]	validation-rmse:5.35128
[22]	validation-rmse:5.33257
[23]	validation-rmse:5.31780
[24]	validation-rmse:5.30449
[25]	validation-rmse:5.29542
[26]	validation-rmse:5.28365
[27]	validation-rmse:5.27505
[28]	validation-rmse:5.26725
[29]	validation-rmse:5.26004


  xgb_model.save_model(model_data_path)


In [21]:
import numpy as np  
np.array([X_train[0].toarray()[0]])


array([[0.  , 0.  , 0.  , ..., 0.  , 0.  , 2.58]], shape=(1, 5702))