# Duration Prediction

In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor, RandomForestRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import root_mean_squared_error
import pickle
import mlflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
import xgboost as xgb



In [2]:
pd.options.mode.copy_on_write = True

## Preprocessing

In [3]:
def preprocessing(df):
    # compute target variables: trip duration in minutes
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # keep only trips of less than an hour
    df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

    # extract trip date components
    df['trip_month'] = pd.Categorical(df.lpep_pickup_datetime.dt.month)
    df['trip_dom'] = pd.Categorical(df.lpep_pickup_datetime.dt.day)
    df['trip_hour'] = pd.Categorical(df.lpep_pickup_datetime.dt.hour)

    # concatenate pickup and dropoff locations
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)

    return df

In [4]:
def one_hot_encoding(df, numerical, categorical, dv=None):
    df[categorical] = df[categorical].astype(str)
    df_dicts = df[categorical + numerical].to_dict(orient='records')

    if not dv:
        dv = DictVectorizer()
        dv.fit(df_dicts)

    X_train = dv.transform(df_dicts)

    return X_train, dv

In [5]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

In [6]:
# read im the raw data
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [7]:
# data cleaning
df_train = preprocessing(df_train)
df_val = preprocessing(df_val)

In [8]:
# variable selection
categorical = ['PU_DO']
numerical = ['trip_distance']

In [9]:
# categorical feature encoding
X_train, dv = one_hot_encoding(df_train, numerical, categorical)
X_val, _ = one_hot_encoding(df_val, numerical, categorical, dv=dv)

In [10]:
target = 'duration'
y_train = df_train[target]
y_val = df_val[target]

## Experiment Tracking

In [11]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/Users/bastienwinant/Desktop/projects/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1748510026560, experiment_id='1', last_update_time=1748510026560, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [12]:
mlflow.autolog(disable=True)

In [13]:
with open('models/preprocessor.b', 'wb') as f_out:
    pickle.dump(dv, f_out)

### Manual Logging
#### Lasso

In [14]:
alpha=.01

In [15]:
lasso = Lasso(alpha=alpha)
train_model(lasso, X_train, y_train)
y_pred = lasso.predict(X_val)

In [16]:
rmse = root_mean_squared_error(y_val, y_pred)

In [17]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Armand Winant")

    # training params
    mlflow.log_param("training-data", 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
    mlflow.log_param("testing-data", 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
    mlflow.log_param("alpha", alpha)

    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="preprocessor")
    mlflow.sklearn.log_model(lasso, artifact_path="models_mlflow")



#### Linear Regression

In [18]:
lr = LinearRegression()
train_model(lr, X_train, y_train)
y_pred = lr.predict(X_val)

In [19]:
rmse = root_mean_squared_error(y_val, y_pred)

In [20]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Armand Winant")

    # training params
    mlflow.log_param("training-data", 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
    mlflow.log_param("testing-data", 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
    mlflow.log_param("alpha", alpha)

    # validation error
    mlflow.log_metric("rmse", rmse)

    # model artifacts
    mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="preprocessor")
    mlflow.sklearn.log_model(lr, artifact_path="models_mlflow")



#### Ridge

In [21]:
alpha = .01

In [22]:
rr = Ridge(alpha=alpha, random_state=42)
train_model(rr, X_train, y_train)
y_pred = rr.predict(X_val)

In [23]:
rmse = root_mean_squared_error(y_val, y_pred)

In [24]:
with mlflow.start_run():
    mlflow.set_tag("developer", "Armand Winant")

    # training params
    mlflow.log_param("training-data", 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
    mlflow.log_param("testing-data", 'https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
    mlflow.log_param("alpha", alpha)

    mlflow.log_metric("rmse", rmse)

    mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="preprocessor")
    mlflow.sklearn.log_model(rr, artifact_path="models_mlflow")



#### XGBoost

In [25]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [26]:
def objective(params):
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50,
        verbose_eval=200
    )

    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)

        mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
        mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="preprocessor")

    return {'loss': rmse, 'status': STATUS_OK}

In [27]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

In [None]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:11.13091                          
[200]	validation-rmse:6.40452                         
[400]	validation-rmse:6.37090                         
[600]	validation-rmse:6.36037                         
[800]	validation-rmse:6.35352                         
[944]	validation-rmse:6.35188                         
  0%|          | 0/50 [01:09<?, ?trial/s, best loss=?]





[0]	validation-rmse:7.78230                                                     
  2%|▏         | 1/50 [01:23<1:07:36, 82.79s/trial, best loss: 6.35179552921122]

### Autologging

In [29]:
mlflow.xgboost.autolog()
mlflow.sklearn.autolog()

In [30]:
params = {
    'learning_rate': 0.19030171678228142,
    'max_depth': 29,
    'min_child_weight': 1.000763936993607,
    'objective': 'reg:squarederror',
    'reg_alpha': 0.20924104314941339,
    'reg_lambda': 0.0025628242268120804,
    'seed': 42
}

In [31]:
booster = xgb.train(
    params=params,
    dtrain=train,
    num_boost_round=1000,
    evals=[(valid, "validation")],
    early_stopping_rounds=50,
    verbose_eval=20
)

y_pred = booster.predict(valid)
rmse = root_mean_squared_error(y_val, y_pred)

2025/05/31 13:37:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'b96cb058fd0541cba82ab4597075cf92', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:10.71397
[20]	validation-rmse:6.53625
[40]	validation-rmse:6.49724
[60]	validation-rmse:6.47088
[80]	validation-rmse:6.44681
[100]	validation-rmse:6.43238
[120]	validation-rmse:6.41863
[140]	validation-rmse:6.40743
[160]	validation-rmse:6.39713
[180]	validation-rmse:6.38836
[200]	validation-rmse:6.37959
[220]	validation-rmse:6.37267
[240]	validation-rmse:6.36569
[260]	validation-rmse:6.36134
[280]	validation-rmse:6.35735
[300]	validation-rmse:6.35217
[320]	validation-rmse:6.34867
[340]	validation-rmse:6.34376
[360]	validation-rmse:6.33996
[380]	validation-rmse:6.33685
[400]	validation-rmse:6.33479
[420]	validation-rmse:6.33094
[440]	validation-rmse:6.32919
[460]	validation-rmse:6.32751
[480]	validation-rmse:6.32493
[500]	validation-rmse:6.32294
[520]	validation-rmse:6.31874
[540]	validation-rmse:6.31647
[560]	validation-rmse:6.31506
[580]	validation-rmse:6.31557
[600]	validation-rmse:6.31512
[620]	validation-rmse:6.31329
[640]	validation-rmse:6.31306
[660]	validatio



In [None]:
with mlflow.start_run():
    mlflow.log_metric("valid_rmse", rmse)

#### Linear SVR

In [32]:
svr = LinearSVR(max_iter=100000000)
train_model(svr, X_train, y_train)
y_pred = svr.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)

2025/05/31 13:39:50 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '7bd20f55cb7643f5b02311babca40d4a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [None]:
with mlflow.start_run():
    mlflow.log_metric("valid_rmse", rmse)

#### Random Forest

In [33]:
rf = RandomForestRegressor()
train_model(rf, X_train, y_train)
y_pred = rf.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)

2025/05/31 13:57:34 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '953252a4048642c2b7f7866aff800b1a', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [None]:
with mlflow.start_run():
    mlflow.log_metric("valid_rmse", rmse)

#### Extra Trees

In [34]:
extra = ExtraTreesRegressor()
train_model(extra, X_train, y_train)
y_pred = extra.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)

2025/05/31 14:11:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'dbd81ce15831422990c6bdc4d6f99210', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


KeyboardInterrupt: 

In [None]:
with mlflow.start_run():
    mlflow.log_metric("valid_rmse", rmse)

#### Gradient Boosting

In [None]:
gb = GradientBoostingRegressor()
train_model(gb, X_train, y_train)
y_pred = gb.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)

In [None]:
with mlflow.start_run():
    mlflow.log_metric("valid_rmse", rmse)

### Model Management

In [32]:
mlflow.xgboost.autolog(disable=True)

In [33]:
with mlflow.start_run():
    mlflow.log_params(params)

    mlflow.log_metric('rmse', rmse)

    mlflow.log_artifact(local_path="models/preprocessor.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")



### Model Retrieval
#### Python

In [34]:
logged_model = 'runs:/65425f7ef71644588856d6163ad035a0/models_mlflow'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [35]:
loaded_model

mlflow.pyfunc.loaded_model:
  artifact_path: models_mlflow
  flavor: mlflow.xgboost
  run_id: 65425f7ef71644588856d6163ad035a0

#### XGBoost

In [36]:
xgboost_model = mlflow.xgboost.load_model(logged_model)

In [37]:
xgboost_model

<xgboost.core.Booster at 0x13a08b370>

In [38]:
xgboost_model.predict(valid)

array([14.496855 ,  7.109053 , 15.07815  , ..., 13.511075 ,  6.2299643,
        8.019186 ], dtype=float32)

### Model Registry