# Duration Prediction

In [1]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import root_mean_squared_error
import pickle
import mlflow
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope



In [2]:
pd.options.mode.copy_on_write = True

In [3]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")
mlflow.autolog(disable=True)

2025/05/26 10:14:50 INFO mlflow.tracking.fluent: Experiment with name 'nyc-taxi-experiment' does not exist. Creating a new experiment.


In [4]:
def preprocessing(df):
    # compute target variables: trip duration in minutes
    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)

    # keep only trips of less than an hour
    df = df.loc[(df.duration >= 1) & (df.duration <= 60), :]

    # extract trip date components
    df['trip_month'] = pd.Categorical(df.lpep_pickup_datetime.dt.month)
    df['trip_dom'] = pd.Categorical(df.lpep_pickup_datetime.dt.day)
    df['trip_hour'] = pd.Categorical(df.lpep_pickup_datetime.dt.hour)

    # concatenate pickup and dropoff locations
    df['PU_DO'] = df['PULocationID'].astype(str) + '_' + df['DOLocationID'].astype(str)

    return df

In [5]:
def one_hot_encoding(df, numerical, categorical, dv=None):
    df[categorical] = df[categorical].astype(str)
    df_dicts = df[categorical + numerical].to_dict(orient='records')

    if not dv:
        dv = DictVectorizer()
        dv.fit(df_dicts)

    X_train = dv.transform(df_dicts)

    return X_train, dv

In [6]:
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)

In [7]:
# read im the raw data
df_train = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [8]:
# data cleaning
df_train = preprocessing(df_train)
df_val = preprocessing(df_val)

In [9]:
# variable selection
categorical = ['PU_DO']
numerical = ['trip_distance']

In [10]:
# categorical feature encoding
X_train, dv = one_hot_encoding(df_train, numerical, categorical)
X_val, _ = one_hot_encoding(df_val, numerical, categorical, dv=dv)

In [11]:
target = 'duration'
y_train = df_train[target]
y_val = df_val[target]

### Linear Regression

In [12]:
lr = LinearRegression()
train_model(lr, X_train, y_train)

In [13]:
y_pred = lr.predict(X_val)

In [14]:
root_mean_squared_error(y_val, y_pred)

7.758715200888857

### Lasso

In [15]:
alpha = .1
lasso = Lasso(alpha=alpha)
train_model(lasso, X_train, y_train)
y_pred = lasso.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)

with mlflow.start_run():
    mlflow.set_tag("developer", "Armand Winant")

    # datasets used for training and validation
    mlflow.log_param("train-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet")
    mlflow.log_param("validation-data-path", "https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet")

    # hyperparameters
    mlflow.log_param("alpha", alpha)

    # validation performance
    mlflow.log_metric("rmse", rmse)

### XGBoost
#### Manual logging

In [16]:
mlflow.xgboost.autolog(disable=True)

In [17]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [18]:
def objective(params):
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50,
        verbose_eval=10
    )
    y_pred = booster.predict(valid)
    rmse = root_mean_squared_error(y_val, y_pred)

    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}


In [19]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

In [20]:
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:11.64008                          
[10]	validation-rmse:8.27056                          
[20]	validation-rmse:7.21100                          
[30]	validation-rmse:6.89604                          
[40]	validation-rmse:6.78906                          
[50]	validation-rmse:6.74538                          
[60]	validation-rmse:6.72606                          
[70]	validation-rmse:6.71558                          
[80]	validation-rmse:6.71046                          
[90]	validation-rmse:6.70708                          
[100]	validation-rmse:6.70447                         
[110]	validation-rmse:6.70200                         
[120]	validation-rmse:6.69985                         
[130]	validation-rmse:6.69765                         
[140]	validation-rmse:6.69596                         
[150]	validation-rmse:6.69444                         
[160]	validation-rmse:6.69262                         
[170]	validation-rmse:6.69080                         
[180]	vali

#### Automatic logging

In [23]:
mlflow.xgboost.autolog()

In [24]:
params = {
    'learning_rate': 0.12369251946620187,
    'max_depth': 44,
    'min_child_weight': 1.2385807487664957,
    'objective': 'reg:squarederror',
    'reg_alpha': 0.031969256340342596,
    'reg_lambda': 0.21710337514239528,
    'seed': 42
}

booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, "validation")],
        early_stopping_rounds=50
    )

y_pred = booster.predict(valid)
rmse = root_mean_squared_error(y_val, y_pred)

2025/05/26 11:14:41 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '027bedb578624b79a660791e19948f86', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


[0]	validation-rmse:11.21802
[1]	validation-rmse:10.38334
[2]	validation-rmse:9.68612
[3]	validation-rmse:9.10744
[4]	validation-rmse:8.63030
[5]	validation-rmse:8.23862
[6]	validation-rmse:7.92146
[7]	validation-rmse:7.66066
[8]	validation-rmse:7.45002
[9]	validation-rmse:7.28043
[10]	validation-rmse:7.13873
[11]	validation-rmse:7.02567
[12]	validation-rmse:6.93412
[13]	validation-rmse:6.85814
[14]	validation-rmse:6.79754
[15]	validation-rmse:6.74767
[16]	validation-rmse:6.70587
[17]	validation-rmse:6.67101
[18]	validation-rmse:6.64254
[19]	validation-rmse:6.61742
[20]	validation-rmse:6.59624
[21]	validation-rmse:6.57802
[22]	validation-rmse:6.56199
[23]	validation-rmse:6.54890
[24]	validation-rmse:6.53701
[25]	validation-rmse:6.52745
[26]	validation-rmse:6.51916
[27]	validation-rmse:6.51139
[28]	validation-rmse:6.50445
[29]	validation-rmse:6.49785
[30]	validation-rmse:6.49321
[31]	validation-rmse:6.48776
[32]	validation-rmse:6.48406
[33]	validation-rmse:6.48048
[34]	validation-rmse:6

