In [1]:
# Import packages for preparing and training data
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error

In [2]:
# Import MLFlow for experiment tracking
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

<Experiment: artifact_location='/home/eduga_1514/mlops-zoomcamp-root/eduga-mlops-zoomcamp/02-experiment-tracking/training/experiment-tracking/mlruns/1', creation_time=1716587323151, experiment_id='1', last_update_time=1716587323151, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

In [3]:
# Import xgb and hyperopt

import xgboost as xgb

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [4]:
# Compact steps above into a function
def prepare_taxi_data(file, target, categorical):
    df = pd.read_parquet(file)
    df["duration"] = df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]
    df["duration"] = pd.to_timedelta(df["duration"])/pd.Timedelta('60s')

    outliers_indices = np.where((df[target] < 1) | (df[target] > 60))
    df = df.drop(outliers_indices[0])

    df[categorical] = df[categorical].astype(str)

    return df

In [5]:
# Prepare training data
training_df = prepare_taxi_data(
    '../../../data/yellow_tripdata_2023-01.parquet', target='duration', categorical=['PULocationID', 'DOLocationID']
)

# Prepare validation data
validate_df = prepare_taxi_data(
    '../../../data/yellow_tripdata_2023-02.parquet', target='duration', categorical=['PULocationID', 'DOLocationID']
)

In [6]:
# One-hot encode and get features matrix
dv = DictVectorizer()

train_dicts = training_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = validate_df[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Prepare target values to train the model
y_train = training_df['duration'].values
y_val = validate_df['duration'].values

In [7]:
train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [None]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag('model', 'xgboost')
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=250,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

In [None]:
# Model the best run from mlflow, performance and optimization wise
params = {
    'learning_rate': 0.7720684014557972,
    'max_depth': 58,
    'min_child_weight': 0.6989693091372148,
    'objective': 'reg:squarederror',
    'reg_alpha': 0.015100098245236579,
    'reg_lambda': 0.03214589450610109,
    'seed': 42
}

mlflow.xgboost.autolog()

with mlflow.start_run():
    mlflow.set_tag('model', 'xgboost')
    booster = xgb.train(
        params=params,
        dtrain=train,
        num_boost_round=250,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )