### Model building continued.

### Model 7: LightGBM with hyperparameter tuning

In this notebook we further improve our model with hyperparameter tuning.

In [5]:
import warnings
warnings.filterwarnings("ignore") # ignore warnings

Loading the data

In [6]:
import sys 
sys.path.append(r"C:\Users\User\capstone_project")

In [7]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')
df

Unnamed: 0,rides_previous_672_hour,rides_previous_671_hour,rides_previous_670_hour,rides_previous_669_hour,rides_previous_668_hour,rides_previous_667_hour,rides_previous_666_hour,rides_previous_665_hour,rides_previous_664_hour,rides_previous_663_hour,...,rides_previous_7_hour,rides_previous_6_hour,rides_previous_5_hour,rides_previous_4_hour,rides_previous_3_hour,rides_previous_2_hour,rides_previous_1_hour,pickup_hour,pickup_location_id,target_rides_next_hour
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,0.0,0.0,...,2.0,0.0,1.0,0.0,0.0,0.0,0.0,2022-01-29,1,0.0
1,0.0,0.0,0.0,0.0,0.0,4.0,1.0,2.0,1.0,2.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2022-01-30,1,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,1.0,2.0,0.0,0.0,0.0,0.0,2022-01-31,1,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,2.0,1.0,0.0,1.0,1.0,0.0,0.0,2022-02-01,1,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2022-02-02,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89300,3.0,0.0,2.0,3.0,2.0,3.0,13.0,8.0,9.0,9.0,...,6.0,5.0,3.0,1.0,6.0,1.0,3.0,2022-12-27,265,3.0
89301,6.0,4.0,0.0,0.0,2.0,0.0,14.0,7.0,8.0,4.0,...,4.0,2.0,1.0,2.0,2.0,2.0,8.0,2022-12-28,265,1.0
89302,7.0,2.0,3.0,4.0,7.0,4.0,10.0,9.0,7.0,11.0,...,2.0,3.0,5.0,1.0,1.0,0.0,8.0,2022-12-29,265,3.0
89303,6.0,5.0,4.0,3.0,0.0,3.0,11.0,12.0,9.0,10.0,...,3.0,3.0,1.0,2.0,0.0,1.0,2.0,2022-12-30,265,7.0


Splitting data into train and test sets.

In [8]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022, 6, 1, 0, 0, 0),
    target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32595, 674)
y_train.shape=(32595,)
X_test.shape=(56710, 674)
y_test.shape=(56710,)


Defining a function which maps a set of hyperparameter to a final validation error.

In [7]:
%pip install optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
     -------------------------------------- 413.4/413.4 kB 2.9 MB/s eta 0:00:00
Collecting alembic>=1.5.0
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
     -------------------------------------- 233.4/233.4 kB 7.2 MB/s eta 0:00:00
Collecting colorlog
  Downloading colorlog-6.8.0-py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
     ---------------------------------------- 78.6/78.6 kB 4.3 MB/s eta 0:00:00
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.13.1 colorlog-6.8.0 optuna-3.5.0
Note: you may need to restart the kernel to use updated packages.


In [9]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = TimeSeriesSplit(n_splits=2)
    scores = []
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

Create a study as per optuna.

In [10]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5) #run 5 iterations

[I 2024-01-07 05:29:25,855] A new study created in memory with name: no-name-40e47b93-acb6-44c4-98e2-8f417eaf6864
[I 2024-01-07 05:29:33,988] Trial 0 finished with value: 3.249227115700037 and parameters: {'num_leaves': 175, 'feature_fraction': 0.5552801252247332, 'bagging_fraction': 0.4181569445469541, 'min_child_samples': 50}. Best is trial 0 with value: 3.249227115700037.
[I 2024-01-07 05:29:40,705] Trial 1 finished with value: 3.388942159391527 and parameters: {'num_leaves': 129, 'feature_fraction': 0.550020146587656, 'bagging_fraction': 0.6427467028672802, 'min_child_samples': 29}. Best is trial 0 with value: 3.249227115700037.
[I 2024-01-07 05:29:46,172] Trial 2 finished with value: 3.334892987210454 and parameters: {'num_leaves': 155, 'feature_fraction': 0.33729195037050375, 'bagging_fraction': 0.5534262999884343, 'min_child_samples': 13}. Best is trial 0 with value: 3.249227115700037.
[I 2024-01-07 05:29:50,251] Trial 3 finished with value: 3.420145401588784 and parameters: {'n

Let us extract the best hyper-parameters optuna found.

In [11]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 175, 'feature_fraction': 0.5552801252247332, 'bagging_fraction': 0.4181569445469541, 'min_child_samples': 50}


Now let us re train the model on the entire training data.

In [12]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.062801 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154527
[LightGBM] [Info] Number of data points in the train set: 32595, number of used features: 675
[LightGBM] [Info] Start training from score 11.571069


Pipeline(steps=[('functiontransformer',
                 FunctionTransformer(func=<function average_rides_last_4_weeks at 0x00000135229B9EE0>)),
                ('temporalfeaturesengineer', TemporalFeaturesEngineer()),
                ('lgbmregressor',
                 LGBMRegressor(bagging_fraction=0.4181569445469541,
                               feature_fraction=0.5552801252247332,
                               min_child_samples=50, num_leaves=175))])

### Evaluating Model 7

In [13]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5425


**Observations and insights**

Tuning hyper-parameter further improved the model based on MAE = 2.54.

This is a pretty good error, let us print to see how the predictions look like.

Printing predictions.

In [14]:
from src.plot import plot_one_sample

plot_one_sample(
    example_id=2979,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

In [15]:
plot_one_sample(
    example_id=3979,
    features=X_test,
    targets=y_test,
    predictions=pd.Series(predictions)
)

**Observations and insights**

As you can see, the model is relatively good at making predictions.

Model 7 is our best performing model.

Next, we can now build a batch scoring service to put the model to work.