In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.parquet')

In [3]:
from datetime import datetime
from src.data_split import train_test_split

X_train, y_train, X_test, y_test = train_test_split(
  df,
  cutoff_date=datetime(2023, 6, 1, 0, 0, 0),
  target_column_name='target_rides_next_hour'
)

print(f'{X_train.shape}')
print(f'{y_train.shape}')
print(f'{X_test.shape}')
print(f'{y_test.shape}')

(32595, 674)
(32595,)
(56710, 674)
(56710,)


In [4]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
       
    tss = KFold(n_splits=3)
    scores = []
        
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [5]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

[I 2024-04-22 14:39:41,066] A new study created in memory with name: no-name-2fef30a3-3f07-435b-947d-588c290bc9af
[I 2024-04-22 14:39:55,173] Trial 0 finished with value: 2.5723226993928967 and parameters: {'num_leaves': 163, 'feature_fraction': 0.6553407316276827, 'bagging_fraction': 0.996042901264252, 'min_child_samples': 27}. Best is trial 0 with value: 2.5723226993928967.
[I 2024-04-22 14:40:03,926] Trial 1 finished with value: 2.626393063705967 and parameters: {'num_leaves': 144, 'feature_fraction': 0.4673702626856997, 'bagging_fraction': 0.4487969441760048, 'min_child_samples': 98}. Best is trial 0 with value: 2.5723226993928967.
[I 2024-04-22 14:40:12,328] Trial 2 finished with value: 2.589102119487292 and parameters: {'num_leaves': 157, 'feature_fraction': 0.3815101452324327, 'bagging_fraction': 0.6483623916767264, 'min_child_samples': 29}. Best is trial 0 with value: 2.5723226993928967.
[I 2024-04-22 14:40:17,408] Trial 3 finished with value: 2.5842591735786744 and parameters:

In [6]:
best_params = study.best_trial.params
print(f'{best_params=}')

best_params={'num_leaves': 163, 'feature_fraction': 0.6553407316276827, 'bagging_fraction': 0.996042901264252, 'min_child_samples': 27}


In [7]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.065912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 156147
[LightGBM] [Info] Number of data points in the train set: 32595, number of used features: 675
[LightGBM] [Info] Start training from score 11.288664


In [8]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.9629


In [13]:
from src.plot import plot_one_sample

plot_one_sample(
  features=X_test, 
  targets=y_test,
  example_id=2000,
  predictions=pd.Series(predictions)
)