In [None]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
import pandas as pd
from src.paths import TRANSFORMED_DATA_DIR

df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data.pq')

In [2]:
from src.data_split import train_test_split
from datetime import datetime

X_train, y_train, X_test, y_test =train_test_split(df, 
    cutoff_date=datetime(2022, 6, 1, 0, 0 ,0),
    target_column_name='target_ride_next_hour')

In [3]:
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna

from src.model import get_pipeline

def objective(trial: optuna.trial.Trial) -> float:

    hyperparams = {
        "metrics": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_float("min_child_samples", 3, 100),
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []

    for train_index, val_index in tss.split(X_train):

        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_, y_val_ = y_train.iloc[train_index, :], y_train.iloc[val_index, :]

        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)

        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        return np.array(scores).mean()


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=5)

[I 2024-07-28 20:42:54,008] A new study created in memory with name: no-name-b6c27ce7-1c2d-4001-8722-84c4c14e3203
[W 2024-07-28 20:42:54,287] Trial 0 failed with parameters: {'num_leaves': 99, 'feature_fraction': 0.7988659392773367, 'bagging_fraction': 0.7978312474320466, 'min_child_samples': 89.86275414889485} because of the following error: IndexingError('Too many indexers').
Traceback (most recent call last):
  File "c:\Users\ingfz\ML Projects\taxi_demand_predictor\taxienv\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\TEMP\ipykernel_6752\2897637355.py", line 26, in objective
    y_train_, y_val_ = y_train.iloc[train_index, :], y_train.iloc[val_index, :]
  File "c:\Users\ingfz\ML Projects\taxi_demand_predictor\taxienv\lib\site-packages\pandas\core\indexing.py", line 1097, in __getitem__
    return self._getitem_tuple(key)
  File "c:\Users\ingfz\ML Projects\taxi_demand_predictor\taxienv\lib\site-packages\pandas\core\

IndexingError: Too many indexers

In [6]:
# study.best_trial.params

In [7]:
best_params = {'num_leaves': 161, 
               'feature_fraction':0.6402569, 
               'bagging_fraction':0.6253098, 
               'min_child_samples':34}

In [8]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.214299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 154527
[LightGBM] [Info] Number of data points in the train set: 32226, number of used features: 675
[LightGBM] [Info] Start training from score 11.703562


In [9]:
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')

test_mae=2.5673


In [10]:
from src.plot import plot_one_sample

plot_one_sample(X_test,
                y_test,
                example_id=2979,
                predictions=pd.Series(predictions))