In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.pardir)))
import lightgbm as LGB

In [4]:
import pandas as pd
import numpy  as np
from datetime import datetime
from sklearn.metrics import mean_absolute_error
from src.paths import TRANSFORMED_DATA_DIR
from src.data_split import train_test_split

In [5]:
import numpy as np
import optuna
from optuna.visualization import plot_intermediate_values
from src.model import get_pipeline
from sklearn.model_selection import KFold, TimeSeriesSplit

In [6]:
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tarbular_data.parquet')

In [7]:
df['hour'] = df['pickup_hours'].dt.hour
df['day_of_week'] = df['pickup_hours'].dt.dayofweek
#df.drop(columns=['pickup_location_id'], inplace= True)

In [8]:
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022,6,1,0,0,0),
    target_column_name='target_rides_next_hour'
)
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 676)
y_train.shape=(32226,)
X_test.shape=(56068, 676)
y_test.shape=(56068,)


In [9]:
X_train.drop(columns = ['pickup_hours'],inplace=True)
X_test.drop(columns=['pickup_hours'],inplace=True)

In [11]:
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    ## Pick the hyper-parameters
    hyperparams = {
        "metric":"mae",
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100)
    }

    tss = TimeSeriesSplit(n_splits=4)
    scores = []
    for train_index, val_index in tss.split(X_train):

        ## Split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]

        ## Train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
       

        ## Evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)

    ## Return the mean mae
    print(scores)
    return np.array(scores).mean()
    

In [12]:
## Now we create a study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=4)

[I 2025-06-09 22:17:16,584] A new study created in memory with name: no-name-57278704-794d-40af-bbec-8b76f800993a


[I 2025-06-09 22:17:23,194] Trial 0 finished with value: 1.5967130808467889 and parameters: {'num_leaves': 91, 'feature_fraction': 0.34859703142626836, 'bagging_fraction': 0.48760522461662614, 'min_child_samples': 42}. Best is trial 0 with value: 1.5967130808467889.


[5.76631456835511, 0.33886465753995093, 0.17493596234094266, 0.10673713515115277]


[I 2025-06-09 22:17:37,458] Trial 1 finished with value: 1.4729461873034815 and parameters: {'num_leaves': 191, 'feature_fraction': 0.841346772173563, 'bagging_fraction': 0.5401731683135856, 'min_child_samples': 75}. Best is trial 1 with value: 1.4729461873034815.


[5.417881602191903, 0.27546916990543796, 0.1446489911436907, 0.05378498597289356]


[I 2025-06-09 22:17:53,822] Trial 2 finished with value: 1.4556988702906983 and parameters: {'num_leaves': 244, 'feature_fraction': 0.7033044874190797, 'bagging_fraction': 0.46474381292847255, 'min_child_samples': 55}. Best is trial 2 with value: 1.4556988702906983.


[5.348134936042027, 0.27112594206844687, 0.1462886320553524, 0.05724597099696703]


[I 2025-06-09 22:18:00,355] Trial 3 finished with value: 1.5849978807394638 and parameters: {'num_leaves': 71, 'feature_fraction': 0.49867498849812775, 'bagging_fraction': 0.22312716332919091, 'min_child_samples': 28}. Best is trial 2 with value: 1.4556988702906983.


[5.6052901690547365, 0.39719878760024224, 0.2122303802406524, 0.12527218606222412]


In [13]:
best_params = study.best_trial.params
print(f"{best_params=}")

best_params={'num_leaves': 244, 'feature_fraction': 0.7033044874190797, 'bagging_fraction': 0.46474381292847255, 'min_child_samples': 55}


In [14]:
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train,y_train)

In [15]:
predicted = pipeline.predict(X_test)
tuned_mae = mean_absolute_error(y_test, predicted)
print(f"MAE After tuning: {tuned_mae}")

MAE After tuning: 2.5676662762430786


In [16]:
from src.plot import plot_one_sample

In [17]:
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2022,6,1,0,0,0),
    target_column_name='target_rides_next_hour'
)
print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

X_train.shape=(32226, 676)
y_train.shape=(32226,)
X_test.shape=(56068, 676)
y_test.shape=(56068,)


In [22]:
plot_one_sample(
    features= X_test,
    targets = y_test,
    example_id = 2979,
    predictions = pd.Series(predicted)
)

In [18]:
plot_one_sample(
    features= X_test,
    targets = y_test,
    example_id = 199,
    predictions = pd.Series(predicted)
)