# Train gradient boosting model using Bayesian optimization to find best hyperparameters

# 1. Imports

## 1.1 Packages

In [1]:
import sys

import pandas as pd


## 1.2 Options

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../src')

# from velib_prediction.pipelines.train_model.mlflow import (  # noqa: E402
#     create_mlflow_experiment,
# )
from velib_prediction.pipelines.train_model.nodes import (  # noqa: E402
    add_lags_sma,
    # get_split_train_val_cv,
    split_train_valid_last_hours,
    train_model_bayesian_opti,
)
    # train_model_mlflow,
# )


In [4]:
lags_to_try = [1,]

In [5]:
feat_date = "duedate"

## 1.3 Datasets

In [6]:
df_training = pd.read_parquet("../data/04_feature/df_feat_train.parquet")
df_training.sample(2)

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
3,130071730443606,13007,1,48,42,4,2,2,1,1,2024-11-01 06:46:46+00:00,75056,2024,11,1,4,0
1,90201730541178,9020,1,21,10,11,8,3,1,1,2024-11-02 09:52:58+00:00,75056,2024,11,2,5,1


In [7]:
df_training.rename(columns={"numbikesavailable": "target"}, inplace=True)

In [8]:
df_training.tail()

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
5,170411729666463,17041,1,36,28,8,8,0,1,1,2024-10-23 06:54:23+00:00,75056,2024,10,23,2,0
6,150471729666488,15047,1,52,48,2,2,0,1,1,2024-10-23 06:54:48+00:00,75056,2024,10,23,2,0
7,121091729666278,12109,1,30,22,8,6,2,1,1,2024-10-23 06:51:18+00:00,75056,2024,10,23,2,0
8,141081729666376,14108,1,21,17,3,2,1,1,1,2024-10-23 06:52:56+00:00,75056,2024,10,23,2,0
9,161381729666193,16138,1,30,15,14,10,4,1,1,2024-10-23 06:49:53+00:00,75056,2024,10,23,2,0


# 2. Prepare datasets

In [9]:
# Add lags defined
df_training = add_lags_sma(df_training, lags_to_try, feat_id='stationcode', feat_date=feat_date, feat_target="target", n_shift=5)

In [10]:
df_training.sort_values(by="duedate", inplace=True)

In [11]:
# df_training.drop(columns="duedate", inplace=True)
# list_df = get_split_train_val_cv(df_training, n_splits=2)

# df_train, df_valid = list_df[0][0], list_df[0][1]

In [12]:
df_training.loc[df_training["stationcode"] == '11025']

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend,sma_1_lag
7,110251729352699,11025,1,43,15,26,15,11,1,1,2024-10-19 15:44:59+00:00,75056,2024,10,19,5,1,
6,110251729356310,11025,1,43,17,22,17,5,1,1,2024-10-19 16:45:10+00:00,75056,2024,10,19,5,1,
5,110251729363427,11025,1,43,9,30,23,7,1,1,2024-10-19 18:43:47+00:00,75056,2024,10,19,5,1,
6,110251729377872,11025,1,43,7,34,28,6,1,1,2024-10-19 22:44:32+00:00,75056,2024,10,19,5,1,
6,110251729388654,11025,1,43,5,35,27,8,1,1,2024-10-20 01:44:14+00:00,75056,2024,10,20,6,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,110251730656452,11025,1,43,14,29,21,8,1,1,2024-11-03 17:54:12+00:00,75056,2024,11,3,6,1,28.0
6,110251730657905,11025,1,43,5,38,27,11,1,1,2024-11-03 18:18:25+00:00,75056,2024,11,3,6,1,21.0
5,110251730710732,11025,1,43,11,31,27,4,1,1,2024-11-04 08:58:52+00:00,75056,2024,11,4,0,0,14.0
9,110251730721334,11025,1,43,13,30,27,3,1,1,2024-11-04 11:55:34+00:00,75056,2024,11,4,0,0,28.0


In [13]:
df_train, df_valid = split_train_valid_last_hours(df_training, n_hours=24)

In [14]:
feat_cat = [
    "is_installed",
    "is_renting",
    "is_returning",
    "code_insee_commune",
    "duedate_weekend",
]

# 3. Train model

In [15]:
# experiment_id = create_mlflow_experiment(
#     experiment_folder_path="../data/06_models/mlruns",
#     experiment_name="velib_prediction"
# )
# experiment_id

experiment_id = '587291553688351204'

In [16]:
search_params = {
    "iterations": (100, 500),
    "depth": (2, 7),
}

# params = {
#     "iterations": 100,
#     "depth": 4
# }

In [17]:
train_model_bayesian_opti(
    run_name="Test_catboost",
    experiment_id=experiment_id,
    search_params=search_params,
    df_train=df_train,
    df_valid=df_valid,
    feat_cat=feat_cat,
    n_trials=10
)

# model, rmse_train, rmse_valid = train_model_mlflow(
#     experiment_id=experiment_id,
#     parent_run_id=None,
#     df_train=df_train,
#     df_valid=df_valid,
#     feat_cat=feat_cat,
#     **params
# )

# print("RMSE train:", rmse_train)
# print("RMSE valid:", rmse_valid)

[I 2024-12-01 21:08:19,738] A new study created in memory with name: 


  0%|          | 0/10 [00:00<?, ?it/s]

[W 2024-12-01 21:08:19,752] Trial 0 failed with parameters: {} because of the following error: TypeError("optimize_hyperparams() got multiple values for argument 'search_params'").
Traceback (most recent call last):
  File "/Users/benjaminwallyn/Git/velib-prediction/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
TypeError: optimize_hyperparams() got multiple values for argument 'search_params'
[W 2024-12-01 21:08:19,753] Trial 0 failed with value None.


TypeError: optimize_hyperparams() got multiple values for argument 'search_params'