# Train gradient boosting model using Bayesian optimization to find best hyperparameters

# 1. Imports

## 1.1 Packages

In [1]:
import sys

import pandas as pd


## 1.2 Options

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
sys.path.append('../src')

# from velib_prediction.pipelines.train_model.mlflow import (  # noqa: E402
#     create_mlflow_experiment,
# )
from velib_prediction.pipelines.train_model.nodes import (  # noqa: E402
    add_lags_sma,
    # get_split_train_val_cv,
    split_train_valid_last_hours,
    train_model_bayesian_opti,
)
    # train_model_mlflow,
# )


In [4]:
lags_to_try = [1,]

In [5]:
feat_date = "duedate"

## 1.3 Datasets

In [6]:
df_training = pd.read_parquet("../data/04_feature/df_feat_train.parquet")
df_training.sample(2)

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
5,152021730671980,15202,1,30,23,6,3,3,1,1,2024-11-03 22:13:00+00:00,75056,2024,11,3,6,1
1,60031730613082,6003,1,21,3,17,13,4,1,1,2024-11-03 05:51:22+00:00,75056,2024,11,3,6,1


In [7]:
df_training.rename(columns={"numbikesavailable": "target"}, inplace=True)

In [8]:
df_training.tail()

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
5,170411729666463,17041,1,36,28,8,8,0,1,1,2024-10-23 06:54:23+00:00,75056,2024,10,23,2,0
6,150471729666488,15047,1,52,48,2,2,0,1,1,2024-10-23 06:54:48+00:00,75056,2024,10,23,2,0
7,121091729666278,12109,1,30,22,8,6,2,1,1,2024-10-23 06:51:18+00:00,75056,2024,10,23,2,0
8,141081729666376,14108,1,21,17,3,2,1,1,1,2024-10-23 06:52:56+00:00,75056,2024,10,23,2,0
9,161381729666193,16138,1,30,15,14,10,4,1,1,2024-10-23 06:49:53+00:00,75056,2024,10,23,2,0


# 2. Prepare datasets

In [9]:
# Add lags defined
df_training = add_lags_sma(df_training, lags_to_try, feat_id='stationcode', feat_date=feat_date, feat_target="target", n_shift=5)

In [10]:
df_training.sort_values(by="duedate", inplace=True)

In [11]:
# df_training.drop(columns="duedate", inplace=True)
# list_df = get_split_train_val_cv(df_training, n_splits=2)

# df_train, df_valid = list_df[0][0], list_df[0][1]

In [12]:
df_training.loc[df_training["stationcode"] == '11025']

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend,sma_1_lag
7,110251729352699,11025,1,43,15,26,15,11,1,1,2024-10-19 15:44:59+00:00,75056,2024,10,19,5,1,
6,110251729356310,11025,1,43,17,22,17,5,1,1,2024-10-19 16:45:10+00:00,75056,2024,10,19,5,1,
5,110251729363427,11025,1,43,9,30,23,7,1,1,2024-10-19 18:43:47+00:00,75056,2024,10,19,5,1,
6,110251729377872,11025,1,43,7,34,28,6,1,1,2024-10-19 22:44:32+00:00,75056,2024,10,19,5,1,
6,110251729388654,11025,1,43,5,35,27,8,1,1,2024-10-20 01:44:14+00:00,75056,2024,10,20,6,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,110251730656452,11025,1,43,14,29,21,8,1,1,2024-11-03 17:54:12+00:00,75056,2024,11,3,6,1,28.0
6,110251730657905,11025,1,43,5,38,27,11,1,1,2024-11-03 18:18:25+00:00,75056,2024,11,3,6,1,21.0
5,110251730710732,11025,1,43,11,31,27,4,1,1,2024-11-04 08:58:52+00:00,75056,2024,11,4,0,0,14.0
9,110251730721334,11025,1,43,13,30,27,3,1,1,2024-11-04 11:55:34+00:00,75056,2024,11,4,0,0,28.0


In [13]:
df_train, df_valid = split_train_valid_last_hours(df_training, n_hours=24)

In [14]:
feat_cat = [
    "is_installed",
    "is_renting",
    "is_returning",
    "code_insee_commune",
    "duedate_weekend",
]

# 3. Train model

In [15]:
# experiment_id = create_mlflow_experiment(
#     experiment_folder_path="../data/06_models/mlruns",
#     experiment_name="velib_prediction"
# )
# experiment_id

experiment_id = '587291553688351204'

In [16]:
search_params = {
    "iterations": {
        "min": 100,
        "max": 500,
        "sampling_type": "int",
    },
    "depth": {
        "min": 2,
        "max": 7,
        "sampling_type": "int",
    },
    "learning_rate": {
        "min": 0.0001,
        "max": 0.5,
        "sampling_type": "loguniform"
    },
}

# params = {
#     "iterations": 100,
#     "depth": 4
# }

In [17]:
cols_to_drop = [
    "idx", "duedate",
]

In [18]:
df_train.drop(columns=cols_to_drop)

Unnamed: 0,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend,sma_1_lag
6,6108,1,17,11,6,5,1,1,1,75056,2024,10,19,5,1,
1,9020,1,21,13,7,4,3,1,1,75056,2024,10,19,5,1,
1,9020,1,21,13,7,4,3,1,1,75056,2024,10,19,5,1,
2,14111,1,25,21,2,1,1,1,1,75056,2024,10,19,5,1,
7,15202,1,30,26,4,1,3,1,1,75056,2024,10,19,5,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,14014,1,60,45,12,5,7,1,1,75056,2024,11,3,6,1,17.0
4,6003,1,21,2,18,12,6,1,1,75056,2024,11,3,6,1,17.0
3,6003,1,21,4,16,13,3,1,1,75056,2024,11,3,6,1,17.0
3,8026,1,12,9,2,1,1,1,1,75056,2024,11,3,6,1,3.0


In [19]:
train_model_bayesian_opti(
    run_name="Test_catboost",
    experiment_id=experiment_id,
    search_params=search_params,
    df_train=df_train.drop(columns=cols_to_drop),
    df_valid=df_valid.drop(columns=cols_to_drop),
    feat_cat=feat_cat,
    n_trials=10
)

# model, rmse_train, rmse_valid = train_model_mlflow(
#     experiment_id=experiment_id,
#     parent_run_id=None,
#     df_train=df_train,
#     df_valid=df_valid,
#     feat_cat=feat_cat,
#     **params
# )

# print("RMSE train:", rmse_train)
# print("RMSE valid:", rmse_valid)

[I 2024-12-09 08:01:50,852] A new study created in memory with name: 


  0%|          | 0/10 [00:00<?, ?it/s]



0:	learn: 9.5568163	test: 8.4866391	best: 8.4866391 (0)	total: 59.2ms	remaining: 13s
100:	learn: 9.1623635	test: 8.1718421	best: 8.1718421 (100)	total: 126ms	remaining: 150ms
200:	learn: 8.7861010	test: 7.8816464	best: 7.8816464 (200)	total: 199ms	remaining: 19.8ms
220:	learn: 8.7129440	test: 7.8238123	best: 7.8238123 (220)	total: 212ms	remaining: 0us

bestTest = 7.82381234
bestIteration = 220





[I 2024-12-09 08:01:52,924] Trial 0 finished with value: 7.823812340388705 and parameters: {'iterations': 221, 'depth': 7, 'learning_rate': 0.000505031897840583}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.4640479	test: 8.4070991	best: 8.4070991 (0)	total: 618us	remaining: 124ms
100:	learn: 3.5220691	test: 3.7413236	best: 3.7413236 (100)	total: 56.1ms	remaining: 55.6ms
200:	learn: 1.4450281	test: 2.3048224	best: 2.3048224 (200)	total: 110ms	remaining: 0us

bestTest = 2.304822444
bestIteration = 200





[I 2024-12-09 08:01:54,361] Trial 1 finished with value: 2.304822443910079 and parameters: {'iterations': 201, 'depth': 6, 'learning_rate': 0.012186164003373176}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.2041322	test: 8.1848497	best: 8.1848497 (0)	total: 696us	remaining: 110ms
100:	learn: 0.4552195	test: 1.7312920	best: 1.7312920 (100)	total: 101ms	remaining: 57.7ms
158:	learn: 0.2645356	test: 1.5756336	best: 1.5756336 (158)	total: 137ms	remaining: 0us

bestTest = 1.575633621
bestIteration = 158

[I 2024-12-09 08:01:55,744] Trial 2 finished with value: 1.5756336214696478 and parameters: {'iterations': 159, 'depth': 6, 'learning_rate': 0.04503021469470911}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.3818460	test: 8.3411336	best: 8.3411336 (0)	total: 751us	remaining: 353ms
100:	learn: 1.5735048	test: 2.4326247	best: 2.4326247 (100)	total: 67.8ms	remaining: 248ms
200:	learn: 0.4422857	test: 1.8549683	best: 1.8549683 (200)	total: 134ms	remaining: 179ms




300:	learn: 0.2621052	test: 1.7485755	best: 1.7485755 (300)	total: 199ms	remaining: 112ms
400:	learn: 0.2086949	test: 1.7108558	best: 1.7108558 (400)	total: 264ms	remaining: 46ms
470:	learn: 0.1846612	test: 1.7003358	best: 1.7003358 (470)	total: 308ms	remaining: 0us

bestTest = 1.700335778
bestIteration = 470

[I 2024-12-09 08:01:57,366] Trial 3 finished with value: 1.7003357778462258 and parameters: {'iterations': 471, 'depth': 7, 'learning_rate': 0.02302610518951525}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.5349406	test: 8.4684328	best: 8.4684328 (0)	total: 812us	remaining: 369ms
100:	learn: 7.2373776	test: 6.6595595	best: 6.6595595 (100)	total: 68.9ms	remaining: 242ms
200:	learn: 5.5271250	test: 5.3539057	best: 5.3539057 (200)	total: 143ms	remaining: 181ms




300:	learn: 4.2395752	test: 4.3876799	best: 4.3876799 (300)	total: 243ms	remaining: 125ms
400:	learn: 3.2746374	test: 3.6800242	best: 3.6800242 (400)	total: 320ms	remaining: 43.9ms
455:	learn: 2.8451338	test: 3.3784863	best: 3.3784863 (455)	total: 364ms	remaining: 0us

bestTest = 3.378486299
bestIteration = 455





[I 2024-12-09 08:01:59,101] Trial 4 finished with value: 3.3784862987448654 and parameters: {'iterations': 456, 'depth': 7, 'learning_rate': 0.0033177433743880463}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 8.9993232	test: 8.0117074	best: 8.0117074 (0)	total: 562us	remaining: 105ms
100:	learn: 0.3235499	test: 1.5570000	best: 1.5570000 (100)	total: 48.8ms	remaining: 41.6ms
186:	learn: 0.2026672	test: 1.4962110	best: 1.4962110 (186)	total: 88.5ms	remaining: 0us

bestTest = 1.496210969
bestIteration = 186

[I 2024-12-09 08:02:00,418] Trial 5 finished with value: 1.4962109689442773 and parameters: {'iterations': 187, 'depth': 5, 'learning_rate': 0.06913985147144269}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.3950331	test: 8.3485492	best: 8.3485492 (0)	total: 572us	remaining: 267ms
100:	learn: 1.9403626	test: 2.5288179	best: 2.5288179 (100)	total: 49.5ms	remaining: 179ms
200:	learn: 0.5867084	test: 1.7178588	best: 1.7178588 (200)	total: 96ms	remaining: 127ms




400:	learn: 0.2454822	test: 1.5055625	best: 1.5055625 (400)	total: 199ms	remaining: 32.7ms
466:	learn: 0.2220021	test: 1.4859419	best: 1.4859355 (465)	total: 233ms	remaining: 0us

bestTest = 1.485935486
bestIteration = 465

Shrink model to first 466 iterations.




[I 2024-12-09 08:02:02,113] Trial 6 finished with value: 1.4859354864397998 and parameters: {'iterations': 467, 'depth': 5, 'learning_rate': 0.020346777100231937}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.2300053	test: 8.2069503	best: 8.2069503 (0)	total: 771us	remaining: 128ms
100:	learn: 0.5137076	test: 1.7841841	best: 1.7841841 (100)	total: 56.6ms	remaining: 37ms
166:	learn: 0.2742931	test: 1.6404761	best: 1.6404761 (166)	total: 93.3ms	remaining: 0us

bestTest = 1.64047609
bestIteration = 166

[I 2024-12-09 08:02:03,423] Trial 7 finished with value: 1.6404760900411484 and parameters: {'iterations': 167, 'depth': 6, 'learning_rate': 0.04175477482488175}. Best is trial 0 with value: 7.823812340388705.
0:	learn: 9.5595719	test: 8.4889328	best: 8.4889328 (0)	total: 836us	remaining: 219ms
100:	learn: 9.4398545	test: 8.3933667	best: 8.3933667 (100)	total: 67.7ms	remaining: 109ms
200:	learn: 9.3220917	test: 8.3012251	best: 8.3012251 (200)	total: 135ms	remaining: 41.6ms




262:	learn: 9.2496899	test: 8.2435344	best: 8.2435344 (262)	total: 179ms	remaining: 0us

bestTest = 8.243534426
bestIteration = 262

[I 2024-12-09 08:02:04,812] Trial 8 finished with value: 8.243534425601819 and parameters: {'iterations': 263, 'depth': 7, 'learning_rate': 0.00015078459524436603}. Best is trial 8 with value: 8.243534425601819.
0:	learn: 9.2074235	test: 8.1887186	best: 8.1887186 (0)	total: 596us	remaining: 253ms
100:	learn: 0.5292652	test: 1.6838851	best: 1.6838851 (100)	total: 49.4ms	remaining: 159ms
200:	learn: 0.2715914	test: 1.4847505	best: 1.4847505 (200)	total: 97.2ms	remaining: 108ms
300:	learn: 0.2048121	test: 1.4549661	best: 1.4547176 (280)	total: 146ms	remaining: 60ms




400:	learn: 0.1570922	test: 1.4454197	best: 1.4446821 (391)	total: 194ms	remaining: 11.6ms
424:	learn: 0.1485193	test: 1.4435872	best: 1.4435398 (418)	total: 211ms	remaining: 0us

bestTest = 1.443539816
bestIteration = 418

Shrink model to first 419 iterations.
[I 2024-12-09 08:02:06,317] Trial 9 finished with value: 1.4435398162573434 and parameters: {'iterations': 425, 'depth': 5, 'learning_rate': 0.0434421882674713}. Best is trial 8 with value: 8.243534425601819.


{'iterations': 263, 'depth': 7, 'learning_rate': 0.00015078459524436603}

In [1]:
import pandas as pd

df = pd.read_parquet("../data/04_feature/df_feat_train.parquet")

df.head(1000).to_parquet("../tests/data/df_train.parquet")

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
0,90201730580768,9020,1,21,13,8,0,8,1,1,2024-11-02 20:52:48+00:00,75056,2024,11,2,5,1
1,141111730580684,14111,1,25,18,3,0,3,1,1,2024-11-02 20:51:24+00:00,75056,2024,11,2,5,1
2,140141730580750,14014,1,60,47,11,2,9,1,1,2024-11-02 20:52:30+00:00,75056,2024,11,2,5,1
3,60031730580840,6003,1,21,1,20,10,10,1,1,2024-11-02 20:54:00+00:00,75056,2024,11,2,5,1
4,61081730580670,6108,1,17,12,4,1,3,1,1,2024-11-02 20:51:10+00:00,75056,2024,11,2,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,61081729796238,6108,1,17,13,4,3,1,1,1,2024-10-24 18:57:18+00:00,75056,2024,10,24,3,0
6,70031729796295,7003,1,60,55,2,2,0,1,1,2024-10-24 18:58:15+00:00,75056,2024,10,24,3,0
7,110251729796230,11025,1,43,6,31,20,11,1,1,2024-10-24 18:57:10+00:00,75056,2024,10,24,3,0
8,150471729796149,15047,1,52,42,7,3,4,1,1,2024-10-24 18:55:49+00:00,75056,2024,10,24,3,0
