# Train gradient boosting model using Bayesian optimization to find best hyperparameters

# 1. Imports

## 1.1 Packages

In [14]:
import sys

import pandas as pd


## 1.2 Options

In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
sys.path.append('../src')

# from velib_prediction.pipelines.train_model.mlflow import (  # noqa: E402
#     create_mlflow_experiment,
# )
from velib_prediction.pipelines.train_model.nodes import (  # noqa: E402
    add_lags_sma,
    get_split_train_val_cv,
    split_train_valid_last_hours,
    # train_model_bayesian_opti,
    train_model_mlflow,
)


In [17]:
lags_to_try = [1,]

In [18]:
feat_date = "duedate"

## 1.3 Datasets

In [19]:
df_training = pd.read_parquet("../data/04_feature/df_feat_train.parquet")
df_training.sample(2)

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,numbikesavailable,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
0,141111730400463,14111,1,25,14,8,2,6,1,1,2024-10-31 18:47:43+00:00,75056,2024,10,31,3,0
5,130071730450963,13007,1,48,42,4,2,2,1,1,2024-11-01 08:49:23+00:00,75056,2024,11,1,4,0


In [20]:
df_training.rename(columns={"numbikesavailable": "target"}, inplace=True)

In [21]:
df_training.tail()

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend
5,170411729666463,17041,1,36,28,8,8,0,1,1,2024-10-23 06:54:23+00:00,75056,2024,10,23,2,0
6,150471729666488,15047,1,52,48,2,2,0,1,1,2024-10-23 06:54:48+00:00,75056,2024,10,23,2,0
7,121091729666278,12109,1,30,22,8,6,2,1,1,2024-10-23 06:51:18+00:00,75056,2024,10,23,2,0
8,141081729666376,14108,1,21,17,3,2,1,1,1,2024-10-23 06:52:56+00:00,75056,2024,10,23,2,0
9,161381729666193,16138,1,30,15,14,10,4,1,1,2024-10-23 06:49:53+00:00,75056,2024,10,23,2,0


# 2. Prepare datasets

In [22]:
# Add lags defined
df_training = add_lags_sma(df_training, lags_to_try, feat_id='stationcode', feat_date=feat_date, feat_target="target", n_shift=5)

In [23]:
df_training.sort_values(by="duedate", inplace=True)

In [None]:
# df_training.drop(columns="duedate", inplace=True)
# list_df = get_split_train_val_cv(df_training, n_splits=2)

# df_train, df_valid = list_df[0][0], list_df[0][1]

In [35]:
df_training.loc[df_training["stationcode"] == '11025']

Unnamed: 0,idx,stationcode,is_installed,capacity,numdocksavailable,target,mechanical,ebike,is_renting,is_returning,duedate,code_insee_commune,duedate_year,duedate_month,duedate_day,duedate_weekday,duedate_weekend,sma_1_lag
7,110251729352699,11025,1,43,15,26,15,11,1,1,2024-10-19 15:44:59+00:00,75056,2024,10,19,5,1,
6,110251729356310,11025,1,43,17,22,17,5,1,1,2024-10-19 16:45:10+00:00,75056,2024,10,19,5,1,
5,110251729363427,11025,1,43,9,30,23,7,1,1,2024-10-19 18:43:47+00:00,75056,2024,10,19,5,1,
6,110251729377872,11025,1,43,7,34,28,6,1,1,2024-10-19 22:44:32+00:00,75056,2024,10,19,5,1,
6,110251729388654,11025,1,43,5,35,27,8,1,1,2024-10-20 01:44:14+00:00,75056,2024,10,20,6,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9,110251730656452,11025,1,43,14,29,21,8,1,1,2024-11-03 17:54:12+00:00,75056,2024,11,3,6,1,28.0
6,110251730657905,11025,1,43,5,38,27,11,1,1,2024-11-03 18:18:25+00:00,75056,2024,11,3,6,1,21.0
5,110251730710732,11025,1,43,11,31,27,4,1,1,2024-11-04 08:58:52+00:00,75056,2024,11,4,0,0,14.0
9,110251730721334,11025,1,43,13,30,27,3,1,1,2024-11-04 11:55:34+00:00,75056,2024,11,4,0,0,28.0


In [None]:
df_train, df_valid = split_train_valid(df_training, n_hours=5)

In [33]:
df_train.groupby("stationcode")["duedate"].max().sort_index()

stationcode
11025   2024-11-04 11:55:34+00:00
11104   2024-11-04 11:58:43+00:00
12109   2024-11-04 08:59:02+00:00
13007   2024-11-03 21:47:11+00:00
13101   2024-11-03 22:21:38+00:00
14014   2024-11-03 22:21:42+00:00
14108   2024-11-02 18:45:12+00:00
14111   2024-11-03 21:36:08+00:00
15047   2024-11-03 22:22:42+00:00
15068   2024-10-21 00:43:19+00:00
15202   2024-11-03 22:13:00+00:00
16107   2024-11-03 17:56:09+00:00
16138   2024-11-03 22:18:38+00:00
17025   2024-11-01 21:50:07+00:00
17026   2024-11-03 14:55:56+00:00
17041   2024-11-03 21:41:28+00:00
17044   2024-11-04 08:58:58+00:00
19027   2024-10-28 23:05:35+00:00
19033   2024-11-03 22:19:14+00:00
20143   2024-11-01 04:49:34+00:00
2022    2024-11-03 22:10:00+00:00
21010   2024-11-03 18:14:12+00:00
5016    2024-11-04 11:58:14+00:00
5110    2024-11-04 12:54:32+00:00
6003    2024-11-04 11:59:16+00:00
6021    2024-11-03 22:26:30+00:00
6108    2024-11-04 11:46:01+00:00
7002    2024-11-04 08:57:35+00:00
7003    2024-11-04 12:59:32+00:00
80

In [32]:
df_valid.groupby("stationcode")["duedate"].min().sort_index()

stationcode
10013   2024-10-30 19:14:45+00:00
11025   2024-11-04 18:59:26+00:00
11104   2024-11-04 16:53:49+00:00
12109   2024-11-04 16:59:08+00:00
13007   2024-11-04 11:54:49+00:00
13101   2024-11-04 16:57:40+00:00
13118   2024-10-30 19:08:50+00:00
14014   2024-11-04 18:58:00+00:00
14108   2024-11-03 15:56:03+00:00
14111   2024-11-04 11:55:27+00:00
15047   2024-11-04 12:50:17+00:00
15068   2024-10-28 23:03:52+00:00
15202   2024-11-04 12:58:09+00:00
16107   2024-11-04 11:49:37+00:00
16138   2024-11-04 08:58:17+00:00
17025   2024-11-02 07:45:38+00:00
17026   2024-11-03 18:16:49+00:00
17041   2024-11-04 08:57:30+00:00
17044   2024-11-04 16:58:19+00:00
19027   2024-11-01 04:49:41+00:00
19033   2024-11-04 12:51:46+00:00
20143   2024-11-03 06:46:10+00:00
2017    2024-10-30 19:13:25+00:00
2022    2024-11-04 12:59:07+00:00
21010   2024-11-04 11:58:22+00:00
5016    2024-11-04 16:54:08+00:00
5110    2024-11-04 18:59:20+00:00
6003    2024-11-04 19:00:06+00:00
6021    2024-11-04 16:58:33+00:00
61

In [27]:
feat_cat = [
    "is_installed",
    "is_renting",
    "is_returning",
    "code_insee_commune",
    "duedate_weekend",
]

# 3. Train model

In [28]:
# experiment_id = create_mlflow_experiment(
#     experiment_folder_path="../data/06_models/mlruns",
#     experiment_name="velib_prediction"
# )
# experiment_id

experiment_id = '587291553688351204'

In [29]:
# search_params = {
#     "iterations": (100, 500),
#     "depth": (2, 7),
# }

params = {
    "iterations": 100,
    "depth": 4
}

In [30]:
# train_model_bayesian_opti(
#     run_name="Test_catboost",
#     experiment_id=experiment_id,
#     search_params=search_params,
#     df_train=df_train,
#     df_valid=df_valid,
#     feat_cat=feat_cat,
#     n_trials=10
# )

model, rmse_train, rmse_valid = train_model_mlflow(
    experiment_id=experiment_id,
    parent_run_id=None,
    df_train=df_train,
    df_valid=df_valid,
    feat_cat=feat_cat,
    **params
)

print("RMSE train:", rmse_train)
print("RMSE valid:", rmse_valid)



Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

RMSE train: 0.2992554793036776
RMSE valid: 0.7423017662117376
