# Example usage notebook
Objects and functions in this notebook are listed with all paramaters to ilustrate their capabillities. Most of the paramaters have default values in the implementation

## Imports

In [1]:
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [2]:
import os
import json
from typing import Literal
import pandas as pd

from churn_pred.preprocessing.preprocess import PreprocessData
from churn_pred.training.trainer import Trainer
from churn_pred.training.optuna_optimizer import LGBOptunaOptimizer

import dill
import numpy as np
from churn_pred.utils import dill_dump
from churn_pred.utils import dill_load
from sklearn.model_selection import train_test_split

from pprint import pprint

## Train a testing model 

In [3]:
# 1. get the data
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
df_pd.head()

Unnamed: 0,CustomerId,CreditScore,Country,Gender,Age,Tenure,Balance (EUR),NumberOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited,CustomerFeedback_sentiment3,CustomerFeedback_sentiment5,Surname_Country,Surname_Country_region,Surname_Country_subregion,Country_region,Country_subregion,is_native,Country_hemisphere,Country_gdp_per_capita,Country_IncomeGroup,Surname_Country_gdp_per_capita,Surname_Country_IncomeGroup,working_class,stage_of_life,generation
0,15787619,844,France,Male,18,2,160980.03,1,0,0,145936.28,0,neutral,4 stars,Taiwan,Asia,Eastern Asia,Europe,Western Europe,0,northern,57594.03402,High income,32756.0,,working_age,teen,gen_z
1,15770309,656,France,Male,18,10,151762.74,1,0,1,127014.32,0,neutral,1 star,United States,Americas,Northern America,Europe,Western Europe,0,northern,57594.03402,High income,76329.58227,High income,working_age,teen,gen_z
2,15569178,570,France,Female,18,4,82767.42,1,1,0,71811.9,0,neutral,2 stars,Russian Federation,Europe,Eastern Europe,Europe,Western Europe,0,northern,57594.03402,High income,34637.76172,Upper middle income,working_age,teen,gen_z
3,15795519,716,Germany,Female,18,3,128743.8,1,0,0,197322.13,0,neutral,2 stars,Russian Federation,Europe,Eastern Europe,Europe,Western Europe,0,northern,66616.02225,High income,34637.76172,Upper middle income,working_age,teen,gen_z
4,15621893,727,France,Male,18,4,133550.67,1,1,1,46941.41,0,positive,1 star,Italy,Europe,Southern Europe,Europe,Western Europe,0,northern,57594.03402,High income,55442.07843,High income,working_age,teen,gen_z


In [4]:
target_col = "Exited"
id_cols = ["CustomerId"]
cat_cols = [
    "Country",
    "Gender",
    "HasCreditCard",
    "IsActiveMember",
    "CustomerFeedback_sentiment3",
    "CustomerFeedback_sentiment5",
    "Surname_Country",
    "Surname_Country_region",
    "Surname_Country_subregion",
    "Country_region",
    "Country_subregion",
    "is_native",
    "Country_hemisphere",
    "Country_IncomeGroup",
    "Surname_Country_IncomeGroup",
    "working_class",
    "stage_of_life",
    "generation",
]
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()

In [5]:
df_pd[cat_cols] = df_pd[cat_cols].astype(str)

In [6]:
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
    df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid[target_col],
    random_state=random_state,
)

In [7]:
prepare_data = PreprocessData(
    id_cols=id_cols,
    target_col=target_col,
    cat_cols=cat_cols,
    cont_cols=cont_cols,
)
# this should be fitted only on training data
_ = prepare_data.fit(df=df_pd)

In [8]:
optimizer = LGBOptunaOptimizer(
    objective="binary",
    n_class=2,
)

trainer = Trainer(
    cat_cols=prepare_data.cat_cols,
    target_col=prepare_data.target_col,
    id_cols=id_cols,
    objective="binary",
    n_class=2,
    optimizer=optimizer,
    preprocessors=[prepare_data],
)

metrics_dict = trainer.fit(
    df_train=df_train,
    df_valid=df_valid,
    df_test=df_test,
)

[I 2024-05-01 21:23:45,400] A new study created in memory with name: no-name-d630027d-067f-416b-b1eb-7c87a894bc0a
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844:  14%|#4        | 1/7 [00:07<00:43,  7.21s/it][I 2024-05-01 21:23:52,617] Trial 0 finished with value: 0.37984401975406856 and parameters: {'feature_fraction': 0.7}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844:  14%|#4        | 1/7 [00:07<00:43,  7.21s/it]

Early stopping, best iteration is:
[163]	valid_0's binary_logloss: 0.379844
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844:  29%|##8       | 2/7 [00:09<00:21,  4.27s/it][I 2024-05-01 21:23:54,829] Trial 1 finished with value: 0.38409999598941535 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844:  29%|##8       | 2/7 [00:09<00:21,  4.27s/it]

Early stopping, best iteration is:
[234]	valid_0's binary_logloss: 0.3841
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844:  43%|####2     | 3/7 [00:11<00:12,  3.15s/it][I 2024-05-01 21:23:56,638] Trial 2 finished with value: 0.3823353357777058 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844:  43%|####2     | 3/7 [00:11<00:12,  3.15s/it]

Early stopping, best iteration is:
[183]	valid_0's binary_logloss: 0.382335
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844:  57%|#####7    | 4/7 [00:13<00:07,  2.62s/it][I 2024-05-01 21:23:58,444] Trial 3 finished with value: 0.3853827423103823 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844:  57%|#####7    | 4/7 [00:13<00:07,  2.62s/it]

Early stopping, best iteration is:
[174]	valid_0's binary_logloss: 0.385383
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844:  71%|#######1  | 5/7 [00:14<00:04,  2.32s/it][I 2024-05-01 21:24:00,240] Trial 4 finished with value: 0.38473704277343407 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844:  71%|#######1  | 5/7 [00:14<00:04,  2.32s/it]

Early stopping, best iteration is:
[227]	valid_0's binary_logloss: 0.384737
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844:  86%|########5 | 6/7 [00:16<00:02,  2.21s/it][I 2024-05-01 21:24:02,245] Trial 5 finished with value: 0.3882168120094672 and parameters: {'feature_fraction': 0.8}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844:  86%|########5 | 6/7 [00:16<00:02,  2.21s/it]

Early stopping, best iteration is:
[209]	valid_0's binary_logloss: 0.388217
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.379844: 100%|##########| 7/7 [00:17<00:00,  1.68s/it][I 2024-05-01 21:24:02,840] Trial 6 finished with value: 0.410300061224442 and parameters: {'feature_fraction': 1.0}. Best is trial 0 with value: 0.37984401975406856.
feature_fraction, val_score: 0.379844: 100%|##########| 7/7 [00:17<00:00,  2.49s/it]


Early stopping, best iteration is:
[10]	valid_0's binary_logloss: 0.4103


num_leaves, val_score: 0.379844:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373154:   5%|5         | 1/20 [00:06<01:59,  6.27s/it][I 2024-05-01 21:24:09,119] Trial 7 finished with value: 0.3731542988236144 and parameters: {'num_leaves': 89}. Best is trial 7 with value: 0.3731542988236144.
num_leaves, val_score: 0.373154:   5%|5         | 1/20 [00:06<01:59,  6.27s/it]

Early stopping, best iteration is:
[86]	valid_0's binary_logloss: 0.373154
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373154:  10%|#         | 2/20 [00:08<01:08,  3.83s/it][I 2024-05-01 21:24:11,231] Trial 8 finished with value: 0.3787072948288647 and parameters: {'num_leaves': 34}. Best is trial 7 with value: 0.3731542988236144.
num_leaves, val_score: 0.373154:  10%|#         | 2/20 [00:08<01:08,  3.83s/it]

Early stopping, best iteration is:
[185]	valid_0's binary_logloss: 0.378707
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  15%|#5        | 3/20 [00:11<01:00,  3.54s/it][I 2024-05-01 21:24:14,422] Trial 9 finished with value: 0.37027853532829447 and parameters: {'num_leaves': 182}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  15%|#5        | 3/20 [00:11<01:00,  3.54s/it]

Early stopping, best iteration is:
[50]	valid_0's binary_logloss: 0.370279
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  20%|##        | 4/20 [00:14<00:55,  3.45s/it][I 2024-05-01 21:24:17,738] Trial 10 finished with value: 0.3738566569324325 and parameters: {'num_leaves': 251}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  20%|##        | 4/20 [00:14<00:55,  3.45s/it]

Early stopping, best iteration is:
[30]	valid_0's binary_logloss: 0.373857
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  25%|##5       | 5/20 [00:17<00:46,  3.07s/it][I 2024-05-01 21:24:20,129] Trial 11 finished with value: 0.37562094257700807 and parameters: {'num_leaves': 146}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  25%|##5       | 5/20 [00:17<00:46,  3.07s/it]

Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.375621
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  30%|###       | 6/20 [00:19<00:41,  2.94s/it][I 2024-05-01 21:24:22,830] Trial 12 finished with value: 0.37355185545083636 and parameters: {'num_leaves': 128}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  30%|###       | 6/20 [00:19<00:41,  2.94s/it]

Early stopping, best iteration is:
[54]	valid_0's binary_logloss: 0.373552
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  35%|###5      | 7/20 [00:25<00:47,  3.69s/it][I 2024-05-01 21:24:28,112] Trial 13 finished with value: 0.37749137269814065 and parameters: {'num_leaves': 137}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  35%|###5      | 7/20 [00:25<00:47,  3.69s/it]

Early stopping, best iteration is:
[66]	valid_0's binary_logloss: 0.377491
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  40%|####      | 8/20 [00:27<00:38,  3.24s/it][I 2024-05-01 21:24:30,327] Trial 14 finished with value: 0.37670628993111843 and parameters: {'num_leaves': 71}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  40%|####      | 8/20 [00:27<00:38,  3.24s/it]

Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.376706
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.370279:  45%|####5     | 9/20 [00:30<00:35,  3.23s/it][I 2024-05-01 21:24:33,532] Trial 15 finished with value: 0.37171561556175114 and parameters: {'num_leaves': 211}. Best is trial 9 with value: 0.37027853532829447.
num_leaves, val_score: 0.370279:  45%|####5     | 9/20 [00:30<00:35,  3.23s/it]

Early stopping, best iteration is:
[38]	valid_0's binary_logloss: 0.371716
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367726:  50%|#####     | 10/20 [00:34<00:34,  3.40s/it][I 2024-05-01 21:24:37,332] Trial 16 finished with value: 0.36772645943071486 and parameters: {'num_leaves': 218}. Best is trial 16 with value: 0.36772645943071486.
num_leaves, val_score: 0.367726:  50%|#####     | 10/20 [00:34<00:34,  3.40s/it]

Early stopping, best iteration is:
[52]	valid_0's binary_logloss: 0.367726
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367726:  55%|#####5    | 11/20 [00:37<00:29,  3.25s/it][I 2024-05-01 21:24:40,238] Trial 17 finished with value: 0.3735775248435483 and parameters: {'num_leaves': 195}. Best is trial 16 with value: 0.36772645943071486.
num_leaves, val_score: 0.367726:  55%|#####5    | 11/20 [00:37<00:29,  3.25s/it]

Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.373578
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  60%|######    | 12/20 [00:40<00:26,  3.29s/it][I 2024-05-01 21:24:43,621] Trial 18 finished with value: 0.3631654866465407 and parameters: {'num_leaves': 198}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  60%|######    | 12/20 [00:40<00:26,  3.29s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  65%|######5   | 13/20 [00:44<00:24,  3.45s/it][I 2024-05-01 21:24:47,442] Trial 19 finished with value: 0.3658250333668991 and parameters: {'num_leaves': 255}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  65%|######5   | 13/20 [00:44<00:24,  3.45s/it]

Early stopping, best iteration is:
[42]	valid_0's binary_logloss: 0.365825
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  70%|#######   | 14/20 [00:50<00:24,  4.10s/it][I 2024-05-01 21:24:53,028] Trial 20 finished with value: 0.37765964111476913 and parameters: {'num_leaves': 250}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  70%|#######   | 14/20 [00:50<00:24,  4.10s/it]

Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.37766
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  75%|#######5  | 15/20 [00:56<00:23,  4.64s/it][I 2024-05-01 21:24:58,936] Trial 21 finished with value: 0.3635935909969657 and parameters: {'num_leaves': 224}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  75%|#######5  | 15/20 [00:56<00:23,  4.64s/it]

Early stopping, best iteration is:
[45]	valid_0's binary_logloss: 0.363594
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  80%|########  | 16/20 [01:00<00:18,  4.63s/it][I 2024-05-01 21:25:03,542] Trial 22 finished with value: 0.3762392972653625 and parameters: {'num_leaves': 256}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  80%|########  | 16/20 [01:00<00:18,  4.63s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.376239
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  85%|########5 | 17/20 [01:06<00:15,  5.07s/it][I 2024-05-01 21:25:09,619] Trial 23 finished with value: 0.372364943808219 and parameters: {'num_leaves': 221}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  85%|########5 | 17/20 [01:06<00:15,  5.07s/it]

Early stopping, best iteration is:
[45]	valid_0's binary_logloss: 0.372365
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  90%|######### | 18/20 [01:19<00:14,  7.30s/it][I 2024-05-01 21:25:22,123] Trial 24 finished with value: 0.3648830332169861 and parameters: {'num_leaves': 171}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  90%|######### | 18/20 [01:19<00:14,  7.30s/it]

Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.364883
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165:  95%|#########5| 19/20 [01:22<00:06,  6.19s/it][I 2024-05-01 21:25:25,739] Trial 25 finished with value: 0.36907375798189823 and parameters: {'num_leaves': 168}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165:  95%|#########5| 19/20 [01:22<00:06,  6.19s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.369074
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.363165: 100%|##########| 20/20 [01:25<00:00,  5.14s/it][I 2024-05-01 21:25:28,426] Trial 26 finished with value: 0.3684827888676411 and parameters: {'num_leaves': 165}. Best is trial 18 with value: 0.3631654866465407.
num_leaves, val_score: 0.363165: 100%|##########| 20/20 [01:25<00:00,  4.28s/it]


Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.368483


bagging, val_score: 0.363165:   0%|          | 0/10 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  10%|#         | 1/10 [00:03<00:27,  3.08s/it][I 2024-05-01 21:25:31,522] Trial 27 finished with value: 0.37409818995074706 and parameters: {'bagging_fraction': 0.6874386187665034, 'bagging_freq': 5}. Best is trial 27 with value: 0.37409818995074706.
bagging, val_score: 0.363165:  10%|#         | 1/10 [00:03<00:27,  3.08s/it]

Early stopping, best iteration is:
[52]	valid_0's binary_logloss: 0.374098
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  20%|##        | 2/10 [00:09<00:39,  4.99s/it][I 2024-05-01 21:25:37,841] Trial 28 finished with value: 0.37471757657090604 and parameters: {'bagging_fraction': 0.9956160383196109, 'bagging_freq': 1}. Best is trial 27 with value: 0.37409818995074706.
bagging, val_score: 0.363165:  20%|##        | 2/10 [00:09<00:39,  4.99s/it]

Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.374718
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  30%|###       | 3/10 [00:11<00:24,  3.48s/it][I 2024-05-01 21:25:39,531] Trial 29 finished with value: 0.3881328954029099 and parameters: {'bagging_fraction': 0.4000466628465502, 'bagging_freq': 7}. Best is trial 27 with value: 0.37409818995074706.
bagging, val_score: 0.363165:  30%|###       | 3/10 [00:11<00:24,  3.48s/it]

Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.388133
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  40%|####      | 4/10 [00:14<00:20,  3.41s/it][I 2024-05-01 21:25:42,830] Trial 30 finished with value: 0.37112386819072435 and parameters: {'bagging_fraction': 0.9660001811651555, 'bagging_freq': 1}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165:  40%|####      | 4/10 [00:14<00:20,  3.41s/it]

Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.371124
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  50%|#####     | 5/10 [00:17<00:15,  3.15s/it][I 2024-05-01 21:25:45,527] Trial 31 finished with value: 0.3814333892163908 and parameters: {'bagging_fraction': 0.4086955649462155, 'bagging_freq': 4}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165:  50%|#####     | 5/10 [00:17<00:15,  3.15s/it]

Early stopping, best iteration is:
[60]	valid_0's binary_logloss: 0.381433
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  60%|######    | 6/10 [00:20<00:12,  3.13s/it][I 2024-05-01 21:25:48,621] Trial 32 finished with value: 0.37164548042615214 and parameters: {'bagging_fraction': 0.6956261973827507, 'bagging_freq': 3}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165:  60%|######    | 6/10 [00:20<00:12,  3.13s/it]

Early stopping, best iteration is:
[48]	valid_0's binary_logloss: 0.371645
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  70%|#######   | 7/10 [00:22<00:08,  2.99s/it][I 2024-05-01 21:25:51,319] Trial 33 finished with value: 0.3734658479856046 and parameters: {'bagging_fraction': 0.8307663196126824, 'bagging_freq': 7}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165:  70%|#######   | 7/10 [00:22<00:08,  2.99s/it]

Early stopping, best iteration is:
[31]	valid_0's binary_logloss: 0.373466
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  80%|########  | 8/10 [00:25<00:05,  2.97s/it][I 2024-05-01 21:25:54,243] Trial 34 finished with value: 0.3796650018224596 and parameters: {'bagging_fraction': 0.5660326067670218, 'bagging_freq': 3}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165:  80%|########  | 8/10 [00:25<00:05,  2.97s/it]

Early stopping, best iteration is:
[52]	valid_0's binary_logloss: 0.379665
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165:  90%|######### | 9/10 [00:29<00:03,  3.07s/it][I 2024-05-01 21:25:57,612] Trial 35 finished with value: 0.38350780701414616 and parameters: {'bagging_fraction': 0.5512934223382902, 'bagging_freq': 5}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165:  90%|######### | 9/10 [00:29<00:03,  3.07s/it]

Early stopping, best iteration is:
[60]	valid_0's binary_logloss: 0.383508
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.363165: 100%|##########| 10/10 [00:32<00:00,  3.13s/it][I 2024-05-01 21:26:00,815] Trial 36 finished with value: 0.3787565611977642 and parameters: {'bagging_fraction': 0.8839668872496622, 'bagging_freq': 2}. Best is trial 30 with value: 0.37112386819072435.
bagging, val_score: 0.363165: 100%|##########| 10/10 [00:32<00:00,  3.24s/it]


Early stopping, best iteration is:
[38]	valid_0's binary_logloss: 0.378757


feature_fraction_stage2, val_score: 0.363165:   0%|          | 0/6 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.363165:  17%|#6        | 1/6 [00:04<00:21,  4.31s/it][I 2024-05-01 21:26:05,135] Trial 37 finished with value: 0.37778620877327024 and parameters: {'feature_fraction': 0.652}. Best is trial 37 with value: 0.37778620877327024.
feature_fraction_stage2, val_score: 0.363165:  17%|#6        | 1/6 [00:04<00:21,  4.31s/it]

Early stopping, best iteration is:
[45]	valid_0's binary_logloss: 0.377786
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.363165:  33%|###3      | 2/6 [00:09<00:18,  4.54s/it][I 2024-05-01 21:26:09,833] Trial 38 finished with value: 0.36316548664654064 and parameters: {'feature_fraction': 0.716}. Best is trial 38 with value: 0.36316548664654064.
feature_fraction_stage2, val_score: 0.363165:  33%|###3      | 2/6 [00:09<00:18,  4.54s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.363165:  50%|#####     | 3/6 [00:19<00:21,  7.17s/it][I 2024-05-01 21:26:20,131] Trial 39 finished with value: 0.3818026498511844 and parameters: {'feature_fraction': 0.7799999999999999}. Best is trial 38 with value: 0.36316548664654064.
feature_fraction_stage2, val_score: 0.363165:  50%|#####     | 3/6 [00:19<00:21,  7.17s/it]

Early stopping, best iteration is:
[29]	valid_0's binary_logloss: 0.381803
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.363165:  67%|######6   | 4/6 [00:22<00:11,  5.65s/it][I 2024-05-01 21:26:23,444] Trial 40 finished with value: 0.37231189943704796 and parameters: {'feature_fraction': 0.62}. Best is trial 38 with value: 0.36316548664654064.
feature_fraction_stage2, val_score: 0.363165:  67%|######6   | 4/6 [00:22<00:11,  5.65s/it]

Early stopping, best iteration is:
[45]	valid_0's binary_logloss: 0.372312
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.363165:  83%|########3 | 5/6 [00:26<00:04,  4.98s/it][I 2024-05-01 21:26:27,240] Trial 41 finished with value: 0.36316548664654064 and parameters: {'feature_fraction': 0.6839999999999999}. Best is trial 38 with value: 0.36316548664654064.
feature_fraction_stage2, val_score: 0.363165:  83%|########3 | 5/6 [00:26<00:04,  4.98s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.363165: 100%|##########| 6/6 [00:29<00:00,  4.44s/it][I 2024-05-01 21:26:30,640] Trial 42 finished with value: 0.37306027041183026 and parameters: {'feature_fraction': 0.748}. Best is trial 38 with value: 0.36316548664654064.
feature_fraction_stage2, val_score: 0.363165: 100%|##########| 6/6 [00:29<00:00,  4.97s/it]


Early stopping, best iteration is:
[47]	valid_0's binary_logloss: 0.37306


regularization_factors, val_score: 0.363165:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:   5%|5         | 1/20 [00:06<02:03,  6.48s/it][I 2024-05-01 21:26:37,128] Trial 43 finished with value: 0.3693160167829919 and parameters: {'lambda_l1': 0.010839648774434362, 'lambda_l2': 1.841325727627302e-08}. Best is trial 43 with value: 0.3693160167829919.
regularization_factors, val_score: 0.363165:   5%|5         | 1/20 [00:06<02:03,  6.48s/it]

Early stopping, best iteration is:
[54]	valid_0's binary_logloss: 0.369316
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  10%|#         | 2/20 [00:10<01:30,  5.03s/it][I 2024-05-01 21:26:41,134] Trial 44 finished with value: 0.3776993048423959 and parameters: {'lambda_l1': 1.8449710008006576e-08, 'lambda_l2': 8.096058854091515}. Best is trial 43 with value: 0.3693160167829919.
regularization_factors, val_score: 0.363165:  10%|#         | 2/20 [00:10<01:30,  5.03s/it]

Early stopping, best iteration is:
[89]	valid_0's binary_logloss: 0.377699
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  15%|#5        | 3/20 [00:16<01:31,  5.38s/it][I 2024-05-01 21:26:46,932] Trial 45 finished with value: 0.3825146969283817 and parameters: {'lambda_l1': 5.222945182730914, 'lambda_l2': 0.0005511762357956978}. Best is trial 43 with value: 0.3693160167829919.
regularization_factors, val_score: 0.363165:  15%|#5        | 3/20 [00:16<01:31,  5.38s/it]

Early stopping, best iteration is:
[151]	valid_0's binary_logloss: 0.382515
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  20%|##        | 4/20 [00:19<01:14,  4.68s/it][I 2024-05-01 21:26:50,531] Trial 46 finished with value: 0.36316548666967746 and parameters: {'lambda_l1': 2.38821495370382e-08, 'lambda_l2': 1.2234306473380587e-08}. Best is trial 46 with value: 0.36316548666967746.
regularization_factors, val_score: 0.363165:  20%|##        | 4/20 [00:19<01:14,  4.68s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  25%|##5       | 5/20 [00:25<01:13,  4.90s/it][I 2024-05-01 21:26:55,826] Trial 47 finished with value: 0.3631654866650527 and parameters: {'lambda_l1': 1.742301640602742e-08, 'lambda_l2': 1.4757341530288764e-08}. Best is trial 47 with value: 0.3631654866650527.
regularization_factors, val_score: 0.363165:  25%|##5       | 5/20 [00:25<01:13,  4.90s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  30%|###       | 6/20 [00:28<01:01,  4.42s/it][I 2024-05-01 21:26:59,330] Trial 48 finished with value: 0.36316548663033577 and parameters: {'lambda_l1': 1.3466481734549487e-08, 'lambda_l2': 1.4328721374059667e-08}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  30%|###       | 6/20 [00:28<01:01,  4.42s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  35%|###5      | 7/20 [00:32<00:53,  4.09s/it][I 2024-05-01 21:27:02,722] Trial 49 finished with value: 0.37054045070773345 and parameters: {'lambda_l1': 3.088163108098451e-06, 'lambda_l2': 2.7879778968249513e-06}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  35%|###5      | 7/20 [00:32<00:53,  4.09s/it]

Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.37054
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  40%|####      | 8/20 [00:35<00:47,  3.97s/it][I 2024-05-01 21:27:06,441] Trial 50 finished with value: 0.37054045078383646 and parameters: {'lambda_l1': 8.658464209873121e-06, 'lambda_l2': 5.7532449843552794e-06}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  40%|####      | 8/20 [00:35<00:47,  3.97s/it]

Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.37054
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  45%|####5     | 9/20 [00:39<00:42,  3.88s/it][I 2024-05-01 21:27:10,138] Trial 51 finished with value: 0.36316548664521475 and parameters: {'lambda_l1': 1.9014991506245228e-08, 'lambda_l2': 1.228912182998111e-08}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  45%|####5     | 9/20 [00:39<00:42,  3.88s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  50%|#####     | 10/20 [00:43<00:38,  3.83s/it][I 2024-05-01 21:27:13,840] Trial 52 finished with value: 0.3631654866579665 and parameters: {'lambda_l1': 2.130936000793227e-08, 'lambda_l2': 1.0208411546562743e-08}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  50%|#####     | 10/20 [00:43<00:38,  3.83s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  55%|#####5    | 11/20 [00:47<00:34,  3.88s/it][I 2024-05-01 21:27:17,910] Trial 53 finished with value: 0.36317380360951507 and parameters: {'lambda_l1': 9.881443065281692e-07, 'lambda_l2': 8.743449696624595e-07}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  55%|#####5    | 11/20 [00:47<00:34,  3.88s/it]

Early stopping, best iteration is:
[56]	valid_0's binary_logloss: 0.363174
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  60%|######    | 12/20 [00:50<00:29,  3.73s/it][I 2024-05-01 21:27:21,222] Trial 54 finished with value: 0.3637709163338463 and parameters: {'lambda_l1': 2.881752239078986e-07, 'lambda_l2': 1.8792756540431085e-07}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  60%|######    | 12/20 [00:50<00:29,  3.73s/it]

Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.363771
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  65%|######5   | 13/20 [00:53<00:24,  3.54s/it][I 2024-05-01 21:27:24,321] Trial 55 finished with value: 0.37120916631743206 and parameters: {'lambda_l1': 0.00020721071716331315, 'lambda_l2': 0.00014298218599046993}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  65%|######5   | 13/20 [00:53<00:24,  3.54s/it]

Early stopping, best iteration is:
[38]	valid_0's binary_logloss: 0.371209
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  70%|#######   | 14/20 [00:57<00:21,  3.53s/it][I 2024-05-01 21:27:27,835] Trial 56 finished with value: 0.36316548668822407 and parameters: {'lambda_l1': 1.057995755643754e-08, 'lambda_l2': 1.3909980688169465e-07}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  70%|#######   | 14/20 [00:57<00:21,  3.53s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  75%|#######5  | 15/20 [01:00<00:17,  3.46s/it][I 2024-05-01 21:27:31,214] Trial 57 finished with value: 0.3657911523095971 and parameters: {'lambda_l1': 5.12545520043369e-07, 'lambda_l2': 0.044035931319284856}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  75%|#######5  | 15/20 [01:00<00:17,  3.46s/it]

Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.365791
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  80%|########  | 16/20 [01:03<00:13,  3.26s/it][I 2024-05-01 21:27:34,010] Trial 58 finished with value: 0.37358486758113685 and parameters: {'lambda_l1': 3.58530228752031e-05, 'lambda_l2': 3.334572516297209e-05}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  80%|########  | 16/20 [01:03<00:13,  3.26s/it]

Early stopping, best iteration is:
[29]	valid_0's binary_logloss: 0.373585
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  85%|########5 | 17/20 [01:07<00:10,  3.46s/it][I 2024-05-01 21:27:37,845] Trial 59 finished with value: 0.37121279787563816 and parameters: {'lambda_l1': 1.984329418685077e-07, 'lambda_l2': 1.6307206573021137e-07}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  85%|########5 | 17/20 [01:07<00:10,  3.46s/it]

Early stopping, best iteration is:
[58]	valid_0's binary_logloss: 0.371213
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  90%|######### | 18/20 [01:10<00:06,  3.37s/it][I 2024-05-01 21:27:41,032] Trial 60 finished with value: 0.37045292764892235 and parameters: {'lambda_l1': 0.007568084108802888, 'lambda_l2': 1.0129981332826581e-08}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  90%|######### | 18/20 [01:10<00:06,  3.37s/it]

Early stopping, best iteration is:
[47]	valid_0's binary_logloss: 0.370453
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165:  95%|#########5| 19/20 [01:14<00:03,  3.66s/it][I 2024-05-01 21:27:45,339] Trial 61 finished with value: 0.3646160826147363 and parameters: {'lambda_l1': 9.591305582762461e-08, 'lambda_l2': 7.830943342805554e-08}. Best is trial 48 with value: 0.36316548663033577.
regularization_factors, val_score: 0.363165:  95%|#########5| 19/20 [01:14<00:03,  3.66s/it]

Early stopping, best iteration is:
[38]	valid_0's binary_logloss: 0.364616
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.363165: 100%|##########| 20/20 [01:18<00:00,  3.60s/it][I 2024-05-01 21:27:48,813] Trial 62 finished with value: 0.3631654866159358 and parameters: {'lambda_l1': 1.3569118215629687e-08, 'lambda_l2': 1.1426930988999945e-08}. Best is trial 62 with value: 0.3631654866159358.
regularization_factors, val_score: 0.363165: 100%|##########| 20/20 [01:18<00:00,  3.91s/it]


Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.363165


min_child_samples, val_score: 0.363165:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.363165:  20%|##        | 1/5 [00:03<00:12,  3.01s/it][I 2024-05-01 21:27:51,833] Trial 63 finished with value: 0.3752495834229246 and parameters: {'min_child_samples': 5}. Best is trial 63 with value: 0.3752495834229246.
min_child_samples, val_score: 0.363165:  20%|##        | 1/5 [00:03<00:12,  3.01s/it]

Early stopping, best iteration is:
[32]	valid_0's binary_logloss: 0.37525
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.363165:  40%|####      | 2/5 [00:05<00:08,  2.93s/it][I 2024-05-01 21:27:54,713] Trial 64 finished with value: 0.3806994493558711 and parameters: {'min_child_samples': 50}. Best is trial 63 with value: 0.3752495834229246.
min_child_samples, val_score: 0.363165:  40%|####      | 2/5 [00:05<00:08,  2.93s/it]

Early stopping, best iteration is:
[81]	valid_0's binary_logloss: 0.380699
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.363165:  60%|######    | 3/5 [00:08<00:05,  2.57s/it][I 2024-05-01 21:27:56,840] Trial 65 finished with value: 0.3920818505283064 and parameters: {'min_child_samples': 100}. Best is trial 63 with value: 0.3752495834229246.
min_child_samples, val_score: 0.363165:  60%|######    | 3/5 [00:08<00:05,  2.57s/it]

Early stopping, best iteration is:
[109]	valid_0's binary_logloss: 0.392082
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.363165:  80%|########  | 4/5 [00:11<00:02,  2.89s/it][I 2024-05-01 21:28:00,224] Trial 66 finished with value: 0.3757703556020652 and parameters: {'min_child_samples': 10}. Best is trial 63 with value: 0.3752495834229246.
min_child_samples, val_score: 0.363165:  80%|########  | 4/5 [00:11<00:02,  2.89s/it]

Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.37577
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.363165: 100%|##########| 5/5 [00:14<00:00,  2.90s/it][I 2024-05-01 21:28:03,136] Trial 67 finished with value: 0.37433125024309605 and parameters: {'min_child_samples': 25}. Best is trial 67 with value: 0.37433125024309605.
min_child_samples, val_score: 0.363165: 100%|##########| 5/5 [00:14<00:00,  2.86s/it]

Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.374331





In [9]:
# save/load trainer
dill_dump("data/model/lgbm_trainer.dill", trainer)
trainer = dill_load("data/model/lgbm_trainer.dill")

In [10]:
pprint(metrics_dict)

{'cls_report': {'0': {'f1-score': 0.9143206854345166,
                      'precision': 0.8914081145584726,
                      'recall': 0.9384422110552764,
                      'support': 796},
                '1': {'f1-score': 0.6174863387978142,
                      'precision': 0.6975308641975309,
                      'recall': 0.553921568627451,
                      'support': 204},
                'accuracy': 0.86,
                'macro avg': {'f1-score': 0.7659035121161655,
                              'precision': 0.7944694893780018,
                              'recall': 0.7461818898413637,
                              'support': 1000},
                'weighted avg': {'f1-score': 0.8537664787206294,
                                 'precision': 0.8518571554848405,
                                 'recall': 0.86,
                                 'support': 1000}},
 'cm': [[747, 49], [91, 113]],
 'prec_rec_curve': [[0.204, 0.6975308641975309, 1.0],
                 

In [11]:
pprint(trainer.compute_metrics(df_test, with_dynamic_binary_threshold=True))

{'cls_report': {'0': {'f1-score': 0.9143920595533498,
                      'precision': 0.9031862745098039,
                      'recall': 0.9258793969849246,
                      'support': 796},
                '1': {'f1-score': 0.6443298969072165,
                      'precision': 0.6793478260869565,
                      'recall': 0.6127450980392157,
                      'support': 204},
                'accuracy': 0.862,
                'macro avg': {'f1-score': 0.7793609782302832,
                              'precision': 0.7912670502983803,
                              'recall': 0.7693122475120702,
                              'support': 1000},
                'weighted avg': {'f1-score': 0.8592993783735386,
                                 'precision': 0.8575232310315432,
                                 'recall': 0.862,
                                 'support': 1000}},
 'cm': [[737, 59], [79, 125]],
 'prec_rec_curve': [[0.204, 0.6793478260869565, 1.0],
              