# Example usage notebook
Objects and functions in this notebook are listed with all paramaters to ilustrate their capabillities. Most of the paramaters have default values in the implementation

## Imports

In [1]:
import os
import sys

sys.path.append(os.getcwd())
os.chdir("..")

import pandas as pd
import mlflow

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 300)

In [2]:
import os
import json
from typing import Literal
import pandas as pd

from churn_pred.preprocessing.preprocess import PreprocessData
from churn_pred.training.trainer import Trainer
from churn_pred.training.optuna_optimizer import LGBOptunaOptimizer
from churn_pred.training.utils import flatten_dict, get_or_create_experiment

import dill
import numpy as np
from churn_pred.utils import dill_dump, dill_load
from sklearn.model_selection import train_test_split

from pprint import pprint

## Train a testing model 

In [3]:
# 1. get the data
df_pd = pd.read_parquet("data/dataset_auxiliary_features_cleaned.parquet")
df_pd.head()

Unnamed: 0,CustomerId,CreditScore,Country,Gender,Age,Tenure,Balance (EUR),NumberOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited,CustomerFeedback_sentiment3,CustomerFeedback_sentiment5,Surname_Country,Surname_Country_region,Surname_Country_subregion,Country_region,Country_subregion,is_native,Country_hemisphere,Country_gdp_per_capita,Country_IncomeGroup,Surname_Country_gdp_per_capita,Surname_Country_IncomeGroup,working_class,stage_of_life,generation
0,15787619,844,France,Male,18,2,160980.03,1,0,0,145936.28,0,neutral,4 stars,Taiwan,Asia,Eastern Asia,Europe,Western Europe,0,northern,57594.03402,High income,32756.0,,working_age,teen,gen_z
1,15770309,656,France,Male,18,10,151762.74,1,0,1,127014.32,0,neutral,1 star,United States,Americas,Northern America,Europe,Western Europe,0,northern,57594.03402,High income,76329.58227,High income,working_age,teen,gen_z
2,15569178,570,France,Female,18,4,82767.42,1,1,0,71811.9,0,neutral,2 stars,Russian Federation,Europe,Eastern Europe,Europe,Western Europe,0,northern,57594.03402,High income,34637.76172,Upper middle income,working_age,teen,gen_z
3,15795519,716,Germany,Female,18,3,128743.8,1,0,0,197322.13,0,neutral,2 stars,Russian Federation,Europe,Eastern Europe,Europe,Western Europe,0,northern,66616.02225,High income,34637.76172,Upper middle income,working_age,teen,gen_z
4,15621893,727,France,Male,18,4,133550.67,1,1,1,46941.41,0,positive,1 star,Italy,Europe,Southern Europe,Europe,Western Europe,0,northern,57594.03402,High income,55442.07843,High income,working_age,teen,gen_z


In [4]:
target_col = "Exited"
id_cols = ["CustomerId"]
cat_cols = [
    "Country",
    "Gender",
    "HasCreditCard",
    "IsActiveMember",
    "CustomerFeedback_sentiment3",
    "CustomerFeedback_sentiment5",
    "Surname_Country",
    "Surname_Country_region",
    "Surname_Country_subregion",
    "Country_region",
    "Country_subregion",
    "is_native",
    "Country_hemisphere",
    "Country_IncomeGroup",
    "Surname_Country_IncomeGroup",
    "working_class",
    "stage_of_life",
    "generation",
]
cont_cols = df_pd.drop(
    columns=id_cols + cat_cols + [target_col]
).columns.values.tolist()

In [5]:
df_pd[cat_cols] = df_pd[cat_cols].astype(str)

In [6]:
valid_size = 0.2
test_size = 0.5
random_state = 1
df_train, df_valid = train_test_split(
    df_pd, test_size=valid_size, stratify=df_pd[target_col], random_state=random_state
)
df_valid, df_test = train_test_split(
    df_valid,
    test_size=test_size,
    stratify=df_valid[target_col],
    random_state=random_state,
)

In [7]:
prepare_data = PreprocessData(
    id_cols=id_cols,
    target_col=target_col,
    cat_cols=cat_cols,
    cont_cols=cont_cols,
)
# this should be fitted only on training data
_ = prepare_data.fit(df=df_pd)

In [8]:
optimizer = LGBOptunaOptimizer(
    objective="binary",
    n_class=2,
)

trainer = Trainer(
    cat_cols=prepare_data.cat_cols,
    target_col=prepare_data.target_col,
    id_cols=id_cols,
    objective="binary",
    n_class=2,
    optimizer=optimizer,
    preprocessors=[prepare_data],
)

metrics_dict = trainer.fit(
    df_train=df_train,
    df_valid=df_valid,
    df_test=df_test,
)

[I 2024-05-17 10:13:50,499] A new study created in memory with name: no-name-fa589f64-d8f0-4cbb-b929-f47ba65e30cf
feature_fraction, val_score: inf:   0%|          | 0/7 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.384886:  14%|#4        | 1/7 [00:03<00:18,  3.09s/it][I 2024-05-17 10:13:53,604] Trial 0 finished with value: 0.3848856550710792 and parameters: {'feature_fraction': 0.5}. Best is trial 0 with value: 0.3848856550710792.
feature_fraction, val_score: 0.384886:  14%|#4        | 1/7 [00:03<00:18,  3.09s/it]

Early stopping, best iteration is:
[256]	valid_0's binary_logloss: 0.384886
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.384886:  29%|##8       | 2/7 [00:09<00:24,  4.93s/it][I 2024-05-17 10:13:59,819] Trial 1 finished with value: 0.38876581289974715 and parameters: {'feature_fraction': 0.4}. Best is trial 0 with value: 0.3848856550710792.
feature_fraction, val_score: 0.384886:  29%|##8       | 2/7 [00:09<00:24,  4.93s/it]

Early stopping, best iteration is:
[203]	valid_0's binary_logloss: 0.388766
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.384886:  43%|####2     | 3/7 [00:10<00:13,  3.45s/it][I 2024-05-17 10:14:01,500] Trial 2 finished with value: 0.3871060537924552 and parameters: {'feature_fraction': 0.6}. Best is trial 0 with value: 0.3848856550710792.
feature_fraction, val_score: 0.384886:  43%|####2     | 3/7 [00:10<00:13,  3.45s/it]

Early stopping, best iteration is:
[136]	valid_0's binary_logloss: 0.387106
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.376373:  57%|#####7    | 4/7 [00:13<00:08,  2.95s/it][I 2024-05-17 10:14:03,700] Trial 3 finished with value: 0.37637252943396876 and parameters: {'feature_fraction': 0.8}. Best is trial 3 with value: 0.37637252943396876.
feature_fraction, val_score: 0.376373:  57%|#####7    | 4/7 [00:13<00:08,  2.95s/it]

Early stopping, best iteration is:
[182]	valid_0's binary_logloss: 0.376373
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.376373:  71%|#######1  | 5/7 [00:18<00:07,  3.70s/it][I 2024-05-17 10:14:08,731] Trial 4 finished with value: 0.38580891587959387 and parameters: {'feature_fraction': 1.0}. Best is trial 3 with value: 0.37637252943396876.
feature_fraction, val_score: 0.376373:  71%|#######1  | 5/7 [00:18<00:07,  3.70s/it]

Early stopping, best iteration is:
[196]	valid_0's binary_logloss: 0.385809
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.376373:  86%|########5 | 6/7 [00:20<00:03,  3.13s/it][I 2024-05-17 10:14:10,735] Trial 5 finished with value: 0.38034803511400994 and parameters: {'feature_fraction': 0.7}. Best is trial 3 with value: 0.37637252943396876.
feature_fraction, val_score: 0.376373:  86%|########5 | 6/7 [00:20<00:03,  3.13s/it]

Early stopping, best iteration is:
[178]	valid_0's binary_logloss: 0.380348
Training until validation scores don't improve for 50 rounds


feature_fraction, val_score: 0.376373: 100%|##########| 7/7 [00:25<00:00,  3.83s/it][I 2024-05-17 10:14:16,006] Trial 6 finished with value: 0.38397832605349824 and parameters: {'feature_fraction': 0.8999999999999999}. Best is trial 3 with value: 0.37637252943396876.
feature_fraction, val_score: 0.376373: 100%|##########| 7/7 [00:25<00:00,  3.64s/it]


Early stopping, best iteration is:
[166]	valid_0's binary_logloss: 0.383978


num_leaves, val_score: 0.376373:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.376373:   5%|5         | 1/20 [00:02<00:51,  2.69s/it][I 2024-05-17 10:14:18,709] Trial 7 finished with value: 0.38099924239489236 and parameters: {'num_leaves': 176}. Best is trial 7 with value: 0.38099924239489236.
num_leaves, val_score: 0.376373:   5%|5         | 1/20 [00:02<00:51,  2.69s/it]

Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.380999
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  10%|#         | 2/20 [00:06<01:05,  3.64s/it][I 2024-05-17 10:14:23,016] Trial 8 finished with value: 0.37385793384339394 and parameters: {'num_leaves': 222}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  10%|#         | 2/20 [00:07<01:05,  3.64s/it]

Early stopping, best iteration is:
[40]	valid_0's binary_logloss: 0.373858
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  15%|#5        | 3/20 [00:13<01:21,  4.81s/it][I 2024-05-17 10:14:29,219] Trial 9 finished with value: 0.38119071611784033 and parameters: {'num_leaves': 93}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  15%|#5        | 3/20 [00:13<01:21,  4.81s/it]

Early stopping, best iteration is:
[74]	valid_0's binary_logloss: 0.381191
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  20%|##        | 4/20 [00:22<01:46,  6.66s/it][I 2024-05-17 10:14:38,717] Trial 10 finished with value: 0.3773282550132434 and parameters: {'num_leaves': 249}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  20%|##        | 4/20 [00:22<01:46,  6.66s/it]

Early stopping, best iteration is:
[40]	valid_0's binary_logloss: 0.377328
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  25%|##5       | 5/20 [00:24<01:13,  4.88s/it][I 2024-05-17 10:14:40,431] Trial 11 finished with value: 0.39044042093457787 and parameters: {'num_leaves': 19}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  25%|##5       | 5/20 [00:24<01:13,  4.88s/it]

Early stopping, best iteration is:
[244]	valid_0's binary_logloss: 0.39044
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  30%|###       | 6/20 [00:28<01:02,  4.44s/it][I 2024-05-17 10:14:44,030] Trial 12 finished with value: 0.38026069867454104 and parameters: {'num_leaves': 251}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  30%|###       | 6/20 [00:28<01:02,  4.44s/it]

Early stopping, best iteration is:
[28]	valid_0's binary_logloss: 0.380261
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  35%|###5      | 7/20 [00:31<00:52,  4.07s/it][I 2024-05-17 10:14:47,328] Trial 13 finished with value: 0.3753274511816769 and parameters: {'num_leaves': 165}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  35%|###5      | 7/20 [00:31<00:52,  4.07s/it]

Early stopping, best iteration is:
[48]	valid_0's binary_logloss: 0.375327
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.373858:  40%|####      | 8/20 [00:38<00:59,  4.99s/it][I 2024-05-17 10:14:54,309] Trial 14 finished with value: 0.3739214777794338 and parameters: {'num_leaves': 162}. Best is trial 8 with value: 0.37385793384339394.
num_leaves, val_score: 0.373858:  40%|####      | 8/20 [00:38<00:59,  4.99s/it]

Early stopping, best iteration is:
[52]	valid_0's binary_logloss: 0.373921
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.371332:  45%|####5     | 9/20 [00:41<00:49,  4.54s/it][I 2024-05-17 10:14:57,829] Trial 15 finished with value: 0.3713317512304886 and parameters: {'num_leaves': 191}. Best is trial 15 with value: 0.3713317512304886.
num_leaves, val_score: 0.371332:  45%|####5     | 9/20 [00:41<00:49,  4.54s/it]

Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.371332
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.371332:  50%|#####     | 10/20 [00:45<00:43,  4.31s/it][I 2024-05-17 10:15:01,634] Trial 16 finished with value: 0.37619505728984265 and parameters: {'num_leaves': 215}. Best is trial 15 with value: 0.3713317512304886.
num_leaves, val_score: 0.371332:  50%|#####     | 10/20 [00:45<00:43,  4.31s/it]

Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.376195
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.371332:  55%|#####5    | 11/20 [00:48<00:35,  3.94s/it][I 2024-05-17 10:15:04,721] Trial 17 finished with value: 0.3714154578813961 and parameters: {'num_leaves': 105}. Best is trial 15 with value: 0.3713317512304886.
num_leaves, val_score: 0.371332:  55%|#####5    | 11/20 [00:48<00:35,  3.94s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.371415
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.371332:  60%|######    | 12/20 [00:51<00:27,  3.47s/it][I 2024-05-17 10:15:07,131] Trial 18 finished with value: 0.38587964344293435 and parameters: {'num_leaves': 96}. Best is trial 15 with value: 0.3713317512304886.
num_leaves, val_score: 0.371332:  60%|######    | 12/20 [00:51<00:27,  3.47s/it]

Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.38588
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  65%|######5   | 13/20 [00:54<00:24,  3.45s/it][I 2024-05-17 10:15:10,519] Trial 19 finished with value: 0.36740957414352104 and parameters: {'num_leaves': 109}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  65%|######5   | 13/20 [00:54<00:24,  3.45s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.36741
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  70%|#######   | 14/20 [00:56<00:18,  3.10s/it][I 2024-05-17 10:15:12,812] Trial 20 finished with value: 0.38156230494038623 and parameters: {'num_leaves': 54}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  70%|#######   | 14/20 [00:56<00:18,  3.10s/it]

Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.381562
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  75%|#######5  | 15/20 [00:59<00:15,  3.01s/it][I 2024-05-17 10:15:15,610] Trial 21 finished with value: 0.3792670191593437 and parameters: {'num_leaves': 116}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  75%|#######5  | 15/20 [00:59<00:15,  3.01s/it]

Early stopping, best iteration is:
[41]	valid_0's binary_logloss: 0.379267
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  80%|########  | 16/20 [01:02<00:11,  2.98s/it][I 2024-05-17 10:15:18,526] Trial 22 finished with value: 0.37058611292960486 and parameters: {'num_leaves': 132}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  80%|########  | 16/20 [01:02<00:11,  2.98s/it]

Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.370586
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  85%|########5 | 17/20 [01:05<00:08,  2.95s/it][I 2024-05-17 10:15:21,421] Trial 23 finished with value: 0.372527054012802 and parameters: {'num_leaves': 139}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  85%|########5 | 17/20 [01:05<00:08,  2.95s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.372527
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  90%|######### | 18/20 [01:08<00:05,  2.85s/it][I 2024-05-17 10:15:24,028] Trial 24 finished with value: 0.38036745420691503 and parameters: {'num_leaves': 64}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  90%|######### | 18/20 [01:08<00:05,  2.85s/it]

Early stopping, best iteration is:
[105]	valid_0's binary_logloss: 0.380367
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410:  95%|#########5| 19/20 [01:11<00:02,  2.96s/it][I 2024-05-17 10:15:27,232] Trial 25 finished with value: 0.3818090274252709 and parameters: {'num_leaves': 194}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410:  95%|#########5| 19/20 [01:11<00:02,  2.96s/it]

Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.381809
Training until validation scores don't improve for 50 rounds


num_leaves, val_score: 0.367410: 100%|##########| 20/20 [01:14<00:00,  3.15s/it][I 2024-05-17 10:15:30,831] Trial 26 finished with value: 0.37252705401280195 and parameters: {'num_leaves': 139}. Best is trial 19 with value: 0.36740957414352104.
num_leaves, val_score: 0.367410: 100%|##########| 20/20 [01:14<00:00,  3.74s/it]


Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.372527


bagging, val_score: 0.367410:   0%|          | 0/10 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  10%|#         | 1/10 [00:02<00:19,  2.20s/it][I 2024-05-17 10:15:33,110] Trial 27 finished with value: 0.3954670757458217 and parameters: {'bagging_fraction': 0.6247871962774826, 'bagging_freq': 1}. Best is trial 27 with value: 0.3954670757458217.
bagging, val_score: 0.367410:  10%|#         | 1/10 [00:02<00:19,  2.20s/it]

Early stopping, best iteration is:
[26]	valid_0's binary_logloss: 0.395467
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  20%|##        | 2/10 [00:04<00:19,  2.43s/it][I 2024-05-17 10:15:35,712] Trial 28 finished with value: 0.383350674150879 and parameters: {'bagging_fraction': 0.923285663502434, 'bagging_freq': 7}. Best is trial 28 with value: 0.383350674150879.
bagging, val_score: 0.367410:  20%|##        | 2/10 [00:04<00:19,  2.43s/it]

Early stopping, best iteration is:
[53]	valid_0's binary_logloss: 0.383351
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  30%|###       | 3/10 [00:07<00:17,  2.47s/it][I 2024-05-17 10:15:38,221] Trial 29 finished with value: 0.37923553921285863 and parameters: {'bagging_fraction': 0.420843204313341, 'bagging_freq': 4}. Best is trial 29 with value: 0.37923553921285863.
bagging, val_score: 0.367410:  30%|###       | 3/10 [00:07<00:17,  2.47s/it]

Early stopping, best iteration is:
[48]	valid_0's binary_logloss: 0.379236
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  40%|####      | 4/10 [00:10<00:16,  2.75s/it][I 2024-05-17 10:15:41,400] Trial 30 finished with value: 0.37765693537413436 and parameters: {'bagging_fraction': 0.9878318604557115, 'bagging_freq': 7}. Best is trial 30 with value: 0.37765693537413436.
bagging, val_score: 0.367410:  40%|####      | 4/10 [00:10<00:16,  2.75s/it]

Early stopping, best iteration is:
[54]	valid_0's binary_logloss: 0.377657
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  50%|#####     | 5/10 [00:13<00:13,  2.78s/it][I 2024-05-17 10:15:44,233] Trial 31 finished with value: 0.37280801864840957 and parameters: {'bagging_fraction': 0.7481148208250323, 'bagging_freq': 2}. Best is trial 31 with value: 0.37280801864840957.
bagging, val_score: 0.367410:  50%|#####     | 5/10 [00:13<00:13,  2.78s/it]

Early stopping, best iteration is:
[49]	valid_0's binary_logloss: 0.372808
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  60%|######    | 6/10 [00:14<00:09,  2.38s/it][I 2024-05-17 10:15:45,825] Trial 32 finished with value: 0.3842362749848752 and parameters: {'bagging_fraction': 0.4391676469093167, 'bagging_freq': 4}. Best is trial 31 with value: 0.37280801864840957.
bagging, val_score: 0.367410:  60%|######    | 6/10 [00:14<00:09,  2.38s/it]

Early stopping, best iteration is:
[19]	valid_0's binary_logloss: 0.384236
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  70%|#######   | 7/10 [00:17<00:07,  2.38s/it][I 2024-05-17 10:15:48,226] Trial 33 finished with value: 0.38045738974897325 and parameters: {'bagging_fraction': 0.7379269104820864, 'bagging_freq': 5}. Best is trial 31 with value: 0.37280801864840957.
bagging, val_score: 0.367410:  70%|#######   | 7/10 [00:17<00:07,  2.38s/it]

Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.380457
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  80%|########  | 8/10 [00:20<00:05,  2.55s/it][I 2024-05-17 10:15:51,131] Trial 34 finished with value: 0.39307944210588885 and parameters: {'bagging_fraction': 0.5892054794337009, 'bagging_freq': 2}. Best is trial 31 with value: 0.37280801864840957.
bagging, val_score: 0.367410:  80%|########  | 8/10 [00:20<00:05,  2.55s/it]

Early stopping, best iteration is:
[70]	valid_0's binary_logloss: 0.393079
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410:  90%|######### | 9/10 [00:23<00:02,  2.75s/it][I 2024-05-17 10:15:54,323] Trial 35 finished with value: 0.37565759644445834 and parameters: {'bagging_fraction': 0.8753010476839909, 'bagging_freq': 5}. Best is trial 31 with value: 0.37280801864840957.
bagging, val_score: 0.367410:  90%|######### | 9/10 [00:23<00:02,  2.75s/it]

Early stopping, best iteration is:
[61]	valid_0's binary_logloss: 0.375658
Training until validation scores don't improve for 50 rounds


bagging, val_score: 0.367410: 100%|##########| 10/10 [00:25<00:00,  2.64s/it][I 2024-05-17 10:15:56,727] Trial 36 finished with value: 0.38695330483645435 and parameters: {'bagging_fraction': 0.5402623279814973, 'bagging_freq': 3}. Best is trial 31 with value: 0.37280801864840957.
bagging, val_score: 0.367410: 100%|##########| 10/10 [00:25<00:00,  2.58s/it]


Early stopping, best iteration is:
[54]	valid_0's binary_logloss: 0.386953


feature_fraction_stage2, val_score: 0.367410:   0%|          | 0/6 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.367410:  17%|#6        | 1/6 [00:02<00:14,  2.91s/it][I 2024-05-17 10:15:59,715] Trial 37 finished with value: 0.3750569994664222 and parameters: {'feature_fraction': 0.88}. Best is trial 37 with value: 0.3750569994664222.
feature_fraction_stage2, val_score: 0.367410:  17%|#6        | 1/6 [00:02<00:14,  2.91s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.375057
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.367410:  33%|###3      | 2/6 [00:07<00:15,  3.84s/it][I 2024-05-17 10:16:04,209] Trial 38 finished with value: 0.36740957414352116 and parameters: {'feature_fraction': 0.784}. Best is trial 38 with value: 0.36740957414352116.
feature_fraction_stage2, val_score: 0.367410:  33%|###3      | 2/6 [00:07<00:15,  3.84s/it]

Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.36741
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.367410:  50%|#####     | 3/6 [00:10<00:10,  3.42s/it][I 2024-05-17 10:16:07,121] Trial 39 finished with value: 0.37661326532050776 and parameters: {'feature_fraction': 0.7520000000000001}. Best is trial 38 with value: 0.36740957414352116.
feature_fraction_stage2, val_score: 0.367410:  50%|#####     | 3/6 [00:10<00:10,  3.42s/it]

Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.376613
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.367410:  67%|######6   | 4/6 [00:18<00:10,  5.27s/it][I 2024-05-17 10:16:15,229] Trial 40 finished with value: 0.3766132653205076 and parameters: {'feature_fraction': 0.7200000000000001}. Best is trial 38 with value: 0.36740957414352116.
feature_fraction_stage2, val_score: 0.367410:  67%|######6   | 4/6 [00:18<00:10,  5.27s/it]

Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.376613
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.367410:  83%|########3 | 5/6 [00:24<00:05,  5.57s/it][I 2024-05-17 10:16:21,324] Trial 41 finished with value: 0.37205822085312645 and parameters: {'feature_fraction': 0.8160000000000001}. Best is trial 38 with value: 0.36740957414352116.
feature_fraction_stage2, val_score: 0.367410:  83%|########3 | 5/6 [00:24<00:05,  5.57s/it]

Early stopping, best iteration is:
[48]	valid_0's binary_logloss: 0.372058
Training until validation scores don't improve for 50 rounds


feature_fraction_stage2, val_score: 0.367410: 100%|##########| 6/6 [00:33<00:00,  6.63s/it][I 2024-05-17 10:16:30,027] Trial 42 finished with value: 0.375056999466422 and parameters: {'feature_fraction': 0.8480000000000001}. Best is trial 38 with value: 0.36740957414352116.
feature_fraction_stage2, val_score: 0.367410: 100%|##########| 6/6 [00:33<00:00,  5.54s/it]


Early stopping, best iteration is:
[75]	valid_0's binary_logloss: 0.375057


regularization_factors, val_score: 0.367410:   0%|          | 0/20 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:   5%|5         | 1/20 [00:04<01:19,  4.20s/it][I 2024-05-17 10:16:34,305] Trial 43 finished with value: 0.4000754351948232 and parameters: {'lambda_l1': 9.735476402093866, 'lambda_l2': 1.1885650284878641e-05}. Best is trial 43 with value: 0.4000754351948232.
regularization_factors, val_score: 0.367410:   5%|5         | 1/20 [00:04<01:19,  4.20s/it]

Early stopping, best iteration is:
[175]	valid_0's binary_logloss: 0.400075
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  10%|#         | 2/20 [00:06<00:59,  3.33s/it][I 2024-05-17 10:16:37,029] Trial 44 finished with value: 0.3842358602524559 and parameters: {'lambda_l1': 1.3624272314789979e-08, 'lambda_l2': 4.53719639040893}. Best is trial 44 with value: 0.3842358602524559.
regularization_factors, val_score: 0.367410:  10%|#         | 2/20 [00:06<00:59,  3.33s/it]

Early stopping, best iteration is:
[63]	valid_0's binary_logloss: 0.384236
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  15%|#5        | 3/20 [00:09<00:51,  3.04s/it][I 2024-05-17 10:16:39,718] Trial 45 finished with value: 0.3768392688529174 and parameters: {'lambda_l1': 0.0006052551309667524, 'lambda_l2': 1.0805462176025796e-08}. Best is trial 45 with value: 0.3768392688529174.
regularization_factors, val_score: 0.367410:  15%|#5        | 3/20 [00:09<00:51,  3.04s/it]

Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.376839
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  20%|##        | 4/20 [00:13<00:52,  3.26s/it][I 2024-05-17 10:16:43,314] Trial 46 finished with value: 0.38300850920886864 and parameters: {'lambda_l1': 1.8958239622988977e-08, 'lambda_l2': 6.020371560207324}. Best is trial 45 with value: 0.3768392688529174.
regularization_factors, val_score: 0.367410:  20%|##        | 4/20 [00:13<00:52,  3.26s/it]

Early stopping, best iteration is:
[102]	valid_0's binary_logloss: 0.383009
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  25%|##5       | 5/20 [00:15<00:45,  3.03s/it][I 2024-05-17 10:16:45,926] Trial 47 finished with value: 0.3979727901107469 and parameters: {'lambda_l1': 9.028443418520673, 'lambda_l2': 0.0008471787212032551}. Best is trial 45 with value: 0.3768392688529174.
regularization_factors, val_score: 0.367410:  25%|##5       | 5/20 [00:15<00:45,  3.03s/it]

Early stopping, best iteration is:
[176]	valid_0's binary_logloss: 0.397973
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  30%|###       | 6/20 [00:18<00:38,  2.78s/it][I 2024-05-17 10:16:48,219] Trial 48 finished with value: 0.38142112717658005 and parameters: {'lambda_l1': 0.0002499516385751432, 'lambda_l2': 0.0010381660654697966}. Best is trial 45 with value: 0.3768392688529174.
regularization_factors, val_score: 0.367410:  30%|###       | 6/20 [00:18<00:38,  2.78s/it]

Early stopping, best iteration is:
[43]	valid_0's binary_logloss: 0.381421
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  35%|###5      | 7/20 [00:20<00:36,  2.79s/it][I 2024-05-17 10:16:51,026] Trial 49 finished with value: 0.3827689612973024 and parameters: {'lambda_l1': 0.00010166999713623285, 'lambda_l2': 1.606135170483702e-08}. Best is trial 45 with value: 0.3768392688529174.
regularization_factors, val_score: 0.367410:  35%|###5      | 7/20 [00:20<00:36,  2.79s/it]

Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.382769
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  40%|####      | 8/20 [00:23<00:34,  2.85s/it][I 2024-05-17 10:16:54,004] Trial 50 finished with value: 0.3717680065875255 and parameters: {'lambda_l1': 0.05452714002013461, 'lambda_l2': 4.088790430612965e-06}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  40%|####      | 8/20 [00:23<00:34,  2.85s/it]

Early stopping, best iteration is:
[72]	valid_0's binary_logloss: 0.371768
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  45%|####5     | 9/20 [00:26<00:29,  2.72s/it][I 2024-05-17 10:16:56,427] Trial 51 finished with value: 0.3761342067809677 and parameters: {'lambda_l1': 0.0369066143307944, 'lambda_l2': 3.2663806162439897e-06}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  45%|####5     | 9/20 [00:26<00:29,  2.72s/it]

Early stopping, best iteration is:
[54]	valid_0's binary_logloss: 0.376134
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  50%|#####     | 10/20 [00:29<00:27,  2.74s/it][I 2024-05-17 10:16:59,219] Trial 52 finished with value: 0.37432924731676614 and parameters: {'lambda_l1': 0.054816809029998775, 'lambda_l2': 0.024999499348712872}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  50%|#####     | 10/20 [00:29<00:27,  2.74s/it]

Early stopping, best iteration is:
[70]	valid_0's binary_logloss: 0.374329
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  55%|#####5    | 11/20 [00:32<00:25,  2.79s/it][I 2024-05-17 10:17:02,121] Trial 53 finished with value: 0.3802377964900833 and parameters: {'lambda_l1': 2.316914347130571e-06, 'lambda_l2': 5.795374367862176e-06}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  55%|#####5    | 11/20 [00:32<00:25,  2.79s/it]

Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.380238
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  60%|######    | 12/20 [00:34<00:21,  2.73s/it][I 2024-05-17 10:17:04,708] Trial 54 finished with value: 0.3824576314527321 and parameters: {'lambda_l1': 0.06822102509516187, 'lambda_l2': 8.792255957217181e-07}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  60%|######    | 12/20 [00:34<00:21,  2.73s/it]

Early stopping, best iteration is:
[59]	valid_0's binary_logloss: 0.382458
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  65%|######5   | 13/20 [00:37<00:19,  2.78s/it][I 2024-05-17 10:17:07,622] Trial 55 finished with value: 0.3746797506374062 and parameters: {'lambda_l1': 0.004203813235193917, 'lambda_l2': 0.0746628734826108}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  65%|######5   | 13/20 [00:37<00:19,  2.78s/it]

Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.37468
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  70%|#######   | 14/20 [00:40<00:16,  2.79s/it][I 2024-05-17 10:17:10,426] Trial 56 finished with value: 0.380237800076126 and parameters: {'lambda_l1': 3.697476359113887e-06, 'lambda_l2': 2.228738169703633e-07}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  70%|#######   | 14/20 [00:40<00:16,  2.79s/it]

Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.380238
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  75%|#######5  | 15/20 [00:43<00:14,  2.88s/it][I 2024-05-17 10:17:13,525] Trial 57 finished with value: 0.3819070958826241 and parameters: {'lambda_l1': 0.810771895442005, 'lambda_l2': 6.377701924975795e-05}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  75%|#######5  | 15/20 [00:43<00:14,  2.88s/it]

Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.381907
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  80%|########  | 16/20 [00:46<00:11,  2.95s/it][I 2024-05-17 10:17:16,619] Trial 58 finished with value: 0.3722284134939273 and parameters: {'lambda_l1': 7.86173966634575e-06, 'lambda_l2': 0.009668553543780791}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  80%|########  | 16/20 [00:46<00:11,  2.95s/it]

Early stopping, best iteration is:
[80]	valid_0's binary_logloss: 0.372228
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  85%|########5 | 17/20 [00:48<00:08,  2.73s/it][I 2024-05-17 10:17:18,897] Trial 59 finished with value: 0.3728074176393718 and parameters: {'lambda_l1': 0.6306947740321641, 'lambda_l2': 1.013819292434261e-07}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  85%|########5 | 17/20 [00:48<00:08,  2.73s/it]

Early stopping, best iteration is:
[56]	valid_0's binary_logloss: 0.372807
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  90%|######### | 18/20 [00:50<00:05,  2.57s/it][I 2024-05-17 10:17:21,098] Trial 60 finished with value: 0.383089180978512 and parameters: {'lambda_l1': 0.005507471755818934, 'lambda_l2': 5.575100781579365e-05}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  90%|######### | 18/20 [00:50<00:05,  2.57s/it]

Early stopping, best iteration is:
[39]	valid_0's binary_logloss: 0.383089
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410:  95%|#########5| 19/20 [00:53<00:02,  2.69s/it][I 2024-05-17 10:17:24,001] Trial 61 finished with value: 0.37251331352332 and parameters: {'lambda_l1': 2.1575640685639232e-07, 'lambda_l2': 0.5104002702883277}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410:  95%|#########5| 19/20 [00:53<00:02,  2.69s/it]

Early stopping, best iteration is:
[65]	valid_0's binary_logloss: 0.372513
Training until validation scores don't improve for 50 rounds


regularization_factors, val_score: 0.367410: 100%|##########| 20/20 [00:56<00:00,  2.64s/it][I 2024-05-17 10:17:26,515] Trial 62 finished with value: 0.3751039420293567 and parameters: {'lambda_l1': 0.8971910981660706, 'lambda_l2': 0.00011633445362150633}. Best is trial 50 with value: 0.3717680065875255.
regularization_factors, val_score: 0.367410: 100%|##########| 20/20 [00:56<00:00,  2.82s/it]


Early stopping, best iteration is:
[66]	valid_0's binary_logloss: 0.375104


min_child_samples, val_score: 0.367410:   0%|          | 0/5 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.367410:  20%|##        | 1/5 [00:07<00:31,  7.91s/it][I 2024-05-17 10:17:34,430] Trial 63 finished with value: 0.37088802049847946 and parameters: {'min_child_samples': 5}. Best is trial 63 with value: 0.37088802049847946.
min_child_samples, val_score: 0.367410:  20%|##        | 1/5 [00:07<00:31,  7.91s/it]

Early stopping, best iteration is:
[54]	valid_0's binary_logloss: 0.370888
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.367410:  40%|####      | 2/5 [00:10<00:14,  4.77s/it][I 2024-05-17 10:17:37,008] Trial 64 finished with value: 0.3893329770050149 and parameters: {'min_child_samples': 50}. Best is trial 63 with value: 0.37088802049847946.
min_child_samples, val_score: 0.367410:  40%|####      | 2/5 [00:10<00:14,  4.77s/it]

Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.389333
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.367410:  60%|######    | 3/5 [00:13<00:07,  3.88s/it][I 2024-05-17 10:17:39,898] Trial 65 finished with value: 0.3788845205095279 and parameters: {'min_child_samples': 10}. Best is trial 63 with value: 0.37088802049847946.
min_child_samples, val_score: 0.367410:  60%|######    | 3/5 [00:13<00:07,  3.88s/it]

Early stopping, best iteration is:
[68]	valid_0's binary_logloss: 0.378885
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.367410:  80%|########  | 4/5 [00:15<00:03,  3.21s/it][I 2024-05-17 10:17:42,014] Trial 66 finished with value: 0.39615198116336536 and parameters: {'min_child_samples': 100}. Best is trial 63 with value: 0.37088802049847946.
min_child_samples, val_score: 0.367410:  80%|########  | 4/5 [00:15<00:03,  3.21s/it]

Early stopping, best iteration is:
[97]	valid_0's binary_logloss: 0.396152
Training until validation scores don't improve for 50 rounds


min_child_samples, val_score: 0.367410: 100%|##########| 5/5 [00:17<00:00,  2.92s/it][I 2024-05-17 10:17:44,414] Trial 67 finished with value: 0.3804781158926489 and parameters: {'min_child_samples': 25}. Best is trial 63 with value: 0.37088802049847946.
min_child_samples, val_score: 0.367410: 100%|##########| 5/5 [00:17<00:00,  3.58s/it]

Early stopping, best iteration is:
[55]	valid_0's binary_logloss: 0.380478





In [9]:
# save/load trainer
dill_dump("lgbm_trainer.dill", trainer)
trainer = dill_load("lgbm_trainer.dill")

In [10]:
pprint(metrics_dict)

{'cls_report': {'0': {'f1-score': 0.9218559218559219,
                      'precision': 0.8966745843230404,
                      'recall': 0.9484924623115578,
                      'support': 796},
                '1': {'f1-score': 0.6464088397790054,
                      'precision': 0.740506329113924,
                      'recall': 0.5735294117647058,
                      'support': 204},
                'accuracy': 0.872,
                'macro avg': {'f1-score': 0.7841323808174636,
                              'precision': 0.8185904567184822,
                              'recall': 0.7610109370381318,
                              'support': 1000},
                'weighted avg': {'f1-score': 0.8656647171122308,
                                 'precision': 0.8648162602603806,
                                 'recall': 0.872,
                                 'support': 1000}},
 'cm': [[755, 41], [87, 117]],
 'prec_rec_curve': [[0.204, 0.740506329113924, 1.0],
                

In [11]:
pprint(trainer.compute_metrics(df_test, with_dynamic_binary_threshold=True))

{'cls_report': {'0': {'f1-score': 0.9244823386114495,
                      'precision': 0.8971631205673759,
                      'recall': 0.9535175879396985,
                      'support': 796},
                '1': {'f1-score': 0.6536312849162011,
                      'precision': 0.7597402597402597,
                      'recall': 0.5735294117647058,
                      'support': 204},
                'accuracy': 0.876,
                'macro avg': {'f1-score': 0.7890568117638252,
                              'precision': 0.8284516901538178,
                              'recall': 0.7635234998522022,
                              'support': 1000},
                'weighted avg': {'f1-score': 0.8692287236576187,
                                 'precision': 0.8691288569586443,
                                 'recall': 0.876,
                                 'support': 1000}},
 'cm': [[759, 37], [87, 117]],
 'prec_rec_curve': [[0.204, 0.7597402597402597, 1.0],
              

# Mlflow
* tracking
* model registration in Minio

In [12]:
mlflow_host = "10.152.183.54"
mlflow_host_url = "mlflow.mlflow.svc.cluster.local"
mlflow_port = "5000"
os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
os.environ["MLFLOW_S3_ENDPOINT_URL"] = f"http://10.152.183.156:9000"

mlflow.set_tracking_uri("http://" + mlflow_host + ":" + mlflow_port)
experiment_id = get_or_create_experiment("ecovadis_assignment")
mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location='s3://mlflow/2', creation_time=1715939157478, experiment_id='2', last_update_time=1715939157478, lifecycle_stage='active', name='ecovadis_assignment', tags={}>

In [13]:
metrics_dict_flattened = flatten_dict(metrics_dict)

In [14]:
# mlflow metrics can be only int, float not list
del metrics_dict_flattened["cm"]
del metrics_dict_flattened["prec_rec_curve"]

In [15]:
run_name = "init_run"
with mlflow.start_run(
    experiment_id=experiment_id, run_name=run_name, nested=True
) as run:
    mlflow.log_params(trainer.optimizer.best)
    mlflow.log_metrics(metrics_dict_flattened)

    # Log tags
    mlflow.set_tags(
        tags={
            "project": "SUCCESS6G",
            "optimizer_engine": "optuna",
            "model_family": "ligtgbm",
            "feature_set_version": 1,
        }
    )
    # Log figure - for future fun
    # mlflow.log_figure(figure=correlation_plot, artifact_file="correlation_plot.png")

    artifact_path = "ecovadis_model"
    registered_model_name = "ecovadis_lgbm_model"
    mlflow.pyfunc.log_model(
        python_model=trainer,
        artifact_path=artifact_path,
        registered_model_name=registered_model_name,
    )
    model_uri = mlflow.get_artifact_uri(artifact_path)
    print(f"Run ID:\n{run.info.run_id}\nModel uri:\n{model_uri}")

2024/05/17 10:18:43 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference
2024/05/17 10:18:43 INFO mlflow.types.utils: Unsupported type hint: <class 'pandas.core.frame.DataFrame'>, skipping schema inference
Registered model 'ecovadis_lgbm_model' already exists. Creating a new version of this model...
2024/05/17 10:18:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: ecovadis_lgbm_model, version 2


Run ID:
18b4ab4d40bb4476b2cbb70b4b6a72a0
Model uri:
s3://mlflow/2/18b4ab4d40bb4476b2cbb70b4b6a72a0/artifacts/ecovadis_model


Created version '2' of model 'ecovadis_lgbm_model'.


## Screenshots from MLflow GUI (Nodeport service in Kubernetes)

![mlflow_parameters](img/mlflow_parameters.png)
![mlflow_parameters](img/mlflow_metrics.png)

![mlflow_parameters](img/mlflow_artifacts.png)

![mlflow_parameters](img/mlflow_artifacts_registered_model.png)

## Predictions testing

**IMPORTANT**: the predictions are in raw_score format(for testing to see if I get same predictions), i.e. strange numbers and not classes in t

### Example data

In [16]:
df_pd.iloc[:2]

Unnamed: 0,CustomerId,CreditScore,Country,Gender,Age,Tenure,Balance (EUR),NumberOfProducts,HasCreditCard,IsActiveMember,EstimatedSalary,Exited,CustomerFeedback_sentiment3,CustomerFeedback_sentiment5,Surname_Country,Surname_Country_region,Surname_Country_subregion,Country_region,Country_subregion,is_native,Country_hemisphere,Country_gdp_per_capita,Country_IncomeGroup,Surname_Country_gdp_per_capita,Surname_Country_IncomeGroup,working_class,stage_of_life,generation
0,15787619,844,France,Male,18,2,160980.03,1,0,0,145936.28,0,neutral,4 stars,Taiwan,Asia,Eastern Asia,Europe,Western Europe,0,northern,57594.03402,High income,32756.0,,working_age,teen,gen_z
1,15770309,656,France,Male,18,10,151762.74,1,0,1,127014.32,0,neutral,1 star,United States,Americas,Northern America,Europe,Western Europe,0,northern,57594.03402,High income,76329.58227,High income,working_age,teen,gen_z


In [17]:
df_pd.iloc[:2].transpose().to_dict(orient="index")

{'CustomerId': {0: 15787619, 1: 15770309},
 'CreditScore': {0: 844, 1: 656},
 'Country': {0: 'France', 1: 'France'},
 'Gender': {0: 'Male', 1: 'Male'},
 'Age': {0: 18, 1: 18},
 'Tenure': {0: 2, 1: 10},
 'Balance (EUR)': {0: 160980.03, 1: 151762.74},
 'NumberOfProducts': {0: 1, 1: 1},
 'HasCreditCard': {0: '0', 1: '0'},
 'IsActiveMember': {0: '0', 1: '1'},
 'EstimatedSalary': {0: 145936.28, 1: 127014.32},
 'Exited': {0: 0, 1: 0},
 'CustomerFeedback_sentiment3': {0: 'neutral', 1: 'neutral'},
 'CustomerFeedback_sentiment5': {0: '4 stars', 1: '1 star'},
 'Surname_Country': {0: 'Taiwan', 1: 'United States'},
 'Surname_Country_region': {0: 'Asia', 1: 'Americas'},
 'Surname_Country_subregion': {0: 'Eastern Asia', 1: 'Northern America'},
 'Country_region': {0: 'Europe', 1: 'Europe'},
 'Country_subregion': {0: 'Western Europe', 1: 'Western Europe'},
 'is_native': {0: '0', 1: '0'},
 'Country_hemisphere': {0: 'northern', 1: 'northern'},
 'Country_gdp_per_capita': {0: 57594.03402, 1: 57594.03402},

after some notepad magic:
```
{
    "CustomerId": [15787619, 15770309],
    "CreditScore": [844, 656],
    "Country": ["France", "France"],
    "Gender": ["Male", "Male"],
    "Age": [18, 18],
    "Tenure": [2, 10],
    "Balance (EUR)": [160980.03, 151762.74],
    "NumberOfProducts": [1, 1],
    "HasCreditCard": ["0", "0"],
    "IsActiveMember": ["0", "1"],
    "EstimatedSalary": [145936.28, 127014.32],
    "Exited": [0, 0],
    "CustomerFeedback_sentiment3": ["neutral", "neutral"],
    "CustomerFeedback_sentiment5": ["4 stars", "1 star"],
    "Surname_Country": ["Taiwan", "United States"],
    "Surname_Country_region": ["Asia", "Americas"],
    "Surname_Country_subregion": ["Eastern Asia", "Northern America"],
    "Country_region": ["Europe", "Europe"],
    "Country_subregion": ["Western Europe", "Western Europe"],
    "is_native": ["0", "0"],
    "Country_hemisphere": ["northern", "northern"],
    "Country_gdp_per_capita": [57594.03402, 57594.03402],
    "Country_IncomeGroup": ["High income", "High income"],
    "Surname_Country_gdp_per_capita": [32756.0, 76329.58227],
    "Surname_Country_IncomeGroup": ["None", "High income"],
    "working_class": ["working_age", "working_age"],
    "stage_of_life": ["teen", "teen"],
    "generation": ["gen_z", "gen_z"],
}
```

### Trainer

In [18]:
trainer.predict(df=df_pd.iloc[:2].drop(columns=["Exited"]), context={})

Unnamed: 0,Exited
0,-17.300401
1,-15.238104


### Downloaded Trainer

In [19]:
model_uri

's3://mlflow/2/18b4ab4d40bb4476b2cbb70b4b6a72a0/artifacts/ecovadis_model'

In [20]:
loaded_trainer = mlflow.pyfunc.load_model(model_uri)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

In [21]:
loaded_trainer.predict(df_pd.iloc[:2].drop(columns=["Exited"]))

Unnamed: 0,Exited
0,-17.300401
1,-15.238104


### Downloaded/Served Trainer
* i.e. testing [model locally](https://mlflow.org/docs/latest/deployment/deploy-model-to-kubernetes/tutorial.html?highlight=kserve#step-6-testing-model-serving-locally)

In [22]:
model_uri

's3://mlflow/2/18b4ab4d40bb4476b2cbb70b4b6a72a0/artifacts/ecovadis_model'

In [23]:
! mlflow models serve -m "s3://mlflow/2/18b4ab4d40bb4476b2cbb70b4b6a72a0/artifacts/ecovadis_model" --env-manager local -p 5000

Downloading artifacts: 100%|█████████████████████| 1/1 [00:00<00:00, 577.81it/s]
2024/05/17 10:19:38 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
Downloading artifacts: 100%|██████████████████████| 9/9 [00:00<00:00, 44.62it/s]
2024/05/17 10:19:38 INFO mlflow.pyfunc.backend: === Running command 'exec gunicorn --timeout=60 -b 127.0.0.1:5000 -w 1 ${GUNICORN_CMD_ARGS} -- mlflow.pyfunc.scoring_server.wsgi:app'
[2024-05-17 10:19:39 +0000] [8811] [INFO] Starting gunicorn 22.0.0
[2024-05-17 10:19:39 +0000] [8811] [INFO] Listening at: http://127.0.0.1:5000 (8811)
[2024-05-17 10:19:39 +0000] [8811] [INFO] Using worker: sync
[2024-05-17 10:19:39 +0000] [8812] [INFO] Booting worker with pid: 8812
^C
[2024-05-17 10:23:52 +0000] [8811] [INFO] Handling signal: int
[2024-05-17 10:23:52 +0000] [8812] [INFO] Worker exiting (pid: 8812)


output from terminal:

```
root@jupyter-5uperpalo:~# curl -X POST -H "Content-Type:application/json" --data '{"inputs": {"CustomerId": [15787619, 15770309], "CreditScore": [844, 656], "Country": ["France", "France"], "Gender": ["Male", "Male"], "Age": [18, 18], "Tenure": [2, 10], "Balance (EUR)": [160980.03, 151762.74], "NumberOfProducts": [1, 1], "HasCreditCard": ["0", "0"], "IsActiveMember": ["0", "1"], "EstimatedSalary": [145936.28, 127014.32], "CustomerFeedback_sentiment3": ["neutral", "neutral"], "CustomerFeedback_sentiment5": ["4 stars", "1 star"], "Surname_Country": ["Taiwan", "United States"], "Surname_Country_region": ["Asia", "Americas"], "Surname_Country_subregion": ["Eastern Asia", "Northern America"], "Country_region": ["Europe", "Europe"], "Country_subregion": ["Western Europe", "Western Europe"], "is_native": ["0", "0"], "Country_hemisphere": ["northern", "northern"], "Country_gdp_per_capita": [57594.03402, 57594.03402], "Country_IncomeGroup": ["High income", "High income"], "Surname_Country_gdp_per_capita": [32756.0, 76329.58227], "Surname_Country_IncomeGroup": ["None", "High income"], "working_class": ["working_age", "working_age"], "stage_of_life": ["teen", "teen"], "generation": ["gen_z", "gen_z"]}}' http://127.0.0.1:5000/invocations
{"predictions": [{"Exited": -17.300400783182656}, {"Exited": -15.2381037279264}]}
```

# [TBD] Model deployment using Kserve
* https://mlflow.org/docs/latest/deployment/deploy-model-to-kubernetes/tutorial.html?highlight=kserve#step-7-deploying-the-model-to-kserve

**Current ISSUE**: the readiness probe is killing the pod: `Readiness probe failed: Get "http://10.1.4.209:8012/": context deadline exceeded (Client.Timeout exceeded while awaiting headers)`