In [1]:
# On affiche les graphiques dans le notebook en statique
%matplotlib inline

In [2]:
import numpy as np
import logging
import re
import os
import gc
import joblib
from joblib import Parallel, delayed
import multiprocessing
import lightgbm as lgb
import optuna
import plotly
import kaleido
import mlflow
import time


import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, make_scorer, f1_score

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from src.p7_constantes import DATA_INTERIM, DATA_BASE, MODEL_DIR
from src.p7_constantes import LOCAL_HOST, PORT_MLFLOW
from src.p7_constantes import NUM_THREADS
from src.p7_simple_kernel import (
    reduce_memory,
    get_memory_consumed,
    get_available_memory,
    get_batch_size,
)

# from src.p7_hyper_param import CONFIG_SEARCH
from src.p7_hyper_param import ExperimentSearch

# from src.p7_hyper_param import get_train, get_sorted_features_by_importance
from src.p7_evaluate import lgb_cross_evaluate
from src.p7_util import timer, clean_ram, format_time
from src.p7_regex import sel_var

In [5]:
print("mlflow", mlflow.__version__)
print("optuna", optuna.__version__)
print("numpy", np.__version__)
print("plotly", plotly.__version__)
print("kaleido", kaleido.__version__)

mlflow 2.12.1
optuna 3.6.1
numpy 1.26.4
plotly 5.21.0
kaleido 0.1.0


# Tes interrogations bases etc.

In [6]:
import optuna
from src.p7_constantes import (
    LOCAL_HOST,
    PORT_MLFLOW,
    PORT_POSTGRE,
    PASSWORD_POSTGRE,
    USER_POSTGRE,
)

In [48]:
# Configuration du stockage Optuna
storage = f"postgresql://{USER_POSTGRE}:{PASSWORD_POSTGRE}@localhost/optuna_db"

# Obtenir toutes les études
studies = optuna.study.get_all_study_summaries(storage=storage, include_best_trial=True)
print("Nb studies :", len(studies))

# Afficher les informations sur les études
for i, study in enumerate(studies):
    if i < 2:
        print(type(study))
        print(
            f"Study ID: {study._study_id}, Study Name: {study.study_name}, Number of Trials: {study.n_trials}, Best trial: {study.best_trial}"
        )

Nb studies : 0


In [47]:
"""for study in studies:
    optuna.delete_study(study_name=study.study_name, storage=storage)"""

In [45]:
loaded_study = optuna.load_study(study_name="example-study-5", storage=storage)
# assert len(loaded_study.trials) == len(study.trials)
assert len(loaded_study.trials) == 10
print(loaded_study.trials)

[FrozenTrial(number=0, state=TrialState.COMPLETE, values=[140.18074835506775], datetime_start=datetime.datetime(2024, 5, 13, 20, 45, 32, 199991), datetime_complete=datetime.datetime(2024, 5, 13, 20, 45, 32, 446047), params={'x': -9.839795114573045}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'x': FloatDistribution(high=10.0, log=False, low=-10.0, step=None)}, trial_id=1, value=None), FrozenTrial(number=1, state=TrialState.COMPLETE, values=[8.107840455877692], datetime_start=datetime.datetime(2024, 5, 13, 20, 45, 32, 205993), datetime_complete=datetime.datetime(2024, 5, 13, 20, 45, 32, 422041), params={'x': 4.847426988682535}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'x': FloatDistribution(high=10.0, log=False, low=-10.0, step=None)}, trial_id=2, value=None), FrozenTrial(number=2, state=TrialState.COMPLETE, values=[4.089854263322307], datetime_start=datetime.datetime(2024, 5, 13, 20, 45, 32, 249004), datetime_complete=datetime.da

In [33]:
from src.p7_db import DbOptuna

In [21]:
exp = ExperimentSearch(frac_sample=0.50, n_predictors=780)

In [22]:
exp.init_config()

Les paramètres [] ont été mis à jour
Chargement des données. n_predictors=780, frac_sample=0.5
Forme initiale de X : (246008, 788)
Après réduction des prédicteurs, forme de X : (246008, 780), conso mémoire : 1232 Mo
Après réduction des lignes (échantillon 50%), forme de X : (123004, 780), conso mémoire 617 Mo
Nom de l'expérience (nouvelle - numéro augmenté) : search_lgbm_binary_014
Description de 'search_lgbm_binary_014' :
Recherche bayesienne d'hyperparamètres pour modèle lgbm
Tags :
	'task' : search
	'model' : lgbm
	'objective' : binary
	'metric' : auc
	'num' : 014
	'db' : optuna


True

In [23]:
exp.create_or_load()

[I 2024-05-20 21:37:17,366] A new study created in RDB with name: search_lgbm_binary_014


Création de l'expérience MLFlow 'search_lgbm_binary_014', ID = 284978663754198379


('284978663754198379', <optuna.study.study.Study at 0x20f167e1300>)

In [30]:
exp.mlflow_id

'796218381130353653'

In [24]:
exp.run_lgb(10, 10)

Optimisation 100 trials...
Durée de l'optimisation (hh:mm:ss) : 2:11:03


In [18]:
exp.output_dir

'models/'

In [17]:
stu = exp.study
fig = optuna.visualization.plot_param_importances(stu)
fig.write_html(os.path.join(MODEL_DIR, "hyperparam_importance.html"))

In [22]:
exp.create_best_run()

In [63]:
exp.run_trials(5, n_jobs=1)

Optimisation 5 trials...


[I 2024-05-20 08:33:10,171] Trial 37 finished with value: 0.7443437712918791 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 1.444749056571956e-07, 'lambda_l2': 0.0011951839644942864, 'num_leaves': 133, 'feature_fraction': 0.6221643063929464, 'bagging_fraction': 0.7057420006955167, 'bagging_freq': 4, 'min_child_samples': 75, 'learning_rate': 0.002417853348599845, 'max_bin': 384, 'n_estimators': 300}. Best is trial 24 with value: 0.7469969299597559.
[I 2024-05-20 08:33:13,037] Trial 36 finished with value: 0.7447622122857914 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 1.412963273371639e-07, 'lambda_l2': 0.02992746505215185, 'num_leaves': 136, 'feature_fraction': 0.624544889137139, 'bagging_fraction': 0.6766040847046277, 'bagging_freq': 4, 'min_child_samples': 73, 'learning_rate': 0.002417529384727407, 'max_bin': 448, 'n_estimators': 300}. Best is trial 24 with value: 0.7469969299597559.
[I 2024-05-20 08:33:13,406] Trial 35 finished with value: 0.745101707600446 and par

Durée de l'optimisation (hh:mm:ss) : 99.69939279556274


In [34]:
DbOptuna.del_studies_from_mlflow()

Liste des studys dans mlflow : [<Experiment: artifact_location='file:///E:/Mes Documents/_Open Classroom/Code/p7/models/light_simple', creation_time=1715534680676, experiment_id='924445687952880873', last_update_time=1715534680676, lifecycle_stage='active', name='light_simple_788x60000_auc_40trials', tags={'metric': 'auc',
 'mlflow.note.content': "Recherche d'hyperparamètres pour le modèle "
                        'light_simple, impact du nombre de features\n'
                        'Recherche Bayesienne - mono objectif - Hyperband',
 'model': 'lightgbm',
 'task': 'hyperparam'}>, <Experiment: artifact_location='file:///E:/Mes Documents/_Open Classroom/Code/p7/models/light_simple', creation_time=1715533753389, experiment_id='499169421546781099', last_update_time=1715533753389, lifecycle_stage='active', name='light_simple_700x246008_auc_40trials', tags={'metric': 'auc',
 'mlflow.note.content': "Recherche d'hyperparamètres pour le modèle "
                        'light_simple, impact d

# Test services

In [6]:
experiment = ExperimentSearch()

In [7]:
experiment.check_mlflow_server()

La connexion MLFlow a échoué. Vérifiez que le serveur est démarré.
Pour le démarrer, vous pouver utiliser experiment.start_mlflow_server()
Error: HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000134483DFA90>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))


In [12]:
experiment.start_mlflow_server()

<Popen: returncode: None args: ['mlflow', 'server', '--host', '127.0.0.1', '...>

In [13]:
experiment.check_mlflow_server()

Le serveur MLFlow est Ok


In [8]:
experiment.check_postgresql_server()

La connexion Postgresql est OK


In [9]:
from src.p7_hyper_param import ExperimentSearch

In [10]:
# experiment = ExperimentSearch()

In [11]:
experiment.check_mlflow_server()

La connexion MLFlow a échoué. Vérifiez que le serveur est démarré.
Pour le démarrer, vous pouver utiliser experiment.start_mlflow_server()
Error: HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000134483DFE80>: Failed to establish a new connection: [WinError 10061] Aucune connexion n’a pu être établie car l’ordinateur cible l’a expressément refusée'))


# Test multiprocess

In [42]:
experiment = ExperimentSearch()
name, id = experiment.init_name()
experiment.init_description(
    add_description="Test parallel et mlflow", add_tags={"debug": "True"}
)
_ = experiment.init_data(frac_sample=0.2, n_predictors=10)
_ = experiment.init_config()

Nom de l'expérience (nouvelle) : search_lgbm_binary_000
Description de 'search_lgbm_binary_000' :
Recherche bayesienne d'hyperparamètres pour modèle lgbm
Test parallel et mlflow
Tags :
	'task' : search
	'model' : lgbm
	'objective' : binary
	'metric' : auc
	'num' : 000
	'db' : optuna
	'debug' : True
Chargement des données. n_predictors=10, frac_sample=0.2
Forme initiale de X : (246008, 788)
Après réduction des prédicteurs, forme de X : (246008, 10), conso mémoire : 19 Mo
Après réduction des lignes (échantillon 20%), forme de X : (49201, 10), conso mémoire 4 Mo
Les paramètres [] ont été mis à jour
Nom de l'expérience (nouvelle) : search_lgbm_binary_000
Description de 'search_lgbm_binary_000' :
Recherche bayesienne d'hyperparamètres pour modèle lgbm
Tags :
	'task' : search
	'model' : lgbm
	'objective' : binary
	'metric' : auc
	'num' : 000
	'db' : optuna


In [15]:
def run_trials(experiment, n_trials):
    experiment.study.optimize(
        experiment.objective_lgb, n_trials=n_trials, gc_after_trial=True
    )

In [44]:
study = experiment.create_or_load()

[I 2024-05-19 14:28:04,559] Using an existing study with name 'search_lgbm_binary_000' instead of creating a new one.


Création de l'expérience MLFlow 'search_lgbm_binary_000', ID = 423303752209919308


('423303752209919308', <optuna.study.study.Study at 0x1344a157d60>)

In [45]:
experiment.run_parallel(2, 10)

In [22]:
type(experiment.study)

optuna.study.study.Study

In [25]:
"""import time
from src.p7_util import format_time"""

Séquentielle

In [26]:
t0 = time.time()
run_trials(experiment, 20)
duration = time.time() - t0
print("duration ", format_time(duration))

[I 2024-05-18 13:58:05,553] Trial 25 finished with value: 0.7450172265037776 and parameters: {'boosting_type': 'dart', 'lambda_l1': 0.00036196303921289045, 'lambda_l2': 1.9844383651241966e-07, 'num_leaves': 246, 'feature_fraction': 0.8138074791824516, 'bagging_fraction': 0.8743617830196556, 'bagging_freq': 5, 'min_child_samples': 34, 'learning_rate': 0.08789862156076977, 'max_bin': 320, 'n_estimators': 160}. Best is trial 5 with value: 0.7551103733755931.
[I 2024-05-18 13:58:18,103] Trial 26 finished with value: 0.7386469763259206 and parameters: {'boosting_type': 'dart', 'lambda_l1': 1.67824342179514e-05, 'lambda_l2': 1.35459102885411e-07, 'num_leaves': 39, 'feature_fraction': 0.45156438232897816, 'bagging_fraction': 0.6403162262475226, 'bagging_freq': 3, 'min_child_samples': 49, 'learning_rate': 0.23544604576008898, 'max_bin': 480, 'n_estimators': 80}. Best is trial 5 with value: 0.7551103733755931.
[I 2024-05-18 13:58:34,573] Trial 27 finished with value: 0.7572477226081137 and para

duration  0:05:32


Faux parallèle

In [29]:
from src.p7_hyper_param import ExperimentSearch

In [30]:
t0 = time.time()
experiment.run_trials(n_trials=20, n_jobs=4)
duration = time.time() - t0
print("duration ", format_time(duration))

[I 2024-05-18 14:48:09,105] Trial 65 finished with value: 0.7456771444056016 and parameters: {'boosting_type': 'dart', 'lambda_l1': 0.0001307961954225366, 'lambda_l2': 5.8165424184548594e-05, 'num_leaves': 9, 'feature_fraction': 0.4300474527180473, 'bagging_fraction': 0.8428478866166653, 'bagging_freq': 4, 'min_child_samples': 39, 'learning_rate': 0.025845661644766264, 'max_bin': 384, 'n_estimators': 200}. Best is trial 27 with value: 0.7572477226081137.
[I 2024-05-18 14:48:10,247] Trial 68 finished with value: 0.7415571485277823 and parameters: {'boosting_type': 'dart', 'lambda_l1': 0.0005795395489309084, 'lambda_l2': 4.971757567353869e-05, 'num_leaves': 9, 'feature_fraction': 0.5841809832964028, 'bagging_fraction': 0.8328002671443379, 'bagging_freq': 4, 'min_child_samples': 39, 'learning_rate': 0.023816918911055005, 'max_bin': 384, 'n_estimators': 200}. Best is trial 27 with value: 0.7572477226081137.
[I 2024-05-18 14:48:12,257] Trial 66 finished with value: 0.7518232771881521 and pa

duration  0:02:58


Parallèle

In [27]:
t0 = time.time()
# Exécuter les essais en parallèle
n_workers = 4
Parallel(n_jobs=n_workers)(delayed(run_trials)(experiment, 5) for _ in range(n_workers))
duration = time.time() - t0
print("duration ", format_time(duration))

duration  0:01:24


In [31]:
t0 = time.time()
# Exécuter les essais en parallèle
n_workers = 4
Parallel(n_jobs=n_workers)(
    delayed(experiment.run_trials)(5, 1) for _ in range(n_workers)
)
duration = time.time() - t0
print("duration ", format_time(duration))

duration  0:01:47


In [32]:
t0 = time.time()
# Exécuter les essais en parallèle
n_workers = 4
experiment.run_parallel(n_workers, n_trials_per_worker=5)
duration = time.time() - t0
print("duration ", format_time(duration))

duration  0:01:46


Test

In [64]:
experiment = ExperimentSearch()
name, id = experiment.init_name()
name

Nom de l'expérience (nouvelle - numéro augmenté) : search_lgbm_binary_014


'search_lgbm_binary_014'

In [65]:
experiment.init_description(
    add_description="Test parallel et mlflow", add_tags={"debug": "True"}
)

Description de 'search_lgbm_binary_014' :
Recherche bayesienne d'hyperparamètres pour modèle lgbm
Test parallel et mlflow
Tags :
	'task' : search
	'model' : lgbm
	'objective' : binary
	'metric' : auc
	'num' : 014
	'debug' : True


("Recherche bayesienne d'hyperparamètres pour modèle lgbm\nTest parallel et mlflow",
 {'task': 'search',
  'model': 'lgbm',
  'objective': 'binary',
  'metric': 'auc',
  'num': '014',
  'debug': 'True'})

In [66]:
_ = experiment.init_data(frac_sample=0.4, n_predictors=20)

Chargement des données. n_predictors=20, frac_sample=0.4
Forme initiale de X : (246008, 788)
Après réduction des prédicteurs, forme de X : (246008, 20), conso mémoire : 38 Mo
Après réduction des lignes (échantillon 40%), forme de X : (98403, 20), conso mémoire 16 Mo


In [67]:
_ = experiment.init_config(n_terminals=2, n_jobs_1_terminal=1, n_trials_1_terminal=10)

Les paramètres ['n_terminals', 'n_jobs_1_terminal', 'n_trials_1_terminal'] ont été mis à jour
Nom de l'expérience (nouvelle) : search_lgbm_binary_014
Description de 'search_lgbm_binary_014' :
Recherche bayesienne d'hyperparamètres pour modèle lgbm
Tags :
	'task' : search
	'model' : lgbm
	'objective' : binary
	'metric' : auc
	'num' : 014
20 trials, à exécuter dans 2 process en parallèle de 10 trials chacun
(1 jobs par process)


Vrai Multi_thrheading

In [63]:
experiment.optimize()

[I 2024-05-17 15:33:06,804] A new study created in RDB with name: search_lgbm_binary_013


Création de l'expérience MLFlow 'search_lgbm_binary_013', ID = 986255984568275937
Experience 'search_lgbm_binary_013' activée

Optimisation...


AttributeError: 'ActiveRun' object has no attribute 'run_id'

Fausse parallélisation (n_jobs)

In [49]:
_ = experiment.init_config(n_terminals=1, n_jobs_1_terminal=16, n_trials_1_terminal=20)

Les paramètres ['n_terminals', 'n_jobs_1_terminal', 'n_trials_1_terminal'] ont été mis à jour
Nom de l'expérience (nouvelle) : search_lgbm_binary_011
Description de 'search_lgbm_binary_011' :
Recherche bayesienne d'hyperparamètres pour modèle lgbm
Tags :
	'task' : search
	'model' : lgbm
	'objective' : binary
	'metric' : auc
	'num' : 011
20 trials, à exécuter dans 1 process en parallèle de 20 trials chacun
(16 jobs par process)


In [50]:
experiment.optimize()

[I 2024-05-17 14:17:51,415] A new study created in RDB with name: search_lgbm_binary_011


Création de l'expérience MLFlow 'search_lgbm_binary_011', ID = 847649907960096330
Experience 'search_lgbm_binary_011' activée

Optimisation...


[I 2024-05-17 14:18:40,119] Trial 14 finished with value: 0.7494266648624275 and parameters: {'boosting_type': 'dart', 'lambda_l1': 1.9887595462963325e-05, 'lambda_l2': 3.3737450046391923e-05, 'num_leaves': 102, 'feature_fraction': 0.8134584426564329, 'bagging_fraction': 0.6339347082567082, 'bagging_freq': 1, 'min_child_samples': 79, 'learning_rate': 0.04951334526015166, 'max_bin': 480, 'n_estimators': 40}. Best is trial 14 with value: 0.7494266648624275.
[I 2024-05-17 14:18:57,259] Trial 15 finished with value: 0.7454630651447134 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.0022827958736154, 'lambda_l2': 0.9894738269020589, 'num_leaves': 201, 'feature_fraction': 0.671188426994366, 'bagging_fraction': 0.9704181592670296, 'bagging_freq': 7, 'min_child_samples': 45, 'learning_rate': 0.0010021390079885848, 'max_bin': 256, 'n_estimators': 40}. Best is trial 14 with value: 0.7494266648624275.
[I 2024-05-17 14:19:03,631] Trial 3 finished with value: 0.735271109379851 and paramete

Durée de l'optimisation : 0:04:07
Number of finished trials:  20
Best trial:
  Value:  0.7592598032043425
  Params: 
    boosting_type: gbdt
    lambda_l1: 4.736031327347091e-05
    lambda_l2: 0.2536129046023039
    num_leaves: 236
    feature_fraction: 0.48871084807234233
    bagging_fraction: 0.5969292499657121
    bagging_freq: 3
    min_child_samples: 67
    learning_rate: 0.005600096263012194
    max_bin: 192
    n_estimators: 260


Aucune para

In [77]:
df = experiment.track_best_run()
display(df)

len(runs) 21 type <class 'pandas.core.frame.DataFrame'>
len(nested_runs) (20, 31)


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.accuracy,metrics.balanced_accuracy,metrics.auc,metrics.weighted_recall,...,params.learning_rate,params.num_leaves,params.bagging_freq,params.feature_fraction,params.lambda_l2,tags.mlflow.user,tags.mlflow.source.type,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.parentRunId
9,115f3ddacadf47368147a55f224f9f05,132863171492550832,FINISHED,file:///E:/Mes Documents/_Open Classroom/Code/...,2024-05-17 13:45:45.962000+00:00,2024-05-17 13:46:17.517000+00:00,0.911507,0.547041,0.762332,0.911507,...,"(0.046944854745911146,)","(93,)","(1,)","(0.41038246226306374,)","(5.3409598364934014,)",Ariane,LOCAL,N_10,e:\Mes Documents\_Open Classroom\Code\p7\p7env...,8cf65c5bba0947a7be4618c935988650


In [73]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 31 columns):
 #   Column                      Non-Null Count  Dtype              
---  ------                      --------------  -----              
 0   run_id                      21 non-null     object             
 1   experiment_id               21 non-null     object             
 2   status                      21 non-null     object             
 3   artifact_uri                21 non-null     object             
 4   start_time                  21 non-null     datetime64[ns, UTC]
 5   end_time                    21 non-null     datetime64[ns, UTC]
 6   metrics.accuracy            20 non-null     float64            
 7   metrics.balanced_accuracy   20 non-null     float64            
 8   metrics.auc                 20 non-null     float64            
 9   metrics.weighted_recall     20 non-null     float64            
 10  metrics.precision           20 non-null     float64            


In [19]:
experiment.X.shape

(123004, 500)

Démarrer

In [5]:
config = CONFIG_SEARCH
for k, v in config.items():
    print(f"{k} : {v}")

model_dir : models/
model_type : lightgbm
subdir : light_simple/
data_dir : data/interim/
train_filename : train.csv
feature_importance_filename : feature_importance.csv
n_predictors : 20
moo_objective : False
metric : weighted_recall
n_trials : 40


# Essai avec toutes les données

In [6]:
config["metric"] = "auc"

In [21]:
data = get_train()
print("Forme de data :", data.shape)
data.head()

0 variables à inclure correspondant au motif 'Unnamed' : []
0 variables à exclure correspondant au motif 'None' : []
0 variables sélectionnées : []
Forme de train.csv : (246008, 790)
Forme de data : (246008, 790)


Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Sentproposal_SUM,CC_NAME_CONTRACT_STATUS_Sentproposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT,TARGET
0,450053,0.0,0,0,0,225000.0,270000.0,13500.0,270000.0,0.018029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0
1,420697,1.0,0,0,0,99000.0,254700.0,17019.0,225000.0,0.020246,...,,,,,,,,,,0
2,338811,0.0,0,0,0,157500.0,450000.0,22500.0,450000.0,0.022625,...,,,,,,,,,,0
3,303494,0.0,1,1,0,225000.0,1372500.0,40131.0,1372500.0,0.011703,...,,,,,,,,,,0
4,153319,1.0,0,1,0,292500.0,314055.0,13963.5,238500.0,0.00823,...,,,,,,,,,,0


In [22]:
sorted_features = get_sorted_features_by_importance(config=config)

In [9]:
"""not_predictors = [
    'TARGET',
    "SK_ID_CURR",
    "SK_ID_BUREAU",
    "SK_ID_PREV",
    "index",
    "level_0",
    ]
predictors = list(filter(lambda v: v not in not_predictors, data.columns))
len(predictors)"""

'not_predictors = [\n    \'TARGET\',\n    "SK_ID_CURR",\n    "SK_ID_BUREAU",\n    "SK_ID_PREV",\n    "index",\n    "level_0",\n    ]\npredictors = list(filter(lambda v: v not in not_predictors, data.columns))\nlen(predictors)'

In [23]:
predictors = joblib.load(
    os.path.join(DATA_INTERIM, "features_sorted_by_importance.pkl")
)

In [24]:
print("Nombre de features :", len(predictors))
X, y = (
    data[predictors],
    data["TARGET"],
)

# categorical_features = list(X.select_dtypes(include="object").columns)
# categorical_features

Nombre de features : 788


In [25]:
X.shape

(246008, 788)

In [26]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246008 entries, 0 to 246007
Columns: 788 entries, PAYMENT_RATE to CC_CNT_DRAWINGS_OTHER_CURRENT_MIN
dtypes: bool(141), float64(607), int64(40)
memory usage: 1.2 GB


In [14]:
# from src.p7_simple_kernel import reduce_memory

In [27]:
X_mem = reduce_memory(X)
X_mem.info()

Memory usage of dataframe is 1247.43 MB
Memory usage after optimization is: 1247.43 MB
Decreased by 0.0%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246008 entries, 0 to 246007
Columns: 788 entries, PAYMENT_RATE to CC_CNT_DRAWINGS_OTHER_CURRENT_MIN
dtypes: bool(141), float64(607), int64(40)
memory usage: 1.2 GB


In [28]:
del X_mem
del data
gc.collect()

466

In [29]:
config["n_trials"] = 40

In [30]:
study = sk_single_search(
    X,
    y,
    experiment_name="huge_auc_by_10_trials",
    num="010",
    by_10_trials=True,
    config=config,
)

[I 2024-05-14 12:38:07,856] A new study created in memory with name: study_huge_auc_by_10_trials


Création de l'expérience 'huge_auc_by_10_trials'
Experience 'huge_auc_by_10_trials' activée


[I 2024-05-14 12:41:16,917] Trial 6 finished with value: 0.7276113978157557 and parameters: {'boosting_type': 'dart', 'lambda_l1': 1.0491003430107929e-08, 'lambda_l2': 0.0027744793790150657, 'num_leaves': 11, 'feature_fraction': 0.6991934817579307, 'bagging_fraction': 0.8380647763272756, 'bagging_freq': 5, 'min_child_samples': 26, 'learning_rate': 0.005751432275744837, 'max_bin': 448, 'n_estimators': 100}. Best is trial 6 with value: 0.7276113978157557.
[I 2024-05-14 12:44:47,601] Trial 8 finished with value: 0.7458267010810352 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 1.5554385411750817e-07, 'lambda_l2': 3.929682691819298e-07, 'num_leaves': 8, 'feature_fraction': 0.5747392784567269, 'bagging_fraction': 0.978366192625776, 'bagging_freq': 1, 'min_child_samples': 64, 'learning_rate': 0.005033900714215973, 'max_bin': 256, 'n_estimators': 400}. Best is trial 8 with value: 0.7458267010810352.
[I 2024-05-14 12:46:42,865] Trial 7 finished with value: 0.7499101800906665 and parame

Optimize hyperparameters - duration (hh:mm:ss) : 2:19:41


# Petite étude pour debug

In [31]:
data = get_train()
print("Forme de data :", data.shape)

0 variables à inclure correspondant au motif 'Unnamed' : []
0 variables à exclure correspondant au motif 'None' : []
0 variables sélectionnées : []
Forme de train.csv : (246008, 790)
Forme de data : (246008, 790)


In [35]:
predictors = joblib.load(
    os.path.join(DATA_INTERIM, "features_sorted_by_importance.pkl")
)
X, y = (
    data[predictors[:20]].head(30_000),
    data["TARGET"].head(30_000),
)
config["metric"] = "balanced_accuracy"
config["n_trials"] = 10
study = sk_single_search(
    X,
    y,
    experiment_name="debug_balanced_acc2",
    num="010",
    by_10_trials=False,
    config=config,
)

[I 2024-05-14 15:49:34,684] A new study created in memory with name: study_debug_balanced_acc2


Création de l'expérience 'debug_balanced_acc2'
Experience 'debug_balanced_acc2' activée


[I 2024-05-14 15:49:45,293] Trial 8 finished with value: 0.5 and parameters: {'boosting_type': 'dart', 'lambda_l1': 0.034693657785455745, 'lambda_l2': 0.0033301602612120017, 'num_leaves': 10, 'feature_fraction': 0.6159333025167415, 'bagging_fraction': 0.5996262986516481, 'bagging_freq': 4, 'min_child_samples': 35, 'learning_rate': 0.00019405383917253262, 'max_bin': 288, 'n_estimators': 260}. Best is trial 8 with value: 0.5.
[I 2024-05-14 15:49:50,417] Trial 0 finished with value: 0.4999818807755029 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 9.293323772818397, 'lambda_l2': 0.20579149010326644, 'num_leaves': 26, 'feature_fraction': 0.5754945014366256, 'bagging_fraction': 0.5951237570785475, 'bagging_freq': 2, 'min_child_samples': 83, 'learning_rate': 0.009128012156059228, 'max_bin': 192, 'n_estimators': 300}. Best is trial 8 with value: 0.5.
[I 2024-05-14 15:49:52,309] Trial 7 finished with value: 0.5 and parameters: {'boosting_type': 'dart', 'lambda_l1': 1.0802649193860532e-

Optimize hyperparameters - duration (hh:mm:ss) : 0:00:47


# Vieux

In [26]:
df = pd.read_csv(os.path.join(DATA_INTERIM, "all_data_simple_kernel_ohe.csv"))

to_drop = sel_var(df.columns, "Unnamed")
if to_drop:
    df = df.drop(to_drop, axis=1)

df = df.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x))

data = df[df["TARGET"].notnull()]

del df
gc.collect()

print("Forme de data avec Target :", data.shape)
data.head()

0 variables à inclure correspondant au motif 'Unnamed' : []
0 variables à exclure correspondant au motif 'None' : []
0 variables sélectionnées : []
Forme de data avec Target : (307507, 794)


Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CC_NAME_CONTRACT_STATUS_Sentproposal_MEAN,CC_NAME_CONTRACT_STATUS_Sentproposal_SUM,CC_NAME_CONTRACT_STATUS_Sentproposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,,,,,,,,,,
1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,...,,,,,,,,,,
2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,135000.0,...,,,,,,,,,,
3,100006,0.0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,...,,,,,,,,,,


In [8]:
# from src.p7_util import clean_ram

In [9]:
dic_local = locals()
to_del = ["to_drop", "selected_features"]
clean_ram(to_del, dic_local)

1 variables détruites : ['to_drop']


Récupération des features par ordre d'importance

In [27]:
sorted_features_by_importance = (
    pd.read_csv(
        os.path.join(
            config["model_dir"], config["subdir"], config["feature_importance_filename"]
        )
    )
    .set_index("feature")
    .index.tolist()
)
print(len(sorted_features_by_importance))
sorted_features_by_importance[:10]

792


['PAYMENT_RATE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_3',
 'EXT_SOURCE_2',
 'DAYS_BIRTH',
 'AMT_ANNUITY',
 'DAYS_EMPLOYED',
 'APPROVED_CNT_PAYMENT_MEAN',
 'DAYS_ID_PUBLISH',
 'ACTIVE_DAYS_CREDIT_MAX']

In [35]:
not_predictors = [
    "TARGET",
    "SK_ID_CURR",
    "SK_ID_BUREAU",
    "SK_ID_PREV",
    "index",
    "level_0",
]
predictors = list(filter(lambda v: v not in not_predictors, data.columns))

In [36]:
len(predictors)

792

In [37]:
config["n_predictors"] = 400

In [38]:
predictors = predictors[: config["n_predictors"]]
print("Nombre de features :", len(predictors))
X_train, y_train = (
    data[predictors],
    data["TARGET"],
)

categorical_features = list(X_train.loc[:, X_train.dtypes == "object"].columns.values)

# Conversion en dataset lgbm
train_set = lgb.Dataset(
    X_train,
    y_train,
    feature_name=predictors,
    categorical_feature=categorical_features,
    free_raw_data=True,
).construct()

Nombre de features : 400


In [14]:
train_set.get_feature_name()

['CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL']

In [15]:
# from src.p7_evaluate import lgb_cross_evaluate

In [16]:
def lgb_single_objective(lgb_dataset, config=CONFIG_SEARCH):
    def _objective(trial):
        parent_run_name, parent_run_description = build_parent_run_name(config=config)
        child_run_name = f"N_{trial.number}_T_{trial._trial_id}"
        with mlflow.start_run(run_name=child_run_name, nested=True):
            # On définit les hyperparamètres
            params = {
                "boosting_type": trial.suggest_categorical(
                    "boosting_type", ["dart", "gbdt"]
                ),
                "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                "learning_rate": trial.suggest_float(
                    "learning_rate", 0.0001, 0.5, log=True
                ),
                "max_bin": trial.suggest_int("max_bin", 128, 512, step=32),
                "n_estimators": trial.suggest_int("n_estimators", 40, 400, step=20),
                "objective": "binary",
                "metric": ["binary_logloss", "auc"],
                "num_threads": NUM_THREADS,
                "verbose": -1,
            }

            if params["boosting_type"] == "dart":
                early_stopping = False
            else:
                early_stopping = True

            other_params = {}
            cv_results = lgb_cross_evaluate(
                lgb_dataset, params, early_stopping=early_stopping, verbose=False
            )
            for k in cv_results.keys():
                mlflow.log_metric(k, cv_results[k][-1])
            mlflow.log_params(params)
        return cv_results["valid weighted_recall-mean"][-1]

    return _objective

In [17]:
# from src.p7_hyper_param import build_experiment, build_parent_run_name

In [18]:
def lgb_single_search(train_df, experiment_name=None, config=CONFIG_SEARCH):
    # Use the fluent API to set the tracking uri and the active experiment
    # mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")

    with timer("Optimize hyperparameters"):
        # Utilise l'algorithme d'optimisation TPE (Tree-structured Parzen Estimator) comme méthode d'échantillonnage
        # Il s'agit de l'algo qui génère les valeurs des hyperparams lors de chaque essai d'optimisation
        # Ici il est utilisé en conjonction avec le pruning Hyperband
        # => Le sampler choisit les params à essayer, le pruner les arrête prématurément si non performants
        sampler = optuna.samplers.TPESampler()

        # Optuna a réalisé plusieurs études empiriques avec différents algorithmes de pruning.
        # Empiriquement, l'algorithme Hyperband a donné les meilleurs résultats
        # Voir : https://github.com/optuna/optuna/wiki/Benchmarks-with-Kurobako
        # reduction_factor contrôle combien de trials sont proposés dans chaque Halving Round
        pruner = optuna.pruners.HyperbandPruner(
            min_resource=10, max_resource=400, reduction_factor=3
        )

        # On active l'expérience
        n_rows = train_df.shape[0]
        experiment_id = build_experiment(
            n_rows=n_rows, experiment_name=experiment_name, config=config
        )
        experiment_metadata = mlflow.set_experiment(experiment_id=experiment_id)
        print(f"Experience '{experiment_metadata.name}' activée")

        # On crèe une run MLFlow
        parent_run_name, parent_run_description = build_parent_run_name(config=config)

        with mlflow.start_run(
            experiment_id=experiment_id, run_name=parent_run_name, nested=True
        ) as run:
            # description du run
            mlflow.set_tag("mlflow.note.content", parent_run_description)

            study = optuna.create_study(
                direction="maximize",
                sampler=sampler,
                pruner=pruner,
                study_name=f"study_{experiment_metadata.name}",
                # storage=os.path.join(MODEL_DIR, subdir)
            )

            # gc appelle le garbage collector après chaque trial
            study.optimize(
                lgb_single_objective(lgb_dataset=train_df, config=config),
                n_trials=config["n_trials"],
                gc_after_trial=True,
                n_jobs=NUM_THREADS,
            )

            best_params = study.best_trial.params
            best_score = study.best_trial.value
            mlflow.log_params(best_params)

            fig = optuna.visualization.plot_parallel_coordinate(
                study,
                params=["boosting_type", "num_leaves", "learning_rate", "n_estimators"],
            )
            im_dir = os.path.join(config["model_dir"], config["subdir"])
            # fig.write_image(file=os.path.join(im_dir, "single_parallel_coordinates.png"), format="png", scale=6)
            fig.write_html(os.path.join(im_dir, "single_parallel_coordinates.html"))
            fig = optuna.visualization.plot_param_importances(study)
            # fig.write_image(file=os.path.join(im_dir, "single_hyperparam_importance.png"), format="png", scale=1)
            fig.write_html(os.path.join(im_dir, "single_hyperparam_importance.html"))

            mlflow.log_params(best_params)
            mlflow.log_metric(config["metric"], best_score)
            mlflow.log_artifact(
                os.path.join(im_dir, "single_parallel_coordinates.html")
            )
            mlflow.log_artifact(
                os.path.join(im_dir, "single_hyperparam_importance.html")
            )
            # mlflow.log_artifact(os.path.join(im_dir, "single_hyperparam_importance.png"))
        # Force mlflow à terminer le run même s'il y a une erreur dedans
        mlflow.end_run()
    return study

In [31]:
config = CONFIG_SEARCH
config

{'model_dir': 'models/',
 'model_type': 'lightgbm',
 'subdir': 'light_simple/',
 'data_dir': 'data/interim/',
 'train_filename': 'all_data_simple_kernel_ohe.csv',
 'feature_importance_filename': 'feature_importance.csv',
 'n_predictors': 400,
 'moo_objective': False,
 'metric': 'weighted_recall',
 'n_trials': 40}

In [20]:
config["n_trials"] = 40

In [32]:
experiment_name = None

In [33]:
data[predictors].shape

(307507, 20)

In [39]:
study = sk_single_search(
    data[predictors + ["TARGET"]], experiment_name=experiment_name, config=config
)

[I 2024-05-10 10:59:57,484] A new study created in memory with name: study_light_simple_400x307507_40trials


experiment_name light_simple_400x307507_40trials
Création de l'expérience 'light_simple_400x307507_40trials'
Experience 'light_simple_400x307507_40trials' activée


[I 2024-05-10 11:00:58,041] Trial 10 finished with value: 0.9192701304309328 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.010236245979906347, 'lambda_l2': 0.012472878005614916, 'num_leaves': 142, 'feature_fraction': 0.9995589586074928, 'bagging_fraction': 0.51568527166329, 'bagging_freq': 3, 'min_child_samples': 38, 'learning_rate': 0.011661606594194181, 'max_bin': 224, 'n_estimators': 40}. Best is trial 10 with value: 0.9192701304309328.
[I 2024-05-10 11:04:28,092] Trial 9 finished with value: 1.0 and parameters: {'boosting_type': 'dart', 'lambda_l1': 7.345055931263538, 'lambda_l2': 0.0022676098344615305, 'num_leaves': 57, 'feature_fraction': 0.6264447572786682, 'bagging_fraction': 0.9623764482744351, 'bagging_freq': 3, 'min_child_samples': 85, 'learning_rate': 0.07485224422944901, 'max_bin': 128, 'n_estimators': 140}. Best is trial 9 with value: 1.0.
[I 2024-05-10 11:04:36,227] Trial 2 finished with value: 0.9192701304309328 and parameters: {'boosting_type': 'dart', 'lamb

Optimize hyperparameters - duration (hh:mm:ss) : 0:45:46


In [24]:
# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")

In [24]:
# study = lgb_single_search(data[predictors + ['TARGET']], experiment_name=experiment_name, config=config)

## Run A Study

In [15]:
"""
Démarrer un serveur mlflow local en ligne de commande :
mlflow server --host 127.0.0.1 --port 8080
"""

'\nDémarrer un serveur mlflow local en ligne de commande :\nmlflow server --host 127.0.0.1 --port 8080\n'

In [16]:
"""# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")"""

'# Use the fluent API to set the tracking uri and the active experiment\nmlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")'

In [17]:
study = sk_single_search(data[predictors + ["TARGET"]])

experiment_name light_simple_20x307507_100trials
Création de l'expérience 'light_simple_20x307507_100trials'
Experience 'light_simple_20x307507_100trials' activée


[I 2024-05-06 23:05:50,403] A new study created in memory with name: study_light_simple_20x307507_100trials
[I 2024-05-06 23:06:04,928] Trial 18 finished with value: 1.0 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 7.064863870788589, 'lambda_l2': 3.9451936584622644e-06, 'num_leaves': 174, 'feature_fraction': 0.8429258549197521, 'bagging_fraction': 0.7719731044862875, 'bagging_freq': 5, 'min_child_samples': 100, 'learning_rate': 0.3496027616599878, 'max_bin': 480, 'n_estimators': 80}. Best is trial 18 with value: 1.0.
[I 2024-05-06 23:06:05,750] Trial 8 finished with value: 0.9192701304309328 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.0018216133950195634, 'lambda_l2': 0.07680533717405039, 'num_leaves': 139, 'feature_fraction': 0.9342887198519151, 'bagging_fraction': 0.8859913125362429, 'bagging_freq': 3, 'min_child_samples': 10, 'learning_rate': 0.00010223793692724258, 'max_bin': 416, 'n_estimators': 40}. Best is trial 18 with value: 1.0.
[I 2024-05-06 23:06:32,6

Optimize hyperparameters - duration (hh:mm:ss) : 0:15:43


In [94]:
mlflow.end_run()

In [20]:
print(study.best_trial)
best_params = study.best_trial.params
for k, v in best_params.items():
    print(f"{k} : {v}")

FrozenTrial(number=18, state=TrialState.COMPLETE, values=[1.0], datetime_start=datetime.datetime(2024, 5, 6, 23, 5, 50, 415841), datetime_complete=datetime.datetime(2024, 5, 6, 23, 6, 4, 928118), params={'boosting_type': 'gbdt', 'lambda_l1': 7.064863870788589, 'lambda_l2': 3.9451936584622644e-06, 'num_leaves': 174, 'feature_fraction': 0.8429258549197521, 'bagging_fraction': 0.7719731044862875, 'bagging_freq': 5, 'min_child_samples': 100, 'learning_rate': 0.3496027616599878, 'max_bin': 480, 'n_estimators': 80}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'boosting_type': CategoricalDistribution(choices=('dart', 'gbdt')), 'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistribution(high=1.0, 

In [22]:
joblib.dump(
    study,
    os.path.join(
        MODEL_DIR, config["subdir"], "study_light_simple_20x307507_100trials.pkl"
    ),
)

['models/light_simple/study_light_simple_20x307507_100trials.pkl']

# Avec 100 predictors

In [None]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)
# voir la faq pour les args dans optimize : https://optuna.readthedocs.io/en/stable/faq.html#how-to-define-objective-functions-that-have-own-arguments

In [23]:
config["n_predictors"] = 100

In [25]:
# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")

In [26]:
study = lgbm_single_search(train[predictors + ["TARGET"]], config=config)

[I 2024-05-07 01:12:26,176] A new study created in memory with name: study_light_simple_100x307507_100trials


experiment_name light_simple_100x307507_100trials
Création de l'expérience 'light_simple_100x307507_100trials'
Experience 'light_simple_100x307507_100trials' activée


[I 2024-05-07 01:13:00,351] Trial 11 finished with value: 1.0 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 2.935980124674102, 'lambda_l2': 0.027376404563782868, 'num_leaves': 91, 'feature_fraction': 0.9404956710315201, 'bagging_fraction': 0.5041372749174218, 'bagging_freq': 5, 'min_child_samples': 67, 'learning_rate': 0.08092938441307712, 'max_bin': 448, 'n_estimators': 200}. Best is trial 11 with value: 1.0.
[I 2024-05-07 01:13:02,629] Trial 4 finished with value: 1.0 and parameters: {'boosting_type': 'dart', 'lambda_l1': 2.532785225351618, 'lambda_l2': 3.463972231226103e-08, 'num_leaves': 140, 'feature_fraction': 0.8186779378093619, 'bagging_fraction': 0.8365114890964351, 'bagging_freq': 5, 'min_child_samples': 29, 'learning_rate': 0.045264340268737394, 'max_bin': 416, 'n_estimators': 40}. Best is trial 11 with value: 1.0.
[I 2024-05-07 01:13:13,615] Trial 26 finished with value: 1.0 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 9.150622579971083, 'lambda_l2': 0.00

MlflowException: API request to http://127.0.0.1:8080/api/2.0/mlflow/runs/log-batch failed with exception HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: /api/2.0/mlflow/runs/log-batch (Caused by ResponseError('too many 500 error responses'))

## Understanding Parameters

In [72]:
study = joblib.load(
    os.path.join(MODEL_DIR, config["subdir"], "opt_lightgbm_single_03.pkl")
)

In [73]:
# Nécessite l'installation de plotly et de Kaleido (redémarrer le kernel impérativement après plotly, kaleido version 0.1.0 et nformat version récente)
# pip install kaleido==0.1.0
# pip install --upgrade nbformat
# Ajouter le path de kaleido.cmd au PATH windows
# Tout redémarrer
fig = optuna.visualization.plot_parallel_coordinate(
    study, params=["boosting_type", "num_leaves", "learning_rate", "n_estimators"]
)
joblib.dump(fig, os.path.join(MODEL_DIR, "fig_plotly.pkl"))
print(type(fig))

<class 'plotly.graph_objs._figure.Figure'>


In [74]:
im_dir = os.path.join(MODEL_DIR, subdir)

In [75]:
fig.write_image(
    file=os.path.join(im_dir, "single_parallel_coordinates.png"), format="png", scale=6
)

In [76]:
fig.show()

In [44]:
# Plus rapide en html
im_path = os.path.join(im_dir, "single_parallel_coordinates.html")
fig.write_html(im_path)
fig.show()

In [78]:
fig = optuna.visualization.plot_param_importances(study)
fig.write_image(
    file=os.path.join(im_dir, "single_hyperparam_importance.png"), format="png", scale=6
)
fig.show()

## Multi-objective optimization

In [22]:
def moo_objective(trial):
    learning_rate = (trial.suggest_float("learning_rate", 0.0001, 0.5, log=True),)

    model = lgb.LGBMClassifier(
        force_row_wise=True,
        boosting_type="gbdt",
        n_estimators=200,
        lambda_l1=3.298803078077973e-07,
        lambda_l2=8.938532783741386e-07,
        num_leaves=6,
        feature_fraction=0.5133218336120866,
        bagging_fraction=0.9660809666082303,
        bagging_freq=7,
        min_child_samples=91,
        learning_rate=learning_rate,
        max_bin=320,
        verbose=-1,
    )
    scores = cross_val_score(model, X, y, scoring="f1_macro")
    return learning_rate[0], scores.mean()

In [23]:
study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(moo_objective, n_trials=100)

[32m[I 2023-04-27 10:47:35,501][0m A new study created in memory with name: no-name-eb122531-f2ee-4cb5-92d1-8efba6a28eef[0m
[32m[I 2023-04-27 10:47:36,221][0m Trial 0 finished with values: [0.018276625572235056, 0.7229728382357858] and parameters: {'learning_rate': 0.018276625572235056}. [0m
[32m[I 2023-04-27 10:47:36,922][0m Trial 1 finished with values: [0.012281350875017864, 0.713129498994182] and parameters: {'learning_rate': 0.012281350875017864}. [0m
[32m[I 2023-04-27 10:47:37,601][0m Trial 2 finished with values: [0.3179931543219117, 0.7181352358216049] and parameters: {'learning_rate': 0.3179931543219117}. [0m
[32m[I 2023-04-27 10:47:38,317][0m Trial 3 finished with values: [0.3317048028124761, 0.7182462105095208] and parameters: {'learning_rate': 0.3317048028124761}. [0m
[32m[I 2023-04-27 10:47:39,042][0m Trial 4 finished with values: [0.010097499163932589, 0.689595604293249] and parameters: {'learning_rate': 0.010097499163932589}. [0m
[32m[I 2023-04-27 10:4

In [24]:
fig = optuna.visualization.plot_pareto_front(
    study, target_names=["learning_rate", "f1"]
)
fig.write_image(file="figures/ch5_pareto.png", format="png", scale=6)
fig.show()