In [1]:
# On affiche les graphiques dans le notebook en statique
%matplotlib inline

In [2]:
import numpy as np
import logging
import re
import os
import gc
import joblib
import lightgbm as lgb
import optuna
import plotly
import kaleido
import mlflow

import pandas as pd
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, make_scorer, f1_score

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from src.p7_constantes import DATA_INTERIM, DATA_BASE, MODEL_DIR
from src.p7_constantes import LOCAL_HOST, LOCAL_PORT
from src.p7_constantes import NUM_THREADS
from src.p7_simple_kernel import reduce_memory, get_memory_consumed, get_available_memory, get_batch_size
from src.p7_hyper_param import CONFIG_SEARCH
from src.p7_hyper_param import sk_single_search, build_experiment, build_parent_run_name, get_train, get_sorted_features_by_importance
from src.p7_evaluate import lgb_cross_evaluate
from src.p7_util import timer, clean_ram
from src.p7_regex import sel_var

Démarrer

In [5]:
config = CONFIG_SEARCH
for k, v in config.items():
    print(f"{k} : {v}")

model_dir : models/
model_type : lightgbm
subdir : light_simple/
data_dir : data/interim/
train_filename : train.csv
feature_importance_filename : feature_importance.csv
n_predictors : 20
moo_objective : False
metric : weighted_recall
n_trials : 40


# Essai avec toutes les données

In [6]:
config['metric'] = 'auc'

In [7]:
data = get_train()
print("Forme de data :", data.shape)
data.head()

0 variables à inclure correspondant au motif 'Unnamed' : []
0 variables à exclure correspondant au motif 'None' : []
0 variables sélectionnées : []
Forme de train.csv : (246008, 790)
Forme de data : (246008, 790)


Unnamed: 0,SK_ID_CURR,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CC_NAME_CONTRACT_STATUS_Sentproposal_SUM,CC_NAME_CONTRACT_STATUS_Sentproposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT,TARGET
0,450053,0.0,0,0,0,225000.0,270000.0,13500.0,270000.0,0.018029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,0
1,420697,1.0,0,0,0,99000.0,254700.0,17019.0,225000.0,0.020246,...,,,,,,,,,,0
2,338811,0.0,0,0,0,157500.0,450000.0,22500.0,450000.0,0.022625,...,,,,,,,,,,0
3,303494,0.0,1,1,0,225000.0,1372500.0,40131.0,1372500.0,0.011703,...,,,,,,,,,,0
4,153319,1.0,0,1,0,292500.0,314055.0,13963.5,238500.0,0.00823,...,,,,,,,,,,0


In [8]:
sorted_features = get_sorted_features_by_importance(config=config)

In [9]:
"""not_predictors = [
    'TARGET',
    "SK_ID_CURR",
    "SK_ID_BUREAU",
    "SK_ID_PREV",
    "index",
    "level_0",
    ]
predictors = list(filter(lambda v: v not in not_predictors, data.columns))
len(predictors)"""

'not_predictors = [\n    \'TARGET\',\n    "SK_ID_CURR",\n    "SK_ID_BUREAU",\n    "SK_ID_PREV",\n    "index",\n    "level_0",\n    ]\npredictors = list(filter(lambda v: v not in not_predictors, data.columns))\nlen(predictors)'

In [10]:
predictors = joblib.load(os.path.join(DATA_INTERIM, "features_sorted_by_importance.pkl"))

In [11]:
print("Nombre de features :", len(predictors))
X, y = (
    data[predictors],
    data["TARGET"],
)

#categorical_features = list(X.select_dtypes(include="object").columns)
#categorical_features

Nombre de features : 788


In [12]:
X.shape

(30000, 50)

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 50 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   PAYMENT_RATE                          29998 non-null  float64
 1   EXT_SOURCE_3                          23957 non-null  float64
 2   EXT_SOURCE_1                          13029 non-null  float64
 3   EXT_SOURCE_2                          29928 non-null  float64
 4   DAYS_BIRTH                            30000 non-null  int64  
 5   DAYS_EMPLOYED                         24582 non-null  float64
 6   AMT_ANNUITY                           29998 non-null  float64
 7   APPROVED_CNT_PAYMENT_MEAN             28284 non-null  float64
 8   DAYS_ID_PUBLISH                       30000 non-null  int64  
 9   ACTIVE_DAYS_CREDIT_MAX                21051 non-null  float64
 10  INSTAL_DPD_MEAN                       28431 non-null  float64
 11  INSTAL_DAYS_ENT

In [14]:
#from src.p7_simple_kernel import reduce_memory

In [15]:
X_mem = reduce_memory(X)
X_mem.info()

Memory usage of dataframe is 11.44 MB
Memory usage after optimization is: 11.44 MB
Decreased by 0.0%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 50 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   PAYMENT_RATE                          29998 non-null  float64
 1   EXT_SOURCE_3                          23957 non-null  float64
 2   EXT_SOURCE_1                          13029 non-null  float64
 3   EXT_SOURCE_2                          29928 non-null  float64
 4   DAYS_BIRTH                            30000 non-null  int64  
 5   DAYS_EMPLOYED                         24582 non-null  float64
 6   AMT_ANNUITY                           29998 non-null  float64
 7   APPROVED_CNT_PAYMENT_MEAN             28284 non-null  float64
 8   DAYS_ID_PUBLISH                       30000 non-null  int64  
 9   ACTIVE_DAYS_CREDIT_MAX                21051 non-

In [16]:
del X_mem
del data
gc.collect()

0

In [17]:
config['n_trials'] = 40

In [19]:

study = sk_single_search(X, y, experiment_name="huge_by_10_trials", num="010", by_10_trials=True, config=config)

[I 2024-05-14 12:27:43,778] A new study created in memory with name: study_by_10_2


Création de l'expérience 'by_10_2'
Experience 'by_10_2' activée


[I 2024-05-14 12:27:56,536] Trial 5 finished with value: 0.7386595314843186 and parameters: {'boosting_type': 'dart', 'lambda_l1': 3.7698602730548696e-05, 'lambda_l2': 0.0038733052326654035, 'num_leaves': 75, 'feature_fraction': 0.4623463454273016, 'bagging_fraction': 0.4296779680977219, 'bagging_freq': 4, 'min_child_samples': 61, 'learning_rate': 0.007173835888078745, 'max_bin': 416, 'n_estimators': 60}. Best is trial 5 with value: 0.7386595314843186.
[I 2024-05-14 12:28:09,327] Trial 1 finished with value: 0.7219273882437498 and parameters: {'boosting_type': 'dart', 'lambda_l1': 4.043060109284792e-06, 'lambda_l2': 0.3600007549071652, 'num_leaves': 38, 'feature_fraction': 0.9726584657663307, 'bagging_fraction': 0.6587140645438139, 'bagging_freq': 1, 'min_child_samples': 31, 'learning_rate': 0.0001835966957922136, 'max_bin': 320, 'n_estimators': 160}. Best is trial 5 with value: 0.7386595314843186.
[I 2024-05-14 12:28:11,094] Trial 7 finished with value: 0.7363304012334634 and paramete

Optimize hyperparameters - duration (hh:mm:ss) : 0:03:27


# Vieux

In [26]:
df = pd.read_csv(os.path.join(DATA_INTERIM, "all_data_simple_kernel_ohe.csv"))

to_drop = sel_var(df.columns, 'Unnamed')
if to_drop:
    df = df.drop(to_drop, axis=1)

df = df.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x))
    
data = df[df['TARGET'].notnull()]

del df
gc.collect()

print("Forme de data avec Target :", data.shape)
data.head()

0 variables à inclure correspondant au motif 'Unnamed' : []
0 variables à exclure correspondant au motif 'None' : []
0 variables sélectionnées : []
Forme de data avec Target : (307507, 794)


Unnamed: 0,SK_ID_CURR,TARGET,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,...,CC_NAME_CONTRACT_STATUS_Sentproposal_MEAN,CC_NAME_CONTRACT_STATUS_Sentproposal_SUM,CC_NAME_CONTRACT_STATUS_Sentproposal_VAR,CC_NAME_CONTRACT_STATUS_Signed_MEAN,CC_NAME_CONTRACT_STATUS_Signed_SUM,CC_NAME_CONTRACT_STATUS_Signed_VAR,CC_NAME_CONTRACT_STATUS_nan_MEAN,CC_NAME_CONTRACT_STATUS_nan_SUM,CC_NAME_CONTRACT_STATUS_nan_VAR,CC_COUNT
0,100002,1.0,0,0,0,0,202500.0,406597.5,24700.5,351000.0,...,,,,,,,,,,
1,100003,0.0,1,0,1,0,270000.0,1293502.5,35698.5,1129500.0,...,,,,,,,,,,
2,100004,0.0,0,1,0,0,67500.0,135000.0,6750.0,135000.0,...,,,,,,,,,,
3,100006,0.0,1,0,0,0,135000.0,312682.5,29686.5,297000.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
4,100007,0.0,0,0,0,0,121500.0,513000.0,21865.5,513000.0,...,,,,,,,,,,


In [8]:
#from src.p7_util import clean_ram

In [9]:
dic_local = locals()
to_del = ['to_drop', 'selected_features']
clean_ram(to_del, dic_local)

1 variables détruites : ['to_drop']


Récupération des features par ordre d'importance

In [27]:
sorted_features_by_importance = pd.read_csv(os.path.join(config['model_dir'], config['subdir'], config['feature_importance_filename'])).set_index("feature").index.tolist()
print(len(sorted_features_by_importance))
sorted_features_by_importance[:10]

792


['PAYMENT_RATE',
 'EXT_SOURCE_1',
 'EXT_SOURCE_3',
 'EXT_SOURCE_2',
 'DAYS_BIRTH',
 'AMT_ANNUITY',
 'DAYS_EMPLOYED',
 'APPROVED_CNT_PAYMENT_MEAN',
 'DAYS_ID_PUBLISH',
 'ACTIVE_DAYS_CREDIT_MAX']

In [35]:
not_predictors = [
    'TARGET',
    "SK_ID_CURR",
    "SK_ID_BUREAU",
    "SK_ID_PREV",
    "index",
    "level_0",
    ]
predictors = list(filter(lambda v: v not in not_predictors, data.columns))

In [36]:
len(predictors)

792

In [37]:
config['n_predictors'] = 400

In [38]:
predictors = predictors[:config['n_predictors']]
print("Nombre de features :", len(predictors))
X_train, y_train = (
    data[predictors],
    data["TARGET"],
)

categorical_features = list(X_train.loc[:, X_train.dtypes == "object"].columns.values)

# Conversion en dataset lgbm
train_set = lgb.Dataset(
    X_train,
    y_train,
    feature_name=predictors,
    categorical_feature=categorical_features,
    free_raw_data=True,
).construct()

Nombre de features : 400


In [14]:
train_set.get_feature_name()

['CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'CNT_CHILDREN',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'DAYS_BIRTH',
 'DAYS_EMPLOYED',
 'DAYS_REGISTRATION',
 'DAYS_ID_PUBLISH',
 'OWN_CAR_AGE',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL']

In [15]:
#from src.p7_evaluate import lgb_cross_evaluate

In [16]:
def lgb_single_objective(lgb_dataset, config=CONFIG_SEARCH):
    def _objective(trial):
        parent_run_name, parent_run_description = build_parent_run_name(config=config)
        child_run_name = f"N_{trial.number}_T_{trial._trial_id}"
        with mlflow.start_run(run_name=child_run_name, nested=True):
            # On définit les hyperparamètres
            params = {
                "boosting_type": trial.suggest_categorical(
                    "boosting_type", ["dart", "gbdt"]
                ),
                "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
                "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
                "num_leaves": trial.suggest_int("num_leaves", 2, 256),
                "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
                "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
                "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
                "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
                "learning_rate": trial.suggest_float(
                    "learning_rate", 0.0001, 0.5, log=True
                ),
                "max_bin": trial.suggest_int("max_bin", 128, 512, step=32),
                "n_estimators": trial.suggest_int("n_estimators", 40, 400, step=20),
                
                "objective": "binary",
                
                "metric": ["binary_logloss", "auc"],
                "num_threads":NUM_THREADS,
                "verbose": -1,
            }
            
            if params['boosting_type'] == 'dart':
                early_stopping=False
            else:
                early_stopping=True
                
            other_params = {}
            cv_results = lgb_cross_evaluate(lgb_dataset, params, early_stopping=early_stopping, verbose=False)
            for k in cv_results.keys():
                mlflow.log_metric(k, cv_results[k][-1])
            mlflow.log_params(params)
        return cv_results['valid weighted_recall-mean'][-1]
    return _objective


In [17]:
#from src.p7_hyper_param import build_experiment, build_parent_run_name

In [18]:
def lgb_single_search(train_df, experiment_name=None, config=CONFIG_SEARCH):
    # Use the fluent API to set the tracking uri and the active experiment
    #mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")

    with timer("Optimize hyperparameters"):
        # Utilise l'algorithme d'optimisation TPE (Tree-structured Parzen Estimator) comme méthode d'échantillonnage
        # Il s'agit de l'algo qui génère les valeurs des hyperparams lors de chaque essai d'optimisation
        # Ici il est utilisé en conjonction avec le pruning Hyperband
        # => Le sampler choisit les params à essayer, le pruner les arrête prématurément si non performants
        sampler = optuna.samplers.TPESampler()

        # Optuna a réalisé plusieurs études empiriques avec différents algorithmes de pruning.
        # Empiriquement, l'algorithme Hyperband a donné les meilleurs résultats
        # Voir : https://github.com/optuna/optuna/wiki/Benchmarks-with-Kurobako
        # reduction_factor contrôle combien de trials sont proposés dans chaque Halving Round
        pruner = optuna.pruners.HyperbandPruner(
            min_resource=10, max_resource=400, reduction_factor=3
        )

        # On active l'expérience
        n_rows = train_df.shape[0]
        experiment_id = build_experiment(
            n_rows=n_rows, experiment_name=experiment_name, config=config
        )
        experiment_metadata = mlflow.set_experiment(experiment_id=experiment_id)
        print(f"Experience '{experiment_metadata.name}' activée")

        # On crèe une run MLFlow
        parent_run_name, parent_run_description = build_parent_run_name(config=config)

        with mlflow.start_run(
            experiment_id=experiment_id, run_name=parent_run_name, nested=True
        ) as run:
            # description du run
            mlflow.set_tag("mlflow.note.content", parent_run_description)

            study = optuna.create_study(
                direction="maximize",
                sampler=sampler,
                pruner=pruner,
                study_name=f"study_{experiment_metadata.name}",
                # storage=os.path.join(MODEL_DIR, subdir)
            )

            # gc appelle le garbage collector après chaque trial
            study.optimize(
                lgb_single_objective(lgb_dataset=train_df, config=config),
                n_trials=config["n_trials"],
                gc_after_trial=True,
                n_jobs=NUM_THREADS,
            )
            
            best_params = study.best_trial.params
            best_score = study.best_trial.value
            mlflow.log_params(best_params)

            fig = optuna.visualization.plot_parallel_coordinate(
                study,
                params=["boosting_type", "num_leaves", "learning_rate", "n_estimators"],
            )
            im_dir = os.path.join(config["model_dir"], config["subdir"])
            # fig.write_image(file=os.path.join(im_dir, "single_parallel_coordinates.png"), format="png", scale=6)
            fig.write_html(os.path.join(im_dir, "single_parallel_coordinates.html"))
            fig = optuna.visualization.plot_param_importances(study)
            # fig.write_image(file=os.path.join(im_dir, "single_hyperparam_importance.png"), format="png", scale=1)
            fig.write_html(os.path.join(im_dir, "single_hyperparam_importance.html"))

            mlflow.log_params(best_params)
            mlflow.log_metric(config["metric"], best_score)
            mlflow.log_artifact(
                os.path.join(im_dir, "single_parallel_coordinates.html")
            )
            mlflow.log_artifact(
                os.path.join(im_dir, "single_hyperparam_importance.html")
            )
            # mlflow.log_artifact(os.path.join(im_dir, "single_hyperparam_importance.png"))
        # Force mlflow à terminer le run même s'il y a une erreur dedans
        mlflow.end_run()
    return study


In [31]:
config = CONFIG_SEARCH
config

{'model_dir': 'models/',
 'model_type': 'lightgbm',
 'subdir': 'light_simple/',
 'data_dir': 'data/interim/',
 'train_filename': 'all_data_simple_kernel_ohe.csv',
 'feature_importance_filename': 'feature_importance.csv',
 'n_predictors': 400,
 'moo_objective': False,
 'metric': 'weighted_recall',
 'n_trials': 40}

In [20]:
config['n_trials'] = 40

In [32]:
experiment_name = None

In [33]:
data[predictors].shape

(307507, 20)

In [39]:
study = sk_single_search(data[predictors + ['TARGET']], experiment_name=experiment_name, config=config)

[I 2024-05-10 10:59:57,484] A new study created in memory with name: study_light_simple_400x307507_40trials


experiment_name light_simple_400x307507_40trials
Création de l'expérience 'light_simple_400x307507_40trials'
Experience 'light_simple_400x307507_40trials' activée


[I 2024-05-10 11:00:58,041] Trial 10 finished with value: 0.9192701304309328 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.010236245979906347, 'lambda_l2': 0.012472878005614916, 'num_leaves': 142, 'feature_fraction': 0.9995589586074928, 'bagging_fraction': 0.51568527166329, 'bagging_freq': 3, 'min_child_samples': 38, 'learning_rate': 0.011661606594194181, 'max_bin': 224, 'n_estimators': 40}. Best is trial 10 with value: 0.9192701304309328.
[I 2024-05-10 11:04:28,092] Trial 9 finished with value: 1.0 and parameters: {'boosting_type': 'dart', 'lambda_l1': 7.345055931263538, 'lambda_l2': 0.0022676098344615305, 'num_leaves': 57, 'feature_fraction': 0.6264447572786682, 'bagging_fraction': 0.9623764482744351, 'bagging_freq': 3, 'min_child_samples': 85, 'learning_rate': 0.07485224422944901, 'max_bin': 128, 'n_estimators': 140}. Best is trial 9 with value: 1.0.
[I 2024-05-10 11:04:36,227] Trial 2 finished with value: 0.9192701304309328 and parameters: {'boosting_type': 'dart', 'lamb

Optimize hyperparameters - duration (hh:mm:ss) : 0:45:46


In [24]:
# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")

In [24]:
#study = lgb_single_search(data[predictors + ['TARGET']], experiment_name=experiment_name, config=config)

## Run A Study

In [15]:
"""
Démarrer un serveur mlflow local en ligne de commande :
mlflow server --host 127.0.0.1 --port 8080
"""

'\nDémarrer un serveur mlflow local en ligne de commande :\nmlflow server --host 127.0.0.1 --port 8080\n'

In [16]:
"""# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")"""

'# Use the fluent API to set the tracking uri and the active experiment\nmlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")'

In [17]:
study = sk_single_search(data[predictors + ['TARGET']])

experiment_name light_simple_20x307507_100trials
Création de l'expérience 'light_simple_20x307507_100trials'
Experience 'light_simple_20x307507_100trials' activée


[I 2024-05-06 23:05:50,403] A new study created in memory with name: study_light_simple_20x307507_100trials
[I 2024-05-06 23:06:04,928] Trial 18 finished with value: 1.0 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 7.064863870788589, 'lambda_l2': 3.9451936584622644e-06, 'num_leaves': 174, 'feature_fraction': 0.8429258549197521, 'bagging_fraction': 0.7719731044862875, 'bagging_freq': 5, 'min_child_samples': 100, 'learning_rate': 0.3496027616599878, 'max_bin': 480, 'n_estimators': 80}. Best is trial 18 with value: 1.0.
[I 2024-05-06 23:06:05,750] Trial 8 finished with value: 0.9192701304309328 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 0.0018216133950195634, 'lambda_l2': 0.07680533717405039, 'num_leaves': 139, 'feature_fraction': 0.9342887198519151, 'bagging_fraction': 0.8859913125362429, 'bagging_freq': 3, 'min_child_samples': 10, 'learning_rate': 0.00010223793692724258, 'max_bin': 416, 'n_estimators': 40}. Best is trial 18 with value: 1.0.
[I 2024-05-06 23:06:32,6

Optimize hyperparameters - duration (hh:mm:ss) : 0:15:43


In [94]:
mlflow.end_run()

In [20]:
print(study.best_trial)
best_params = study.best_trial.params
for k, v in best_params.items():
    print(f"{k} : {v}")

FrozenTrial(number=18, state=TrialState.COMPLETE, values=[1.0], datetime_start=datetime.datetime(2024, 5, 6, 23, 5, 50, 415841), datetime_complete=datetime.datetime(2024, 5, 6, 23, 6, 4, 928118), params={'boosting_type': 'gbdt', 'lambda_l1': 7.064863870788589, 'lambda_l2': 3.9451936584622644e-06, 'num_leaves': 174, 'feature_fraction': 0.8429258549197521, 'bagging_fraction': 0.7719731044862875, 'bagging_freq': 5, 'min_child_samples': 100, 'learning_rate': 0.3496027616599878, 'max_bin': 480, 'n_estimators': 80}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'boosting_type': CategoricalDistribution(choices=('dart', 'gbdt')), 'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistribution(high=1.0, 

In [22]:
joblib.dump(study, os.path.join(MODEL_DIR, config['subdir'], "study_light_simple_20x307507_100trials.pkl"))

['models/light_simple/study_light_simple_20x307507_100trials.pkl']

# Avec 100 predictors

In [None]:
# override Optuna's default logging to ERROR only
optuna.logging.set_verbosity(optuna.logging.ERROR)
# voir la faq pour les args dans optimize : https://optuna.readthedocs.io/en/stable/faq.html#how-to-define-objective-functions-that-have-own-arguments

In [23]:
config['n_predictors'] = 100

In [25]:
# Use the fluent API to set the tracking uri and the active experiment
mlflow.set_tracking_uri(f"{LOCAL_HOST}:{LOCAL_PORT}")

In [26]:
study = lgbm_single_search(train[predictors + ['TARGET']], config=config)

[I 2024-05-07 01:12:26,176] A new study created in memory with name: study_light_simple_100x307507_100trials


experiment_name light_simple_100x307507_100trials
Création de l'expérience 'light_simple_100x307507_100trials'
Experience 'light_simple_100x307507_100trials' activée


[I 2024-05-07 01:13:00,351] Trial 11 finished with value: 1.0 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 2.935980124674102, 'lambda_l2': 0.027376404563782868, 'num_leaves': 91, 'feature_fraction': 0.9404956710315201, 'bagging_fraction': 0.5041372749174218, 'bagging_freq': 5, 'min_child_samples': 67, 'learning_rate': 0.08092938441307712, 'max_bin': 448, 'n_estimators': 200}. Best is trial 11 with value: 1.0.
[I 2024-05-07 01:13:02,629] Trial 4 finished with value: 1.0 and parameters: {'boosting_type': 'dart', 'lambda_l1': 2.532785225351618, 'lambda_l2': 3.463972231226103e-08, 'num_leaves': 140, 'feature_fraction': 0.8186779378093619, 'bagging_fraction': 0.8365114890964351, 'bagging_freq': 5, 'min_child_samples': 29, 'learning_rate': 0.045264340268737394, 'max_bin': 416, 'n_estimators': 40}. Best is trial 11 with value: 1.0.
[I 2024-05-07 01:13:13,615] Trial 26 finished with value: 1.0 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 9.150622579971083, 'lambda_l2': 0.00

MlflowException: API request to http://127.0.0.1:8080/api/2.0/mlflow/runs/log-batch failed with exception HTTPConnectionPool(host='127.0.0.1', port=8080): Max retries exceeded with url: /api/2.0/mlflow/runs/log-batch (Caused by ResponseError('too many 500 error responses'))

## Understanding Parameters

In [72]:
study = joblib.load(os.path.join(MODEL_DIR, config['subdir'], "opt_lightgbm_single_03.pkl"))

In [73]:
# Nécessite l'installation de plotly et de Kaleido (redémarrer le kernel impérativement après plotly, kaleido version 0.1.0 et nformat version récente)
# pip install kaleido==0.1.0
# pip install --upgrade nbformat
# Ajouter le path de kaleido.cmd au PATH windows
# Tout redémarrer
fig = optuna.visualization.plot_parallel_coordinate(study, params=["boosting_type", "num_leaves", "learning_rate", "n_estimators"])
joblib.dump(fig, os.path.join(MODEL_DIR,'fig_plotly.pkl'))
print(type(fig))

<class 'plotly.graph_objs._figure.Figure'>


In [74]:
im_dir = os.path.join(MODEL_DIR, subdir)

In [75]:
fig.write_image(file=os.path.join(im_dir, "single_parallel_coordinates.png"), format="png", scale=6)

In [76]:
fig.show()

In [44]:
# Plus rapide en html
im_path = os.path.join(im_dir, "single_parallel_coordinates.html")
fig.write_html(im_path)
fig.show()

In [78]:
fig = optuna.visualization.plot_param_importances(study)
fig.write_image(file=os.path.join(im_dir, "single_hyperparam_importance.png"), format="png", scale=6)
fig.show()

## Multi-objective optimization

In [22]:
def moo_objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.5, log=True),

    model = lgb.LGBMClassifier(
        force_row_wise=True,
        boosting_type='gbdt',
        n_estimators=200,
        lambda_l1=3.298803078077973e-07,
        lambda_l2=8.938532783741386e-07,
        num_leaves=6,
        feature_fraction=0.5133218336120866,
        bagging_fraction=0.9660809666082303,
        bagging_freq=7,
        min_child_samples=91,
        learning_rate=learning_rate,
        max_bin=320,
        verbose=-1,
    )
    scores = cross_val_score(model, X, y, scoring="f1_macro")
    return learning_rate[0], scores.mean()

In [23]:
study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(moo_objective, n_trials=100)

[32m[I 2023-04-27 10:47:35,501][0m A new study created in memory with name: no-name-eb122531-f2ee-4cb5-92d1-8efba6a28eef[0m
[32m[I 2023-04-27 10:47:36,221][0m Trial 0 finished with values: [0.018276625572235056, 0.7229728382357858] and parameters: {'learning_rate': 0.018276625572235056}. [0m
[32m[I 2023-04-27 10:47:36,922][0m Trial 1 finished with values: [0.012281350875017864, 0.713129498994182] and parameters: {'learning_rate': 0.012281350875017864}. [0m
[32m[I 2023-04-27 10:47:37,601][0m Trial 2 finished with values: [0.3179931543219117, 0.7181352358216049] and parameters: {'learning_rate': 0.3179931543219117}. [0m
[32m[I 2023-04-27 10:47:38,317][0m Trial 3 finished with values: [0.3317048028124761, 0.7182462105095208] and parameters: {'learning_rate': 0.3317048028124761}. [0m
[32m[I 2023-04-27 10:47:39,042][0m Trial 4 finished with values: [0.010097499163932589, 0.689595604293249] and parameters: {'learning_rate': 0.010097499163932589}. [0m
[32m[I 2023-04-27 10:4

In [24]:
fig = optuna.visualization.plot_pareto_front(study, target_names=["learning_rate", "f1"])
fig.write_image(file="figures/ch5_pareto.png", format="png", scale=6)
fig.show()