In [1]:
# On affiche les graphiques dans le notebook en statique
%matplotlib inline

In [2]:
import numpy as np
import logging
import sys
import os
import gc
import joblib
import lightgbm as lgb
import optuna
import pandas as pd
from sklearn.model_selection import cross_val_score

In [3]:
from src.p7_constantes import DATA_INTERIM, DATA_BASE, MODEL_DIR
from src.p7_util import timer

Démarrer

In [4]:
df = pd.read_csv(os.path.join(DATA_INTERIM, "train.csv"))
print("Forme de train.csv :", df.shape)
if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)
df.head()

Forme de train.csv : (307506, 18)


Unnamed: 0,SK_ID_CURR,TARGET,ORGANIZATION_TYPE,CREDIT_TO_ANNUITY_RATIO,EXT_SOURCES_MEAN,EXT_SOURCE_3,EXT_SOURCES_MIN,BUREAU_CONSUMER_DAYS_CREDIT_ENDDATE_MAX,AMT_ANNUITY,CREDIT_TO_GOODS_RATIO,OCCUPATION_TYPE,DAYS_ID_PUBLISH,DAYS_EMPLOYED,BUREAU_CREDIT_DEBT_CREDIT_DIFF_MEAN,BUREAU_ACTIVE_DEBT_CREDIT_DIFF_MEAN,ANNUITY_TO_INCOME_RATIO,AGE
0,100002,1.0,0,16.47,0.1617,0.1394,0.083,780.0,24700.5,1.158,0,-2120,-637.0,15994.282,118103.78,0.12195,25.920548
1,100003,0.0,1,36.22,0.4668,,0.3113,-420.0,35698.5,1.1455,1,-291,-1188.0,461250.0,810000.0,0.1322,45.931507
2,100004,0.0,2,20.0,0.6426,0.7295,0.556,-382.0,6750.0,1.0,0,-2531,-225.0,,,0.1,52.180822
3,100006,0.0,0,10.53,0.6504,,0.6504,,29686.5,1.053,0,-2437,-3040.0,,,0.2198,52.068493
4,100007,0.0,3,23.47,0.3228,,0.3228,-783.0,21865.5,1.0,1,-3458,-3038.0,,,0.1799,54.608219


In [5]:
encoded_features_application = ['ORGANIZATION_TYPE', 'OCCUPATION_TYPE']
mapper = {feature: feature + '_old' for feature in encoded_features_application}
df = df.rename(mapper, axis=1)


In [6]:
initial_application = pd.read_csv(os.path.join(DATA_BASE, "application_train.csv"))[['SK_ID_CURR'] + encoded_features_application]
initial_application.shape
df = pd.merge(left=df, right=initial_application, on=['SK_ID_CURR'])
df = df.drop([v for v in mapper.values()], axis=1)
del initial_application
del mapper
gc.collect()
df.head()

Unnamed: 0,SK_ID_CURR,TARGET,CREDIT_TO_ANNUITY_RATIO,EXT_SOURCES_MEAN,EXT_SOURCE_3,EXT_SOURCES_MIN,BUREAU_CONSUMER_DAYS_CREDIT_ENDDATE_MAX,AMT_ANNUITY,CREDIT_TO_GOODS_RATIO,DAYS_ID_PUBLISH,DAYS_EMPLOYED,BUREAU_CREDIT_DEBT_CREDIT_DIFF_MEAN,BUREAU_ACTIVE_DEBT_CREDIT_DIFF_MEAN,ANNUITY_TO_INCOME_RATIO,AGE,ORGANIZATION_TYPE,OCCUPATION_TYPE
0,100002,1.0,16.47,0.1617,0.1394,0.083,780.0,24700.5,1.158,-2120,-637.0,15994.282,118103.78,0.12195,25.920548,Business Entity Type 3,Laborers
1,100003,0.0,36.22,0.4668,,0.3113,-420.0,35698.5,1.1455,-291,-1188.0,461250.0,810000.0,0.1322,45.931507,School,Core staff
2,100004,0.0,20.0,0.6426,0.7295,0.556,-382.0,6750.0,1.0,-2531,-225.0,,,0.1,52.180822,Government,Laborers
3,100006,0.0,10.53,0.6504,,0.6504,,29686.5,1.053,-2437,-3040.0,,,0.2198,52.068493,Business Entity Type 3,Laborers
4,100007,0.0,23.47,0.3228,,0.3228,-783.0,21865.5,1.0,-3458,-3038.0,,,0.1799,54.608219,Religion,Core staff


In [7]:
df['ORGANIZATION_TYPE'] = df['ORGANIZATION_TYPE'].apply(lambda x: np.nan if x=="XNA" else x)
df['ORGANIZATION_TYPE'].value_counts(dropna=False)

ORGANIZATION_TYPE
Business Entity Type 3    67991
NaN                       55374
Self-employed             38412
Other                     16683
Medicine                  11192
Business Entity Type 2    10553
Government                10404
School                     8893
Trade: type 7              7831
Kindergarten               6879
Construction               6721
Business Entity Type 1     5983
Transport: type 4          5398
Trade: type 3              3492
Industry: type 9           3368
Industry: type 3           3277
Security                   3247
Housing                    2958
Industry: type 11          2704
Military                   2634
Bank                       2507
Agriculture                2454
Police                     2341
Transport: type 2          2204
Postal                     2157
Security Ministries        1974
Trade: type 2              1900
Restaurant                 1811
Services                   1575
University                 1327
Industry: type 7      

In [8]:
df['OCCUPATION_TYPE'].value_counts(dropna=False)

OCCUPATION_TYPE
NaN                      96389
Laborers                 55185
Sales staff              32102
Core staff               27569
Managers                 21371
Drivers                  18603
High skill tech staff    11380
Accountants               9813
Medicine staff            8537
Security staff            6721
Cooking staff             5946
Cleaning staff            4653
Private service staff     2652
Low-skill Laborers        2092
Waiters/barmen staff      1348
Secretaries               1305
Realty agents              751
HR staff                   563
IT staff                   526
Name: count, dtype: int64

In [9]:
cat_features = list(df.loc[:, df.dtypes == 'object'].columns.values)
for feature in cat_features:
    df[feature] = pd.Series(df[feature], dtype="category")

In [10]:
# Essai sur un échantillon d'abord
n_rows = 30_000
X = df.drop(columns=["SK_ID_CURR", "TARGET"], axis=1)
y = df["TARGET"]
if n_rows:
    X = X.head(n_rows)
    y = y.head(n_rows)

In [11]:
predictors = list(X.columns)
predictors

['CREDIT_TO_ANNUITY_RATIO',
 'EXT_SOURCES_MEAN',
 'EXT_SOURCE_3',
 'EXT_SOURCES_MIN',
 'BUREAU_CONSUMER_DAYS_CREDIT_ENDDATE_MAX',
 'AMT_ANNUITY',
 'CREDIT_TO_GOODS_RATIO',
 'DAYS_ID_PUBLISH',
 'DAYS_EMPLOYED',
 'BUREAU_CREDIT_DEBT_CREDIT_DIFF_MEAN',
 'BUREAU_ACTIVE_DEBT_CREDIT_DIFF_MEAN',
 'ANNUITY_TO_INCOME_RATIO',
 'AGE',
 'ORGANIZATION_TYPE',
 'OCCUPATION_TYPE']

In [12]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 15 columns):
 #   Column                                   Non-Null Count  Dtype   
---  ------                                   --------------  -----   
 0   CREDIT_TO_ANNUITY_RATIO                  30000 non-null  float64 
 1   EXT_SOURCES_MEAN                         29977 non-null  float64 
 2   EXT_SOURCE_3                             24078 non-null  float64 
 3   EXT_SOURCES_MIN                          29977 non-null  float64 
 4   BUREAU_CONSUMER_DAYS_CREDIT_ENDDATE_MAX  24064 non-null  float64 
 5   AMT_ANNUITY                              30000 non-null  float64 
 6   CREDIT_TO_GOODS_RATIO                    29976 non-null  float64 
 7   DAYS_ID_PUBLISH                          30000 non-null  int64   
 8   DAYS_EMPLOYED                            24577 non-null  float64 
 9   BUREAU_CREDIT_DEBT_CREDIT_DIFF_MEAN      15663 non-null  float64 
 10  BUREAU_ACTIVE_DEBT_CREDIT_DIFF_MEA

## Run A Study

In [13]:
# Nécessite l'installation optuna-integration
def objective(optimize_boosting_type=True):
    def _objective(trial):
        if optimize_boosting_type:
            boosting_type = trial.suggest_categorical("boosting_type", ["dart", "gbdt"])
        else:
            boosting_type = "gbdt"
        lambda_l1 = trial.suggest_float(
            'lambda_l1', 1e-8, 10.0, log=True),
        lambda_l2 = trial.suggest_float(
            'lambda_l2', 1e-8, 10.0, log=True),
        num_leaves = trial.suggest_int(
            'num_leaves', 2, 256),
        feature_fraction = trial.suggest_float(
            'feature_fraction', 0.4, 1.0),
        bagging_fraction = trial.suggest_float(
            'bagging_fraction', 0.4, 1.0),
        bagging_freq = trial.suggest_int(
            'bagging_freq', 1, 7),
        min_child_samples = trial.suggest_int(
            'min_child_samples', 5, 100),
        learning_rate = trial.suggest_float(
            "learning_rate", 0.0001, 0.5, log=True),
        max_bin = trial.suggest_int(
            "max_bin", 128, 512, step=32)
        n_estimators = trial.suggest_int(
            "n_estimators", 40, 400, step=20)

        pruning_callback = optuna.integration.LightGBMPruningCallback(trial, "binary")
        #pruning_callback = optuna-integration.LightGBMPruningCallback(trial, "binary")
        
        model = lgb.LGBMClassifier(
            force_row_wise=True,
            boosting_type=boosting_type,
            n_estimators=n_estimators,
            lambda_l1=lambda_l1,
            lambda_l2=lambda_l2,
            num_leaves=num_leaves,
            feature_fraction=feature_fraction,
            bagging_fraction=bagging_fraction,
            bagging_freq=bagging_freq,
            min_child_samples=min_child_samples,
            learning_rate=learning_rate,
            max_bin=max_bin,
            callbacks=[pruning_callback],
            verbose=-1,
            )
        
        scores = cross_val_score(model, X, y, scoring="f1_macro", cv=5)
        return scores.mean()

    return _objective

In [14]:
cat_features

['ORGANIZATION_TYPE', 'OCCUPATION_TYPE']

In [15]:
with timer("Optimize hyperparameters"):
    sampler = optuna.samplers.TPESampler()
    pruner = optuna.pruners.HyperbandPruner(
        min_resource=10, max_resource=400, reduction_factor=3)

    study = optuna.create_study(
        direction='maximize', sampler=sampler,
        pruner=pruner
    )
    study.optimize(objective(), n_trials=30, gc_after_trial=True, n_jobs=-1)

[I 2024-04-27 08:30:37,758] A new study created in memory with name: no-name-b5531032-13a8-4e70-9878-dc8d40ef2b8c
[I 2024-04-27 08:30:45,111] Trial 13 finished with value: 0.4791485806059554 and parameters: {'boosting_type': 'gbdt', 'lambda_l1': 2.5434052781410035e-07, 'lambda_l2': 0.0005809141064153906, 'num_leaves': 3, 'feature_fraction': 0.7896642847527977, 'bagging_fraction': 0.7357466819476344, 'bagging_freq': 2, 'min_child_samples': 96, 'learning_rate': 0.005883644208119656, 'max_bin': 416, 'n_estimators': 340}. Best is trial 13 with value: 0.4791485806059554.
[I 2024-04-27 08:30:52,087] Trial 19 finished with value: 0.4791485806059554 and parameters: {'boosting_type': 'dart', 'lambda_l1': 3.050334122435856e-06, 'lambda_l2': 4.350924027127744e-08, 'num_leaves': 6, 'feature_fraction': 0.48634489370002093, 'bagging_fraction': 0.9746296699996962, 'bagging_freq': 2, 'min_child_samples': 98, 'learning_rate': 0.00015026369838780335, 'max_bin': 256, 'n_estimators': 260}. Best is trial 1

Optimize hyperparameters - done in 82s


In [16]:
print(study.best_trial)
best_params = study.best_trial.params
for k, v in best_params.items():
    print(f"{k} : {v}")

FrozenTrial(number=26, state=TrialState.COMPLETE, values=[0.560365316044187], datetime_start=datetime.datetime(2024, 4, 27, 8, 30, 37, 805472), datetime_complete=datetime.datetime(2024, 4, 27, 8, 31, 34, 798284), params={'boosting_type': 'dart', 'lambda_l1': 8.525930055598907, 'lambda_l2': 2.1656304353873055, 'num_leaves': 153, 'feature_fraction': 0.9969889380894155, 'bagging_fraction': 0.6553199106521793, 'bagging_freq': 2, 'min_child_samples': 25, 'learning_rate': 0.24241611757080983, 'max_bin': 352, 'n_estimators': 260}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'boosting_type': CategoricalDistribution(choices=('dart', 'gbdt')), 'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistribut

## Understanding Parameters

In [17]:
import plotly

In [18]:
plotly.__version__

'5.21.0'

In [19]:
import kaleido
kaleido.__version__

'0.1.0'

In [20]:
study = joblib.load(os.path.join(MODEL_DIR,"lgbm-optuna-study.pkl"))

In [21]:
# Nécessite l'installation de plotly et de Kaleido (redémarrer le kernel impérativement après plotly, kaleido version 0.1.0 et nformat version récente)
# pip install kaleido==0.1.0
# pip install --upgrade nbformat
# Ajouter le path de kaleido.cmd au PATH windows
# Tout redémarrer
fig = optuna.visualization.plot_parallel_coordinate(study, params=["boosting_type", "num_leaves", "learning_rate", "n_estimators"])
joblib.dump(fig, os.path.join(MODEL_DIR,'fig_plotly.pkl'))
print(type(fig))

<class 'plotly.graph_objs._figure.Figure'>


In [22]:
fig.write_image(file=os.path.join(MODEL_DIR, "try1_parallel_coordinates.png"), format="png", scale=6)

In [23]:
fig.show()

In [24]:
# Plus rapide en html
im_path = os.path.join(MODEL_DIR, 'fig_plotly.html')
fig.write_html(im_path)
fig.show()

In [25]:
fig = optuna.visualization.plot_param_importances(study)
fig.write_image(file=os.path.join(MODEL_DIR, "try1_hyperparam_importance.png"), format="png", scale=6)
fig.show()

## Save and Resume a Study

In [26]:
joblib.dump(study, os.path.join(MODEL_DIR, "lgbm-optuna-study.pkl"))

['models/lgbm-optuna-study.pkl']

In [28]:
study = joblib.load(os.path.join(MODEL_DIR, "lgbm-optuna-study.pkl"))
study.optimize(objective(), n_trials=20, gc_after_trial=True, n_jobs=-1)

[I 2024-04-27 08:53:45,530] Trial 32 finished with value: 0.5116440498214068 and parameters: {'boosting_type': 'dart', 'lambda_l1': 6.792818041547189e-05, 'lambda_l2': 3.1954892717159225e-07, 'num_leaves': 75, 'feature_fraction': 0.8946164805173675, 'bagging_fraction': 0.8658874438569492, 'bagging_freq': 4, 'min_child_samples': 30, 'learning_rate': 0.39566782548107093, 'max_bin': 320, 'n_estimators': 380}. Best is trial 23 with value: 0.5907814807779238.
[I 2024-04-27 08:53:48,454] Trial 38 finished with value: 0.5150143552605523 and parameters: {'boosting_type': 'dart', 'lambda_l1': 3.735265790987351e-05, 'lambda_l2': 1.5702432762430575e-07, 'num_leaves': 75, 'feature_fraction': 0.9028907370621844, 'bagging_fraction': 0.8293905296790435, 'bagging_freq': 4, 'min_child_samples': 32, 'learning_rate': 0.41619962296496005, 'max_bin': 320, 'n_estimators': 400}. Best is trial 23 with value: 0.5907814807779238.
[I 2024-04-27 08:53:49,022] Trial 31 finished with value: 0.5163946218571953 and p

In [29]:
print(study.best_trial)

FrozenTrial(number=23, state=TrialState.COMPLETE, values=[0.5907814807779238], datetime_start=datetime.datetime(2024, 4, 26, 7, 18, 28, 352868), datetime_complete=datetime.datetime(2024, 4, 26, 7, 19, 27, 743267), params={'boosting_type': 'dart', 'lambda_l1': 0.025780771002814715, 'lambda_l2': 2.1103357195834125, 'num_leaves': 72, 'feature_fraction': 0.746310813407126, 'bagging_fraction': 0.9601299271053441, 'bagging_freq': 6, 'min_child_samples': 44, 'learning_rate': 0.043556425276861634, 'max_bin': 256, 'n_estimators': 320}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'boosting_type': CategoricalDistribution(choices=('dart', 'gbdt')), 'lambda_l1': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'lambda_l2': FloatDistribution(high=10.0, log=True, low=1e-08, step=None), 'num_leaves': IntDistribution(high=256, log=False, low=2, step=1), 'feature_fraction': FloatDistribution(high=1.0, log=False, low=0.4, step=None), 'bagging_fraction': FloatDistri

## Multi-objective optimization

In [22]:
def moo_objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.5, log=True),

    model = lgb.LGBMClassifier(
        force_row_wise=True,
        boosting_type='gbdt',
        n_estimators=200,
        lambda_l1=3.298803078077973e-07,
        lambda_l2=8.938532783741386e-07,
        num_leaves=6,
        feature_fraction=0.5133218336120866,
        bagging_fraction=0.9660809666082303,
        bagging_freq=7,
        min_child_samples=91,
        learning_rate=learning_rate,
        max_bin=320,
        verbose=-1,
    )
    scores = cross_val_score(model, X, y, scoring="f1_macro")
    return learning_rate[0], scores.mean()

In [23]:
study = optuna.create_study(directions=["maximize", "maximize"])
study.optimize(moo_objective, n_trials=100)

[32m[I 2023-04-27 10:47:35,501][0m A new study created in memory with name: no-name-eb122531-f2ee-4cb5-92d1-8efba6a28eef[0m
[32m[I 2023-04-27 10:47:36,221][0m Trial 0 finished with values: [0.018276625572235056, 0.7229728382357858] and parameters: {'learning_rate': 0.018276625572235056}. [0m
[32m[I 2023-04-27 10:47:36,922][0m Trial 1 finished with values: [0.012281350875017864, 0.713129498994182] and parameters: {'learning_rate': 0.012281350875017864}. [0m
[32m[I 2023-04-27 10:47:37,601][0m Trial 2 finished with values: [0.3179931543219117, 0.7181352358216049] and parameters: {'learning_rate': 0.3179931543219117}. [0m
[32m[I 2023-04-27 10:47:38,317][0m Trial 3 finished with values: [0.3317048028124761, 0.7182462105095208] and parameters: {'learning_rate': 0.3317048028124761}. [0m
[32m[I 2023-04-27 10:47:39,042][0m Trial 4 finished with values: [0.010097499163932589, 0.689595604293249] and parameters: {'learning_rate': 0.010097499163932589}. [0m
[32m[I 2023-04-27 10:4

In [24]:
fig = optuna.visualization.plot_pareto_front(study, target_names=["learning_rate", "f1"])
fig.write_image(file="figures/ch5_pareto.png", format="png", scale=6)
fig.show()