# Optuna lightGBM

https://github.com/optuna/optuna  

In [1]:
from sklearn.datasets import load_boston
import lightgbm as lgb # conda install -c conda-forge lightgbm
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split

import optuna

In [2]:
data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [3]:
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [4]:
# FYI: Objective functions can take additional arguments
# (https://optuna.readthedocs.io/en/stable/faq.html#objective-func-additional-args).

def objective(trial):
    data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
    train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(valid_x)
    pred_labels = np.rint(preds)
    accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
    return accuracy


In [5]:
%%time
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

[I 2020-10-08 13:44:07,901] A new study created in memory with name: no-name-ed42ef8b-46dd-4027-895f-504363c6bf53
[I 2020-10-08 13:44:07,993] Trial 0 finished with value: 0.965034965034965 and parameters: {'lambda_l1': 0.2735899378237925, 'lambda_l2': 2.7259170270177638e-05, 'num_leaves': 246, 'feature_fraction': 0.9839665770059358, 'bagging_fraction': 0.6009408914064643, 'bagging_freq': 7, 'min_child_samples': 46}. Best is trial 0 with value: 0.965034965034965.
[I 2020-10-08 13:44:08,103] Trial 1 finished with value: 0.9790209790209791 and parameters: {'lambda_l1': 0.000737348505216848, 'lambda_l2': 1.3672841163631296e-06, 'num_leaves': 252, 'feature_fraction': 0.4668670819569275, 'bagging_fraction': 0.6229994930151244, 'bagging_freq': 4, 'min_child_samples': 17}. Best is trial 1 with value: 0.9790209790209791.
[I 2020-10-08 13:44:08,180] Trial 2 finished with value: 0.965034965034965 and parameters: {'lambda_l1': 1.045425779511504e-06, 'lambda_l2': 0.0018688567187433878, 'num_leaves'

[I 2020-10-08 13:44:13,855] Trial 45 finished with value: 0.965034965034965 and parameters: {'lambda_l1': 0.0040056776549609145, 'lambda_l2': 7.023762815374617e-07, 'num_leaves': 94, 'feature_fraction': 0.509378801384265, 'bagging_fraction': 0.7942868572923538, 'bagging_freq': 1, 'min_child_samples': 84}. Best is trial 4 with value: 0.993006993006993.
[I 2020-10-08 13:44:13,993] Trial 46 finished with value: 0.9790209790209791 and parameters: {'lambda_l1': 0.06632809987367773, 'lambda_l2': 8.401627616173359, 'num_leaves': 55, 'feature_fraction': 0.6226963472208445, 'bagging_fraction': 0.5537639967817853, 'bagging_freq': 2, 'min_child_samples': 75}. Best is trial 4 with value: 0.993006993006993.
[I 2020-10-08 13:44:14,121] Trial 47 finished with value: 0.9790209790209791 and parameters: {'lambda_l1': 2.3365989874231137e-07, 'lambda_l2': 1.3223824589694781e-05, 'num_leaves': 82, 'feature_fraction': 0.5534564399390709, 'bagging_fraction': 0.8479173995109347, 'bagging_freq': 4, 'min_child_

[I 2020-10-08 13:44:20,193] Trial 91 finished with value: 0.993006993006993 and parameters: {'lambda_l1': 0.016674472668262833, 'lambda_l2': 0.0006282597523511958, 'num_leaves': 85, 'feature_fraction': 0.6428836926864397, 'bagging_fraction': 0.8064821792320519, 'bagging_freq': 5, 'min_child_samples': 38}. Best is trial 4 with value: 0.993006993006993.
[I 2020-10-08 13:44:20,312] Trial 92 finished with value: 0.972027972027972 and parameters: {'lambda_l1': 0.0017993893691568388, 'lambda_l2': 0.0002567345180804827, 'num_leaves': 86, 'feature_fraction': 0.6476624572188711, 'bagging_fraction': 0.8099071512479044, 'bagging_freq': 5, 'min_child_samples': 92}. Best is trial 4 with value: 0.993006993006993.
[I 2020-10-08 13:44:20,459] Trial 93 finished with value: 0.972027972027972 and parameters: {'lambda_l1': 0.01544774140061836, 'lambda_l2': 0.0005896700589864609, 'num_leaves': 100, 'feature_fraction': 0.6035150311598853, 'bagging_fraction': 0.8892592139142246, 'bagging_freq': 6, 'min_child

Wall time: 13.4 s


In [6]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

Number of finished trials: 100
Best trial:
  Value: 0.993006993006993
  Params: 
    lambda_l1: 4.3509450228679825e-08
    lambda_l2: 0.00014385483603431102
    num_leaves: 87
    feature_fraction: 0.5742998413613465
    bagging_fraction: 0.6616367173720703
    bagging_freq: 1
    min_child_samples: 55


## Evaluation

In [7]:
data, target = sklearn.datasets.load_breast_cancer(return_X_y=True)
train_x, valid_x, train_y, valid_y = train_test_split(data, target, test_size=0.25)
dtrain = lgb.Dataset(train_x, label=train_y)

In [14]:
# default parameters
param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt"
}

In [17]:
gbm = lgb.train(param, dtrain)
preds = gbm.predict(valid_x)
pred_labels = np.rint(preds)
accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
print(accuracy)

0.958041958041958


In [18]:
# Optuna最適化パラメーター
param = {
        "objective": "binary",
        "metric": "binary_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "lambda_l1": 4.3509450228679825e-08,
        "lambda_l2": 0.00014385483603431102,
        "num_leaves": 87,
        "feature_fraction": 0.5742998413613465,
        "bagging_fraction": 0.6616367173720703,
        "bagging_freq": 1,
        "min_child_samples": 55,
}

In [19]:
gbm = lgb.train(param, dtrain)
preds = gbm.predict(valid_x)
pred_labels = np.rint(preds)
accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels)
print(accuracy)

0.965034965034965
