## lightgbm 패키지 설치
```
pip install lightgbm
```

In [1]:
from lightgbm import LGBMClassifier
import pandas as pd
import numpy as np 

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [2]:
cancer = load_breast_cancer()
data = cancer.data
target = cancer.target 

X_train, X_test, y_train, y_test = train_test_split(data, target, 
                                                    test_size=0.2, random_state=10)

lgbm = LGBMClassifier(n_estimators=400)

lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)], eval_metric='logloss')
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)

# [LightGBM] [Warning] No further splits with positive gain, best gain: -inf
# 분할로 인해 더 이상 긍정적인 값이 나오지 않을 때 나오는 경고

[LightGBM] [Info] Number of positive: 282, number of negative: 173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000567 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.619780 -> initscore=0.488615
[LightGBM] [Info] Start training from score 0.488615


In [3]:
print(pred_proba)

[[9.99993802e-01 6.19816306e-06]
 [3.57344323e-08 9.99999964e-01]
 [8.32478297e-09 9.99999992e-01]
 [9.99999989e-01 1.07316708e-08]
 [4.41181675e-06 9.99995588e-01]
 [6.85032588e-04 9.99314967e-01]
 [5.44686468e-08 9.99999946e-01]
 [1.88067514e-03 9.98119325e-01]
 [9.99999998e-01 2.08942377e-09]
 [9.99999838e-01 1.61734932e-07]
 [6.18628924e-08 9.99999938e-01]
 [9.33698419e-09 9.99999991e-01]
 [9.99997731e-01 2.26923515e-06]
 [3.33622692e-07 9.99999666e-01]
 [9.99999790e-01 2.09733957e-07]
 [9.99999692e-01 3.08307103e-07]
 [2.27987592e-08 9.99999977e-01]
 [7.59340607e-05 9.99924066e-01]
 [1.94042065e-05 9.99980596e-01]
 [9.98758252e-01 1.24174797e-03]
 [9.99763183e-01 2.36817265e-04]
 [9.99990796e-01 9.20387692e-06]
 [1.93515875e-05 9.99980648e-01]
 [1.54850116e-06 9.99998451e-01]
 [9.99999974e-01 2.55192535e-08]
 [9.99999982e-01 1.75551833e-08]
 [1.98812990e-05 9.99980119e-01]
 [9.99999978e-01 2.17602480e-08]
 [1.42690861e-08 9.99999986e-01]
 [1.65725734e-06 9.99998343e-01]
 [1.116154

In [4]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, pred)

0.9736842105263158

>튜닝

In [5]:
from sklearn.model_selection import GridSearchCV

param = {
    'min_child_samples':range(10, 400, 50),
    'num_leaves':range(10, 200, 30),
    'reg_alpha':[0.1, 0.3, 0.5, 0.7, 1],
    'reg_lambda':[0.1, 0.3, 0.5, 0.7, 1],
}

grid_cv = GridSearchCV(lgbm, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적의 파라미터 : ', grid_cv.best_params_)
print('최고 정확도 : ', grid_cv.best_score_)

Fitting 2 folds for each of 1400 candidates, totalling 2800 fits
[LightGBM] [Info] Number of positive: 282, number of negative: 173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.619780 -> initscore=0.488615
[LightGBM] [Info] Start training from score 0.488615
최적의 파라미터 :  {'min_child_samples': 60, 'num_leaves': 10, 'reg_alpha': 0.1, 'reg_lambda': 0.3}
최고 정확도 :  0.9604200479171496


In [6]:
param = {
    'min_child_samples':range(40, 90, 10),
    'num_leaves':range(8, 13, 1),
    'reg_alpha':[0.05, 0.1, 0.15, 0.2],
    'reg_lambda':[0.2, 0.3, 0.4],
}

grid_cv = GridSearchCV(lgbm, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적의 파라미터 : ', grid_cv.best_params_)
print('최고 정확도 : ', grid_cv.best_score_)

Fitting 2 folds for each of 300 candidates, totalling 600 fits
[LightGBM] [Info] Number of positive: 282, number of negative: 173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000644 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.619780 -> initscore=0.488615
[LightGBM] [Info] Start training from score 0.488615
최적의 파라미터 :  {'min_child_samples': 70, 'num_leaves': 8, 'reg_alpha': 0.05, 'reg_lambda': 0.2}
최고 정확도 :  0.969201638457377


In [7]:
param = {
    'min_child_samples':range(65, 80, 5),
    'num_leaves':range(5, 10, 1),
    'reg_alpha':[0.04, 0.05, 0.06],
    'reg_lambda':[0.1, 0.15, 0.2],
}

grid_cv = GridSearchCV(lgbm, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적의 파라미터 : ', grid_cv.best_params_)
print('최고 정확도 : ', grid_cv.best_score_)

Fitting 2 folds for each of 135 candidates, totalling 270 fits
[LightGBM] [Info] Number of positive: 282, number of negative: 173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.619780 -> initscore=0.488615
[LightGBM] [Info] Start training from score 0.488615
최적의 파라미터 :  {'min_child_samples': 70, 'num_leaves': 5, 'reg_alpha': 0.05, 'reg_lambda': 0.2}
최고 정확도 :  0.969201638457377


In [8]:
param = {
    'min_child_samples':range(68, 73, 1),
    'num_leaves':range(3, 7, 1),
    'reg_alpha':[0.05],
    'reg_lambda':[0.2],
}

grid_cv = GridSearchCV(lgbm, param_grid=param, cv=2, verbose=1, n_jobs=-1)
grid_cv.fit(X_train, y_train)
print('최적의 파라미터 : ', grid_cv.best_params_)
print('최고 정확도 : ', grid_cv.best_score_)

Fitting 2 folds for each of 20 candidates, totalling 40 fits
[LightGBM] [Info] Number of positive: 282, number of negative: 173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000306 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.619780 -> initscore=0.488615
[LightGBM] [Info] Start training from score 0.488615
최적의 파라미터 :  {'min_child_samples': 70, 'num_leaves': 3, 'reg_alpha': 0.05, 'reg_lambda': 0.2}
최고 정확도 :  0.969201638457377


In [13]:
# {'min_child_samples': 70, 'num_leaves': 5, 'reg_alpha': 0.05, 'reg_lambda': 0.2}
lgbm = LGBMClassifier(n_estimators=400, min_child_samples=70, num_leaves=3, reg_alpha=0.05, reg_lambda=0.2)
lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test)])
pred = lgbm.predict(X_test)
pred_proba = lgbm.predict_proba(X_test)

[LightGBM] [Info] Number of positive: 282, number of negative: 173
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000378 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4542
[LightGBM] [Info] Number of data points in the train set: 455, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.619780 -> initscore=0.488615
[LightGBM] [Info] Start training from score 0.488615


In [10]:
accuracy_score(y_test, pred)

0.9649122807017544

> min_child_samples는 최종 결정 클래스인 Leaf Node가 되기 위해서 최소한으로 필요한 데이터 개체의 수를 의미하며, 과적합을 제어하는 파라미터이다. 이 파라미터의 최적값은 훈련 데이터의 개수와 num_leaves에 의해 결정된다. 너무 큰 숫자로 설정하면 예측률이 떨어지는 과소적합(under-fitting)이 일어날 수 있으며, 아주 큰 데이터셋이라면 적어도 수백~수천 정도로 가정하는 것이 편리하다

> num_leaves는 개별 트리가 가질 수 있는 최대 리프의 개수이고 LightGBM 모델의 복잡도를 제어하는 주요 파라미터이다. 일반적으로 계수를 높이면 정확도가 올라가지만 트리의 깊이가 깊어지고 모델이 복잡도가 커져 과적합이 될 가능성이 높다.

> reg_alpha, reg_lambda는 피처 개수가 많을 경우 적용을 검토하며 값이 클수록 과적합 감소 효과가 있다.

> 결국 다음과 같이 파라미터를 설정하는 것이 예측률을 높일 수 있다.
- min_child_samples= 10, num_leaves= 5, reg_alpha= 0.05, reg_lambda= 0.25

In [11]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95        39
           1       0.99      0.96      0.97        75

    accuracy                           0.96       114
   macro avg       0.96      0.97      0.96       114
weighted avg       0.97      0.96      0.97       114

