In [1]:
import numpy as np
import pandas as pd
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner

In [2]:
df_data = pd.read_csv('data/iris.csv', sep=',')
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [3]:
from sklearn.preprocessing import LabelEncoder

items=['Setosa', 'Versicolor', 'Virginica']
encoder = LabelEncoder()
encoder.fit(items)
df_data['variety'] = encoder.transform(df_data['variety'])
df_data.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_data, test_size=0.2)

In [5]:
df_train.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
52,6.9,3.1,4.9,1.5,1
19,5.1,3.8,1.5,0.3,0
116,6.5,3.0,5.5,1.8,2
81,5.5,2.4,3.7,1.0,1
100,6.3,3.3,6.0,2.5,2


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 52 to 35
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  120 non-null    float64
 1   sepal.width   120 non-null    float64
 2   petal.length  120 non-null    float64
 3   petal.width   120 non-null    float64
 4   variety       120 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.2 KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 74 to 25
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  30 non-null     float64
 1   sepal.width   30 non-null     float64
 2   petal.length  30 non-null     float64
 3   petal.width   30 non-null     float64
 4   variety       30 non-null     int32  
dtypes: float64(4), int32(1)
memory usage: 1.3 KB


In [8]:
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
target = 'variety'

In [9]:
df_train.reset_index()

Unnamed: 0,index,sepal.length,sepal.width,petal.length,petal.width,variety
0,52,6.9,3.1,4.9,1.5,1
1,19,5.1,3.8,1.5,0.3,0
2,116,6.5,3.0,5.5,1.8,2
3,81,5.5,2.4,3.7,1.0,1
4,100,6.3,3.3,6.0,2.5,2
...,...,...,...,...,...,...
115,51,6.4,3.2,4.5,1.5,1
116,42,4.4,3.2,1.3,0.2,0
117,6,4.6,3.4,1.4,0.3,0
118,55,5.7,2.8,4.5,1.3,1


In [10]:
df_trains = []
df_valids = []
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train_index, valid_index in skf.split(df_train[features], df_train[target]):
    train = df_train.iloc[train_index]
    valid = df_train.iloc[valid_index]
    df_trains.append(train)
    df_valids.append(valid)

In [11]:
import optuna
def accuracy(true, pred):
    return np.mean(true==pred)

def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
        'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False), 
        'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True), 
        'n_estimators': trial.suggest_int('n_estimators', 8, 1024, step=1, log=True), 
        'objective': 'multiclass', 
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False), 
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
        'random_state': 0
    }
    
    score = []
    for df_train, df_valid in zip(df_trains, df_valids):
        clf = LGBMClassifier(**params)
        clf.fit(df_train[features], df_train[target])
        
        pred = clf.predict(df_valid[features])
        true = df_valid[target].values
        score.append(accuracy(true, pred))
    score = np.mean(score)
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=50)

[32m[I 2023-01-05 17:19:39,235][0m A new study created in memory with name: no-name-55d56567-5cac-408c-b8e7-4119c2cc62b0[0m
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
  'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
  'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
[32m[I 2023-01-05 17:19:39,360][0m Trial 0 finished with value: 0.9166666666666666 and parameters: {'num_leaves': 54, 'max_depth': 8, 'learning_rate': 0.006431172050131994, 'n_estimators': 109, 'class_weight': None, 'min_child_samples': 27, 'subsample': 0.9675319002346239, 'colsample_bytree': 0.9890988281503088, 'reg_alpha': 0.3834415188257777, 'reg_lambda': 7.917250380826646}. Best is trial 0 with value: 0.9166666666666666.[0m
  'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
  'reg_alpha': trial.suggest_uniform('reg

In [12]:
# Best Parameter로 모델 훈련
clfs = []
for df_train in df_trains:
    clf = LGBMClassifier(**study.best_params)
    clf.fit(df_train[features], df_train[target])
    clfs.append(clf)
    

# 예측 수행 (soft voting)
pred = [clf.predict_proba(df_test[features]) for clf in clfs]
pred = np.mean(pred, axis=0)
pred = np.argmax(pred, axis=1)

In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(pred, df_test[target])

array([[11,  0,  0],
       [ 0,  8,  0],
       [ 0,  0, 11]], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score
print("예측 정확도 : {0:.4f}".format(accuracy_score(df_test[target], pred)))

예측 정확도 : 1.0000
