In [1]:
import numpy as np
import pandas as pd
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner

In [2]:
df_data = pd.read_csv('data/iris.csv', sep=',')
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [3]:
from sklearn.preprocessing import LabelEncoder

items=['Setosa', 'Versicolor', 'Virginica']
encoder = LabelEncoder()
encoder.fit(items)
df_data['variety'] = encoder.transform(df_data['variety'])
df_data.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_data, test_size=0.2)

In [5]:
df_train.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
76,6.8,2.8,4.8,1.4,1
85,6.0,3.4,4.5,1.6,1
137,6.4,3.1,5.5,1.8,2
8,4.4,2.9,1.4,0.2,0
127,6.1,3.0,4.9,1.8,2


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 76 to 130
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  120 non-null    float64
 1   sepal.width   120 non-null    float64
 2   petal.length  120 non-null    float64
 3   petal.width   120 non-null    float64
 4   variety       120 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.2 KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 82 to 120
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  30 non-null     float64
 1   sepal.width   30 non-null     float64
 2   petal.length  30 non-null     float64
 3   petal.width   30 non-null     float64
 4   variety       30 non-null     int32  
dtypes: float64(4), int32(1)
memory usage: 1.3 KB


In [8]:
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
target = 'variety'

In [9]:
df_train.reset_index()

Unnamed: 0,index,sepal.length,sepal.width,petal.length,petal.width,variety
0,76,6.8,2.8,4.8,1.4,1
1,85,6.0,3.4,4.5,1.6,1
2,137,6.4,3.1,5.5,1.8,2
3,8,4.4,2.9,1.4,0.2,0
4,127,6.1,3.0,4.9,1.8,2
...,...,...,...,...,...,...
115,108,6.7,2.5,5.8,1.8,2
116,35,5.0,3.2,1.2,0.2,0
117,29,4.7,3.2,1.6,0.2,0
118,26,5.0,3.4,1.6,0.4,0


In [10]:
df_trains = []
df_valids = []
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train_index, valid_index in skf.split(df_train[features], df_train[target]):
    train = df_train.iloc[train_index]
    valid = df_train.iloc[valid_index]
    df_trains.append(train)
    df_valids.append(valid)

In [11]:
import optuna
def accuracy(true, pred):
    return np.mean(true==pred)

def objective(trial):
    # params = {
    #     'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True), 
    #     'max_depth': trial.suggest_int('max_depth', 1, 10, step=1, log=False), 
    #     'learning_rate': trial.suggest_float('learning_rate', 0.0001, 0.1, log=True), 
    #     'n_estimators': trial.suggest_int('n_estimators', 8, 1024, step=1, log=True), 
    #     'objective': 'multiclass', 
    #     'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
    #     'min_child_samples': trial.suggest_int('min_child_samples', 10, 50, step=1, log=False), 
    #     'subsample': trial.suggest_uniform('subsample', 0.7, 1.0), 
    #     'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
    #     'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
    #     'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 10.0),
    #     'random_state': 0
    # }
    params = {
        "verbosity": 0,
        "objective": "binary:logistic",
        # use exact for small dataset.
        "tree_method": "exact",
        # defines booster, gblinear for linear functions.
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        # L2 regularization weight.
        "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
        # L1 regularization weight.
        "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        # sampling ratio for training data.
        "subsample": trial.suggest_float("subsample", 0.2, 1.0),
        # sampling according to each tree.
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
    }

    if params["booster"] in ["gbtree", "dart"]:
        # maximum depth of the tree, signifies complexity of the tree.
        params["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2)
        # minimum child weight, larger the term more conservative the tree.
        params["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10)
        params["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
        # defines how selective algorithm is.
        params["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
        params["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

    if params["booster"] == "dart":
        params["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        params["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        params["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
        params["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)


    score = []
    for df_train, df_valid in zip(df_trains, df_valids):
        clf = XGBClassifier(**params)
        clf.fit(df_train[features], df_train[target])
        
        pred = clf.predict(df_valid[features])
        true = df_valid[target].values
        score.append(accuracy(true, pred))
    score = np.mean(score)
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=50)

[32m[I 2023-01-11 15:30:53,493][0m A new study created in memory with name: no-name-37e4e44f-0423-4621-8ab1-f77df97cc7cc[0m
[32m[I 2023-01-11 15:30:54,538][0m Trial 0 finished with value: 0.9666666666666668 and parameters: {'booster': 'gblinear', 'lambda': 0.00022859433522173892, 'alpha': 2.4504079607415994e-05, 'subsample': 0.7167152904533249, 'colsample_bytree': 0.550069769010154}. Best is trial 0 with value: 0.9666666666666668.[0m
[32m[I 2023-01-11 15:30:54,645][0m Trial 1 finished with value: 0.9666666666666668 and parameters: {'booster': 'gblinear', 'lambda': 0.021567524728780586, 'alpha': 0.0001702783205227671, 'subsample': 0.6544356488751459, 'colsample_bytree': 0.9404773106341289}. Best is trial 0 with value: 0.9666666666666668.[0m
[32m[I 2023-01-11 15:30:54,747][0m Trial 2 finished with value: 0.9666666666666668 and parameters: {'booster': 'gblinear', 'lambda': 0.04580983888874032, 'alpha': 0.016797861943261036, 'subsample': 0.8960097185974554, 'colsample_bytree': 0

In [12]:
# Best Parameter로 모델 훈련
clfs = []
for df_train in df_trains:
    clf = XGBClassifier(**study.best_params)
    clf.fit(df_train[features], df_train[target])
    clfs.append(clf)
    

# 예측 수행 (soft voting)
pred = [clf.predict_proba(df_test[features]) for clf in clfs]
pred = np.mean(pred, axis=0)
pred = np.argmax(pred, axis=1)

In [13]:
from sklearn.metrics import confusion_matrix
confusion_matrix(pred, df_test[target])

array([[ 7,  0,  0],
       [ 0, 10,  1],
       [ 0,  0, 12]], dtype=int64)

In [14]:
from sklearn.metrics import accuracy_score
print("예측 정확도 : {0:.4f}".format(accuracy_score(df_test[target], pred)))

예측 정확도 : 0.9667
