In [1]:
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import StratifiedKFold
from optuna.samplers import TPESampler
from optuna.pruners import SuccessiveHalvingPruner
import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm

In [2]:
df_data = pd.read_csv('data/iris.csv', sep=',')
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  150 non-null    float64
 1   sepal.width   150 non-null    float64
 2   petal.length  150 non-null    float64
 3   petal.width   150 non-null    float64
 4   variety       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [3]:
from sklearn.preprocessing import LabelEncoder

items=['Setosa', 'Versicolor', 'Virginica']
encoder = LabelEncoder()
encoder.fit(items)
df_data['variety'] = encoder.transform(df_data['variety'])
df_data.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df_data, test_size=0.2)

In [5]:
df_train.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
5,5.4,3.9,1.7,0.4,0
78,6.0,2.9,4.5,1.5,1
85,6.0,3.4,4.5,1.6,1
77,6.7,3.0,5.0,1.7,1
91,6.1,3.0,4.6,1.4,1


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 120 entries, 5 to 28
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  120 non-null    float64
 1   sepal.width   120 non-null    float64
 2   petal.length  120 non-null    float64
 3   petal.width   120 non-null    float64
 4   variety       120 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.2 KB


In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30 entries, 118 to 51
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal.length  30 non-null     float64
 1   sepal.width   30 non-null     float64
 2   petal.length  30 non-null     float64
 3   petal.width   30 non-null     float64
 4   variety       30 non-null     int32  
dtypes: float64(4), int32(1)
memory usage: 1.3 KB


In [8]:
features = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
target = 'variety'

In [9]:
df_train.reset_index()

Unnamed: 0,index,sepal.length,sepal.width,petal.length,petal.width,variety
0,5,5.4,3.9,1.7,0.4,0
1,78,6.0,2.9,4.5,1.5,1
2,85,6.0,3.4,4.5,1.6,1
3,77,6.7,3.0,5.0,1.7,1
4,91,6.1,3.0,4.6,1.4,1
...,...,...,...,...,...,...
115,21,5.1,3.7,1.5,0.4,0
116,100,6.3,3.3,6.0,2.5,2
117,56,6.3,3.3,4.7,1.6,1
118,27,5.2,3.5,1.5,0.2,0


In [10]:
df_trains = []
df_valids = []
skf = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
for train_index, valid_index in skf.split(df_train[features], df_train[target]):
    train = df_train.iloc[train_index]
    valid = df_train.iloc[valid_index]
    df_trains.append(train)
    df_valids.append(valid)

In [15]:
import optuna
def accuracy(true, pred):
    return np.mean(true==pred)

def objective(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 2, 32, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 4, 20, log=True),
        }
    score = []
    for df_train, df_valid in zip(df_trains, df_valids):
        clf = sklearn.ensemble.RandomForestClassifier(**params)

        clf.fit(df_train[features], df_train[target])
        
        pred = clf.predict(df_valid[features])
        true = df_valid[target].values
        score.append(accuracy(true, pred))
    score = np.mean(score)
    return score

# Hyperparameter Tuning
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0), pruner=SuccessiveHalvingPruner())
study.optimize(objective, n_trials=50)

[32m[I 2023-01-11 15:51:37,347][0m A new study created in memory with name: no-name-e221bee5-fa60-42d4-ac26-0a1313625954[0m
[32m[I 2023-01-11 15:51:37,462][0m Trial 0 finished with value: 0.925 and parameters: {'max_depth': 8, 'n_estimators': 12}. Best is trial 0 with value: 0.925.[0m
[32m[I 2023-01-11 15:51:37,548][0m Trial 1 finished with value: 0.9333333333333332 and parameters: {'max_depth': 10, 'n_estimators': 9}. Best is trial 1 with value: 0.9333333333333332.[0m
[32m[I 2023-01-11 15:51:37,636][0m Trial 2 finished with value: 0.9333333333333333 and parameters: {'max_depth': 6, 'n_estimators': 11}. Best is trial 2 with value: 0.9333333333333333.[0m
[32m[I 2023-01-11 15:51:37,760][0m Trial 3 finished with value: 0.95 and parameters: {'max_depth': 6, 'n_estimators': 17}. Best is trial 3 with value: 0.95.[0m
[32m[I 2023-01-11 15:51:37,836][0m Trial 4 finished with value: 0.9583333333333334 and parameters: {'max_depth': 29, 'n_estimators': 7}. Best is trial 4 with val

In [16]:
# Best Parameter로 모델 훈련
clfs = []
for df_train in df_trains:
    clf = sklearn.ensemble.RandomForestClassifier(**study.best_params)

    clf.fit(df_train[features], df_train[target])
    clfs.append(clf)
    

# 예측 수행 (soft voting)
pred = [clf.predict_proba(df_test[features]) for clf in clfs]
pred = np.mean(pred, axis=0)
pred = np.argmax(pred, axis=1)

In [17]:
from sklearn.metrics import confusion_matrix
confusion_matrix(pred, df_test[target])

array([[ 7,  0,  0],
       [ 0, 11,  0],
       [ 0,  1, 11]], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score
print("예측 정확도 : {0:.4f}".format(accuracy_score(df_test[target], pred)))

예측 정확도 : 0.9667
