# Bayesian Optimization

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# MinMax
scaler = MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity'] = scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity'] = scaler.transform(test[['fixed acidity']])

# 원-핫 인코딩
encoder = OneHotEncoder()
encoder.fit(train[['type']])

onehot = encoder.transform(train[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
train = pd.concat([train, onehot], axis=1)
train = train.drop(columns=['type'])

onehot = encoder.transform(test[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
test = pd.concat([test, onehot], axis=1)
test = test.drop(columns=['type'])

In [3]:
from bayes_opt import BayesianOptimization

In [4]:
# x: 학습데이터 // y: 목표 변수
x = train.drop(columns=['index', 'quality'])
y = train['quality']

In [5]:
# 랜덤포레스트 모델 하이퍼파라미터
rf_parameter_bounds = {
    'max_depth': (1, 3), # 트리 깊이
    'n_estimators': (30, 100),
}

In [6]:
# 랜덤포레스트 함수 생성
def rf_bo(max_depth, n_estimators):
    rf_params = {
        'max_depth': int(round(max_depth)),
        'n_estimators': int(round(n_estimators)),
    }
    rf = RandomForestClassifier(**rf_params)
    
    x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, )
    rf.fit(x_train, y_train)
    score = accuracy_score(y_valid, rf.predict(x_valid))
    return score

In [7]:
# Bayesian Optimization 객체 생성
BO_rf = BayesianOptimization(f=rf_bo, pbounds=rf_parameter_bounds, random_state=0)

In [8]:
# Bayesian Optimization 실행
BO_rf.maximize(init_points=5, n_iter=5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.4945  [0m | [0m 2.098   [0m | [0m 80.06   [0m |
| [0m 2       [0m | [0m 0.4882  [0m | [0m 2.206   [0m | [0m 68.14   [0m |
| [0m 3       [0m | [0m 0.4736  [0m | [0m 1.847   [0m | [0m 75.21   [0m |
| [95m 4       [0m | [95m 0.5091  [0m | [95m 1.875   [0m | [95m 92.42   [0m |
| [0m 5       [0m | [0m 0.5082  [0m | [0m 2.927   [0m | [0m 56.84   [0m |
| [95m 6       [0m | [95m 0.5164  [0m | [95m 1.906   [0m | [95m 88.77   [0m |
| [95m 7       [0m | [95m 0.5182  [0m | [95m 2.874   [0m | [95m 47.47   [0m |
| [0m 8       [0m | [0m 0.5155  [0m | [0m 1.0     [0m | [0m 39.21   [0m |
| [95m 9       [0m | [95m 0.5273  [0m | [95m 3.0     [0m | [95m 30.0    [0m |
| [0m 10      [0m | [0m 0.4664  [0m | [0m 1.0     [0m | [0m 32.87   [0m |


In [9]:
# 하이퍼파라미터 결과값 저장
max_params = BO_rf.max['params']
max_params['max_depth'] = int(max_params['max_depth'])
max_params['n_estimators'] = int(max_params['n_estimators'])
print(max_params)

{'max_depth': 3, 'n_estimators': 30}


In [10]:
BO_tuend_rf = RandomForestClassifier(**max_params)