# Beyesian Optimization

하이퍼파라미터 튜닝과 관련된 내용

In [1]:
!pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading bayesian-optimization-1.2.0.tar.gz (14 kB)
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-py3-none-any.whl size=11685 sha256=4091b56c5efe2ad0892b560822daa61553cda6916607a9cd947f0d2f2731202c
  Stored in directory: /root/.cache/pip/wheels/fd/9b/71/f127d694e02eb40bcf18c7ae9613b88a6be4470f57a8528c5b
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [2]:
# 데이터 다운로드 링크로 데이터를 코랩에 불러옵니다.

!wget 'https://bit.ly/3i4n1QB'

import zipfile
with zipfile.ZipFile('3i4n1QB', 'r') as existing_zip:
    existing_zip.extractall('data')

--2021-09-16 19:15:02--  https://bit.ly/3i4n1QB
Resolving bit.ly (bit.ly)... 67.199.248.10, 67.199.248.11
Connecting to bit.ly (bit.ly)|67.199.248.10|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://drive.google.com/uc?export=download&id=1emLrrpFWT8dCoj5BJb12-5QMG2-nruUw [following]
--2021-09-16 19:15:02--  https://drive.google.com/uc?export=download&id=1emLrrpFWT8dCoj5BJb12-5QMG2-nruUw
Resolving drive.google.com (drive.google.com)... 173.194.202.100, 173.194.202.102, 173.194.202.101, ...
Connecting to drive.google.com (drive.google.com)|173.194.202.100|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-10-10-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/mhbhdkk03m62tqteslktuqrjpn2frjhp/1631819700000/17946651057176172524/*/1emLrrpFWT8dCoj5BJb12-5QMG2-nruUw?e=download [following]
--2021-09-16 19:15:04--  https://doc-10-10-docs.googleusercontent.com/docs/secur

In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# scaling
scaler = MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity'] = scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity'] = scaler.transform(test[['fixed acidity']])

# encoding
encoder = OneHotEncoder()
encoder.fit(train[['type']])
onehot = encoder.transform(train[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
train = pd.concat([train, onehot], axis = 1)
train = train.drop(columns = ['type'])

onehot = encoder.transform(test[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
test = pd.concat([test, onehot], axis = 1)
test = test.drop(columns = ['type'])

test.head()

Unnamed: 0,index,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,Scaled fixed acidity,x0_red,x0_white
0,0,9.0,0.31,0.48,6.6,0.043,11.0,73.0,0.9938,2.9,0.38,11.6,0.429752,0.0,1.0
1,1,13.3,0.43,0.58,1.9,0.07,15.0,40.0,1.0004,3.06,0.49,9.0,0.785124,1.0,0.0
2,2,6.5,0.28,0.27,5.2,0.04,44.0,179.0,0.9948,3.19,0.69,9.4,0.22314,0.0,1.0
3,3,7.2,0.15,0.39,1.8,0.043,21.0,159.0,0.9948,3.52,0.47,10.0,0.280992,0.0,1.0
4,4,6.8,0.26,0.26,2.0,0.019,23.5,72.0,0.99041,3.16,0.47,11.8,0.247934,0.0,1.0


In [6]:
from bayes_opt import BayesianOptimization

In [7]:
X = train.drop(columns = ['index', 'quality'])
y = train['quality']

# 랜덤포레스트의 하이퍼 파라미터
rf_parameter_bounds = {
                      'max_depth' : (1,3), # 나무의 깊이
                      'n_estimators' : (30,100),
                      }

In [8]:
# 함수
def rf_bo(max_depth, n_estimators):
  rf_params = {
              'max_depth' : int(round(max_depth)),
               'n_estimators' : int(round(n_estimators)),      
              }
  rf = RandomForestClassifier(**rf_params)

  X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size = 0.2, )

  rf.fit(X_train,y_train)
  score = accuracy_score(y_valid, rf.predict(X_valid))
  return score

In [9]:
BO_rf = BayesianOptimization(f=rf_bo, pbounds=rf_parameter_bounds, random_state=0)
BO_rf.maximize(init_points=5, n_iter=5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.5364  [0m | [0m 2.098   [0m | [0m 80.06   [0m |
| [0m 2       [0m | [0m 0.4991  [0m | [0m 2.206   [0m | [0m 68.14   [0m |
| [0m 3       [0m | [0m 0.4864  [0m | [0m 1.847   [0m | [0m 75.21   [0m |
| [0m 4       [0m | [0m 0.5245  [0m | [0m 1.875   [0m | [0m 92.42   [0m |
| [0m 5       [0m | [0m 0.5336  [0m | [0m 2.927   [0m | [0m 56.84   [0m |
| [0m 6       [0m | [0m 0.5309  [0m | [0m 2.606   [0m | [0m 30.01   [0m |
| [0m 7       [0m | [0m 0.5082  [0m | [0m 2.07    [0m | [0m 80.1    [0m |
| [0m 8       [0m | [0m 0.52    [0m | [0m 2.406   [0m | [0m 49.15   [0m |
| [0m 9       [0m | [0m 0.5082  [0m | [0m 2.309   [0m | [0m 37.97   [0m |
| [0m 10      [0m | [0m 0.5273  [0m | [0m 2.973   [0m | [0m 99.93   [0m |


In [10]:
# max_params의 변수에 하이퍼파라미터 값 
max_params = BO_rf.max['params']

max_params['max_depth'] = int(max_params['max_depth'])
max_params['n_estimators'] = int(max_params['n_estimators'])
print(max_params)

{'max_depth': 2, 'n_estimators': 80}


In [11]:
BO_tuend_rf = RandomForestClassifier(**max_params)

In [12]:
BO_tuend_rf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)