In [9]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from bayes_opt import BayesianOptimization

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# RandomForest 튜닝
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [10]:
# Scailing
scaler = MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity'] = scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity'] = scaler.transform(test[['fixed acidity']])

In [11]:
#Encoding
encoder = OneHotEncoder()
encoder.fit(train[['type']])

onehot = encoder.transform(train[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
train = pd.concat([train, onehot], axis = 1)
train = train.drop(columns = ['type'])

onehot = encoder.transform(test[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
test = pd.concat([test, onehot], axis = 1)
test = test.drop(columns = ['type'])

In [12]:
X = train.drop(columns = ['index', 'quality'])
y = train['quality']

In [13]:
rf_parameter_bounds = {
    'max_depth': (1, 3),
    'n_estimators': (30, 100)
}

In [14]:
def rf_bo(max_depth, n_estimators):
    rf_params = {
        'max_depth': int(round(max_depth)),
        'n_estimators': int(round(n_estimators))
    }
    rf = RandomForestClassifier(**rf_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    
    rf.fit(X_train, y_train)
    score = accuracy_score(y_valid, rf.predict(X_valid))
    return score

In [15]:
BO_rf = BayesianOptimization(f = rf_bo, pbounds = rf_parameter_bounds,random_state = 0)
BO_rf.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.5445  [0m | [0m 2.098   [0m | [0m 80.06   [0m |
| [0m 2       [0m | [0m 0.5164  [0m | [0m 2.206   [0m | [0m 68.14   [0m |
| [0m 3       [0m | [0m 0.5127  [0m | [0m 1.847   [0m | [0m 75.21   [0m |
| [0m 4       [0m | [0m 0.5191  [0m | [0m 1.875   [0m | [0m 92.42   [0m |
| [0m 5       [0m | [0m 0.5155  [0m | [0m 2.927   [0m | [0m 56.84   [0m |
| [0m 6       [0m | [0m 0.4518  [0m | [0m 1.018   [0m | [0m 48.01   [0m |
| [0m 7       [0m | [0m 0.54    [0m | [0m 1.963   [0m | [0m 82.94   [0m |
| [0m 8       [0m | [0m 0.54    [0m | [0m 3.0     [0m | [0m 30.0    [0m |
| [0m 9       [0m | [0m 0.4436  [0m | [0m 1.0     [0m | [0m 35.23   [0m |
| [95m 10      [0m | [95m 0.5527  [0m | [95m 3.0     [0m | [95m 86.77   [0m |


In [21]:
# XGBoost 튜닝
from xgboost import XGBClassifier

X = train.drop(columns = ['index', 'quality'])
y = train['quality']

In [22]:
xgb_parameter_bounds = {
    'gamma': (0, 10),
    'max_depth': (1, 3),
    'subsample': (0.5, 1)
}

In [23]:
def xgb_bo(gamma, max_depth, subsample):
    xgb_params = {
        'gamma': int(round(gamma)),
        'max_depth': int(round(max_depth)),
        'subsample': int(round(subsample))
    }
    xgb = XGBClassifier(**xgb_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    
    xgb.fit(X_train, y_train)
    score = accuracy_score(y_valid, xgb.predict(X_valid))
    return score

In [24]:
BO_xgb = BayesianOptimization(f = xgb_bo, pbounds = xgb_parameter_bounds,random_state = 0)
BO_xgb.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   |   gamma   | max_depth | subsample |
-------------------------------------------------------------




| [0m 1       [0m | [0m 0.5445  [0m | [0m 5.488   [0m | [0m 2.43    [0m | [0m 0.8014  [0m |




| [95m 2       [0m | [95m 0.5482  [0m | [95m 5.449   [0m | [95m 1.847   [0m | [95m 0.8229  [0m |




| [95m 3       [0m | [95m 0.5882  [0m | [95m 4.376   [0m | [95m 2.784   [0m | [95m 0.9818  [0m |




| [0m 4       [0m | [0m 0.5745  [0m | [0m 3.834   [0m | [0m 2.583   [0m | [0m 0.7644  [0m |




| [0m 5       [0m | [0m 0.57    [0m | [0m 5.68    [0m | [0m 2.851   [0m | [0m 0.5355  [0m |




| [0m 6       [0m | [0m 0.5518  [0m | [0m 4.387   [0m | [0m 3.0     [0m | [0m 0.5544  [0m |




| [0m 7       [0m | [0m 0.5782  [0m | [0m 2.303   [0m | [0m 2.915   [0m | [0m 0.6916  [0m |




| [0m 8       [0m | [0m 0.5527  [0m | [0m 5.411   [0m | [0m 2.106   [0m | [0m 0.8663  [0m |




| [0m 9       [0m | [0m 0.5818  [0m | [0m 4.196   [0m | [0m 2.621   [0m | [0m 1.0     [0m |
| [0m 10      [0m | [0m 0.5536  [0m | [0m 3.72    [0m | [0m 1.231   [0m | [0m 0.9286  [0m |




In [25]:
# LightGBM 튜닝
from lightgbm import LGBMClassifier

X = train.drop(columns = ['index', 'quality'])
y = train['quality']

In [26]:
lgbm_parameter_bounds = {
    'n_estimators': (30, 100),
    'max_depth': (1, 3),
    'subsample': (0.5, 1)
}

In [27]:
def lgbm_bo(n_estimators, max_depth, subsample):
    lgbm_params = {
        'n_estimators': int(round(n_estimators)),
        'max_depth': int(round(max_depth)),
        'subsample': int(round(subsample))
    }
    lgbm = LGBMClassifier(**lgbm_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    
    lgbm.fit(X_train, y_train)
    score = accuracy_score(y_valid, lgbm.predict(X_valid))
    return score

In [28]:
BO_lgbm = BayesianOptimization(f = lgbm_bo, pbounds = lgbm_parameter_bounds,random_state = 0)
BO_lgbm.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... | subsample |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.5655  [0m | [0m 2.098   [0m | [0m 80.06   [0m | [0m 0.8014  [0m |
| [95m 2       [0m | [95m 0.57    [0m | [95m 2.09    [0m | [95m 59.66   [0m | [95m 0.8229  [0m |
| [95m 3       [0m | [95m 0.5736  [0m | [95m 1.875   [0m | [95m 92.42   [0m | [95m 0.9818  [0m |
| [95m 4       [0m | [95m 0.5773  [0m | [95m 1.767   [0m | [95m 85.42   [0m | [95m 0.7644  [0m |
| [0m 5       [0m | [0m 0.5564  [0m | [0m 2.136   [0m | [0m 94.79   [0m | [0m 0.5355  [0m |
| [0m 6       [0m | [0m 0.5391  [0m | [0m 1.829   [0m | [0m 85.39   [0m | [0m 0.8606  [0m |
| [0m 7       [0m | [0m 0.5418  [0m | [0m 1.461   [0m | [0m 97.03   [0m | [0m 0.6916  [0m |
| [0m 8       [0m | [0m 0.5473  [0m | [0m 2.082   [0m | [0m 68.72   [0m | [0m 0.8663  [0m |
| [0m 9       [0m | [0m 0.5591  [0m | [0

In [30]:
print(BO_rf.max['params'])
print(BO_xgb.max['params'])
print(BO_lgbm.max['params'])

{'max_depth': 3.0, 'n_estimators': 86.77395944328902}
{'gamma': 4.375872112626925, 'max_depth': 2.7835460015641598, 'subsample': 0.9818313802505146}
{'max_depth': 1.7668830376515554, 'n_estimators': 85.42075266578652, 'subsample': 0.7644474598764522}


In [49]:
LGBM = LGBMClassifier(max_depth = 2, n_estimators = 85, subsample = 0.7644)
XGB = XGBClassifier(gamma = 4.375, max_depth = 3, subsample = 0.9818)
RF = RandomForestClassifier(max_depth = 3, n_estimators = 87)

In [50]:
from sklearn.ensemble import VotingClassifier

VC = VotingClassifier(estimators=[('rf',RF),('xgb',XGB),('lgbm',LGBM)],voting = 'soft')

In [51]:
train_one = pd.get_dummies(train)
test_one = pd.get_dummies(test)

X = train_one.drop('quality', axis = 1)
y = train_one['quality']

In [52]:
VC.fit(X, y)





VotingClassifier(estimators=[('rf',
                              RandomForestClassifier(max_depth=3,
                                                     n_estimators=87)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, gamma=4.375,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=None,
                                            max_delta_step=None, max_depth=3,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints=None,
                                            n_estim

In [53]:
pred = VC.predict(test_one)

In [55]:
submission = pd.read_csv('data/sample_submission.csv')
submission['quality'] = pred
submission.to_csv('tune_voting.csv',index = False)