In [49]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [50]:
# Scailing
scaler = MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity'] = scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity'] = scaler.transform(test[['fixed acidity']])

In [51]:
#Encoding
encoder = OneHotEncoder()
encoder.fit(train[['type']])

onehot = encoder.transform(train[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
train = pd.concat([train, onehot], axis = 1)
train = train.drop(columns = ['type'])

onehot = encoder.transform(test[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
test = pd.concat([test, onehot], axis = 1)
test = test.drop(columns = ['type'])

In [52]:
# Bayesian Optimization
from bayes_opt import BayesianOptimization

In [53]:
def rf_bo(max_depth, n_estimators):
    rf_params = {
        'max_depth': int(round(max_depth)),
        'n_estimators': int(round(n_estimators))
    }
    rf = RandomForestClassifier(**rf_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    
    rf.fit(X_train, y_train)
    score = accuracy_score(y_valid, rf.predict(X_valid))
    return score

In [54]:
X = train.drop(columns = ['index', 'quality'])
y = train['quality']

rf_parameter_bounds = {
    'max_depth': (1, 3),
    'n_estimators': (30, 100)
}

In [55]:
BO_rf = BayesianOptimization(f = rf_bo, pbounds = rf_parameter_bounds, random_state = 0)
BO_rf.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m 1       [0m | [0m 0.5073  [0m | [0m 2.098   [0m | [0m 80.06   [0m |
| [95m 2       [0m | [95m 0.5218  [0m | [95m 2.206   [0m | [95m 68.14   [0m |
| [0m 3       [0m | [0m 0.5209  [0m | [0m 1.847   [0m | [0m 75.21   [0m |
| [0m 4       [0m | [0m 0.5164  [0m | [0m 1.875   [0m | [0m 92.42   [0m |
| [95m 5       [0m | [95m 0.5445  [0m | [95m 2.927   [0m | [95m 56.84   [0m |
| [95m 6       [0m | [95m 0.5627  [0m | [95m 2.604   [0m | [95m 52.96   [0m |
| [0m 7       [0m | [0m 0.5218  [0m | [0m 2.539   [0m | [0m 46.04   [0m |
| [0m 8       [0m | [0m 0.4782  [0m | [0m 1.0     [0m | [0m 53.91   [0m |
| [0m 9       [0m | [0m 0.5318  [0m | [0m 2.639   [0m | [0m 52.99   [0m |
| [0m 10      [0m | [0m 0.5436  [0m | [0m 2.657   [0m | [0m 53.03   [0m |


In [56]:
max_params = BO_rf.max['params']
max_params['max_depth'] = int(max_params['max_depth'])
max_params['n_estimators'] = int(max_params['n_estimators'])
print(max_params)

{'max_depth': 2, 'n_estimators': 52}


In [57]:
BO_tuend_rf = RandomForestClassifier(**max_params)

In [66]:
# Bayesian Optimization을 이용해 XGBoost 모델 튜닝
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [67]:
# Scailing
scaler = MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity'] = scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity'] = scaler.transform(test[['fixed acidity']])

In [68]:
#Encoding
encoder = OneHotEncoder()
encoder.fit(train[['type']])

onehot = encoder.transform(train[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
train = pd.concat([train, onehot], axis = 1)
train = train.drop(columns = ['type'])

onehot = encoder.transform(test[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
test = pd.concat([test, onehot], axis = 1)
test = test.drop(columns = ['type'])

In [69]:
from xgboost import XGBClassifier

def xgb_bo(gamma, max_depth, subsample):
    xgb_params = {
        'gamma': int(round(gamma)),
        'max_depth': int(round(max_depth)),
        'subsample': int(round(subsample))
    }
    xgb = XGBClassifier(**xgb_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    
    xgb.fit(X_train, y_train)
    score = accuracy_score(y_valid, xgb.predict(X_valid))
    return score

In [70]:
X = train.drop(columns = ['index', 'quality'])
y = train['quality']

xgb_parameter_bounds = {
    'gamma': (0, 10),
    'max_depth': (1, 3),
    'subsample': (0.5, 1)
}

BO_xgb = BayesianOptimization(f = xgb_bo, pbounds = xgb_parameter_bounds, random_state = 0)
BO_xgb.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   |   gamma   | max_depth | subsample |
-------------------------------------------------------------




| [0m 1       [0m | [0m 0.5345  [0m | [0m 5.488   [0m | [0m 2.43    [0m | [0m 0.8014  [0m |




| [95m 2       [0m | [95m 0.55    [0m | [95m 5.449   [0m | [95m 1.847   [0m | [95m 0.8229  [0m |




| [95m 3       [0m | [95m 0.5573  [0m | [95m 4.376   [0m | [95m 2.784   [0m | [95m 0.9818  [0m |




| [95m 4       [0m | [95m 0.5745  [0m | [95m 3.834   [0m | [95m 2.583   [0m | [95m 0.7644  [0m |




| [0m 5       [0m | [0m 0.5609  [0m | [0m 5.68    [0m | [0m 2.851   [0m | [0m 0.5355  [0m |




| [0m 6       [0m | [0m 0.5455  [0m | [0m 6.692   [0m | [0m 2.421   [0m | [0m 0.9232  [0m |




| [95m 7       [0m | [95m 0.59    [0m | [95m 2.303   [0m | [95m 2.915   [0m | [95m 0.6916  [0m |




| [0m 8       [0m | [0m 0.5482  [0m | [0m 5.411   [0m | [0m 2.106   [0m | [0m 0.8663  [0m |




| [0m 9       [0m | [0m 0.5782  [0m | [0m 2.303   [0m | [0m 2.95    [0m | [0m 0.7461  [0m |
| [0m 10      [0m | [0m 0.5409  [0m | [0m 3.72    [0m | [0m 1.231   [0m | [0m 0.9286  [0m |




In [73]:
print(BO_xgb.max['params'])

{'gamma': 2.3028884906844747, 'max_depth': 2.915034887195457, 'subsample': 0.6916244375247732}


In [74]:
xgb_tune = XGBClassifier(gamma = 2.302, max_depth = 2, subsample = 0.6916)
xgb_tune.fit(X, y)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=2.302, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.6916,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [75]:
pred = xgb_tune.predict(test.drop(columns = ['index']))

In [76]:
submission = pd.read_csv('data/sample_submission.csv')
submission['quality'] = pred
submission.to_csv('tune_xgb.csv', index = False)

In [77]:
# Bayesian Optimization을 이용해 LightGBM 모델 튜닝
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [78]:
# Scailing
scaler = MinMaxScaler()
scaler.fit(train[['fixed acidity']])
train['Scaled fixed acidity'] = scaler.transform(train[['fixed acidity']])
test['Scaled fixed acidity'] = scaler.transform(test[['fixed acidity']])

In [79]:
#Encoding
encoder = OneHotEncoder()
encoder.fit(train[['type']])

onehot = encoder.transform(train[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
train = pd.concat([train, onehot], axis = 1)
train = train.drop(columns = ['type'])

onehot = encoder.transform(test[['type']])
onehot = onehot.toarray()
onehot = pd.DataFrame(onehot)
onehot.columns = encoder.get_feature_names()
test = pd.concat([test, onehot], axis = 1)
test = test.drop(columns = ['type'])

In [81]:
from lightgbm import LGBMClassifier

def lgbm_bo(n_estimators, max_depth, subsample):
    lgbm_params = {
        'n_estimators': int(round(n_estimators)),
        'max_depth': int(round(max_depth)),
        'subsample': int(round(subsample))
    }
    lgbm = LGBMClassifier(**lgbm_params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2)
    
    lgbm.fit(X_train, y_train)
    score = accuracy_score(y_valid, lgbm.predict(X_valid))
    return score

In [82]:
X = train.drop(columns = ['index', 'quality'])
y = train['quality']

lgbm_parameter_bounds = {
    'n_estimators': (30, 100),
    'max_depth': (1, 3),
    'subsample': (0.5, 1)
}

In [83]:
BO_lgbm = BayesianOptimization(f = lgbm_bo, pbounds = lgbm_parameter_bounds, random_state = 0)
BO_lgbm.maximize(init_points = 5, n_iter = 5)

|   iter    |  target   | max_depth | n_esti... | subsample |
-------------------------------------------------------------
| [0m 1       [0m | [0m 0.5618  [0m | [0m 2.098   [0m | [0m 80.06   [0m | [0m 0.8014  [0m |
| [95m 2       [0m | [95m 0.5691  [0m | [95m 2.09    [0m | [95m 59.66   [0m | [95m 0.8229  [0m |
| [95m 3       [0m | [95m 0.5973  [0m | [95m 1.875   [0m | [95m 92.42   [0m | [95m 0.9818  [0m |
| [0m 4       [0m | [0m 0.5764  [0m | [0m 1.767   [0m | [0m 85.42   [0m | [0m 0.7644  [0m |
| [0m 5       [0m | [0m 0.56    [0m | [0m 2.136   [0m | [0m 94.79   [0m | [0m 0.5355  [0m |
| [0m 6       [0m | [0m 0.5509  [0m | [0m 2.338   [0m | [0m 79.72   [0m | [0m 0.9232  [0m |
| [0m 7       [0m | [0m 0.5718  [0m | [0m 1.977   [0m | [0m 91.89   [0m | [0m 0.9572  [0m |
| [0m 8       [0m | [0m 0.5345  [0m | [0m 2.082   [0m | [0m 68.72   [0m | [0m 0.8663  [0m |
| [0m 9       [0m | [0m 0.5836  [0m | [0m 1.9

In [85]:
print(BO_lgbm.max['params'])

{'max_depth': 1.875174422525385, 'n_estimators': 92.42411005474558, 'subsample': 0.9818313802505146}


In [88]:
lgbm_tune = LGBMClassifier(n_estimators = 92, max_depth = 2, subsample = 0.9818)
lgbm_tune.fit(X, y)

LGBMClassifier(max_depth=2, n_estimators=92, subsample=0.9818)

In [89]:
pred = lgbm_tune.predict(test.drop(columns = ['index']))

In [90]:
submission = pd.read_csv('data/sample_submission.csv')
submission['quality'] = pred
submission.to_csv('tune_lgbm.csv', index = False)