# MultiOutput LGBM Regressor with Bayesian Optimization
- MultiOutput Regressor에 Bayesian Optimization을 적용하여 하이퍼 파라미터 튜닝을 실시하였습니다.
- 현재 MultiOutput Regressor에 대해서 Bayesian Optimization을 적용한 코드 예시가 없기 때문에 이를 만들었습니다.
- 검증은 KFOLD-5를 이용하여 교차검증합니다.
- 제일 점수가 잘 나온 하이퍼 파라미터 2개를 이용하여 추후에 앙상블시켰습니다

### 패키지 로딩

In [3]:
# Data Handling
import pandas as pd
import numpy as np

# Model selection and evaluation
from sklearn.model_selection import KFold
import sklearn.metrics as metrics

# Modeling
from lightgbm import LGBMRegressor
from sklearn.multioutput import MultiOutputRegressor
from bayes_opt import BayesianOptimization

### 데이터 로딩

In [4]:
# Data Loading
data = pd.read_csv('C:/LG_Aimers/LG_Aimers_NEW/data/train.csv')

# Data Preprocessing
X = data.iloc[:,1:57]
y= data.iloc[:,57:]

### 평가함수 정의

In [5]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

### 모델 적합함수 정의

In [6]:
def fit_model(X,y,X_test,i,num_leaves, learning_rate, n_estimators, subsample, colsample_bytree):
    lgbm_model = MultiOutputRegressor(LGBMRegressor(num_leaves = int(num_leaves),
                                                   learning_rate = learning_rate,
                                                   n_estimators = int(n_estimators),
                                                   subsample = subsample,
                                                   colsample_bytree = colsample_bytree)).fit(X,y)
    lgbm_predict = lgbm_model.predict(X_test)
    return lgbm_predict

### Cross Validation 함수 정의

In [7]:
def LGBM_cv(num_leaves, learning_rate, n_estimators, subsample, colsample_bytree) :

    # KOLD CV
    kf=KFold(5,shuffle=True,random_state=1339)
    score = 0
    i=0
    for train_index,val_index in kf.split(X):
        i+=1
        X_train,X_val=X.iloc[train_index],X.iloc[val_index]
        y_train,y_val=y.iloc[train_index],y.iloc[val_index]
        LGBM_predict = fit_model(np.array(X_train),np.array(y_train),np.array(X_val),i,num_leaves, learning_rate, n_estimators, subsample, colsample_bytree)
        score += lg_nrmse(np.array(y_val),LGBM_predict)

    return -score/5

### 베이지안 옵티마이저

In [8]:
pbounds = {'num_leaves': (16, 1024),        # num_leaves,       범위(16~1024)
           'learning_rate': (0.0001, 0.1),  # learning_rate,    범위(0.0001~0.1)
           'n_estimators': (16, 1024),      # n_estimators,     범위(16~1024)
           'subsample': (0, 1),             # subsample,        범위(0~1)
           'colsample_bytree': (0, 1),      # colsample_bytree, 범위(0~1)
            }

In [None]:
bo = BayesianOptimization(f = LGBM_cv, pbounds = pbounds, random_state = 1,verbose = 2)
bo.maximize(init_points = 50, n_iter = 200,acq = 'ei',xi = 0.01)

In [None]:
print(bo.max)

{'target': -0.9884710321697995, 'params': {'colsample_bytree': 0.8749504388228964, 'gamma': 10.083312203045946, 'learning_rate': 0.08251735312986809, 'max_depth': 6.448478384832381, 'min_child_weight': 2.472122241236701, 'n_estimators': 206.1616899422121, 'subsample': 0.8088199427005741}}
