# MultiOutput Catboost Regressor with Bayesian Optimization
- MultiOutput Regressor에 Bayesian Optimization을 적용하여 하이퍼 파라미터 튜닝을 실시하였습니다.
- 현재 MultiOutput Regressor에 대해서 Bayesian Optimization을 적용한 코드 예시가 없기 때문에 이를 만들었습니다.
- 검증은 KFOLD-5를 이용하여 교차검증합니다.
- 제일 점수가 잘 나온 하이퍼 파라미터 2개를 이용하여 추후에 앙상블시켰습니다

### 패키지 로딩

In [15]:
# Data Handling
import pandas as pd
import numpy as np

# Model Validation
from sklearn.model_selection import KFold
import sklearn.metrics as metrics

# Model Building
from sklearn.multioutput import MultiOutputRegressor
from catboost import CatBoostRegressor
from bayes_opt import BayesianOptimization

### 데이터 로딩

In [16]:
# Data Loading
data = pd.read_csv('C:/LG_Aimers_NEW/data/train.csv')

# Data Preprocessing
X = data.iloc[:,1:57]
y= data.iloc[:,57:]

### 평가 함수 만들기

In [17]:
def lg_nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,14): # ignore 'ID'
        rmse = metrics.mean_squared_error(gt[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

### 모델 적합 함수 만들기

In [18]:
def fit_model(X,y,X_test,i,depth,bagging_temperature,learning_rate,subsample):
    cat_model = MultiOutputRegressor(CatBoostRegressor(depth = int(depth),
                                                        bagging_temperature=bagging_temperature,
                                                        learning_rate=learning_rate,
                                                        random_state=1339,#
                                                        verbose=0,#
                                                        subsample=subsample
                                                        )).fit(X,y)
    cat_predict = cat_model.predict(X_test)
    return cat_predict

### Cross Validation 함수 정의

In [19]:
def CAT_cv(depth,bagging_temperature,learning_rate,subsample):

    # KOLD CV
    kf=KFold(5,shuffle=True,random_state=1339)
    score = 0
    i=0
    for train_index,val_index in kf.split(X):
        i+=1
        X_train,X_val=X.iloc[train_index],X.iloc[val_index]
        y_train,y_val=y.iloc[train_index],y.iloc[val_index]
        cat_predict = fit_model(np.array(X_train),np.array(y_train),np.array(X_val),i,depth,bagging_temperature,learning_rate,subsample)
        score += lg_nrmse(np.array(y_val),cat_predict)

    return -score/5

### Bayesian Optimization

In [20]:
# cat pbounds
pbounds = { 'depth': (3, 10),
            'bagging_temperature': (3, 10),
            'learning_rate': (0.01, 1.0),
            'subsample' : (0.5,1)
            }

In [None]:
bo = BayesianOptimization(f = CAT_cv, pbounds = pbounds, random_state = 1,verbose = 2)
bo.maximize(init_points = 50, n_iter = 200,acq = 'ei',xi = 0.01)

In [None]:
print(bo.max)