## seed ensemble

- seed별로 예측값이 조금씩 바뀌기 때문에, 이 영향을 제거하기 위해 6개의 seed(0부터 5)별로 훈련, 예측하여 6개 예측값의 평균을 구함

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm

from xgboost import XGBRegressor

In [3]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/XGBoost/data/train_preprocessed.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/XGBoost/data/test_preprocessed.csv")
xgb_params = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/XGBoost/data/hyperparameter_xgb_final.csv")

train = train.drop("hour", axis = 1)
train["rain"] = train["rain"].fillna(0)
test = test.drop("hour", axis = 1)

In [4]:
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

In [5]:
preds = np.array([])
for i in tqdm(range(100)):

    pred_df = pd.DataFrame()   # 시드별 예측값을 담을 data frame

    for seed in [0,1,2,3,4,5]: # 각 시드별 예측
        y_train = train.loc[train.num == i+1, 'power']
        x_train, x_test = train.loc[train.num == i+1, ].drop(["num", "date_time", "power"], axis = 1), test.loc[test.num == i+1, ].drop(["num", "date_time"], axis = 1),
        x_test = x_test[x_train.columns]

        xgb = XGBRegressor(seed = seed, n_estimators = xgb_params.iloc[i, 7], eta = 0.01,
                           min_child_weight = xgb_params.iloc[i, 2], max_depth = xgb_params.iloc[i, 3],
                           colsample_bytree=xgb_params.iloc[i, 4], subsample=xgb_params.iloc[i, 5])

        if xgb_params.iloc[i,6] != 0:  # 만약 alpha가 0이 아니면 weighted_mse 사용
            xgb.set_params(**{'objective':weighted_mse(xgb_params.iloc[i,6])})

        xgb.fit(x_train, y_train)
        y_pred = xgb.predict(x_test)
        pred_df.loc[:,seed] = y_pred   # 각 시드별 예측 담기

    pred = pred_df.mean(axis=1)        # (i+1)번째 건물의 예측 =  (i+1)번째 건물의 각 시드별 예측 평균값
    preds = np.append(preds, pred)

100%|██████████| 100/100 [18:02<00:00, 10.82s/it]


In [6]:
submission = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/XGBoost/data/sample_submission.csv')
submission['answer'] = preds
submission.to_csv('/content/drive/MyDrive/Colab Notebooks/XGBoost/data/submission.csv', index = False)