In [1]:
import pandas as pd
import numpy as np

from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import gc

import shap

In [2]:
DEBUG = False

if DEBUG:
    NFOLDS = 2
    N_ESTIMATORS = 50
else:
    NFOLDS = 5
    N_ESTIMATORS = 500

In [3]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

# Data Preparation

In [4]:
# 모든 값이 동일한 column 제거
train = train.loc[:, train.max() != train.min()]

train.csv 
-     30일 간의 기상청 데이터 (X00\~X39) 및 센서데이터 (Y00\~Y17)
-     이후 3일 간의 기상청 데이터 (X00\~X39) 및 센서데이터 (Y18)

10분 간격으로 측정되었기 때문에 하루에 총 144번 측정됨(6\*24 = 144)  
총 33일치 데이터기 때문에 4,752개의 데이터가 존재함(144\*33 = 4752)

이중, 30일간 측정된 4,320개의 데이터는 (Y00\~Y17)의 센서데이터만 존재하며, 이후 3일간 측정된 432개의 데이터는 Y18의 센서데이터만 존재함

In [5]:
6*24*33, 6*24*30, 6*24*3

(4752, 4320, 432)

In [6]:
train['Y18'].isnull().sum(), train['Y18'].notnull().sum()

(4320, 432)

In [7]:
train_first_30 = train[train['Y18'].isnull()]
train_first_30.shape

(4320, 57)

In [8]:
train_later_3 = train[train['Y18'].notnull()]
train_later_3.shape

(432, 57)

In [9]:
train_first_30

Unnamed: 0,id,X00,X01,X02,X03,X04,X05,X06,X07,X08,...,Y09,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17,Y18
0,0,9.7,988.8,1.2,0.6,0.0,1009.3,989.6,12.2,1009.9,...,7.0,7.5,7.0,9.0,10.0,9.5,9.0,8.0,9.0,
1,1,9.3,988.9,1.7,1.9,0.0,1009.3,989.6,12.1,1010.0,...,6.5,7.5,7.0,8.5,10.0,9.5,9.0,7.5,9.0,
2,2,9.4,989.0,1.1,2.3,0.0,1009.2,989.7,12.1,1010.1,...,6.5,7.5,6.5,8.0,9.5,9.5,8.5,7.5,8.5,
3,3,9.4,988.9,1.5,0.7,0.0,1009.2,989.6,12.0,1010.0,...,6.0,7.0,6.0,8.0,9.5,9.0,8.5,7.5,8.5,
4,4,9.2,988.9,0.8,1.7,0.0,1009.2,989.7,12.0,1010.1,...,6.0,7.0,6.0,7.5,9.5,9.0,8.5,7.5,8.5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,4315,19.5,987.8,1.7,0.4,0.0,1007.8,988.0,21.3,1007.8,...,19.5,20.0,19.0,20.5,20.5,20.0,20.0,20.5,20.0,
4316,4316,19.3,987.7,2.1,0.9,0.0,1007.8,988.1,21.3,1008.0,...,19.5,20.0,19.0,20.5,20.5,20.0,19.5,20.5,19.5,
4317,4317,19.5,987.8,0.9,1.3,0.0,1007.8,988.3,21.2,1008.1,...,19.5,20.0,19.0,20.0,20.0,19.5,19.5,20.0,19.0,
4318,4318,20.0,987.8,1.4,0.5,0.0,1007.8,988.3,21.1,1008.1,...,19.5,20.0,18.5,20.0,20.0,19.5,19.5,20.0,19.0,


[본 공유코드](https://dacon.io/competitions/official/235584/codeshare/707)를 통해 Y15, Y16이 Y18과 연관성이 높다는 것을 알 수 있습니다.

In [10]:
X_train = train_first_30.loc[:, 'X00':'X39']
X_train

Unnamed: 0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39
0,9.7,988.8,1.2,0.6,0.0,1009.3,989.6,12.2,1009.9,1009.8,...,69.1,8.2,10.7,1010.1,0.00,256.4,0.0,77.2,62.6,0.0
1,9.3,988.9,1.7,1.9,0.0,1009.3,989.6,12.1,1010.0,1009.9,...,70.3,8.3,10.3,1010.1,0.00,215.4,0.0,77.3,63.5,0.0
2,9.4,989.0,1.1,2.3,0.0,1009.2,989.7,12.1,1010.1,1010.1,...,71.5,8.0,9.7,1010.0,0.00,235.2,0.0,77.3,63.9,0.0
3,9.4,988.9,1.5,0.7,0.0,1009.2,989.6,12.0,1010.0,1010.0,...,73.2,7.7,9.4,1010.1,0.00,214.0,0.0,77.5,64.5,0.0
4,9.2,988.9,0.8,1.7,0.0,1009.2,989.7,12.0,1010.1,1010.0,...,74.3,7.4,9.4,1010.1,0.00,174.9,0.0,78.0,65.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4315,19.5,987.8,1.7,0.4,0.0,1007.8,988.0,21.3,1007.8,1007.5,...,82.3,18.8,19.4,1008.2,16.68,272.9,0.0,82.3,74.3,0.0
4316,19.3,987.7,2.1,0.9,0.0,1007.8,988.1,21.3,1008.0,1007.4,...,85.8,18.6,18.5,1008.4,16.68,315.9,0.0,82.4,74.8,0.0
4317,19.5,987.8,0.9,1.3,0.0,1007.8,988.3,21.2,1008.1,1007.5,...,84.1,18.4,19.1,1008.4,16.68,343.3,0.0,82.8,75.4,0.0
4318,20.0,987.8,1.4,0.5,0.0,1007.8,988.3,21.1,1008.1,1007.5,...,85.4,18.2,19.0,1008.4,16.68,341.7,0.0,82.8,75.8,0.0


'Y15', 'Y16'을 각각 target 데이터로 하는 훈련데이터를 만들기 위해 X_train을 두배로 늘려준 뒤, 'Y18'을 target 데이터로 하기 위해 train_later_3를 훈련데이터로 더 추가해준다.

In [11]:
X_train = pd.concat([X_train, X_train], axis=0)
X_train = pd.concat([X_train, train_later_3.loc[:, 'X00':'X39']], axis=0)
X_train

Unnamed: 0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X30,X31,X32,X33,X34,X35,X36,X37,X38,X39
0,9.7,988.8,1.2,0.6,0.0,1009.3,989.6,12.2,1009.9,1009.8,...,69.1,8.2,10.7,1010.1,0.00,256.4,0.0,77.2,62.6,0.0
1,9.3,988.9,1.7,1.9,0.0,1009.3,989.6,12.1,1010.0,1009.9,...,70.3,8.3,10.3,1010.1,0.00,215.4,0.0,77.3,63.5,0.0
2,9.4,989.0,1.1,2.3,0.0,1009.2,989.7,12.1,1010.1,1010.1,...,71.5,8.0,9.7,1010.0,0.00,235.2,0.0,77.3,63.9,0.0
3,9.4,988.9,1.5,0.7,0.0,1009.2,989.6,12.0,1010.0,1010.0,...,73.2,7.7,9.4,1010.1,0.00,214.0,0.0,77.5,64.5,0.0
4,9.2,988.9,0.8,1.7,0.0,1009.2,989.7,12.0,1010.1,1010.0,...,74.3,7.4,9.4,1010.1,0.00,174.9,0.0,78.0,65.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747,19.9,987.6,0.9,0.8,0.0,1006.9,987.7,21.7,1007.5,1007.4,...,89.9,17.7,19.1,1007.5,22.16,218.6,0.0,82.3,58.6,0.5
4748,19.9,987.6,0.5,0.7,0.0,1006.8,987.7,21.6,1007.5,1007.4,...,91.3,17.7,19.2,1007.5,22.16,161.7,0.0,82.5,59.1,0.5
4749,19.7,987.7,0.9,0.6,0.0,1006.9,987.6,21.4,1007.4,1007.5,...,90.2,17.8,19.2,1007.5,22.16,254.2,0.0,83.0,58.9,0.5
4750,19.4,987.7,0.9,0.8,0.0,1006.9,987.8,21.3,1007.6,1007.5,...,90.1,17.7,19.3,1007.6,22.16,300.0,0.0,83.2,59.8,0.5


## Standardization

In [12]:
#표준화 함수 생성
def standardization(df):
    mean = np.mean(df)
    std = np.std(df)
    norm = (df - mean) / (std - 1e-07)
    return norm, mean, std

In [13]:
# 학습데이터 표준화
X_train_norm, mean, std = standardization(X_train)

In [14]:
y_train = train_first_30.loc[:, 'Y15']
y_train = pd.concat([y_train, y_train], axis=0)
y_train = pd.concat([y_train, train_later_3.loc[:, 'Y18']], axis=0)
y_train

0        9.0
1        9.0
2        8.5
3        8.5
4        8.5
        ... 
4747    21.5
4748    21.5
4749    21.5
4750    21.5
4751    21.0
Length: 9072, dtype: float64

In [15]:
X_test = test.loc[:, 'X00':'X39']

# 모든 값이 동일한 column 제거
X_test = X_test.loc[:, X_test.max() != X_test.min()]

# 테스트데이터 표준화
X_test_norm = (X_test - mean) / (std - 1e-07)

In [16]:
# ID로 시간 변수 생성
minute = pd.Series((X_train_norm.index%144).astype(int))
hour= pd.Series((X_train_norm.index%144/6).astype(int))

# 삼각함수를 이용한 시간변수 생성
min_in_day = 24*6
hour_in_day = 24

minute_sin = np.sin(np.pi*minute/min_in_day) 
minute_cos = np.cos(np.pi*minute/min_in_day)

hour_sin  = np.sin(np.pi*hour/hour_in_day)
hour_cos  = np.cos(np.pi*hour/hour_in_day)

In [17]:
X_train_norm['minute_sin'] = minute_sin
X_train_norm['minute_cos'] = minute_cos

X_train_norm['hour_sin'] = hour_sin
X_train_norm['hour_cos'] = hour_cos
X_train_norm

Unnamed: 0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X34,X35,X36,X37,X38,X39,minute_sin,minute_cos,hour_sin,hour_cos
0,-2.200858,0.021446,-0.302269,-0.901206,-0.294097,0.250176,0.140271,-2.125617,0.278398,0.359492,...,-1.012858,0.492439,-0.281954,0.584950,0.253448,-0.239790,0.000000,1.000000,0.000000,1.000000
1,-2.285482,0.050911,0.160625,0.473953,-0.294097,0.250176,0.140271,-2.148344,0.306617,0.387481,...,-1.012858,0.076828,-0.281954,0.590018,0.302031,-0.239790,0.021815,0.999762,0.000000,1.000000
2,-2.264326,0.080376,-0.394848,0.897079,-0.294097,0.220989,0.169400,-2.148344,0.334836,0.443458,...,-1.012858,0.277538,-0.281954,0.590018,0.323624,-0.239790,0.043619,0.999048,0.000000,1.000000
3,-2.264326,0.050911,-0.024533,-0.795425,-0.294097,0.220989,0.140271,-2.171071,0.306617,0.415469,...,-1.012858,0.062636,-0.281954,0.600155,0.356013,-0.239790,0.065403,0.997859,0.000000,1.000000
4,-2.306638,0.050911,-0.672584,0.262390,-0.294097,0.220989,0.169400,-2.171071,0.334836,0.415469,...,-1.012858,-0.333715,-0.281954,0.625499,0.383004,-0.239790,0.087156,0.996195,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4747,-0.042946,-0.332135,-0.580005,-0.689643,-0.294097,-0.450307,-0.413183,0.033452,-0.398857,-0.312228,...,0.927250,0.109266,-0.281954,0.843450,0.037521,-0.133855,0.108867,-0.994056,0.130526,-0.991445
4748,-0.042946,-0.332135,-0.950320,-0.795425,-0.294097,-0.479494,-0.413183,0.010725,-0.398857,-0.312228,...,0.927250,-0.467522,-0.281954,0.853587,0.064512,-0.133855,0.087156,-0.996195,0.130526,-0.991445
4749,-0.085258,-0.302670,-0.580005,-0.901206,-0.294097,-0.450307,-0.442312,-0.034729,-0.427076,-0.284240,...,0.927250,0.470138,-0.281954,0.878930,0.053716,-0.133855,0.065403,-0.997859,0.130526,-0.991445
4750,-0.148726,-0.302670,-0.580005,-0.689643,-0.294097,-0.450307,-0.384054,-0.057456,-0.370638,-0.284240,...,0.927250,0.934407,-0.281954,0.889067,0.102299,-0.133855,0.043619,-0.999048,0.130526,-0.991445


In [18]:
# ID로 시간 변수 생성
minute = pd.Series((test.id%144).astype(int))
hour= pd.Series((test.id%144/6).astype(int))

# 삼각함수를 이용한 시간변수 생성
min_in_day = 24*6
hour_in_day = 24

minute_sin = np.sin(np.pi*minute/min_in_day) 
minute_cos = np.cos(np.pi*minute/min_in_day)

hour_sin  = np.sin(np.pi*hour/hour_in_day)
hour_cos  = np.cos(np.pi*hour/hour_in_day)

In [19]:
X_test_norm['minute_sin'] = minute_sin
X_test_norm['minute_cos'] = minute_cos

X_test_norm['hour_sin'] = hour_sin
X_test_norm['hour_cos'] = hour_cos
X_test_norm

Unnamed: 0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X34,X35,X36,X37,X38,X39,minute_sin,minute_cos,hour_sin,hour_cos
0,-0.296818,-0.391065,-0.024533,-0.160736,-0.294097,-0.479494,-0.413183,-0.102911,-0.398857,-0.368205,...,-1.012858,-0.133005,-0.281954,0.929616,0.107697,-0.239790,0.000000,1.000000,0.000000,1.000000
1,-0.275662,-0.391065,-0.394848,-0.266518,-0.294097,-0.508680,-0.413183,-0.080183,-0.398857,-0.368205,...,-1.012858,-0.351962,-0.281954,0.929616,0.140086,-0.239790,0.021815,0.999762,0.000000,1.000000
2,-0.233350,-0.391065,-0.209690,-0.266518,-0.294097,-0.537867,-0.442312,-0.080183,-0.427076,-0.368205,...,-1.012858,0.007897,-0.281954,0.980303,0.156281,-0.239790,0.043619,0.999048,0.000000,1.000000
3,-0.296818,-0.391065,0.716097,-0.689643,-0.294097,-0.537867,-0.442312,-0.102911,-0.427076,-0.368205,...,-1.012858,-0.231333,-0.281954,1.020852,0.177873,-0.239790,0.065403,0.997859,0.000000,1.000000
4,-0.360286,-0.391065,-0.394848,-0.795425,-0.294097,-0.508680,-0.413183,-0.148365,-0.398857,-0.368205,...,-1.012858,-0.456371,-0.281954,1.096881,0.204864,-0.239790,0.087156,0.996195,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11515,1.184102,1.317909,2.197357,2.483800,-0.294097,0.950658,1.363694,1.078896,1.237842,1.143166,...,0.699620,-0.387441,-0.281954,0.711665,0.517958,-0.133855,0.108867,-0.994056,0.130526,-0.991445
11516,1.162946,1.317909,1.364149,1.637549,-0.294097,0.950658,1.363694,1.078896,1.237842,1.143166,...,0.699620,-0.323578,-0.281954,0.706597,0.539550,-0.133855,0.087156,-0.996195,0.130526,-0.991445
11517,1.162946,1.317909,2.104778,1.108641,-0.294097,1.009031,1.334565,1.056169,1.209623,1.171155,...,0.699620,-0.031637,-0.281954,0.686322,0.561143,-0.133855,0.065403,-0.997859,0.130526,-0.991445
11518,1.141790,1.317909,0.808676,1.425986,-0.294097,0.979845,1.334565,1.033442,1.209623,1.143166,...,0.699620,-0.340811,-0.281954,0.731940,0.588134,-0.133855,0.043619,-0.999048,0.130526,-0.991445


In [20]:
# 훈련, 테스트 데이터의 column 개수가 동일한지 확인
len(X_train_norm.columns) == len(X_test_norm.columns)

True

In [21]:
X_train_norm_Y18 = X_train_norm.iloc[4320+4320:, :]

In [22]:
y_train_Y18 = y_train[4320+4320:]

# Feature Selection

In [31]:
# SHAP와 Feature_importances_에 의해서 선택된 상위 13개 피처
selected_columns = [
                    'X00',
                    'X07',
                    'X12',
                    'X20',
                    'X28',
                    'X30',
                    'X31',
                    'X32',
                    'X34',
                    'X37',
                    'hour_sin',
                    'minute_cos',
                    'minute_sin'
                    ]

In [24]:
# Y15, Y16, Y18를 활용한 훈련 데이터 세트
X_train_norm = X_train_norm[selected_columns]

# Y18만 활용한 훈련 데이터 세트
X_train_norm_Y18 = X_train_norm_Y18[selected_columns]

X_test_norm = X_test_norm[selected_columns]

# Modeling

In [25]:
# 실제값과 예측값의 차이가 1보다 작은 건 무시하는 Custom MSE Metric
def mse_ignoring_diff_less_than_one(y_pred, y_true):

    diff = abs(y_true - y_pred)
    mask_less_than_one = np.array([0 if x < 1 else 1 for x in diff])

    y_pred = mask_less_than_one * y_pred
    y_true = mask_less_than_one *y_true
    
    score =  mean_squared_error(y_true, y_pred)
    
    return 'score', score, False

In [26]:
# Y15, Y16, Y18을 활용한 훈련데이터 세트 모델링

feature_importance = pd.DataFrame()
feature_importance['feature'] = X_train_norm.columns

folds = KFold(n_splits=NFOLDS, shuffle=True)
y_preds = np.zeros(X_test_norm.shape[0])

model = LGBMRegressor(n_estimators=N_ESTIMATORS,
                       boosting_type='gbdt',
                       num_leaves=60,
                       max_depth=-1,
                       learning_rate=0.01,
                       objective='regression',
                       sub_sample=0.7,
                       colsample_bytree=0.8,
                       min_data_in_leaf=100,
                       bagging_seed=11,
                       # metric=mse_ignoring_diff_less_than_one,
                       verbosity=-1,
                       random_state=47)

for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train_norm, y_train)):
    print('Fold: ', fold_n+1)
    X_train_fold, X_valid_fold = X_train_norm.iloc[train_index], X_train_norm.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    evals = [(X_train_fold, y_train_fold), (X_valid_fold,  y_valid_fold)]
    model.fit(X_train_fold, 
              y_train_fold, 
              eval_metric=mse_ignoring_diff_less_than_one,
              eval_set=evals,
              early_stopping_rounds=50,
              verbose=100
             )
    
    shap_values = shap.TreeExplainer(model.booster_).shap_values(X_train_fold)
    fold_importance = pd.DataFrame()
    fold_importance['feature'] = X_train_norm.columns
    fold_importance['shap_values'] = abs(np.array(shap_values)[:, :].mean(0))
    fold_importance['feat_importance'] = model.feature_importances_
    feature_importance = pd.concat([feature_importance, fold_importance], sort=False)
        
    y_preds += model.predict(X_test_norm) / NFOLDS
    
    del X_train_fold, X_valid_fold, y_train_fold, y_valid_fold
    gc.collect()

feature_importance = feature_importance.groupby('feature').mean().sort_values('shap_values', ascending=False).reset_index()

Fold:  1
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.4332	training's score: 7.35256	valid_1's l2: 7.71806	valid_1's score: 7.6382
[200]	training's l2: 1.74821	training's score: 1.54372	valid_1's l2: 1.96548	valid_1's score: 1.76462
[300]	training's l2: 0.803207	training's score: 0.622977	valid_1's l2: 1.02513	valid_1's score: 0.846123
[400]	training's l2: 0.574127	training's score: 0.430804	valid_1's l2: 0.822886	valid_1's score: 0.672947
[500]	training's l2: 0.468428	training's score: 0.33888	valid_1's l2: 0.720335	valid_1's score: 0.578521
Did not meet early stopping. Best iteration is:
[500]	training's l2: 0.468428	training's score: 0.33888	valid_1's l2: 0.720335	valid_1's score: 0.578521
Fold:  2
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.50885	training's score: 7.42464	valid_1's l2: 7.42851	valid_1's score: 7.35026
[200]	training's l2: 1.77774	training's score: 1.57299	valid_1's l2: 1.81805	valid_1's 

In [27]:
# Y18만 활용한 훈련데이터 세트 모델링

feature_importance_Y18 = pd.DataFrame()
feature_importance_Y18['feature'] = X_train_norm_Y18.columns

folds = KFold(n_splits=NFOLDS, shuffle=True)
y_preds_Y18 = np.zeros(X_test_norm.shape[0])

model_Y18 = LGBMRegressor(n_estimators=N_ESTIMATORS,
                       boosting_type='gbdt',
                       num_leaves=70,
                       max_depth=-1,
                       learning_rate=0.01,
                       objective='regression',
                       sub_sample=0.8,
                       colsample_bytree=0.8,
                       min_data_in_leaf=50,
                       bagging_seed=11,
                       # metric=mse_ignoring_diff_less_than_one,
                       verbosity=-1,
                       random_state=47)

for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train_norm_Y18, y_train_Y18)):
    print('Fold: ', fold_n+1)
    X_train_fold, X_valid_fold = X_train_norm_Y18.iloc[train_index], X_train_norm_Y18.iloc[valid_index]
    y_train_fold, y_valid_fold = y_train_Y18.iloc[train_index], y_train_Y18.iloc[valid_index]
    
    evals = [(X_train_fold, y_train_fold), (X_valid_fold,  y_valid_fold)]
    model_Y18.fit(X_train_fold, 
              y_train_fold, 
              eval_metric=mse_ignoring_diff_less_than_one,
              eval_set=evals,
              early_stopping_rounds=50,
              verbose=100
             )
    
    shap_values = shap.TreeExplainer(model_Y18.booster_).shap_values(X_train_fold)
    fold_importance = pd.DataFrame()
    fold_importance['feature'] = X_train_norm_Y18.columns
    fold_importance['shap_values'] = abs(np.array(shap_values)[:, :].mean(0))
    fold_importance['feat_importance'] = model_Y18.feature_importances_
    feature_importance_Y18 = pd.concat([feature_importance_Y18, fold_importance], sort=False)
        
    y_preds_Y18 += model_Y18.predict(X_test_norm) / NFOLDS
    
    del X_train_fold, X_valid_fold, y_train_fold, y_valid_fold
    gc.collect()

feature_importance_Y18 = feature_importance_Y18.groupby('feature').mean().sort_values('shap_values', ascending=False).reset_index()

Fold:  1
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.32084	training's score: 7.25458	valid_1's l2: 7.89388	valid_1's score: 7.83743
[200]	training's l2: 1.9026	training's score: 1.66571	valid_1's l2: 2.31041	valid_1's score: 2.10388
[300]	training's l2: 0.941205	training's score: 0.772161	valid_1's l2: 1.35276	valid_1's score: 1.16931
[400]	training's l2: 0.740768	training's score: 0.572517	valid_1's l2: 1.16905	valid_1's score: 1.04453
[500]	training's l2: 0.630583	training's score: 0.450464	valid_1's l2: 1.08124	valid_1's score: 0.971446
Did not meet early stopping. Best iteration is:
[500]	training's l2: 0.630583	training's score: 0.450464	valid_1's l2: 1.08124	valid_1's score: 0.971446
Fold:  2
Training until validation scores don't improve for 50 rounds
[100]	training's l2: 7.40761	training's score: 7.36616	valid_1's l2: 7.46743	valid_1's score: 7.38879
[200]	training's l2: 1.94833	training's score: 1.70764	valid_1's l2: 2.28775	valid_1's sc

# Submission

In [28]:
ensembled_preds = 0.3 * y_preds + 0.7 * y_preds_Y18

submit = pd.DataFrame({'id':range(144*33, 144*113),
              'Y18':ensembled_preds})
submit.to_csv('ensemble_lgbm_result.csv', index = False)

- 피처간의 pairplot 그려보기

In [29]:
feature_importance_Y18

Unnamed: 0,feature,shap_values,feat_importance
0,minute_sin,0.111328,394.0
1,X31,0.073695,145.8
2,hour_sin,0.060172,195.6
3,X30,0.053614,242.8
4,X32,0.05114,106.6
5,X07,0.03709,130.2
6,minute_cos,0.036687,116.6
7,X20,0.03477,138.0
8,X28,0.032425,150.0
9,X12,0.021422,119.0


In [30]:
feature_importance

Unnamed: 0,feature,shap_values,feat_importance
0,hour_sin,0.110418,865.2
1,X07,0.103427,3693.0
2,X32,0.078661,2134.4
3,X31,0.073496,2231.8
4,minute_sin,0.062032,2136.8
5,X28,0.04619,1718.6
6,X00,0.044434,2074.8
7,minute_cos,0.028576,2347.8
8,X12,0.020957,1814.8
9,X34,0.010633,2681.4


`ensembled_preds = 0.4 * y_preds + 0.6 * y_preds_Y18`  
`num_leaves=70` for Y15, Y16, Y18 Full Data set  
`selected_columns` = 16개 피처  
**Score: 4.6456712856**

`ensembled_preds = 0.3 * y_preds + 0.7 * y_preds_Y18`  
`num_leaves=60` for Y15, Y16, Y18 Full Data set  
`selected_columns = ['X00','X07','X12','X20','X28','X30','X31','X32','X34','X37','hour_sin','minute_cos','minute_sin']`  
**Score: 4.3257990839**

### To do 
- Pairplot을 통한 피처간의 상관관계 조사 
- Y15, Y16, Y18 간의 상관관계를 통해 Y15, Y16을 더 조정할 수 있는지 여부 확인
- RNN활용시 Augmentation, Dropout 적용 여부 확인