## Dacon  5 회 생체 광학 데이터 분석 AI 모델링 경진대회
## 초보
## 2020년 7월 03일

## 1. 라이브러리 및 데이터
## Library & Data

In [1]:
# LOAD LIBRARIES
import pandas as pd
import numpy as np
import pickle, os
import joblib
from tqdm import trange, tqdm, tqdm_notebook

# DATA SPLIT
from sklearn.model_selection import KFold, StratifiedKFold

# EVALUATE
from sklearn.metrics import mean_absolute_error

# MODEL
import lightgbm as lgb

# ELSE
import matplotlib.pyplot as plt
import warnings
import gc
warnings.filterwarnings('ignore')
%matplotlib inline

os.getcwd()
os.chdir('C:\\Users\\ParkGiChan\\Desktop\\DataAnalysis\\DACON_BIO')

## 2. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
submission = pd.read_csv('./sample_submission.csv')

In [None]:
src_col=train.columns[train.columns.str.contains('src')]
dst_col = train.columns[train.columns.str.contains('dst')]

def simple_fe(train):
    """
    전체 파장에 대한 src, dst 평균 / mean 투과도 / std 투과도
    """
    train['src_mean']=train[src_col].mean(1)
    train['dst_mean']=train[dst_col].mean(1)
    train['Trans_mean'] = train['dst_mean']/train['src_mean']

    train['src_std']=train[src_col].std(1)
    train['dst_std']=train[dst_col].std(1)
    train['Trans_std'] = train['dst_std']/train['src_std']

    
    return train

train = simple_fe(train)
test = simple_fe(test)

############################################################

def binning_fe(train, src_col, dst_col, size=50):
    """
    size 별로 binning후 피쳐 생성
    """
    for i in range(650, 1000-size, size):
        temp1=train.loc[:, '%s_src'%i : '%s_src'%(i+size)].mean(1)
        temp2=train.loc[:, '%s_dst'%i : '%s_dst'%(i+size)].mean(1)
        train['Trans_%s_to_%s_mean'%(i,i+size)] = temp2/temp1
        train['Concen_%s_to_%s_mean'%(i,i+size)] = np.log(temp2/temp1) / train['rho']
    
    return train

for i in [10, 20,30,40, 50,100]:
    train = binning_fe(train, src_col, dst_col, size=i)
    test = binning_fe(test, src_col, dst_col, size=i)
############################################################

def rolling_fe(train):
    """
    window size별 롤링후 피쳐 생성
    """
    temp = [x for x in range(650,1000,10)]
    for k in [3, 5]:# window size
        temp1 = train[src_col].rolling(window = k, min_periods=1, axis=1).mean()
        temp2 = train[dst_col].rolling(window = k, min_periods=1, axis=1).mean()
        for i in temp:
            train['%s_rolling_size_%s'%(i, k)] = np.log(temp2['%s_dst'%i]/temp1['%s_src'%i])/train['rho']
        
    return train

train = rolling_fe(train)
test = rolling_fe(test)

############################################################

def rolling_fe2(train):
    """
    3, 5 각 윈도우 사이즈로 만든 피쳐끼리 나누기
    """
    temp = [x for x in range(650,1000,10)]
    for i in temp:
        train['%s_rolling_size_3/5'%i] = train['%s_rolling_size_3'%i]/ train['%s_rolling_size_5'%i]
        
    return train

train = rolling_fe2(train)
test = rolling_fe2(test)

############################################################
def rolling_fe3(train, near):
    """
    서로 가까운 영역의 롤링 피쳐들끼리 나눠주기
    """
    temp = [x for x in range(650,1000-near, 10)]
    for i in temp:
        train['near_%s_%s_size_3'%(i,i+near)] = train['%s_rolling_size_%s'%(i, 3)] / train['%s_rolling_size_%s'%(i+near, 3)]
        train['near_%s_%s_size_5'%(i,i+near)] = train['%s_rolling_size_%s'%(i, 5)] / train['%s_rolling_size_%s'%(i+near, 5)]
    return train

for i in range(10, 30, 10):
    train = rolling_fe3(train,i)
    test = rolling_fe3(test, i)
############################################################

def core_fe(train):
    """
    파장대별로 투과도 피쳐와, 농도와 직결되는 피쳐 생성
    """
    temp = [x for x in range(650,1000,10)]
    for i in temp:
        train['Trans_%s'%i] = train['%s_dst'%i]/train['%s_src'%i]
        train['Concen_%s'%i] = np.log(train['%s_dst'%i]/train['%s_src'%i])/train['rho']
    return train

train = core_fe(train)
test = core_fe(test)

############################################################

def core_fe2(train):
    """
    모든 파장대의 피쳐끼리 서로 빼기 나누기
    """
    temp = [x for x in range(650,1000,10)]
    temp_col = []
    for i in temp:
        temp_col.append('Concen_%s'%i)

    for i in temp_col:
        for j in temp_col:
            if i!=j:
                train['%s_%s_near_all_div'%(i,j)] = train[i]/train[j]
    
    return train

train = core_fe2(train)
test = core_fe2(test)

############################################################

def core_fe3(train):
    """
    모든 파장대의 피쳐끼리 서로 빼기 나누기
    """
    temp = [x for x in range(650,1000,10)]
    temp_col = []
    for i in temp:
        temp_col.append('Concen_%s'%i)

    for i in temp_col:
        for j in temp_col:
            if i!=j:
                train['%s_%s_near_all_sub'%(i,j)] = train[i]-train[j]
    
    return train

train = core_fe3(train)
test = core_fe3(test)
############################################################
def core_fe4(train, size=10):
    temp_col=[]
    for i in [x for x in range(650, 1000-size, size)]:
        temp_col.append('Concen_%s_to_%s_mean'%(i,i+size))
        
    for i in temp_col:
        for j in temp_col:
            if i!=j:
                train['temp_%s_%s'%(i,j)] = train[i]/train[j]
                train['temp2_%s_%s'%(i,j)] = train[i]-train[j]
    return train

for i in [10,20,30,40,50]:
    train = core_fe4(train, size=i)
    test = core_fe4(test, size=i)

In [None]:
excluded_features=['id','hhb', 'hbo2', 'ca', 'na']
col = [x for x in train.columns if x not in excluded_features]

x_train = train[col]
y_train = train.loc[:, 'hhb':'na']
test = test[col]

x_train=x_train.replace([np.inf, -np.inf], np.nan)
test=test.replace([np.inf, -np.inf], np.nan)

## 3. 모델 학습 및 검증
## Model Tuning & Evaluation

## permutation

In [None]:
from sklearn.model_selection import train_test_split
from eli5.permutation_importance import get_score_importances

X_train = x_train.copy()

In [None]:
def my_permutation_badfeatures(X_train, y_train, col):
    y_train_temp = y_train[col].copy()
    threshold = [0.0001]
    bad_features1= []
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
        
    def score(X, y):
        y_pred = reg.predict(X)
        return abs(y-y_pred).mean() 
    
    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(X_train)):
        trn_x, trn_y = X_train.iloc[trn_idx], y_train_temp.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train_temp.iloc[val_idx]
        
        reg= lgb.LGBMRegressor(boosting_type ='gbdt',n_estimators=20000,num_leaves=32, max_depth=-1, min_child_weight=5, 
                                 subsample=0.7, colsample_bytree =1, learning_rate=0.01, gamma = 0 , n_jobs=-1,
                            random_state=42,reg_alpha=0.1, reg_lambda=0.1)

        reg.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y),(val_x, val_y)], 
                  early_stopping_rounds=50 ,verbose=-1, eval_metric='mae')
        
        
        
        base_score, score_decreases = get_score_importances(score,np.array(val_x), np.array(val_y), n_iter=1)
        
        bad_features1.extend(list(val_x.columns[score_decreases[0] > -threshold[0]]))
                             
    return bad_features1
    

In [None]:
for col in y_train.columns:
    bad_features = my_permutation_badfeatures(X_train, y_train, col)
    pd.DataFrame(bad_features)[0].value_counts().to_csv("./bad_features/%s_bad_features.csv"%col)


## permutation 후 모델 학습

In [26]:
def permute_train_model(x_train, y_train, x_test, label):
    models=[]
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros((x_train.shape[0],)) # oof
    pred = np.zeros(x_test.shape[0])
    feature_importance_df = pd.DataFrame()
    
    # train, test split
    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(x_train)):

        #print(n_fold)
        trn_x, trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]

        if os.path.isfile('./models/%s_%sfold_0.78230.pkl'%(label, n_fold)): # 모델 존재할경우
            print('%s model load..'%label)
            model= joblib.load('./models/%s_%sfold_0.78230.pkl'%(label,n_fold))
        
        else: # 모델 없을때는 train
            print('train %s '%label)
            model= lgb.LGBMRegressor(boosting_type ='dart',n_estimators=50000,num_leaves=64, max_depth=-1, min_child_weight=5, 
                                     subsample=0.7, colsample_bytree = 0.2, learning_rate=0.01, gamma = 0 , n_jobs=-1,
                                random_state=42,reg_alpha=0.1, reg_lambda=0.1)

            model.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y),(val_x, val_y)], 
                      early_stopping_rounds=50 ,verbose=50000, eval_metric='mae')
        
            models.append(model)
        

        # OOF
        v_p = model.predict(val_x)
        oof[val_idx] = v_p
    
        # PREDS
        pred += model.predict(x_test)/5.0
        
    return models, oof, pred
        


In [27]:
# bad_features는 ~~ github주소
permute_models = {}
permute_oofs=[]
permute_preds=[]
for label in y_train.columns:
    pred = np.zeros(test.shape[0])
    
    permute_features = pd.read_csv('./bad_features/%s_bad_features.csv'%label,  names=['feature','count'])   
    bad_features = permute_features[permute_features['count'] >1]['feature'].values 
    
    x_reduced_trn = x_train.drop(bad_features, axis=1).copy()
    x_reduced_test = test.drop(bad_features, axis=1).copy()

    ms, oof, pred =permute_train_model(x_reduced_trn, y_train[label], x_reduced_test, label)

    permute_models[label] = ms
    permute_oofs.append(oof)
    permute_preds.append(pred)
    

280
hhb model load
hhb model load
hhb model load
hhb model load
hhb model load
236
hbo2 model load
hbo2 model load
hbo2 model load
hbo2 model load
hbo2 model load
278
ca model load
ca model load
ca model load
ca model load
ca model load
162
na model load
na model load
na model load
na model load
na model load


In [28]:
labels = ['hhb', 'hbo2', 'ca', 'na']

a=[]
for label,i in zip(labels,permute_oofs):
    a.append(mean_absolute_error(train[label], i))
print(a)
print('oof mae %.5f'% np.mean(a))

[0.49469949544364755, 0.3951968527521723, 1.2320512842905125, 1.007247908003161]
oof mae 0.78230


In [29]:
# permute 용도
submission = pd.read_csv('./sample_submission.csv')
submission['hhb'] = permute_preds[0]
submission['hbo2'] = permute_preds[1]
submission['ca'] = permute_preds[2]
submission['na'] = permute_preds[3]
submission

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.240995,4.874584,9.376731,3.102264
1,10001,7.436999,2.485243,8.694831,2.330186
2,10002,9.766639,5.094046,9.731256,3.201371
3,10003,8.112722,4.391882,9.359735,4.386559
4,10004,6.892838,2.795613,9.154041,3.434072
...,...,...,...,...,...
9995,19995,3.650341,4.473906,12.157714,5.282470
9996,19996,10.640541,2.935168,8.345288,2.959774
9997,19997,6.784586,3.711216,9.008868,3.962321
9998,19998,12.838950,4.575061,8.045664,4.044267


In [59]:
submission.to_csv('./submission/sub_0.78230.csv', index=False)
submission

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.385180,4.951731,9.617316,2.969566
1,10001,7.517709,2.465483,8.854714,2.409327
2,10002,9.706019,5.112048,9.844248,3.254742
3,10003,8.162751,4.326323,9.363771,4.632428
4,10004,6.549421,2.967990,8.919735,3.397089
...,...,...,...,...,...
9995,19995,3.572231,4.502444,11.946385,5.093107
9996,19996,10.622650,2.911157,8.060776,3.144995
9997,19997,6.858170,3.687651,8.885298,3.943446
9998,19998,12.892524,4.579880,8.060315,3.944395


In [43]:
def save_model(permute_models, cv):
    for label in permute_models:
        for n, i in enumerate(permute_models[label]):
            joblib.dump(i, './models/%s_%sfold_%s.pkl'%(label,n,cv))

In [44]:
save_model(permute_models, '0.78230')

## prob값을 feature로 사용하여 모델 재학습

In [34]:
def predict_model(x_train, y_train,label):
    models=[]
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros((x_train.shape[0],)) # oof

    # train, test split
    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(x_train)):

        #print(n_fold)
        trn_x, trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]


        model= joblib.load('./models/%s_%sfold_0.78230.pkl'%(label,n_fold))
        

        # OOF
        v_p = model.predict(val_x)
        oof[val_idx] = v_p

    
        
    return oof

In [36]:
labels = ['hhb', 'hbo2', 'ca', 'na']
threshold = [0.001, 0.0005, 0.0001]
oofs=[]
for label in labels:
    permute_features = pd.read_csv('./bad_features/%s_bad_features.csv'%label,  names=['feature','count'])   
    bad_features = permute_features[permute_features['count'] >1]['feature'].values     
    
    x_reduced_trn = x_train.drop(bad_features, axis=1).copy()
    x_reduced_test = test.drop(bad_features, axis=1).copy()
    y_train2 = y_train[label].copy()
    
    oof=predict_model(x_reduced_trn, y_train2, label)
    oofs.append(oof)

In [38]:
a=[]
for label,i in zip(labels,oofs):
    a.append(mean_absolute_error(train[label], i))
np.mean(a)

0.7822988851223733

In [39]:
def prob_features(x_train, oofs, data='train'):
    """
    proba features 추가
    """
    if data =='train':
        preds_hhb = oofs[0]
        preds_hbo2 = oofs[1]
        preds_ca = oofs[2]
        preds_na = oofs[3]
    elif data =='test':
        preds_hhb = oofs['hhb']
        preds_hbo2 = oofs['hbo2']
        preds_ca = oofs['ca']
        preds_na = oofs['na']        

    x_train['hhb_prob'] = preds_hhb
    x_train['hbo2_prob'] = preds_hbo2
    x_train['ca_prob'] = preds_ca
    x_train['na_prob'] = preds_na

    x_train['hhb/hbo2'] = preds_hhb/preds_hbo2
    x_train['hhb/ca'] = preds_hhb/preds_ca
    x_train['hhb/na'] = preds_hhb/preds_na
    x_train['hbo2/ca'] = preds_hbo2/preds_ca
    x_train['hbo2/na'] = preds_hbo2/preds_na
    x_train['ca/na'] = preds_ca/preds_na
    
    return x_train

In [40]:
# train
x_train = prob_features(x_train, oofs, data='train')

# test
stack_sub = pd.read_csv('./submission/sub_0.78230.csv')
test = prob_features(test, stack_sub, data='test')

In [50]:
def retrain_model(x_train, y_train, x_test, seed, label, cv):
    models=[]
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    oof = np.zeros((x_train.shape[0],)) # oof
    pred = np.zeros(x_test.shape[0])
    # train, test split
    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(x_train)):

        #print(n_fold)
        trn_x, trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
        val_x, val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]

        if os.path.isfile('./models/%s_%sfold_%s_seed%s.pkl'%(label, n_fold, cv, seed)): # 모델 존재할경우
            print('%s model load..'%label)
            model= joblib.load('./models/%s_%sfold_%s_seed%s.pkl'%(label,n_fold, cv, seed))
        
        else: # 모델 없을때는 train
            print('train %s '%label)
            model= lgb.LGBMRegressor(boosting_type ='dart', n_estimators=20000,num_leaves=64, max_depth=-1, 
                                     min_child_weight=5, subsample=0.7, colsample_bytree = 0.2, learning_rate=0.01, gamma = 0 , n_jobs=-1,
                                random_state=42,reg_alpha=0.1, reg_lambda=0.1)

            model.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y),(val_x, val_y)], 
                      early_stopping_rounds=50 ,verbose=20000, eval_metric='mae')
        
            models.append(model)

        # OOF
        v_p = model.predict(val_x)
        oof[val_idx] = v_p
        
        # PREDS
        pred += model.predict(x_test)/5.0
    
        
    return models, oof, pred

In [51]:
def train_model(seed, cv):
    models_re = {}
    oofs_re=[]
    preds_re=[]
    threshold = [0.001, 0.0005, 0.0001]

    for label in ['hhb', 'hbo2', 'ca', 'na']:
        print('train column : ', label)
        pred = np.zeros(test.shape[0])
        permute_features = pd.read_csv('./bad_features/%s_bad_features.csv'%label,  names=['feature','count'])   
        bad_features = permute_features[permute_features['count'] >1]['feature'].values     
    
        x_reduced_trn = x_train.drop(bad_features, axis=1).copy()
        x_reduced_test = test.drop(bad_features, axis=1).copy()

        print(x_reduced_trn.shape[1])
        ms, oof, pred =retrain_model(x_reduced_trn, y_train[label], x_reduced_test, seed, label, cv)


        models_re[label] = ms
        oofs_re.append(oof)
        preds_re.append(pred)
    
    return models_re, oofs_re, preds_re

In [52]:
seed=[42,92,2020] ; cv = [0.79025, 0.77943, 0.78182]
models_re1, oofs_re1, preds_re1 = train_model(seed[0], cv[0])
models_re2, oofs_re2, preds_re2 = train_model(seed[1], cv[1])
models_re3, oofs_re3, preds_re3 = train_model(seed[2], cv[2])

train column :  hhb
290
hhb model load..
hhb model load..
hhb model load..
hhb model load..
hhb model load..
train column :  hbo2
246
hbo2 model load..
hbo2 model load..
hbo2 model load..
hbo2 model load..
hbo2 model load..
train column :  ca
288
ca model load..
ca model load..
ca model load..
ca model load..
ca model load..
train column :  na
172
na model load..
na model load..
na model load..
na model load..
na model load..
train column :  hhb
290
hhb model load..
hhb model load..
hhb model load..
hhb model load..
hhb model load..
train column :  hbo2
246
hbo2 model load..
hbo2 model load..
hbo2 model load..
hbo2 model load..
hbo2 model load..
train column :  ca
288
ca model load..
ca model load..
ca model load..
ca model load..
ca model load..
train column :  na
172
na model load..
na model load..
na model load..
na model load..
na model load..
train column :  hhb
290
hhb model load..
hhb model load..
hhb model load..
hhb model load..
hhb model load..
train column :  hbo2
246
hbo2 m

In [53]:
labels = ['hhb', 'hbo2', 'ca', 'na']
a=[]
for label,i in zip(labels,oofs_re1):
    a.append(mean_absolute_error(train[label], i))
print(a)
print('42 seed oof mae %.5f'% np.mean(a))

[0.49621931148554294, 0.4027651102521572, 1.251376674299484, 1.01063331532915]
42 seed oof mae 0.79025


In [54]:
a=[]
for label,i in zip(labels,oofs_re2):
    a.append(mean_absolute_error(train[label], i))
print(a)
print('92 seed oof mae %.5f'% np.mean(a))


[0.48738207291747193, 0.3963998804157671, 1.2345040134839655, 0.9994471082980501]
92 seed oof mae 0.77943


In [55]:
a=[]
for label,i in zip(labels,oofs_re3):
    a.append(mean_absolute_error(train[label], i))
print(a)
print('2020 seed oof mae %.5f'% np.mean(a))

[0.48610003505209726, 0.39494450071581094, 1.2415459878944304, 1.0046876782622796]
2020 seed oof mae 0.78182


In [56]:
# 최종 oof 성능
final_oof=[]
for i in range(4):
    temp_oof=np.zeros(train.shape[0])
    temp_oof+= (oofs_re1[i]+oofs_re2[i]+oofs_re3[i])/3.0
    final_oof.append(temp_oof)
#  
a=[]
for label,i in zip(labels,final_oof):
    a.append(mean_absolute_error(train[label], i))
print(a)
print('oof mae %.5f'% np.mean(a))

[0.4848126801392114, 0.3951887481523925, 1.2348176409768281, 1.000027085577363]
oof mae 0.77871


In [57]:
# submission
submission_gwang = pd.read_csv('./sample_submission.csv')
submission_gwang['hhb'] = (preds_re1[0] +preds_re2[0] +preds_re3[0]) /3
submission_gwang['hbo2'] = (preds_re1[1] +preds_re2[1] +preds_re3[1]) /3
submission_gwang['ca'] = (preds_re1[2] +preds_re2[2] +preds_re3[2]) /3
submission_gwang['na'] = (preds_re1[3] +preds_re2[3] +preds_re3[3]) /3

submission_gwang

Unnamed: 0,id,hhb,hbo2,ca,na
0,10000,8.139906,5.050496,9.591441,2.897216
1,10001,7.832381,2.404754,8.636410,2.364584
2,10002,9.797005,5.144637,9.655771,3.127631
3,10003,8.033548,4.509453,8.850021,4.441217
4,10004,7.119349,2.595708,9.248665,3.407534
...,...,...,...,...,...
9995,19995,3.628231,4.391384,12.157612,5.364728
9996,19996,10.684080,2.923970,8.306135,3.141520
9997,19997,6.668286,3.735521,9.247005,4.048194
9998,19998,12.813112,4.595704,7.960818,4.061691


In [112]:
submission_gwang.to_csv('./submission/0.77871_gwang_final.csv', index=False)

In [114]:
"""모델 저장"""

def save_model(permute_models, cv, seed=42):
    for label in permute_models:
        for n, i in enumerate(permute_models[label]):
            joblib.dump(i, './models/%s_%sfold_%s_seed%s.pkl'%(label,n, cv, seed))

In [115]:
save_model(models_re1, '0.79025', seed=42)
save_model(models_re2, '0.77943', seed=92)
save_model(models_re3, '0.78182', seed=2020)

# 기찬

## 1. 라이브러리 및 데이터
## Library & Data

In [6]:
import pandas as pd
import numpy as np
import pickle, os
import joblib
from tqdm import trange, tqdm, tqdm_notebook

# DATA SPLIT
from sklearn.model_selection import KFold, StratifiedKFold

# EVALUATE
from sklearn.metrics import log_loss
from sklearn.metrics import mean_absolute_error

# MODEL
import lightgbm as lgbm
from sklearn.ensemble import RandomForestRegressor
from catboost import CatBoostRegressor
import xgboost as xgb
# ELSE
import matplotlib.pyplot as plt
import warnings
import gc
warnings.filterwarnings('ignore')
%matplotlib inline


In [7]:
train = pd.read_csv("./train.csv", index_col = 0)
test = pd.read_csv("./test.csv", index_col=0)

df = train.append(test)
df = df[train.columns]
submission = pd.read_csv('./sample_submission.csv')

## 2. 변수 선택 및 모델 구축
## Feature Engineering & Initial Modeling

In [8]:
src_col=train.columns[train.columns.str.contains('src')]
dst_col = train.columns[train.columns.str.contains('dst')]

def fe3(train):
    train['src_mean']=train[src_col].mean(1)
    train['dst_mean']=train[dst_col].mean(1)
    train['div_feature_src2'] = train['dst_mean']/train['src_mean']

    train['src_std']=train[src_col].std(1)
    train['dst_std']=train[dst_col].std(1)
    train['div_feature_dst2'] = train['dst_std']/train['src_std']

    
    return train

train = fe3(train)
test = fe3(test)

############################################################

def fe_temp(train, src_col, dst_col, size=50):
    for i in range(650, 1000-size, size):
        temp1=train.loc[:, '%s_src'%i : '%s_src'%(i+size)].mean(1)
        temp2=train.loc[:, '%s_dst'%i : '%s_dst'%(i+size)].mean(1)
        train['%s_to_%s_mean_src_dst_div'%(i,i+size)] = temp2/temp1
        train['%s_to_%s_mean_src_dst_div_rho'%(i,i+size)] = np.log(temp2/temp1) / train['rho']
    
    return train

for i in [10,30,50,100]:
    train = fe_temp(train, src_col, dst_col, size=i)
    test = fe_temp(test, src_col, dst_col, size=i)
############################################################
temp = [x for x in range(650,1000,10)]

def window_rollinng_first(train):
    for k in [3, 5]:#window size
        temp1 = train[src_col].rolling(window = k, min_periods=1, axis=1).mean()
        temp2 = train[dst_col].rolling(window = k, min_periods=1, axis=1).mean()
        for i in temp:
            train['%s_rolling_win_%s'%(i, k)] = np.log(temp2['%s_dst'%i]/temp1['%s_src'%i])/train['rho']
        
    return train

train = window_rollinng_first(train)
test = window_rollinng_first(test)

############################################################
temp = [x for x in range(650,1000,10)]

def window_rolling_div(train):
    for i in temp:
        train['%s_rolling_win3/win5'%i] = train['%s_rolling_win_3'%i]/ train['%s_rolling_win_5'%i]
        
    return train

train = window_rolling_div(train)
test = window_rolling_div(test)

############################################################
def window_rolling_div_near(train, near):
    temp = [x for x in range(650,1000-near, 10)]
    for i in temp:
        train['near_%s_%s_win_3'%(i,i+near)] = train['%s_rolling_win_%s'%(i, 3)] / train['%s_rolling_win_%s'%(i+near, 3)]
        train['near_%s_%s_win_5'%(i,i+near)] = train['%s_rolling_win_%s'%(i, 5)] / train['%s_rolling_win_%s'%(i+near, 5)]
    return train

for i in range(10, 30, 10):
    train = window_rolling_div_near(train,i)
    test = window_rolling_div_near(test, i)
############################################################
temp = [x for x in range(650,1000,10)]

def temp_fe(train):
    for i in temp:
        train['%s_dd'%i] = train['%s_dst'%i]/train['%s_src'%i]
        train['%s_dd2'%i] = np.log(train['%s_dst'%i]/train['%s_src'%i])/train['rho']
    return train

train = temp_fe(train)
test = temp_fe(test)

############################################################
def near_all(train):
    temp = [x for x in range(650,1000,10)]
    temp_col = []
    for i in temp:
        temp_col.append('%s_dd2'%i)

    for i in temp_col:
        for j in temp_col:
            if i!=j:
                train['%s_%s_near_all_dd2'%(i,j)] = train[i]/train[j]
    
    return train

train = near_all(train)
test = near_all(test)

############################################################
def near_all(train):
    temp = [x for x in range(650,1000,10)]
    temp_col = []
    for i in temp:
        temp_col.append('%s_dd2'%i)

    for i in temp_col:
        for j in temp_col:
            if i!=j:
                train['%s_%s_near_all_dd3'%(i,j)] = train[i]-train[j]
    
    return train

#train = near_all(train)
#test = near_all(test)
############################################################

def exp1(train, size=10):
    temp_col=[]
    for i in [x for x in range(650, 1000-size, size)]:
        temp_col.append('%s_to_%s_mean_src_dst_div_rho'%(i,i+size))
        
    for i in temp_col:
        for j in temp_col:
            if i!=j:
                train['fucking_%s_%s'%(i,j)] = train[i]/train[j]
                #train['fucking2_%s_%s'%(i,j)] = train[i]-train[j]
    return train

train = exp1(train, size=10)
test = exp1(test, size=10)

train = exp1(train, size=30)
test = exp1(test, size=30)
train = exp1(train, size=50)
test = exp1(test, size=50)
train = exp1(train, size=100)
test = exp1(test, size=100)


In [9]:
test=test.replace([np.inf, -np.inf], np.nan)
train=train.replace([np.inf, -np.inf], np.nan)

df1 = train.append(test)

## Feature Engineering & Initial Modeling 2

In [10]:
train = pd.read_csv("./train.csv", index_col = 0)
test = pd.read_csv("./test.csv", index_col=0)

df = train.append(test)
df = df[train.columns]
submission = pd.read_csv('./sample_submission.csv')

In [11]:
kind = ['dst','src']
X_dst = df.iloc[:,np.where(df.columns.str.find(kind[0]) == 4)[0]].copy()
X_src = df.iloc[:,np.where(df.columns.str.find(kind[1]) == 4)[0]].copy()

In [12]:
def get_absor_rho(df, X_dst, X_src, window):

    temp1 = X_dst.copy()
    temp2 = X_src.copy()

    temp1.columns = [i for i in range(0, len(temp1.columns))]
    temp2.columns = [i for i in range(0, len(temp2.columns))]
    
    for i in range(0, len(X_dst)):
        temp1.iloc[i,:] = np.log(np.array(temp1.iloc[i,:].rolling(window=window,min_periods=1).mean())/\
                                 np.array(temp2.iloc[i,:].rolling(window=window,min_periods=1).mean()))/df.loc[i,'rho']
        
    absor_rolling = temp1.copy()
    absor_rolling.columns =  X_dst.columns + "/" + X_src.columns + "_rho_" + str(window)
    
    
    absor_rolling = absor_rolling.replace(np.inf, np.nan)
    absor_rolling = absor_rolling.replace(-np.inf, np.nan)
    
    absor_rolling["rolling_mean_" + str(window)] = absor_rolling.mean(axis=1)
    absor_rolling["rolling_std_" + str(window)] = absor_rolling.std(axis=1)   
    
    
    return absor_rolling

In [13]:
def get_absor(df, X_dst, X_src, window):

    temp1 = X_dst.copy()
    temp2 = X_src.copy()

    temp1.columns = [i for i in range(0, len(temp1.columns))]
    temp2.columns = [i for i in range(0, len(temp2.columns))]
    
    for i in range(0, len(X_dst)):
        temp1.iloc[i,:] = np.array(temp1.iloc[i,:].rolling(window=window,min_periods=1).mean())/\
                                 np.array(temp2.iloc[i,:].rolling(window=window,min_periods=1).mean())
        
    absor_rolling = temp1.copy()
    absor_rolling.columns =  X_dst.columns + "/" + X_src.columns + "_" + str(window)
    
    
    absor_rolling = absor_rolling.replace(np.inf, np.nan)
    absor_rolling = absor_rolling.replace(-np.inf, np.nan)
    
    absor_rolling["rolling_mean_" + str(window)] = absor_rolling.mean(axis=1)
    absor_rolling["rolling_std_" + str(window)] = absor_rolling.std(axis=1)   
    
    
    return absor_rolling

In [14]:
def divide_each_col(df, stride):
    temp = df.copy()
    for i in range(stride, len(temp.columns)):
        temp[temp.columns[i-stride] + "/" + temp.columns[i]] = temp[temp.columns[i-stride]]/temp[temp.columns[i]]
        
    return temp.iloc[:,len(df.columns):]

In [15]:
def subtract_each_col(df, stride):
    temp = df.copy()
    for i in range(stride, len(temp.columns)):
        temp[temp.columns[i-stride] + "-" + temp.columns[i]] = temp[temp.columns[i-stride]] - temp[temp.columns[i]]
        
    return temp.iloc[:,len(df.columns):]

In [16]:
def divide_df(df1, df2):
    temp1 = df1.copy()
    temp2 = df2.copy()
    
    temp1.columns = [i for i in range(0, len(temp1.columns))]
    temp2.columns = [i for i in range(0, len(temp2.columns))]
    
    df = temp1/temp2
    df.columns = df1.columns + "/" + df2.columns
    return df

In [17]:
absor_rolling1 = get_absor(df, X_dst, X_src, 1)
absor_rolling2 = get_absor(df, X_dst, X_src, 3)
absor_rolling3 = get_absor(df, X_dst, X_src, 5)
absor_rolling4 = get_absor(df, X_dst, X_src, 10)

absor_rho_rolling1 = get_absor_rho(df, X_dst, X_src, 1)
absor_rho_rolling2 = get_absor_rho(df, X_dst, X_src, 3)
absor_rho_rolling3 = get_absor_rho(df, X_dst, X_src, 5)
absor_rho_rolling4 = get_absor_rho(df, X_dst, X_src, 10)

In [18]:
absor_rolling1_divide_stride1 = divide_each_col(absor_rolling1, 1)
absor_rolling1_divide_stride2 = divide_each_col(absor_rolling1, 3)
absor_rolling1_divide_stride3 = divide_each_col(absor_rolling1, 5)
absor_rolling1_divide_stride4 = divide_each_col(absor_rolling1, 10)

absor_rolling2_divide_stride1 = divide_each_col(absor_rolling2, 1)
absor_rolling2_divide_stride2 = divide_each_col(absor_rolling2, 3)
absor_rolling2_divide_stride3 = divide_each_col(absor_rolling2, 5)
absor_rolling2_divide_stride4 = divide_each_col(absor_rolling2, 10)

absor_rolling3_divide_stride1 = divide_each_col(absor_rolling3, 1)
absor_rolling3_divide_stride2 = divide_each_col(absor_rolling3, 3)
absor_rolling3_divide_stride3 = divide_each_col(absor_rolling3, 5)
absor_rolling3_divide_stride4 = divide_each_col(absor_rolling3, 10)

absor_rolling4_divide_stride1 = divide_each_col(absor_rolling4, 1)
absor_rolling4_divide_stride2 = divide_each_col(absor_rolling4, 3)
absor_rolling4_divide_stride3 = divide_each_col(absor_rolling4, 5)
absor_rolling4_divide_stride4 = divide_each_col(absor_rolling4, 10)




########3



absor_rho_rolling1_divide_stride1 = divide_each_col(absor_rho_rolling1, 1)
absor_rho_rolling1_divide_stride2 = divide_each_col(absor_rho_rolling1, 3)
absor_rho_rolling1_divide_stride3 = divide_each_col(absor_rho_rolling1, 5)
absor_rho_rolling1_divide_stride4 = divide_each_col(absor_rho_rolling1, 10)

absor_rho_rolling2_divide_stride1 = divide_each_col(absor_rho_rolling2, 1)
absor_rho_rolling2_divide_stride2 = divide_each_col(absor_rho_rolling2, 3)
absor_rho_rolling2_divide_stride3 = divide_each_col(absor_rho_rolling2, 5)
absor_rho_rolling2_divide_stride4 = divide_each_col(absor_rho_rolling2, 10)

absor_rho_rolling3_divide_stride1 = divide_each_col(absor_rho_rolling3, 1)
absor_rho_rolling3_divide_stride2 = divide_each_col(absor_rho_rolling3, 3)
absor_rho_rolling3_divide_stride3 = divide_each_col(absor_rho_rolling3, 5)
absor_rho_rolling3_divide_stride4 = divide_each_col(absor_rho_rolling3, 10)

absor_rho_rolling4_divide_stride1 = divide_each_col(absor_rho_rolling4, 1)
absor_rho_rolling4_divide_stride2 = divide_each_col(absor_rho_rolling4, 3)
absor_rho_rolling4_divide_stride3 = divide_each_col(absor_rho_rolling4, 5)
absor_rho_rolling4_divide_stride4 = divide_each_col(absor_rho_rolling4, 10)

In [19]:
absor_rho_rolling_1_divide_rolling_2 = divide_df(absor_rho_rolling1, absor_rho_rolling2)
absor_rho_rolling_2_divide_rolling_3 = divide_df(absor_rho_rolling2, absor_rho_rolling3)
absor_rho_rolling_3_divide_rolling_4 = divide_df(absor_rho_rolling3, absor_rho_rolling4)

absor_rolling_1_divide_rolling_2 = divide_df(absor_rolling1, absor_rolling2)
absor_rolling_2_divide_rolling_3 = divide_df(absor_rolling2, absor_rolling3)
absor_rolling_3_divide_rolling_4 = divide_df(absor_rolling3, absor_rolling4)

In [20]:
df = pd.merge(df, absor_rolling1, on='id')
df = pd.merge(df, absor_rolling2, on='id')
df = pd.merge(df, absor_rolling3, on='id')
df = pd.merge(df, absor_rolling4, on='id')

df = pd.merge(df, absor_rho_rolling1, on='id')
df = pd.merge(df, absor_rho_rolling2, on='id')
df = pd.merge(df, absor_rho_rolling3, on='id')
df = pd.merge(df, absor_rho_rolling4, on='id')

##
df = pd.merge(df, absor_rho_rolling1_divide_stride1, on='id')
df = pd.merge(df, absor_rho_rolling1_divide_stride2, on='id')
df = pd.merge(df, absor_rho_rolling1_divide_stride3, on='id')
df = pd.merge(df, absor_rho_rolling1_divide_stride4, on='id')

df = pd.merge(df, absor_rho_rolling2_divide_stride1, on='id')
df = pd.merge(df, absor_rho_rolling2_divide_stride2, on='id')
df = pd.merge(df, absor_rho_rolling2_divide_stride3, on='id')
df = pd.merge(df, absor_rho_rolling2_divide_stride4, on='id')

df = pd.merge(df, absor_rho_rolling3_divide_stride1, on='id')
df = pd.merge(df, absor_rho_rolling3_divide_stride2, on='id')
df = pd.merge(df, absor_rho_rolling3_divide_stride3, on='id')
df = pd.merge(df, absor_rho_rolling3_divide_stride4, on='id')

df = pd.merge(df, absor_rho_rolling4_divide_stride1, on='id')
df = pd.merge(df, absor_rho_rolling4_divide_stride2, on='id')
df = pd.merge(df, absor_rho_rolling4_divide_stride3, on='id')
df = pd.merge(df, absor_rho_rolling4_divide_stride4, on='id')

###


####

df = pd.merge(df, absor_rho_rolling_1_divide_rolling_2, on='id')
df = pd.merge(df, absor_rho_rolling_2_divide_rolling_3, on='id')
df = pd.merge(df, absor_rho_rolling_3_divide_rolling_4, on='id')

In [21]:
df = df.replace(np.inf, np.nan)
df = df.replace(-np.inf, np.nan)

df2 = df.copy()

## 3. 모델 학습 및 검증
## Model Tuning & Evaluation

In [22]:
def get_train_test(df):

    X_train = df.drop(['hhb','hbo2','ca','na'], axis=1).loc[0:9999].copy()
    X_test = df.drop(['hhb','hbo2','ca','na'], axis=1).loc[10000:20000].copy()

    y_train = df[['hhb','hbo2','ca','na']].loc[0:9999].copy()
    y_test = df[['hhb','hbo2','ca','na']].loc[10000:20000].copy()
    
    return X_train, X_test, y_train, y_test

X_train1, X_test1, y_train1, y_test1 = get_train_test(df1)
X_train2, X_test2, y_train2, y_test2 = get_train_test(df2)

In [24]:
from sklearn.model_selection import train_test_split
from eli5.permutation_importance import get_score_importances

Using TensorFlow backend.


In [25]:
param = {'objective':'regression','n_estimators':10000, 'learning_rate':0.01,'random_state':42,
        'early_stopping_rounds':50,'colsample_bytree':0.5, 'metric':'l1'} #'

kf = KFold(n_splits=5, shuffle=True, random_state=777)

def my_permutation_badfeatures(X_train, y_train, col, kf):
    y_train_temp = y_train[col].copy()
    threshold = [0.001, 0.0001]
    bad_features1 = []
    bad_features2 = []
    #bad_features3 = []
        
    def score(X, y):
        y_pred = reg.predict(X)
        return abs(y-y_pred).mean() 
    
    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(X_train)):
        trn_x, trn_y = X_train.iloc[trn_idx], y_train_temp.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train_temp.iloc[val_idx]

        
        reg = lgbm.LGBMRegressor(**param)
        reg.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)],
                      verbose=True, eval_metric='mae', early_stopping_rounds=50)
        
        
        base_score, score_decreases = get_score_importances(score,np.array(val_x), np.array(val_y), n_iter=2)
        
        bad_features1.extend(list(val_x.columns[score_decreases[0] > -threshold[0]]))
                             
    return bad_features1
    

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=777)

##1
for col in y_train1.columns:
    
    
    bad_features1 = my_permutation_badfeatures(X_train1, y_train1, col, kf)
    pd.DataFrame(bad_features1)[0].value_counts().to_csv("./bad_features/bad_features_gwang_" + col + "_1.csv")

##2
for col in y_train2.columns:
    
    
    bad_features1 = my_permutation_badfeatures(X_train2, y_train2, col, kf)
    pd.DataFrame(bad_features1)[0].value_counts().to_csv("./bad_features/bad_features_gichan_" + col + "_1.csv")

In [26]:
param = {'objective':'reg:squarederror','eval_metric':'mae','colsample_bytree':0.7,'learning_rate':0.01,
          'n_estimators':20000,'random_state':42, 'tree_method':'gpu_hist','n_gpus':1,'early_stopping_rounds':50}

def my_permutation_lgb_for_stacking(X_train, X_test, y_train, y_test, param, kf, col):

    score = []
    
    y_val_pred = y_train.copy()
    y_val_pred.loc[:,:] = 0
    
    y_submit = y_test.copy()
    y_submit.loc[:,:] = 0
    
    
    valid_mae = []
    
    y_train_temp = y_train[col]
    
    

    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(X_train)):
        trn_x, trn_y = X_train.iloc[trn_idx], y_train_temp.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train_temp.iloc[val_idx]
        
        reg = lgbm.LGBMRegressor(**param)

    
        reg.fit(trn_x, trn_y,
                    eval_set=[(trn_x, trn_y), (val_x, val_y)],
                    early_stopping_rounds=50, verbose=100)
    
        pred = reg.predict(X_test)
        y_submit[col] += pred/5
        y_val_pred.loc[val_idx,col] = reg.predict(val_x)
        
        pred_val = reg.predict(val_x)
        valid_mae.append(abs(pred_val-val_y).mean())
        
        
    return y_submit[col], np.array(valid_mae).mean(), y_val_pred[col]

In [35]:
y_submit_lgb1 = y_test1.copy()
y_val_pred1 = y_train1.copy(); y_val_pred1.iloc[:,:] = 0

y_submit_lgb2 = y_test2.copy()
y_val_pred2 = y_train2.copy(); y_val_pred2.iloc[:,:] = 0

## seed ensemble 실시한다 곧. random_stae = 777, 777^2, 777^4 
param3 = {'objective':'regression','n_estimators':10000, 'learning_rate':0.15, 'random_state':777,
        'early_stopping_rounds':50,'colsample_bytree':0.2, 'metric':'l1','boosting':'dart'} #'boosting':'dart'

kf = KFold(n_splits=5, shuffle=True, random_state=777)


## 1
valid_score = []
for col in ['hhb','hbo2','ca','na']:
     
    bad_features = pd.read_csv("./bad_features_gwang_" + col + "_"+ str(1) + ".csv", index_col=0).copy()
    bad_features = bad_features[bad_features.iloc[:,0] == 5].index
    List = list(bad_features)
    
    y_submit_lgb1[col], valid_mae, y_val_pred1[col] = my_permutation_lgb_for_stacking(X_train1.drop(List, axis=1), \
                                                      X_test1.drop(List, axis=1),y_train1,  y_test1, param3, kf, col)
    valid_score.append(valid_mae)
    print(valid_mae)

## 3
valid_score = []
for col in ['hhb','hbo2','ca','na']:
     
    bad_features = pd.read_csv("./bad_features_gichan_" + col + "_"+ str(1) + ".csv", index_col=0).copy()
    bad_features = bad_features[bad_features.iloc[:,0] == 5].index
    List = list(bad_features)
    
    y_submit_lgb2[col], valid_mae, y_val_pred2[col] = my_permutation_lgb_for_stacking(X_train2.drop(List, axis=1), \
                                                      X_test2.drop(List, axis=1),y_train2,  y_test2, param3, kf, col)
    valid_score.append(valid_mae)
    print(valid_mae)

2.106439119498514
0.7655641973038987
2.274564558091105
1.4641019687347696
2.1057222429480076
0.7683808929313594
2.288236002426401
1.4585054585418429


In [36]:
y_stacking1 = y_val_pred1.append(y_submit_lgb1)
y_stacking1.columns = y_stacking1.columns + "_stacking"

y_stacking2 = y_val_pred2.append(y_submit_lgb2)
y_stacking2.columns = y_stacking2.columns + "_stacking"

#저장 
y_stacking1.to_csv('./y_stacking1.csv')
y_stacking2.to_csv('./y_stacking2.csv')

# Permutation + Stacking + dart + final

In [38]:
def divide_all(y_stacking):
    y_stacking_temp = y_stacking.copy()
    for i in range(0, len(y_stacking.columns)-1):
        for j in range(i+1, len(y_stacking.columns)):
            y_stacking_temp[y_stacking.columns[i] + "/" + y_stacking.columns[j]] = y_stacking[y_stacking.columns[i]]/y_stacking[y_stacking.columns[j]] 
    return y_stacking_temp.iloc[:,0:4]

In [39]:
def my_permutation_lgb(X_train, X_test, y_train, y_test, param, kf, col):

    score = []
    
    y_val_pred = y_train.copy()
    y_val_pred.loc[:,:] = 0
    
    y_submit = y_test.copy()
    y_submit.loc[:,:] = 0
    
    
    valid_mae = []
    
    y_train_temp = y_train[col]
    
    

    for n_fold, (trn_idx, val_idx) in enumerate(kf.split(X_train)):
        trn_x, trn_y = X_train.iloc[trn_idx], y_train_temp.iloc[trn_idx]
        val_x, val_y = X_train.iloc[val_idx], y_train_temp.iloc[val_idx]
        
        reg = lgbm.LGBMRegressor(**param)

    
        reg.fit(trn_x, trn_y,
                    eval_set=[(trn_x, trn_y), (val_x, val_y)],
                    early_stopping_rounds=50, verbose=100)
    
        pred = reg.predict(X_test)
        y_submit[col] += pred/5
        y_val_pred.loc[val_idx,col] = reg.predict(val_x)
        
        pred_val = reg.predict(val_x)
        valid_mae.append(abs(pred_val-val_y).mean())
        
        
    return y_submit[col], np.array(valid_mae).mean() , y_val_pred[col]

# df1 stacking + df2 stacking -> dart

In [30]:
df2.drop(['hhb','hbo2','ca','na'], axis=1, inplace=True)

In [40]:
y_stacking1= pd.read_csv('./y_stacking1.csv', index_col=0)
y_stacking2 = pd.read_csv('./y_stacking2.csv', index_col=0)

#y_stacking1.index.name ='id'
#y_stacking2.index.name ='id'

In [44]:
#X_train, X_test, y_train_no, y_test = get_train_test(df_temp_final)
y_submit_lgb_final = y_test1.copy()
y_val_pred = y_train1.copy(); y_val_pred.iloc[:,:] = 0
kf = KFold(n_splits=5, shuffle=True, random_state=777)

#param = {'objective':'regression','n_estimators':10000, 'learning_rate':0.005, 'random_state':777,
#        'early_stopping_rounds':50,'colsample_bytree':0.7, 'metric':'l1','n_jobs':12}
param = {'objective':'regression','n_estimators':10000, 'learning_rate':0.15, 'random_state':777**2,
        'early_stopping_rounds':50,'colsample_bytree':0.2, 'metric':'l1','n_jobs':12,'boosting':'dart'} 

# seed : 777, 777*2, 777**2

valid_score = []
for col in ['hhb','hbo2','ca','na']:
    
    Y0 = y_stacking1
    Y1 = divide_all(y_stacking1)
    
    Y2 = y_stacking2
    Y3 = divide_all(y_stacking2)    
    
    bad_features1 = pd.read_csv("./bad_features_gwang_" + col + "_"+ str(1) + ".csv", index_col=0).copy()
    bad_features1 = bad_features1[bad_features1.iloc[:,0] == 5].index
    List1 = list(bad_features1)
    
    bad_features2 = pd.read_csv("./bad_features_gichan_" + col + "_"+ str(1) + ".csv", index_col=0).copy()
    bad_features2 = bad_features2[bad_features2.iloc[:,0] == 5].index
    List2 = list(bad_features2)
    
    df_temp = df1.drop(List1, axis=1)
    df_temp2 = df2.drop(List2, axis=1)
    
    # 합체!!
    df_temp_final = pd.merge(df_temp, df_temp2, on='id')
    df_temp_final = pd.merge(df_temp_final, Y0, on='id')
    df_temp_final = pd.merge(df_temp_final, Y1, on='id')
    df_temp_final = pd.merge(df_temp_final, Y2, on='id')
    df_temp_final = pd.merge(df_temp_final, Y3, on='id')
    
    X_train, X_test, y_train, y_test = get_train_test(df_temp_final) 
    
    y_submit_lgb_final[col], valid_mae, y_val_pred[col] = my_permutation_lgb(X_train,
                                                      X_test,y_train,  y_test, param, kf, col)
    valid_score.append(valid_mae)
    print(valid_mae)

[100]	training's l1: 0.565774	valid_1's l1: 0.669097


KeyboardInterrupt: 

In [None]:
y_submit_lgb_final.to_csv("./submission/submission_gichan.csv")

## 광한 기찬 서브미션 앙상블

In [None]:
y_submit_lgb_final = pd.read_csv('./submission/submission_gichan.csv')
submission_gwang = pd.read_csv('./submission/0.77871_gwang_final.csv')

In [None]:
final = submission_gwang.copy()
final[['hhb','hbo2','ca', 'na']] = (submission_gwang[['hhb','hbo2','ca', 'na']] + y_submit_lgb_final[['hhb','hbo2','ca', 'na']])/2
final.to_csv('./submission/final_submission.csv', index=False)