In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
# import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn import base
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFE


# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVR
from lightgbm import LGBMClassifier
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import BayesianRidge

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

# Utility
import os
import time
import random
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")
from IPython.display import Image
# import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from tensorflow import keras

# from bayes_opt import BayesianOptimization

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
X_new = pd.read_csv(os.path.abspath("../input")+"/1round_third_select_499_train.csv" , encoding = 'utf-8')
X_te_new = pd.read_csv(os.path.abspath("../input")+"/1round_third_select_499_test.csv" , encoding = 'utf-8')

In [3]:
num_features_train = pd.read_csv(os.path.abspath("../input")+"/X_train.csv" , encoding ='cp949')
num_features_test = pd.read_csv(os.path.abspath("../input")+"/X_test.csv" , encoding ='cp949')

In [4]:
target = pd.read_csv(os.path.abspath("../input")+'/y_train.csv' , encoding = 'cp949').group

# LGBM_BO Modeling

In [5]:
# LGBM_BO Modeling

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_new, target, test_size=0.3, random_state = 0, stratify = target)

In [9]:
import pickle
import joblib

IDtest = num_features_train.custid.unique()

pikle_data = (np.array(X_train), np.array(X_val), y_train, y_val, np.array(X_te_new), np.array(IDtest))

with open('DNN_features.pkl', 'wb') as f:
    pickle.dump(pikle_data, f)

In [7]:
from bayes_opt import BayesianOptimization

In [8]:
bayesian_params = {
    'n_estimators':(50, 500),
    'learning_rate':(0.001, 0.1),
    'max_depth':(8, 16),
    'num_leaves':(24, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}

In [9]:
def lgb_log_loss_eval(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree, max_bin, reg_lambda, reg_alpha):
    
    params = {
        "n_estimators":int(round(n_estimators)), 
        'learning_rate': learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0),
        'random_state':1000,
        'n_jobs':-1
    }
    
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric= 'logloss', verbose= 100, 
                early_stopping_rounds= 100)
    valid_pred = lgb_model.predict_proba(X_val)
    LL = log_loss(y_val, valid_pred)
    
    return LL

In [10]:
lgbBO = BayesianOptimization(f = lgb_log_loss_eval, pbounds=bayesian_params, random_state=1000)
lgbBO.maximize(init_points=5, n_iter=50)

|   iter    |  target   | colsam... | learni... |  max_bin  | max_depth | min_ch... | min_ch... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 1.618   [0m | [0m 0.8268  [0m | [0m 0.01239 [0m | [0m 475.6   [0m | [0m 11.86   [0m | [0m 175.8   [0m | [0m 11.4    [0m | [0m 68.32   [0m | [0m 39.89   [0m | [0m 11.66   [0m | [0m 8.418   [0m | [0m 0.6035  [0m |
[100]	training's multi_logloss: 1.36263	valid_1's multi_logloss: 1.50144
[200]	training's multi_logloss: 1.24786	valid_1's multi_logloss: 1.48094
[300]	training's multi_logloss: 1.17572	valid_1's multi_logloss: 1.47725
[400]	training's multi_logloss: 1.13489	valid_1's multi_logloss: 1.478
| [0m 2       [0m | [0m 1.477   [0m | [0m 0.8712  [0m | [0m 0.03982 [0m | [0m 99.31   [0m | [0m 13.95   [0m | [0m 23.22

[100]	training's multi_logloss: 1.40283	valid_1's multi_logloss: 1.52235
| [0m 22      [0m | [0m 1.516   [0m | [0m 0.7965  [0m | [0m 0.03763 [0m | [0m 435.9   [0m | [0m 9.637   [0m | [0m 83.84   [0m | [0m 2.188   [0m | [0m 115.3   [0m | [0m 37.38   [0m | [0m 41.65   [0m | [0m 6.016   [0m | [0m 0.8648  [0m |
| [0m 23      [0m | [0m 1.566   [0m | [0m 0.5648  [0m | [0m 0.03875 [0m | [0m 65.62   [0m | [0m 15.36   [0m | [0m 88.48   [0m | [0m 1.118   [0m | [0m 51.35   [0m | [0m 35.48   [0m | [0m 38.55   [0m | [0m 8.34    [0m | [0m 0.9788  [0m |
| [0m 24      [0m | [0m 1.591   [0m | [0m 0.8235  [0m | [0m 0.01878 [0m | [0m 470.1   [0m | [0m 14.84   [0m | [0m 175.2   [0m | [0m 10.43   [0m | [0m 60.61   [0m | [0m 32.98   [0m | [0m 16.55   [0m | [0m 8.837   [0m | [0m 0.5008  [0m |
[100]	training's multi_logloss: 1.14812	valid_1's multi_logloss: 1.47153
[200]	training's multi_logloss: 0.930613	valid_1's multi_logloss: 1.

[100]	training's multi_logloss: 1.57853	valid_1's multi_logloss: 1.61829
| [0m 44      [0m | [0m 1.61    [0m | [0m 0.7846  [0m | [0m 0.011   [0m | [0m 283.1   [0m | [0m 15.53   [0m | [0m 191.9   [0m | [0m 17.14   [0m | [0m 108.9   [0m | [0m 54.02   [0m | [0m 39.89   [0m | [0m 4.221   [0m | [0m 0.6714  [0m |
[100]	training's multi_logloss: 1.48559	valid_1's multi_logloss: 1.58113
[200]	training's multi_logloss: 1.34494	valid_1's multi_logloss: 1.51776
[300]	training's multi_logloss: 1.24893	valid_1's multi_logloss: 1.49299
| [0m 45      [0m | [0m 1.483   [0m | [0m 0.5358  [0m | [0m 0.0126  [0m | [0m 246.8   [0m | [0m 13.32   [0m | [0m 117.5   [0m | [0m 5.412   [0m | [0m 383.1   [0m | [0m 41.53   [0m | [0m 13.23   [0m | [0m 6.24    [0m | [0m 0.5461  [0m |
[100]	training's multi_logloss: 0.798755	valid_1's multi_logloss: 1.47969
| [0m 46      [0m | [0m 1.477   [0m | [0m 0.8433  [0m | [0m 0.06378 [0m | [0m 352.4   [0m | [0m 15.

In [11]:
# dictionary에 있는 target값을 모두 추출
target_list = []

for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmin(np.array(target_list)))

[1.6180766132143152, 1.4771480931188807, 1.4986236823777845, 1.4865929511101474, 1.4864684198990108, 1.4761457599925478, 1.4732015788119721, 1.481429941517693, 1.4779121866594425, 1.571835428077632, 1.5684302235359127, 1.4927936611443855, 1.554248309366624, 1.4968960772406232, 1.7047804606751118, 1.5183652754293264, 1.653741736036426, 1.493191892742638, 1.4985408851282322, 1.4965054169759506, 1.5456786858646698, 1.5161884829622883, 1.5661772260059434, 1.5912441544058429, 1.468628214769404, 1.5037108485938704, 1.5069424997581757, 1.4715535991096913, 1.5147208647206722, 1.5018880256054477, 1.4871269194637353, 1.4794706466045988, 1.5160707358615568, 1.475861739780996, 1.480647345395523, 1.4766480233963675, 1.4901128297295496, 1.5013526930356011, 1.499800473009222, 1.4902029572632227, 1.5154260725926423, 1.5009286342511807, 1.5073718377024625, 1.6095818613813475, 1.483321475963915, 1.4773403259494458, 1.4956001395437006, 1.5034134247978768, 1.5066672712970568, 1.5909097497759395, 1.4991613

In [12]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmin(np.array(target_list))]
print(max_dict)

{'target': 1.468628214769404, 'params': {'colsample_bytree': 0.7332583469184908, 'learning_rate': 0.05376702188186737, 'max_bin': 84.74524468918648, 'max_depth': 10.69908199214511, 'min_child_samples': 180.21851856592102, 'min_child_weight': 45.29455927988186, 'n_estimators': 250.61717140680972, 'num_leaves': 50.677188681346735, 'reg_alpha': 0.34933638595536765, 'reg_lambda': 9.987015785769142, 'subsample': 0.5655322532561203}}


# *OOF 스태킹*

In [20]:
target = pd.read_csv(os.path.abspath("../input")+'/y_train.csv' , encoding = 'cp949').group

In [21]:
from sklearn.preprocessing import LabelEncoder

In [22]:
encoder = LabelEncoder()

In [23]:
y = encoder.fit_transform(target)

In [24]:
X_new.shape, X_te_new.shape

((21587, 499), (14380, 499))

In [25]:
lgb_oof_ver2 = np.zeros((X_new.shape[0], 19))
lgb_oof_ver2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from time import time
import datetime
import gc

# FE
from scipy.signal import find_peaks, peak_widths, peak_prominences

# Model
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import xgboost as xgb

# Ensemble
from scipy.optimize import minimize
from sklearn.metrics import log_loss

***

{'target': 1.468628214769404, 'params': {'colsample_bytree': 0.7332583469184908, 'learning_rate': 0.05376702188186737, 'max_bin': 84.74524468918648, 'max_depth': 10.69908199214511, 'min_child_samples': 180.21851856592102, 'min_child_weight': 45.29455927988186, 'n_estimators': 250.61717140680972, 'num_leaves': 50.677188681346735, 'reg_alpha': 0.34933638595536765, 'reg_lambda': 9.987015785769142, 'subsample': 0.5655322532561203}}

***

In [27]:
clf = LGBMClassifier(
                objective='multiclass',
                metric='multi_logloss',
                nthread=4,
                n_estimators=250,
                learning_rate=0.05376702188186737,
                max_bin=85,
                max_depth=11,
                num_leaves=51,
                colsample_bytree=0.7332583469184908,
                subsample=0.5655322532561203,
                reg_alpha=0.34933638595536765,
                reg_lambda=9.987015785769142,
                min_child_samples=180,
                min_child_weight=45.29455927988186,
                silent=-1,
                verbose=-1,
                random_state=1000
                )

In [28]:
n_splits = 10
mlogloss = []
lgb_oof_ver2 = np.zeros((X_new.shape[0], 8))
lgb_pred_ver2 = np.zeros((X_te_new.shape[0], 8))

for X, X_test in [(X_new,X_te_new)]:
    X= X.reset_index(drop=True)
    for seed in [0,1000]:
        kfold = StratifiedKFold(n_splits=n_splits, random_state= seed, shuffle=True)
        for fold, (trn_idx, val_idx) in enumerate(kfold.split(X, y)):
            X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
            y_train, y_valid = y[trn_idx], y[val_idx]

            clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'logloss', verbose= 200, 
                early_stopping_rounds= 200)
            # Predict
            lgb_pred_ver2 += clf.predict_proba(X_test) / (n_splits * 4)
            lgb_oof_ver2[val_idx] += clf.predict_proba(X_valid) / 4
            print('*'* 85)
            print('Training has finished.')
        print('lgb ver2 logloss= ', log_loss(y, lgb_oof_ver2))

[200]	training's multi_logloss: 0.946542	valid_1's multi_logloss: 1.46057
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 0.943317	valid_1's multi_logloss: 1.47205
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 0.950352	valid_1's multi_logloss: 1.46583
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 0.947374	valid_1's multi_logloss: 1.48242
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 0.953174	valid_1's multi_logloss: 1.44846
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 0.950957	valid_1's multi_logloss: 1.45808
***********

In [29]:
all_oof = np.column_stack([lgb_oof_ver2])
all_test = np.column_stack([lgb_pred_ver2])

In [30]:
pd.DataFrame(all_test)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.067822,0.054536,0.018386,0.077573,0.037053,0.065385,0.013187,0.166058
1,0.013510,0.034492,0.262041,0.007544,0.001991,0.016504,0.149249,0.014670
2,0.416722,0.056078,0.004218,0.002852,0.010712,0.006276,0.001243,0.001900
3,0.272008,0.066794,0.019654,0.021418,0.015980,0.075622,0.018960,0.009564
4,0.441558,0.024129,0.004873,0.003393,0.017990,0.004705,0.001499,0.001853
...,...,...,...,...,...,...,...,...
14375,0.032353,0.221874,0.022800,0.008792,0.007031,0.158272,0.039870,0.009008
14376,0.196193,0.105315,0.067057,0.034056,0.008402,0.036092,0.025006,0.027880
14377,0.150580,0.085071,0.045800,0.039253,0.069070,0.056512,0.018296,0.035419
14378,0.062192,0.184805,0.125750,0.005162,0.004608,0.062049,0.053561,0.001872


In [31]:
X_train.shape

(19429, 499)

In [32]:
all_test.shape

(14380, 8)

In [33]:
mlogloss = []
n_splits = 10

stk_oof_pred = np.zeros((all_oof.shape[0], 8))
stk_test_pred = np.zeros((all_test.shape[0], 8))

kfold = StratifiedKFold(n_splits=n_splits, random_state=1000, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kfold.split(all_oof, y)):
    X_train, X_valid = all_oof[trn_idx], all_oof[val_idx]
    y_train, y_valid = y[trn_idx], y[val_idx]
            
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'logloss', verbose= 200, 
                early_stopping_rounds= 200)


    stk_test_pred += clf.predict_proba(all_test) / n_splits
    stk_oof_pred[val_idx] = clf.predict_proba(X_valid)
    
print('mean logloss= ',np.mean(mlogloss))

[200]	training's multi_logloss: 1.26729	valid_1's multi_logloss: 1.47699
[200]	training's multi_logloss: 1.26749	valid_1's multi_logloss: 1.47308
[200]	training's multi_logloss: 1.26524	valid_1's multi_logloss: 1.49973
[200]	training's multi_logloss: 1.26925	valid_1's multi_logloss: 1.45977
[200]	training's multi_logloss: 1.26416	valid_1's multi_logloss: 1.5157
[200]	training's multi_logloss: 1.26757	valid_1's multi_logloss: 1.48055
[200]	training's multi_logloss: 1.26722	valid_1's multi_logloss: 1.47002
[200]	training's multi_logloss: 1.26681	valid_1's multi_logloss: 1.47921
[200]	training's multi_logloss: 1.26606	valid_1's multi_logloss: 1.48875
[200]	training's multi_logloss: 1.2637	valid_1's multi_logloss: 1.49921
mean logloss=  nan


In [34]:
stk_test_pred = pd.DataFrame(stk_test_pred)

In [35]:
stk_test_pred = stk_test_pred.rename(columns = {0:'F20',1:'F30',2:'F40',3:'F50',4:'M20',5:'M30',6:'M40',7:'M50'})

In [36]:
stk_test_pred

Unnamed: 0,F20,F30,F40,F50,M20,M30,M40,M50
0,0.156663,0.089070,0.053010,0.171149,0.122487,0.148980,0.029384,0.229257
1,0.007516,0.055356,0.602576,0.006623,0.002759,0.010515,0.284459,0.030196
2,0.850770,0.087259,0.011270,0.006768,0.022990,0.011907,0.002247,0.006788
3,0.350098,0.312041,0.032786,0.050447,0.124847,0.076881,0.020503,0.032398
4,0.839838,0.078835,0.017156,0.007975,0.033409,0.010379,0.002850,0.009559
...,...,...,...,...,...,...,...,...
14375,0.049020,0.482403,0.027145,0.025468,0.009461,0.290934,0.105399,0.010172
14376,0.310158,0.211895,0.235567,0.061233,0.022786,0.084297,0.033561,0.040503
14377,0.266065,0.162950,0.070723,0.058905,0.198516,0.141810,0.036061,0.064971
14378,0.134219,0.349731,0.256458,0.004877,0.015971,0.073414,0.158630,0.006701


In [37]:
test_id_pd = pd.read_csv(os.path.abspath("../submission")+"/bbi_cat.csv" , encoding ='cp949')

In [38]:
tst_id = test_id_pd['ID']

In [39]:
submission = pd.concat([pd.DataFrame({'ID':tst_id}),stk_test_pred],axis = 1)
submission.to_csv('1round_third_lgbm_stk_oof_10.csv',index = False,encoding = 'utf-8')

In [40]:
submission

Unnamed: 0,ID,F20,F30,F40,F50,M20,M30,M40,M50
0,30001,0.156663,0.089070,0.053010,0.171149,0.122487,0.148980,0.029384,0.229257
1,30002,0.007516,0.055356,0.602576,0.006623,0.002759,0.010515,0.284459,0.030196
2,30003,0.850770,0.087259,0.011270,0.006768,0.022990,0.011907,0.002247,0.006788
3,30005,0.350098,0.312041,0.032786,0.050447,0.124847,0.076881,0.020503,0.032398
4,30007,0.839838,0.078835,0.017156,0.007975,0.033409,0.010379,0.002850,0.009559
...,...,...,...,...,...,...,...,...,...
14375,49988,0.049020,0.482403,0.027145,0.025468,0.009461,0.290934,0.105399,0.010172
14376,49990,0.310158,0.211895,0.235567,0.061233,0.022786,0.084297,0.033561,0.040503
14377,49992,0.266065,0.162950,0.070723,0.058905,0.198516,0.141810,0.036061,0.064971
14378,49993,0.134219,0.349731,0.256458,0.004877,0.015971,0.073414,0.158630,0.006701
