In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
# import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn import base
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFE


# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVR
from lightgbm import LGBMClassifier
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.linear_model import BayesianRidge

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

# Utility
import os
import time
import random
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")
from IPython.display import Image
# import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from tensorflow import keras

# from bayes_opt import BayesianOptimization

  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)
  _numeric_index_types = (pd.Int64Index, pd.Float64Index, pd.UInt64Index)


In [2]:
X_new = pd.read_csv(os.path.abspath("../input")+"/choi_select_547_train.csv" , encoding = 'utf-8')
X_te_new = pd.read_csv(os.path.abspath("../input")+"/choi_select_547_test.csv" , encoding = 'utf-8')

In [3]:
num_features_train = pd.read_csv(os.path.abspath("../input")+"/choi_num_features_train.csv" , encoding = 'utf-8')
num_features_test = pd.read_csv(os.path.abspath("../input")+"/choi_num_features_test.csv" , encoding = 'utf-8')

In [4]:
target = pd.read_csv(os.path.abspath("../input")+'/y_train.csv' , encoding = 'cp949').group

# LGBM_BO Modeling

In [5]:
# LGBM_BO Modeling

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X_new, target, test_size=0.3, random_state = 0, stratify = target)

In [7]:
import pickle
import joblib

IDtest = num_features_train.custid.unique()

pikle_data = (np.array(X_train), np.array(X_val), y_train, y_val, np.array(X_te_new), np.array(IDtest))

with open('DNN_features.pkl', 'wb') as f:
    pickle.dump(pikle_data, f)

In [8]:
from bayes_opt import BayesianOptimization

In [9]:
bayesian_params = {
    'n_estimators':(50, 500),
    'learning_rate':(0.001, 0.1),
    'max_depth':(8, 16),
    'num_leaves':(24, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}

In [10]:
def lgb_log_loss_eval(n_estimators, learning_rate, max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree, max_bin, reg_lambda, reg_alpha):
    
    params = {
        "n_estimators":int(round(n_estimators)), 
        'learning_rate': learning_rate,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0),
        'random_state':1000,
        'n_jobs':-1
    }
    
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric= 'logloss', verbose= 100, 
                early_stopping_rounds= 100)
    valid_pred = lgb_model.predict_proba(X_val)
    LL = log_loss(y_val, valid_pred)
    
    return LL

In [11]:
lgbBO = BayesianOptimization(f = lgb_log_loss_eval, pbounds=bayesian_params, random_state=1000)
lgbBO.maximize(init_points=5, n_iter=50)

|   iter    |  target   | colsam... | learni... |  max_bin  | max_depth | min_ch... | min_ch... | n_esti... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 1.606   [0m | [0m 0.8268  [0m | [0m 0.01239 [0m | [0m 475.6   [0m | [0m 11.86   [0m | [0m 175.8   [0m | [0m 11.4    [0m | [0m 68.32   [0m | [0m 39.89   [0m | [0m 11.66   [0m | [0m 8.418   [0m | [0m 0.6035  [0m |
[100]	training's multi_logloss: 1.25423	valid_1's multi_logloss: 1.47967
[200]	training's multi_logloss: 1.07755	valid_1's multi_logloss: 1.45989
[300]	training's multi_logloss: 0.956931	valid_1's multi_logloss: 1.456
[400]	training's multi_logloss: 0.868089	valid_1's multi_logloss: 1.45687
| [0m 2       [0m | [0m 1.456   [0m | [0m 0.8712  [0m | [0m 0.03982 [0m | [0m 99.31   [0m | [0m 13.95   [0m | [0m 23.

[200]	training's multi_logloss: 1.11805	valid_1's multi_logloss: 1.46686
| [0m 21      [0m | [0m 1.467   [0m | [0m 0.6516  [0m | [0m 0.09152 [0m | [0m 158.6   [0m | [0m 11.56   [0m | [0m 124.0   [0m | [0m 15.84   [0m | [0m 258.0   [0m | [0m 37.98   [0m | [0m 39.34   [0m | [0m 8.326   [0m | [0m 0.9828  [0m |
| [0m 22      [0m | [0m 1.477   [0m | [0m 0.7061  [0m | [0m 0.08002 [0m | [0m 473.9   [0m | [0m 12.39   [0m | [0m 182.5   [0m | [0m 17.52   [0m | [0m 56.32   [0m | [0m 49.1    [0m | [0m 21.55   [0m | [0m 7.584   [0m | [0m 0.8887  [0m |
| [0m 23      [0m | [0m 1.543   [0m | [0m 0.5648  [0m | [0m 0.03875 [0m | [0m 65.62   [0m | [0m 15.36   [0m | [0m 88.48   [0m | [0m 1.118   [0m | [0m 51.35   [0m | [0m 35.48   [0m | [0m 38.55   [0m | [0m 8.34    [0m | [0m 0.9788  [0m |
| [0m 24      [0m | [0m 1.576   [0m | [0m 0.8235  [0m | [0m 0.01878 [0m | [0m 470.1   [0m | [0m 14.84   [0m | [0m 175.2   [0m |

[100]	training's multi_logloss: 1.40958	valid_1's multi_logloss: 1.56489
[200]	training's multi_logloss: 1.222	valid_1's multi_logloss: 1.49696
[300]	training's multi_logloss: 1.08903	valid_1's multi_logloss: 1.47076
| [0m 45      [0m | [0m 1.461   [0m | [0m 0.5358  [0m | [0m 0.0126  [0m | [0m 246.8   [0m | [0m 13.32   [0m | [0m 117.5   [0m | [0m 5.412   [0m | [0m 383.1   [0m | [0m 41.53   [0m | [0m 13.23   [0m | [0m 6.24    [0m | [0m 0.5461  [0m |
[100]	training's multi_logloss: 0.601097	valid_1's multi_logloss: 1.46548
| [0m 46      [0m | [0m 1.463   [0m | [0m 0.8433  [0m | [0m 0.06378 [0m | [0m 352.4   [0m | [0m 15.43   [0m | [0m 139.0   [0m | [0m 6.911   [0m | [0m 262.4   [0m | [0m 45.78   [0m | [0m 1.28    [0m | [0m 8.062   [0m | [0m 0.5683  [0m |
[100]	training's multi_logloss: 1.20798	valid_1's multi_logloss: 1.47065
| [0m 47      [0m | [0m 1.47    [0m | [0m 0.7734  [0m | [0m 0.08886 [0m | [0m 293.6   [0m | [0m 10.28

In [12]:
# dictionary에 있는 target값을 모두 추출
target_list = []

for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmin(np.array(target_list)))

[1.6060982858125845, 1.4556068675499798, 1.4655825658321555, 1.4688808847928827, 1.4591822611433771, 1.4669657412254045, 1.452348853289662, 1.4645161000335565, 1.4626696571767683, 1.4799900664906072, 1.5443871109560476, 1.4671639551435856, 1.5274123757440137, 1.4675921068085074, 1.6950039702477975, 1.4933062656887393, 1.638815821205718, 1.4627763950780488, 1.4680463155627095, 1.4662920486477267, 1.4668566526359093, 1.4767312387673706, 1.5428294883105653, 1.5758729620943188, 1.4624938648355332, 1.4781610996959282, 1.4854362789477158, 1.468151754343313, 1.459436314821971, 1.4713478269480162, 1.4730684078905438, 1.4642859113913247, 1.4891334602953101, 1.4998583551808504, 1.461139181123826, 1.4627578493200197, 1.4619787961882376, 1.4690911586541557, 1.467681102742748, 1.4632863093789008, 1.4727158703805407, 1.4774825834921883, 1.4877343294300907, 1.5913676784636825, 1.4607328430612407, 1.4634965955517054, 1.4699443732746134, 1.470698345132294, 1.4753763253485417, 1.4921084861715843, 1.4671

In [13]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmin(np.array(target_list))]
print(max_dict)

{'target': 1.452348853289662, 'params': {'colsample_bytree': 0.5489297647691318, 'learning_rate': 0.033947270113616715, 'max_bin': 64.96058512468377, 'max_depth': 13.838623612170684, 'min_child_samples': 65.50145788216062, 'min_child_weight': 37.87286066499082, 'n_estimators': 355.00257308928775, 'num_leaves': 48.02022146460943, 'reg_alpha': 21.197890290451262, 'reg_lambda': 9.521528307448555, 'subsample': 0.5990212685265623}}


# *OOF 스태킹*

In [24]:
target = pd.read_csv(os.path.abspath("../input")+'/y_train.csv' , encoding = 'cp949').group

In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
encoder = LabelEncoder()

In [27]:
y = encoder.fit_transform(target)

In [28]:
X_new.shape, X_te_new.shape

((21587, 547), (14380, 547))

In [29]:
lgb_oof_ver2 = np.zeros((X_new.shape[0], 19))
lgb_oof_ver2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from time import time
import datetime
import gc

# FE
from scipy.signal import find_peaks, peak_widths, peak_prominences

# Model
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
import xgboost as xgb

# Ensemble
from scipy.optimize import minimize
from sklearn.metrics import log_loss

***

{'target': 1.452348853289662, 'params': {'colsample_bytree': 0.5489297647691318, 'learning_rate': 0.033947270113616715, 'max_bin': 64.96058512468377, 'max_depth': 13.838623612170684, 'min_child_samples': 65.50145788216062, 'min_child_weight': 37.87286066499082, 'n_estimators': 355.00257308928775, 'num_leaves': 48.02022146460943, 'reg_alpha': 21.197890290451262, 'reg_lambda': 9.521528307448555, 'subsample': 0.5990212685265623}}

***

In [34]:
clf = LGBMClassifier(
                objective='multiclass',
                metric='multi_logloss',
                nthread=4,
                n_estimators=355,
                learning_rate=0.033947270113616715,
                max_bin=65,
                max_depth=14,
                num_leaves=48,
                colsample_bytree=0.5489297647691318,
                subsample=0.5990212685265623,
                reg_alpha=21.197890290451262,
                reg_lambda=9.521528307448555,
                min_child_samples=66,
                min_child_weight=37.87286066499082,
                silent=-1,
                verbose=-1,
                random_state=1000
                )

In [35]:
n_splits = 10
mlogloss = []
lgb_oof_ver2 = np.zeros((X_new.shape[0], 8))
lgb_pred_ver2 = np.zeros((X_te_new.shape[0], 8))

for X, X_test in [(X_new,X_te_new)]:
    X= X.reset_index(drop=True)
    for seed in [0,1000]:
        kfold = StratifiedKFold(n_splits=n_splits, random_state= seed, shuffle=True)
        for fold, (trn_idx, val_idx) in enumerate(kfold.split(X, y)):
            X_train, X_valid = X.iloc[trn_idx], X.iloc[val_idx]
            y_train, y_valid = y[trn_idx], y[val_idx]

            clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'logloss', verbose= 200, 
                early_stopping_rounds= 200)
            # Predict
            lgb_pred_ver2 += clf.predict_proba(X_test) / (n_splits * 4)
            lgb_oof_ver2[val_idx] += clf.predict_proba(X_valid) / 4
            print('*'* 85)
            print('Training has finished.')
        print('lgb ver2 logloss= ', log_loss(y, lgb_oof_ver2))

[200]	training's multi_logloss: 1.15848	valid_1's multi_logloss: 1.44976
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 1.15476	valid_1's multi_logloss: 1.46388
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 1.15675	valid_1's multi_logloss: 1.46218
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 1.15422	valid_1's multi_logloss: 1.47452
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 1.15665	valid_1's multi_logloss: 1.44548
*************************************************************************************
Training has finished.
[200]	training's multi_logloss: 1.15857	valid_1's multi_logloss: 1.43763
*****************

In [36]:
all_oof = np.column_stack([lgb_oof_ver2])
all_test = np.column_stack([lgb_pred_ver2])

In [37]:
pd.DataFrame(all_test)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.043863,0.056851,0.018624,0.076121,0.043695,0.174054,0.016139,0.070654
1,0.014689,0.079079,0.157967,0.008213,0.002926,0.016560,0.196939,0.023625
2,0.382456,0.089427,0.007827,0.004077,0.005917,0.004133,0.001771,0.004393
3,0.296074,0.051171,0.022629,0.019292,0.015418,0.071908,0.011358,0.012151
4,0.436762,0.026952,0.006998,0.003268,0.014175,0.005745,0.003516,0.002585
...,...,...,...,...,...,...,...,...
14375,0.034244,0.214441,0.040246,0.012426,0.012738,0.128336,0.047645,0.009925
14376,0.173447,0.130976,0.048655,0.028464,0.020169,0.048308,0.028673,0.021308
14377,0.177087,0.081713,0.051533,0.045639,0.032437,0.052109,0.020833,0.038649
14378,0.069617,0.199166,0.047143,0.010714,0.015145,0.115048,0.035688,0.007477


In [38]:
X_train.shape

(19429, 547)

In [39]:
all_test.shape

(14380, 8)

In [41]:
mlogloss = []
n_splits = 10

stk_oof_pred = np.zeros((all_oof.shape[0], 8))
stk_test_pred = np.zeros((all_test.shape[0], 8))

kfold = StratifiedKFold(n_splits=n_splits, random_state=1000, shuffle=True)
for fold, (trn_idx, val_idx) in enumerate(kfold.split(all_oof, y)):
    X_train, X_valid = all_oof[trn_idx], all_oof[val_idx]
    y_train, y_valid = y[trn_idx], y[val_idx]
            
    clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric= 'logloss', verbose= 200, 
                early_stopping_rounds= 200)


    stk_test_pred += clf.predict_proba(all_test) / n_splits
    stk_oof_pred[val_idx] = clf.predict_proba(X_valid)
    
print('mean logloss= ',np.mean(mlogloss))

[200]	training's multi_logloss: 1.41369	valid_1's multi_logloss: 1.43158
[200]	training's multi_logloss: 1.41256	valid_1's multi_logloss: 1.44315
[200]	training's multi_logloss: 1.40926	valid_1's multi_logloss: 1.46161
[200]	training's multi_logloss: 1.4128	valid_1's multi_logloss: 1.43378
[200]	training's multi_logloss: 1.40961	valid_1's multi_logloss: 1.47002
[200]	training's multi_logloss: 1.41109	valid_1's multi_logloss: 1.45214
[200]	training's multi_logloss: 1.41402	valid_1's multi_logloss: 1.42645
[200]	training's multi_logloss: 1.41238	valid_1's multi_logloss: 1.43991
[200]	training's multi_logloss: 1.41158	valid_1's multi_logloss: 1.44692
[200]	training's multi_logloss: 1.40957	valid_1's multi_logloss: 1.46346
mean logloss=  nan


In [42]:
stk_test_pred = pd.DataFrame(stk_test_pred)

In [43]:
stk_test_pred = stk_test_pred.rename(columns = {0:'F20',1:'F30',2:'F40',3:'F50',4:'M20',5:'M30',6:'M40',7:'M50'})

In [44]:
stk_test_pred

Unnamed: 0,F20,F30,F40,F50,M20,M30,M40,M50
0,0.091214,0.107264,0.036091,0.157093,0.085900,0.320539,0.028990,0.172909
1,0.014220,0.145901,0.428837,0.022665,0.007769,0.029788,0.316705,0.034115
2,0.765611,0.160336,0.013934,0.011439,0.012696,0.020701,0.006567,0.008716
3,0.545416,0.147426,0.047545,0.041071,0.074535,0.096943,0.023613,0.023451
4,0.858288,0.055778,0.014141,0.011092,0.026631,0.016289,0.006075,0.011706
...,...,...,...,...,...,...,...,...
14375,0.073224,0.401464,0.083367,0.019107,0.015923,0.283476,0.109812,0.013627
14376,0.381390,0.268226,0.100962,0.046113,0.035342,0.086820,0.046323,0.034823
14377,0.345633,0.174871,0.103964,0.099055,0.061212,0.113580,0.035984,0.065700
14378,0.127541,0.428154,0.077704,0.016760,0.022703,0.224028,0.088861,0.014249


In [46]:
tst_id = num_features_test['custid']

In [47]:
submission = pd.concat([pd.DataFrame({'ID':tst_id}),stk_test_pred],axis = 1)
submission.to_csv('choi_lgbm_stk_oof_10.csv',index = False,encoding = 'utf-8')

In [48]:
submission

Unnamed: 0,ID,F20,F30,F40,F50,M20,M30,M40,M50
0,30001,0.091214,0.107264,0.036091,0.157093,0.085900,0.320539,0.028990,0.172909
1,30002,0.014220,0.145901,0.428837,0.022665,0.007769,0.029788,0.316705,0.034115
2,30003,0.765611,0.160336,0.013934,0.011439,0.012696,0.020701,0.006567,0.008716
3,30005,0.545416,0.147426,0.047545,0.041071,0.074535,0.096943,0.023613,0.023451
4,30007,0.858288,0.055778,0.014141,0.011092,0.026631,0.016289,0.006075,0.011706
...,...,...,...,...,...,...,...,...,...
14375,49988,0.073224,0.401464,0.083367,0.019107,0.015923,0.283476,0.109812,0.013627
14376,49990,0.381390,0.268226,0.100962,0.046113,0.035342,0.086820,0.046323,0.034823
14377,49992,0.345633,0.174871,0.103964,0.099055,0.061212,0.113580,0.035984,0.065700
14378,49993,0.127541,0.428154,0.077704,0.016760,0.022703,0.224028,0.088861,0.014249
