In [30]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [31]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

# FE
from scipy.signal import find_peaks, peak_widths, peak_prominences

from sklearn.model_selection import train_test_split

import gc
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import contextlib

from sklearn.metrics import accuracy_score

from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold

In [32]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [33]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

SEED = 2020

In [34]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

# 변수 생성

ver1: 앞뒤 컬럼의 차이를 변수로 생성

ver2: 5개 컬럼 전의 컬럼과의 차이까지 변수로 생성

In [35]:
wave_columns = df_train.columns.drop(['nObserve','nDetect','class','redshift'])

## Ver1

In [36]:
for j in range(14):
    name = 'diff_' + str(wave_columns[j+1]) + '_' + str(wave_columns[j])
    df_train[name] = df_train[wave_columns[j+1]] - df_train[wave_columns[j]]
    df_test[name] = df_test[wave_columns[j+1]] - df_test[wave_columns[j]]
    print(wave_columns[j+1], ' - ', wave_columns[j], j)

g  -  u 0
r  -  g 1
i  -  r 2
z  -  i 3
dered_u  -  z 4
dered_g  -  dered_u 5
dered_r  -  dered_g 6
dered_i  -  dered_r 7
dered_z  -  dered_i 8
airmass_u  -  dered_z 9
airmass_g  -  airmass_u 10
airmass_r  -  airmass_g 11
airmass_i  -  airmass_r 12
airmass_z  -  airmass_i 13


In [37]:
# 15포인트 랭킹

mag_rank_tr = df_train[wave_columns].rank(axis=1)
mag_rank_tt = df_test[wave_columns].rank(axis=1)

rank_col = []
for col in df_train[wave_columns].columns:
    col = col + '_rank'
    rank_col.append(col)
mag_rank_tr.columns = rank_col
mag_rank_tt.columns = rank_col

df_train = pd.concat([df_train, mag_rank_tr], axis=1)
df_test = pd.concat([df_test,mag_rank_tt], axis=1)

In [38]:
# 측정방법별 파장 차이 비교 변수

diff_col = []
for col in ['u','g','r','i','z']:
    for i in range(2):
        diff_col.append(col + '_' + str(i))

mag_wave_diff_tr = pd.DataFrame(np.zeros((df_train.shape[0], 10)), index=df_train.index)
mag_wave_diff_tt = pd.DataFrame(np.zeros((df_test.shape[0],10)), index=df_test.index)

for i in range(0,10,5):
    for j in range(5):
        mag_wave_diff_tr.loc[:, j+i] = df_train[wave_columns[j]] - df_train[wave_columns[5+j+i]]
        mag_wave_diff_tt.loc[:, j+i] = df_test[wave_columns[j]] - df_test[wave_columns[5+j+i]]
        print(wave_columns[j], ' - ', wave_columns[5+j+i],i+j)

u  -  dered_u 0
g  -  dered_g 1
r  -  dered_r 2
i  -  dered_i 3
z  -  dered_z 4
u  -  airmass_u 5
g  -  airmass_g 6
r  -  airmass_r 7
i  -  airmass_i 8
z  -  airmass_z 9


In [39]:
mag_wave_diff_tr.columns = diff_col
mag_wave_diff_tt.columns = diff_col

df_train = pd.concat([df_train, mag_wave_diff_tr], axis=1)
df_test = pd.concat([df_test, mag_wave_diff_tt], axis=1)



In [40]:
df_train['nObserve'] = df_train['nObserve'].apply(np.log1p)
df_test['nObserve'] = df_test['nObserve'].apply(np.log1p)

df_train['d_obs_det'] = df_train['nObserve'] - df_train['nDetect']
df_test['d_obs_det'] = df_test['nObserve'] - df_test['nDetect']

In [41]:
delete_column=['u','g','r','i','nDetect',
               'airmass_g', 'airmass_r', 'airmass_i', 'airmass_z',
               'dered_u','dered_g','dered_r','dered_i',
               'diff_airmass_g_airmass_u','diff_airmass_r_airmass_g',
               'diff_airmass_i_airmass_r',
               'airmass_u_rank', 'airmass_g_rank', 'airmass_r_rank',
               'airmass_i_rank','i_1','z_1','u_1','i_0','r_rank','dered_z_rank','dered_g_rank','z_rank','dered_i_rank','airmass_z_rank','i_rank','dered_u_rank','u_rank','g_rank','dered_r_rank','airmass_u','diff_airmass_z_airmass_i'
              ]

In [42]:
df_train.drop(delete_column, axis=1, inplace=True)
df_test.drop(delete_column, axis=1, inplace=True)

# 데이터셋 생성

In [43]:
features = df_train.columns.drop(['class'])
scaler = MinMaxScaler()

# train set
X = scaler.fit_transform(df_train.drop(columns=['class'], axis=1))
X = pd.DataFrame(X, columns=features)
y = df_train.loc[:,'class']

# test set
test = scaler.transform(df_test)
test = pd.DataFrame(test, columns=features)

In [44]:
X = pd.DataFrame(X, columns=features)
test = pd.DataFrame(test,columns=features)

In [45]:
# train set split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=2020)

# feature 선택

In [46]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'tree_method': 'auto',
    'objective': 'multi:softmax',
    'num_class': 3,
    'random_state': 2020
}

In [47]:
xgb_clf = xgb.XGBClassifier(**xgb_params)

In [19]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb_clf,
    search_spaces = {
        'learning_rate': (0.01, 1.0, 'log-uniform'),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'max_bin': (100, 1000),
        'gamma': (1e-9, 0.5, 'log-uniform'),
        'subsample': (0.01, 1.0, 'uniform'),
        'colsample_bytree': (0.01, 1.0, 'uniform'),
        'colsample_bylevel': (0.01, 1.0, 'uniform'),
        'min_child_weight': (0, 10),
        'reg_lambda': (1e-9, 1000, 'log-uniform'),
        'reg_alpha': (1e-9, 1.0, 'log-uniform'),
        'scale_pos_weight': (1e-6, 500, 'log-uniform'),
        'n_estimators': (50, 100),
    },    
    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=2020
    ),
    scoring='neg_log_loss',
    n_jobs = -1,
    n_iter = 1000,   
    verbose = 0,
    refit = True,
    random_state = 2020
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")

In [20]:

result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)

Model #1
Best ROC-AUC: -0.2117
Best params: OrderedDict([('colsample_bylevel', 0.20284208990056435), ('colsample_bytree', 0.4100434869006526), ('gamma', 1.3370880967690957e-07), ('learning_rate', 0.616247102917486), ('max_bin', 583), ('max_delta_step', 1), ('max_depth', 12), ('min_child_weight', 2), ('n_estimators', 76), ('reg_alpha', 0.04669386473350748), ('reg_lambda', 631), ('scale_pos_weight', 52), ('subsample', 0.03193070143140899)])

Model #2
Best ROC-AUC: -0.2117
Best params: OrderedDict([('colsample_bylevel', 0.20284208990056435), ('colsample_bytree', 0.4100434869006526), ('gamma', 1.3370880967690957e-07), ('learning_rate', 0.616247102917486), ('max_bin', 583), ('max_delta_step', 1), ('max_depth', 12), ('min_child_weight', 2), ('n_estimators', 76), ('reg_alpha', 0.04669386473350748), ('reg_lambda', 631), ('scale_pos_weight', 52), ('subsample', 0.03193070143140899)])

Model #3
Best ROC-AUC: -0.1604
Best params: OrderedDict([('colsample_bylevel', 0.6460711463783065), ('colsample_

ValueError: All integer values shouldbe greater than 0.000000

In [50]:
X.max(), X.min(), y.min(), y.max()

(z                         1.0
 redshift                  1.0
 dered_z                   1.0
 nObserve                  1.0
 diff_g_u                  1.0
 diff_r_g                  1.0
 diff_i_r                  1.0
 diff_z_i                  1.0
 diff_dered_u_z            1.0
 diff_dered_g_dered_u      1.0
 diff_dered_r_dered_g      1.0
 diff_dered_i_dered_r      1.0
 diff_dered_z_dered_i      1.0
 diff_airmass_u_dered_z    1.0
 u_0                       1.0
 g_0                       1.0
 g_1                       1.0
 r_0                       1.0
 r_1                       1.0
 z_0                       1.0
 d_obs_det                 1.0
 dtype: float64,
 z                         0.0
 redshift                  0.0
 dered_z                   0.0
 nObserve                  0.0
 diff_g_u                  0.0
 diff_r_g                  0.0
 diff_i_r                  0.0
 diff_z_i                  0.0
 diff_dered_u_z            0.0
 diff_dered_g_dered_u      0.0
 diff_dered_r_dered_g 