In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb

from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [4]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

SEED = 2020

In [5]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

# 이상체 제거 

In [6]:
# test의 MinMax 범위 넘는 행은 train에서 제거
train_shape = df_train.shape[0]

for col in df_train.columns[:18]:
    df_train = df_train.loc[np.logical_and(df_train[col]>=df_test[col].min(),
                            df_train[col]<=df_test[col].max())]

print('제거된 행 개수 :', train_shape - df_train.shape[0])

제거된 행 개수 : 77


In [7]:
df_train.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class
count,319923.0,319923.0,319923.0,319923.0,319923.0,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0
mean,19.8525,18.4318,17.7352,17.3037,17.0603,0.060481,19.6298,18.2591,17.6158,17.2151,16.9944,6.3334,6.1396,1.1758,1.1765,1.1751,1.1754,1.1761,1.1165
std,1.9392,1.6598,1.4586,1.3141,1.331,0.2911,1.9114,1.6594,1.4672,1.325,1.3232,8.8817,8.5522,0.1163,0.1181,0.1147,0.1155,0.1171,0.9234
min,2.2651,-12.4441,7.7314,7.7115,-9.2548,-25.915,-30.6337,-18.656,-8.756,-3.6973,0.2159,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0002,0.0
25%,18.7244,17.475,16.8773,16.524,16.289,3.5724e-05,18.5639,17.3486,16.7874,16.4532,16.2343,1.0,1.0,1.0883,1.0885,1.0878,1.0881,1.0883,0.0
50%,19.4195,18.1405,17.5259,17.1498,16.9177,0.047153,19.2647,18.0225,17.4434,17.0874,16.8694,2.0,2.0,1.1794,1.1792,1.1794,1.1794,1.1793,1.0
75%,20.432,19.0728,18.4279,18.0074,17.7288,0.094606,20.1976,18.8883,18.2908,17.907,17.6555,5.0,5.0,1.2275,1.226,1.2292,1.2286,1.2268,2.0
max,49.1436,46.3383,45.1299,32.8634,52.6127,44.62,30.7779,30.6132,31.294,30.5509,28.571,44.0,42.0,2.0491,2.0786,2.0205,2.0347,2.0637,2.0


In [8]:
df_test.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,19.8598,18.4371,17.7397,17.3065,17.149,0.060083,19.635,18.2638,17.6193,17.2178,16.8702,6.3499,6.1578,1.176,1.1767,1.1753,1.1756,1.1763
std,1.9543,1.703,1.4786,1.3207,24.6431,0.34684,1.921,1.6657,1.476,1.3324,35.4367,8.8728,8.5509,0.1164,0.1182,0.1148,0.1156,0.1173
min,-0.3385,-51.1753,-5.4387,5.3267,-39.5272,-30.149,-30.6337,-18.656,-8.756,-3.6973,-9999.0,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0001
25%,18.7273,17.478,16.882,16.5255,16.2882,3.3645e-05,18.5709,17.3522,16.7906,16.4562,16.231,1.0,1.0,1.0881,1.0883,1.0877,1.0879,1.0882
50%,19.4261,18.1451,17.5299,17.156,16.9212,0.047115,19.2674,18.0247,17.4487,17.0912,16.8733,2.0,2.0,1.1794,1.1793,1.1794,1.1794,1.1793
75%,20.4344,19.0793,18.4345,18.008,17.7333,0.094769,20.1999,18.8948,18.2952,17.9109,17.6578,5.0,5.0,1.2278,1.2262,1.2294,1.2289,1.2269
max,56.8471,94.3591,46.6913,33.0259,6976.3922,46.39,30.8899,30.9529,31.6536,30.9478,28.6441,44.0,42.0,2.0502,2.0797,2.0216,2.0358,2.0648


# 변수 생성

ver1: 앞뒤 컬럼의 차이를 변수로 생성

## Ver1

In [9]:
wave_columns = df_train.columns.drop(['nObserve','nDetect','class','redshift'])

In [10]:
for j in range(14):
    name = 'diff_' + str(wave_columns[j+1]) + '_' + str(wave_columns[j])
    df_train[name] = df_train[wave_columns[j+1]] - df_train[wave_columns[j]]
    df_test[name] = df_test[wave_columns[j+1]] - df_test[wave_columns[j]]
    print(wave_columns[j+1], ' - ', wave_columns[j], j)

g  -  u 0
r  -  g 1
i  -  r 2
z  -  i 3
dered_u  -  z 4
dered_g  -  dered_u 5
dered_r  -  dered_g 6
dered_i  -  dered_r 7
dered_z  -  dered_i 8
airmass_u  -  dered_z 9
airmass_g  -  airmass_u 10
airmass_r  -  airmass_g 11
airmass_i  -  airmass_r 12
airmass_z  -  airmass_i 13


In [11]:
# 15포인트 랭킹

mag_rank_tr = df_train[wave_columns].rank(axis=1)
mag_rank_tt = df_test[wave_columns].rank(axis=1)

rank_col = []
for col in df_train[wave_columns].columns:
    col = col + '_rank'
    rank_col.append(col)
mag_rank_tr.columns = rank_col
mag_rank_tt.columns = rank_col

df_train = pd.concat([df_train, mag_rank_tr], axis=1)
df_test = pd.concat([df_test,mag_rank_tt], axis=1)

In [12]:
# 측정방법별 파장 차이 비교 변수

diff_col = []
for col in ['u','g','r','i','z']:
    for i in range(2):
        diff_col.append(col + '_' + str(i))

mag_wave_diff_tr = pd.DataFrame(np.zeros((df_train.shape[0], 10)), index=df_train.index)
mag_wave_diff_tt = pd.DataFrame(np.zeros((df_test.shape[0],10)), index=df_test.index)

for i in range(0,10,5):
    for j in range(5):
        mag_wave_diff_tr.loc[:, j+i] = df_train[wave_columns[j]] - df_train[wave_columns[5+j+i]]
        mag_wave_diff_tt.loc[:, j+i] = df_test[wave_columns[j]] - df_test[wave_columns[5+j+i]]
        print(wave_columns[j], ' - ', wave_columns[5+j+i],i+j)

u  -  dered_u 0
g  -  dered_g 1
r  -  dered_r 2
i  -  dered_i 3
z  -  dered_z 4
u  -  airmass_u 5
g  -  airmass_g 6
r  -  airmass_r 7
i  -  airmass_i 8
z  -  airmass_z 9


In [13]:
mag_wave_diff_tr.columns = diff_col
mag_wave_diff_tt.columns = diff_col

df_train = pd.concat([df_train, mag_wave_diff_tr], axis=1)
df_test = pd.concat([df_test, mag_wave_diff_tt], axis=1)

In [14]:
df_train['nObserve'] = df_train['nObserve'].apply(np.log1p)
df_test['nObserve'] = df_test['nObserve'].apply(np.log1p)

df_train['d_obs_det'] = df_train['nObserve'] - df_train['nDetect']
df_test['d_obs_det'] = df_test['nObserve'] - df_test['nDetect']

# dataset 생성

In [15]:
drop_columns = ['d_obs_det','g_0','diff_airmass_z_airmass_i','u','airmass_g','airmass_z','nDetect','dered_i_rank','diff_airmass_r_airmass_g','dered_r_rank','dered_g_rank','g_rank','airmass_i_rank','airmass_r_rank','airmass_g_rank','airmass_z_rank','dered_u_rank','r_rank','diff_airmass_u_dered_z','u_rank','z_rank','dered_z_rank','airmass_u_rank','diff_airmass_i_airmass_r','i_rank','airmass_r','z']


In [16]:
df_train = df_train.drop(drop_columns, axis=1).copy()
df_test = df_test.drop(drop_columns, axis=1).copy()

In [17]:
# train set
X = df_train.drop(columns=['class'], axis=1)
y = df_train.loc[:,'class']

# test set
test = df_test

# train set splitd
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.3, random_state=SEED)

# 모델 학습 

In [18]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'tree_method': 'auto',
    'objective': 'multi:softmax',
    'num_class': 3,
    'random_state': 2020
}

In [19]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train,y_train)
p = xgb_clf.predict(X_val)
print(accuracy_score(p,y_val))

0.9344217885535077


In [20]:
# # 0.9356

# xgb_clf.fit(X,y)
# p = xgb_clf.predict(test)

# submission = pd.read_csv('../input/dankook/sample_submission.csv')
# submission['class'] = p
# submission.to_csv('submission.csv',index=False, encoding='utf-8-sig')

# 파라미터 튜닝

In [21]:
# Classifier
bayes_cv_tuner = BayesSearchCV(
    estimator = xgb_clf,
    search_spaces = {
        'learning_rate': (0.01, 1.0),
        'max_depth': (0, 50),
        'max_delta_step': (0, 20),
        'max_bin': (100, 1000),
        'gamma': (0.001, 0.5),
        'subsample': (0.01, 1.0),
        'colsample_bytree': (0.01, 1.0),
        'colsample_bylevel': (0.01, 1.0),
        'min_child_weight': (0, 10),
        'reg_lambda': (0.01, 1000),
        'reg_alpha': (0.01, 1.0),
        'scale_pos_weight': (0.01, 500),
        'n_estimators': (50, 100),
    },    
    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=2020
    ),
    scoring='neg_log_loss',
    n_jobs = -1,
    n_iter = 1000,   
    verbose = 0,
    refit = True,
    random_state = 2020
)

def status_print(optim_result):
    """Status callback durring bayesian hyperparameter search"""
    
    # Get all the models tested so far in DataFrame format
    all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)    
    
    # Get current parameters and the best parameters    
    best_params = pd.Series(bayes_cv_tuner.best_params_)
    print('Model #{}\nBest ROC-AUC: {}\nBest params: {}\n'.format(
        len(all_models),
        np.round(bayes_cv_tuner.best_score_, 4),
        bayes_cv_tuner.best_params_
    ))
    
    # Save all model results
    clf_name = bayes_cv_tuner.estimator.__class__.__name__
    all_models.to_csv(clf_name+"_cv_results.csv")

In [None]:
result = bayes_cv_tuner.fit(X.values, y.values, callback=status_print)