In [67]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [68]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

# FE
from scipy.signal import find_peaks, peak_widths, peak_prominences

from sklearn.model_selection import train_test_split

import gc
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import log_loss, matthews_corrcoef, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import contextlib

from sklearn.metrics import accuracy_score

from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold

In [69]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [70]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

SEED = 2020

In [71]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

# 이상치 제거

In [63]:
# test의 MinMax 범위 넘는 행은 train에서 제거
train_shape = df_train.shape[0]
print('original raws count:', train_shape)

for col in df_train.columns[:18]:
    df_train = df_train.loc[np.logical_and(df_train[col]>df_test[col].min(),
                            df_train[col]<df_test[col].max())]

print('제거된 행 개수 :', train_shape - df_train.shape[0])

original raws count: 320000
제거된 행 개수 : 106590


In [64]:
df_train.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class
count,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0
mean,20.0045,18.5556,17.8432,17.403,17.1534,0.058677,19.7917,18.3895,17.7284,17.3178,17.0898,8.973,8.7045,1.1835,1.1845,1.1826,1.183,1.184,1.037
std,1.9489,1.646,1.443,1.3047,1.313,0.25417,1.896,1.6224,1.4325,1.2993,1.2997,9.8583,9.4803,0.1093,0.1111,0.1076,0.1084,0.1102,0.9307
min,6.4308,11.8227,11.0519,7.7115,-9.2548,-19.245,-22.204,-13.1527,-4.7775,0.071,3.8435,2.0,2.0,1.0003,1.0002,1.0006,1.0004,1.0002,0.0
25%,18.8132,17.5576,16.9529,16.5881,16.3491,5.8421e-06,18.6537,17.4289,16.8602,16.5173,16.2944,2.0,2.0,1.1103,1.1101,1.1095,1.1101,1.1102,0.0
50%,19.5555,18.2459,17.6135,17.2326,16.9989,0.04151,19.3847,18.1175,17.5268,17.1654,16.9478,3.0,3.0,1.1861,1.1864,1.1859,1.186,1.1862,1.0
75%,20.7681,19.3689,18.7027,18.2387,17.9381,0.094244,20.5102,19.1788,18.5661,18.1316,17.8596,16.0,16.0,1.2299,1.2296,1.2305,1.2306,1.2296,2.0
max,47.3354,46.3383,31.9638,31.8865,46.2408,44.62,30.7779,30.318,30.4149,30.5509,28.571,43.0,41.0,2.0491,2.0786,2.0205,2.0347,2.0637,2.0


In [65]:
df_test.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,19.8598,18.4371,17.7397,17.3065,17.149,0.060083,19.635,18.2638,17.6193,17.2178,16.8702,6.3499,6.1578,1.176,1.1767,1.1753,1.1756,1.1763
std,1.9543,1.703,1.4786,1.3207,24.6431,0.34684,1.921,1.6657,1.476,1.3324,35.4367,8.8728,8.5509,0.1164,0.1182,0.1148,0.1156,0.1173
min,-0.3385,-51.1753,-5.4387,5.3267,-39.5272,-30.149,-30.6337,-18.656,-8.756,-3.6973,-9999.0,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0001
25%,18.7273,17.478,16.882,16.5255,16.2882,3.3645e-05,18.5709,17.3522,16.7906,16.4562,16.231,1.0,1.0,1.0881,1.0883,1.0877,1.0879,1.0882
50%,19.4261,18.1451,17.5299,17.156,16.9212,0.047115,19.2674,18.0247,17.4487,17.0912,16.8733,2.0,2.0,1.1794,1.1793,1.1794,1.1794,1.1793
75%,20.4344,19.0793,18.4345,18.008,17.7333,0.094769,20.1999,18.8948,18.2952,17.9109,17.6578,5.0,5.0,1.2278,1.2262,1.2294,1.2289,1.2269
max,56.8471,94.3591,46.6913,33.0259,6976.3922,46.39,30.8899,30.9529,31.6536,30.9478,28.6441,44.0,42.0,2.0502,2.0797,2.0216,2.0358,2.0648


# 변수 생성

ver1: 앞뒤 컬럼의 차이를 변수로 생성

ver2: 5개 컬럼 전의 컬럼과의 차이까지 변수로 생성

In [7]:
wave_columns = df_train.columns.drop(['nObserve','nDetect','class','redshift'])

## Ver1

In [8]:
for j in range(14):
    name = 'diff_' + str(wave_columns[j+1]) + '_' + str(wave_columns[j])
    df_train[name] = df_train[wave_columns[j+1]] - df_train[wave_columns[j]]
    df_test[name] = df_test[wave_columns[j+1]] - df_test[wave_columns[j]]
    print(wave_columns[j+1], ' - ', wave_columns[j], j)

g  -  u 0
r  -  g 1
i  -  r 2
z  -  i 3
dered_u  -  z 4
dered_g  -  dered_u 5
dered_r  -  dered_g 6
dered_i  -  dered_r 7
dered_z  -  dered_i 8
airmass_u  -  dered_z 9
airmass_g  -  airmass_u 10
airmass_r  -  airmass_g 11
airmass_i  -  airmass_r 12
airmass_z  -  airmass_i 13


In [9]:
# 15포인트 랭킹

mag_rank_tr = df_train[wave_columns].rank(axis=1)
mag_rank_tt = df_test[wave_columns].rank(axis=1)

rank_col = []
for col in df_train[wave_columns].columns:
    col = col + '_rank'
    rank_col.append(col)
mag_rank_tr.columns = rank_col
mag_rank_tt.columns = rank_col

df_train = pd.concat([df_train, mag_rank_tr], axis=1)
df_test = pd.concat([df_test,mag_rank_tt], axis=1)

In [10]:
# 측정방법별 파장 차이 비교 변수

diff_col = []
for col in ['u','g','r','i','z']:
    for i in range(2):
        diff_col.append(col + '_' + str(i))

mag_wave_diff_tr = pd.DataFrame(np.zeros((df_train.shape[0], 10)), index=df_train.index)
mag_wave_diff_tt = pd.DataFrame(np.zeros((df_test.shape[0],10)), index=df_test.index)

for i in range(0,10,5):
    for j in range(5):
        mag_wave_diff_tr.loc[:, j+i] = df_train[wave_columns[j]] - df_train[wave_columns[5+j+i]]
        mag_wave_diff_tt.loc[:, j+i] = df_test[wave_columns[j]] - df_test[wave_columns[5+j+i]]
        print(wave_columns[j], ' - ', wave_columns[5+j+i],i+j)

u  -  dered_u 0
g  -  dered_g 1
r  -  dered_r 2
i  -  dered_i 3
z  -  dered_z 4
u  -  airmass_u 5
g  -  airmass_g 6
r  -  airmass_r 7
i  -  airmass_i 8
z  -  airmass_z 9


In [11]:
mag_wave_diff_tr.columns = diff_col
mag_wave_diff_tt.columns = diff_col

df_train = pd.concat([df_train, mag_wave_diff_tr], axis=1)
df_test = pd.concat([df_test, mag_wave_diff_tt], axis=1)



In [12]:
df_train['nObserve'] = df_train['nObserve'].apply(np.log1p)
df_test['nObserve'] = df_test['nObserve'].apply(np.log1p)

df_train['d_obs_det'] = df_train['nObserve'] - df_train['nDetect']
df_test['d_obs_det'] = df_test['nObserve'] - df_test['nDetect']

# feature 선택

In [13]:
df_train.head()

Unnamed: 0_level_0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class,diff_g_u,diff_r_g,diff_i_r,diff_z_i,diff_dered_u_z,diff_dered_g_dered_u,diff_dered_r_dered_g,diff_dered_i_dered_r,diff_dered_z_dered_i,diff_airmass_u_dered_z,diff_airmass_g_airmass_u,diff_airmass_r_airmass_g,diff_airmass_i_airmass_r,diff_airmass_z_airmass_i,u_rank,g_rank,r_rank,i_rank,z_rank,dered_u_rank,dered_g_rank,dered_r_rank,dered_i_rank,dered_z_rank,airmass_u_rank,airmass_g_rank,airmass_r_rank,airmass_i_rank,airmass_z_rank,u_0,u_1,g_0,g_1,r_0,r_1,i_0,i_1,z_0,z_1,d_obs_det
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1
0,23.264,20.3368,19.0095,17.6724,16.9396,-8.1086e-05,23.1243,20.2578,18.9551,17.6321,16.9089,2.9444,18,1.1898,1.1907,1.189,1.1894,1.1902,0,-2.9272,-1.3273,-1.3371,-0.7328,6.1847,-2.8665,-1.3027,-1.323,-0.7232,-15.7192,0.000917,-0.001702,0.000376,0.000851,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,3.0,5.0,1.0,2.0,4.0,0.1397,0.079,0.0544,0.0403,0.0307,22.0742,19.1461,17.8205,16.4831,15.7494,-15.0556
2,16.7864,15.8254,15.5363,15.3935,15.35,0.00047198,16.6076,15.6866,15.44,15.3217,15.2961,1.0986,2,1.0225,1.0241,1.021,1.0217,1.0233,0,-0.961,-0.2891,-0.1428,-0.0435,1.2576,-0.9211,-0.2466,-0.1183,-0.0257,-14.2736,0.001606,-0.003122,0.000747,0.001561,15.0,13.0,11.0,9.0,8.0,14.0,12.0,10.0,7.0,6.0,3.0,5.0,1.0,2.0,4.0,0.1787,0.1388,0.0963,0.0718,0.054,15.7639,14.8013,14.5153,14.3718,14.3267,-0.9014
3,25.6606,21.1887,20.2212,19.8949,19.6346,5.8143e-06,25.3536,20.9947,20.0873,19.7947,19.5552,1.6094,3,1.2054,1.2061,1.2049,1.2051,1.2057,0,-4.4719,-0.9676,-0.3262,-0.2603,5.719,-4.359,-0.9074,-0.2926,-0.2395,-18.3498,0.000659,-0.001184,0.000246,0.000592,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,3.0,5.0,1.0,2.0,4.0,0.307,0.1941,0.1339,0.1003,0.0795,24.4552,19.9827,19.0163,18.6898,18.4289,-1.3906
4,24.4534,20.6992,19.0424,18.3242,17.9826,-3.3247e-05,23.7714,20.4338,18.863,18.1903,17.8759,2.6391,12,1.1939,1.1943,1.1937,1.1938,1.1941,0,-3.7543,-1.6568,-0.7182,-0.3415,5.7888,-3.3376,-1.5709,-0.6727,-0.3144,-16.682,0.000339,-0.000547,8.8e-05,0.000273,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,3.0,5.0,1.0,2.0,4.0,0.682,0.2653,0.1794,0.1339,0.1067,23.2595,19.5049,17.8486,17.1303,16.7885,-9.3609
6,23.9063,20.8472,19.9784,19.4952,19.0526,-4.2489e-05,23.8458,20.7,19.8766,19.4205,19.004,1.0986,2,1.2019,1.202,1.202,1.2019,1.202,0,-3.0591,-0.8688,-0.4832,-0.4426,4.7933,-3.1458,-0.8234,-0.4561,-0.4165,-17.8021,9.5e-05,-5.6e-05,-3.6e-05,2.7e-05,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,1.0,5.0,4.0,2.0,3.0,0.0604,0.1472,0.1018,0.0747,0.0486,22.7044,19.6452,18.7765,18.2933,17.8506,-0.9014


In [14]:
delete_column=['u','g','r','i','nDetect',
               'airmass_g', 'airmass_r', 'airmass_i', 'airmass_z',
               'dered_u','dered_g','dered_r','dered_i',
               'diff_airmass_g_airmass_u','diff_airmass_r_airmass_g',
               'diff_airmass_i_airmass_r',
               'airmass_u_rank', 'airmass_g_rank', 'airmass_r_rank',
               'airmass_i_rank',
              ]

In [15]:
df_train.drop(delete_column, axis=1, inplace=True)
df_test.drop(delete_column, axis=1, inplace=True)

In [16]:
df_train.shape

(213410, 39)

# dataset 생성

In [17]:
# train set
X = df_train.drop(columns=['class'], axis=1)
y = df_train.loc[:,'class']

# test set
test = df_test

# train set split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=SEED)

# test

In [18]:
X.describe()

Unnamed: 0,z,redshift,dered_z,nObserve,airmass_u,diff_g_u,diff_r_g,diff_i_r,diff_z_i,diff_dered_u_z,diff_dered_g_dered_u,diff_dered_r_dered_g,diff_dered_i_dered_r,diff_dered_z_dered_i,diff_airmass_u_dered_z,diff_airmass_z_airmass_i,u_rank,g_rank,r_rank,i_rank,z_rank,dered_u_rank,dered_g_rank,dered_r_rank,dered_i_rank,dered_z_rank,airmass_z_rank,u_0,u_1,g_0,g_1,r_0,r_1,i_0,i_1,z_0,z_1,d_obs_det
count,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0,213410.0
mean,17.1534,0.058677,17.0898,1.8749,1.1835,-1.4489,-0.7124,-0.4402,-0.2496,2.6383,-1.4021,-0.6611,-0.4106,-0.228,-15.9063,0.0009,14.8001,12.7945,10.9384,9.1474,7.5303,13.9136,11.6924,9.7783,7.9018,6.503,3.2838,0.2128,0.166,0.1147,0.0851,0.0636,18.821,17.3711,16.6605,16.2199,15.9694,-6.8296
std,1.313,0.25417,1.2997,0.8787,0.1093,0.772,0.516,0.5009,0.4294,1.5794,0.7186,0.4978,0.4921,0.3874,1.295,0.0041,0.8525,1.0261,0.5238,0.8266,1.5734,0.9995,1.2738,0.7943,0.89,1.5255,0.9354,0.4871,0.3137,0.2196,0.179,0.2163,1.9402,1.6388,1.4369,1.2991,1.3079,8.6368
min,-9.2548,-19.245,3.8435,1.0986,1.0003,-28.6788,-31.5613,-13.6538,-25.5422,-40.0274,-13.0835,-12.4786,-12.377,-14.3327,-27.4961,-0.011,6.0,6.0,6.0,6.0,1.0,1.0,2.0,3.0,4.0,6.0,1.0,-18.5095,-10.2906,-10.8168,-16.2843,-33.894,5.2695,10.5299,9.8624,6.3893,-10.5755,-37.2388
25%,16.3491,5.8421e-06,16.2944,1.0986,1.1103,-1.6402,-0.8877,-0.4586,-0.3209,1.6931,-1.6021,-0.841,-0.4339,-0.3016,-16.6569,-0.0012,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,2.0,0.0769,0.0701,0.0486,0.0359,0.0231,17.6403,16.3829,15.7794,15.4132,15.1717,-12.909
50%,16.9989,0.04151,16.9478,1.3863,1.1861,-1.2808,-0.6338,-0.3612,-0.2194,2.3219,-1.2401,-0.5896,-0.3375,-0.2002,-15.7785,0.0008,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,4.0,0.1484,0.112,0.0774,0.0577,0.0442,18.3877,17.0785,16.448,16.0649,15.8305,-1.6137
75%,17.9381,0.094244,17.8596,2.8332,1.2299,-1.073,-0.4306,-0.2374,-0.102,3.1257,-1.0382,-0.3851,-0.2097,-0.0801,-15.1164,0.0032,15.0,13.0,11.0,9.0,7.0,14.0,12.0,10.0,8.0,6.0,4.0,0.2573,0.1817,0.1252,0.0933,0.076,19.56,18.1588,17.4956,17.0307,16.7342,-0.9014
max,46.2408,44.62,28.571,3.7842,2.0491,15.5273,14.1219,14.1786,27.886,26.7982,12.164,11.7109,13.5664,13.6082,-2.6569,0.029,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,8.0,47.7979,36.7719,25.285,18.7776,23.4402,45.8892,44.8819,30.9159,30.6921,45.2331,1.6109


In [19]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'gamma': 0,
    'tree_method': 'auto',
    'objective': 'reg:squarederror',
    'random_state': 2020
}



In [20]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train,y_train)
p = xgb_clf.predict(X_test)
print(accuracy_score(p,y_test))

0.9343829561251425


In [21]:
# 0.9333 제출파일

xgb_clf.fit(X,y)
p = xgb_clf.predict(test)
submission = pd.read_csv('../input/dankook/sample_submission.csv')

submission['class'] = p
submission.to_csv('submission.csv',index=False, encoding='utf-8-sig')