In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb


import eli5
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances



In [3]:
import pandas as pd # 데이터 분석
import numpy as np # 행렬 연산, version: 1.6.1

import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import sklearn
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD, PCA, FastICA, FactorAnalysis, KernelPCA, DictionaryLearning
from sklearn.decomposition import IncrementalPCA, LatentDirichletAllocation,MiniBatchSparsePCA, SparsePCA

import itertools

In [4]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [5]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

target_column = 'class'
SEED = 2020

In [6]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

# 이상치 제거

In [7]:
# test의 MinMax 범위 넘는 행은 train에서 제거
train_shape = df_train.shape[0]

for col in df_train.columns[:18]:
    df_train = df_train.loc[np.logical_and(df_train[col]>=df_test[col].min(),
                            df_train[col]<=df_test[col].max())]

print('제거된 행 개수 :', train_shape - df_train.shape[0])

제거된 행 개수 : 77


In [8]:
df_train.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class
count,319923.0,319923.0,319923.0,319923.0,319923.0,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0
mean,19.8525,18.4318,17.7352,17.3037,17.0603,0.060481,19.6298,18.2591,17.6158,17.2151,16.9944,6.3334,6.1396,1.1758,1.1765,1.1751,1.1754,1.1761,1.1165
std,1.9392,1.6598,1.4586,1.3141,1.331,0.2911,1.9114,1.6594,1.4672,1.325,1.3232,8.8817,8.5522,0.1163,0.1181,0.1147,0.1155,0.1171,0.9234
min,2.2651,-12.4441,7.7314,7.7115,-9.2548,-25.915,-30.6337,-18.656,-8.756,-3.6973,0.2159,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0002,0.0
25%,18.7244,17.475,16.8773,16.524,16.289,3.5724e-05,18.5639,17.3486,16.7874,16.4532,16.2343,1.0,1.0,1.0883,1.0885,1.0878,1.0881,1.0883,0.0
50%,19.4195,18.1405,17.5259,17.1498,16.9177,0.047153,19.2647,18.0225,17.4434,17.0874,16.8694,2.0,2.0,1.1794,1.1792,1.1794,1.1794,1.1793,1.0
75%,20.432,19.0728,18.4279,18.0074,17.7288,0.094606,20.1976,18.8883,18.2908,17.907,17.6555,5.0,5.0,1.2275,1.226,1.2292,1.2286,1.2268,2.0
max,49.1436,46.3383,45.1299,32.8634,52.6127,44.62,30.7779,30.6132,31.294,30.5509,28.571,44.0,42.0,2.0491,2.0786,2.0205,2.0347,2.0637,2.0


In [9]:
df_test.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,19.8598,18.4371,17.7397,17.3065,17.149,0.060083,19.635,18.2638,17.6193,17.2178,16.8702,6.3499,6.1578,1.176,1.1767,1.1753,1.1756,1.1763
std,1.9543,1.703,1.4786,1.3207,24.6431,0.34684,1.921,1.6657,1.476,1.3324,35.4367,8.8728,8.5509,0.1164,0.1182,0.1148,0.1156,0.1173
min,-0.3385,-51.1753,-5.4387,5.3267,-39.5272,-30.149,-30.6337,-18.656,-8.756,-3.6973,-9999.0,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0001
25%,18.7273,17.478,16.882,16.5255,16.2882,3.3645e-05,18.5709,17.3522,16.7906,16.4562,16.231,1.0,1.0,1.0881,1.0883,1.0877,1.0879,1.0882
50%,19.4261,18.1451,17.5299,17.156,16.9212,0.047115,19.2674,18.0247,17.4487,17.0912,16.8733,2.0,2.0,1.1794,1.1793,1.1794,1.1794,1.1793
75%,20.4344,19.0793,18.4345,18.008,17.7333,0.094769,20.1999,18.8948,18.2952,17.9109,17.6578,5.0,5.0,1.2278,1.2262,1.2294,1.2289,1.2269
max,56.8471,94.3591,46.6913,33.0259,6976.3922,46.39,30.8899,30.9529,31.6536,30.9478,28.6441,44.0,42.0,2.0502,2.0797,2.0216,2.0358,2.0648


# 변수 생성 

In [10]:
for i in ['u','g','r','i','z']:
    df_train[f'ugriz_{i}'] = df_train[i]
    df_test[f'ugriz_{i}'] = df_test[i]
df_train.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)
df_test.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)

In [11]:
ugriz_col = [c for c in df_train.columns if c.find('ugriz') != -1]
dered_col = [c for c in df_train.columns if c.find('dered') != -1]
airmass_col = [c for c in df_train.columns if c.find('airmass') != -1]

In [12]:
# zip 함수를 이용하여 각 Row별, Magnitude별, max, min, max-min, std, sum을 구한다.
for prefix , g in zip(['ugriz','dered','airmass'], [ugriz_col, dered_col, airmass_col]):
    df_train[f'{prefix}_max'] = df_train[g].max(axis=1)
    df_test[f'{prefix}_max'] = df_test[g].max(axis=1)
    
    df_train[f'{prefix}_min'] = df_train[g].min(axis=1)
    df_test[f'{prefix}_min'] = df_test[g].min(axis=1)
    
    df_train[f'{prefix}_diff'] = df_train[f'{prefix}_max'] - df_train[f'{prefix}_min']
    df_test[f'{prefix}_diff'] = df_test[f'{prefix}_max'] - df_test[f'{prefix}_min']
    
    df_train[f'{prefix}_std'] = df_train[g].std(axis=1)
    df_test[f'{prefix}_std'] = df_test[g].std(axis=1)
    
    df_train[f'{prefix}_sum'] = df_train[g].sum(axis=1)
    df_test[f'{prefix}_sum'] = df_test[g].sum(axis=1)

In [13]:
# diff feature 추가 예: z - i
# itertools combinations을 활용하여 전체 magnitude에서 diff를 구함
# 총 105가지 조합이 나옴. 여기서 안 좋은 것은 permutation importance를 활용하여 제거할 예정
diff_feature = []
for c1, c2 in itertools.combinations(ugriz_col[::-1]+dered_col[::-1]
                                     +airmass_col[::-1],2):
    new_c = f'{c1}_{c2}_diff'
    df_train[new_c] = df_train[c1]-df_train[c2]
    df_test[new_c] = df_test[c1]-df_test[c2]
    diff_feature.append(new_c)
    

In [14]:
# 각 maginitude 별 max-max, min-min, sum-sum 을 구함

for c in itertools.combinations(['ugriz','dered','airmass'],2):
    df_train[f'{c[0]}_{c[1]}_max_diff'] = df_train[f'{c[0]}_max'] - df_train[f'{c[1]}_max']
    df_test[f'{c[0]}_{c[1]}_max_diff'] = df_test[f'{c[0]}_max'] - df_test[f'{c[1]}_max']
    
    df_train[f'{c[0]}_{c[1]}_min_diff'] = df_train[f'{c[0]}_min'] - df_train[f'{c[1]}_min']
    df_test[f'{c[0]}_{c[1]}_min_diff'] = df_test[f'{c[0]}_min'] - df_test[f'{c[1]}_min']
    
    df_train[f'{c[0]}_{c[1]}_sum_diff'] = df_train[f'{c[0]}_sum'] - df_train[f'{c[1]}_sum']
    df_test[f'{c[0]}_{c[1]}_sum_diff'] = df_test[f'{c[0]}_sum'] - df_test[f'{c[1]}_sum']

In [15]:
# 아래와 같은 공식들을 구현함
# http://classic.sdss.org/dr4/algorithms/sdssUBVRITransform.html 

def make_2flux_feature(train, test, c1, c2, func, mag_list=None):
    
    for c in mag_list:
        x=train[f'{c}_{c1}'].values
        y=train[f'{c}_{c2}'].values
        train[f'{c}_{func.__name__}'] = func(x,y)
        
        x=test[f'{c}_{c1}'].values
        y=test[f'{c}_{c2}'].values
        test[f'{c}_{func.__name__}'] = func(x,y)
        
def UB_jester(x1,x2):
    return 0.75*(x1-x2)-0.81

make_2flux_feature(df_train,df_test, 'u','g',UB_jester,['ugriz'])

def BV_jester(x1,x2):
    return 0.62*(x1-x2)+0.15

make_2flux_feature(df_train,df_test, 'g','r',BV_jester,['ugriz'])

def VR_jester(x1,x2):
    return 0.38*(x1-x2)+0.27

make_2flux_feature(df_train, df_test, 'r','i',VR_jester,['ugriz'])

def RcIc_jester(x1,x2):
    return 0.72*(x1-x2)+0.27

make_2flux_feature(df_train, df_test, 'r','i', RcIc_jester, ['ugriz'])

def B_jester(x1,x2):
    return x2+0.17*(x1-x2)+0.11

make_2flux_feature(df_train,df_test, 'u','g', B_jester, ['ugriz'])

def V_jester(x1,x2):
    return x1-0.52*(x1-x2)-0.03

make_2flux_feature(df_train,df_test, 'g','r',V_jester, ['ugriz'])

In [16]:
# https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy

# 아래 나열된 모든 ugriz 등급은 달리 명시되지 않는 한 dereddened PSF mag입니다.
# dered_X 사용하면 될듯, 일단은 잘 몰라서 ugriz_X 버전과 dered_X 버전을 비교해서 사용하면 될듯.

# ugriz_X 버전

all_data = pd.concat([df_train,df_test], axis=0)
all_data.fillna(-1, inplace=True)

all_data['ugriz_icolor'] = all_data['ugriz_u']*(-0.436) + all_data['ugriz_g']*(1.129) + all_data['ugriz_r']*(-0.119) + all_data['ugriz_i']*(-0.574) + 0.1984

all_data['ugriz_ucolor'] = all_data['ugriz_u']*(-0.249) + all_data['ugriz_g']*(0.794) + all_data['ugriz_r']*(-0.555) + 0.234

all_data['ugriz_p1'] = (all_data['ugriz_u']-all_data['ugriz_g'])*(0.91) + (all_data['ugriz_g']-all_data['ugriz_r'])*(0.415)- 1.280

all_data['ugriz_r_std_div'] = all_data['ugriz_r']/all_data['ugriz_r'].std()

In [17]:
# https://www.sdss.org/dr16/algorithms/legacy_target_selection/
# 멸망과 관련 있는 거라고 함.
# dered_X 가 어떤 곳에서는 멸망과 관련 있는 데이터였음.
# ugriz_X를 붉어짐에 대한 걸로 사용하다면, 여기서는 dered_X를 멸망과 관련 있는 것과 사용하면 될듯

all_data['dered_orthogonal'] = (all_data['dered_r']-all_data['dered_i'])-(all_data['dered_g']-all_data['dered_r'])/4-0.18

all_data['dered_parallel'] = 0.7*(all_data['dered_g']-all_data['dered_r']) + 1.2*((all_data['dered_r']-all_data['dered_i'])-0.18)

In [18]:
# https://www.sdss.org/dr12/algorithms/magnitudes/
# 문서에는 psfMag에 대한 데이터인데, 그것을 나는 ugriz로 사용하고 있으니까 urgiz로 사용

color_list = ['u', 'g', 'r', 'i', 'z']
b_list = [1.4*10e-10, 0.9*10e-10, 1.2*10e-10, 1.8*10e-10, 7.4*10e-10]
f0_list = [24.63, 25.11, 24.80, 24.36, 22.83]
for c, b, f0 in zip(color_list, b_list, f0_list):
    all_data[f'ugriz_{c}_asinh'] = -2.5*np.log(10)*(np.arcsinh((all_data[f'ugriz_{c}']/f0)/(2*b))+np.log(b))

In [19]:
all_data['nObserve'] = all_data['nObserve'].apply(np.log1p)
all_data['d_obs_det'] = all_data['nObserve'] - all_data['nDetect']

# feature 선택

In [20]:
selected_columns = ['redshift', 'dered_i', 'nObserve', 'airmass_u', 'airmass_g', 'airmass_i', 'ugriz_sum', 'dered_sum', 'airmass_min', 'airmass_sum', 'ugriz_z_ugriz_i_diff', 'ugriz_z_ugriz_u_diff', 'ugriz_z_dered_z_diff', 'ugriz_z_dered_i_diff', 'ugriz_z_dered_r_diff', 'ugriz_i_ugriz_u_diff', 'ugriz_i_dered_z_diff', 'ugriz_i_dered_g_diff', 'ugriz_i_dered_u_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_dered_i_diff', 'ugriz_r_dered_u_diff', 'ugriz_r_airmass_i_diff', 'ugriz_g_ugriz_u_diff', 'ugriz_g_dered_u_diff', 'dered_z_dered_i_diff', 'dered_z_dered_g_diff', 'dered_z_airmass_g_diff', 'dered_i_dered_r_diff', 'dered_i_dered_u_diff', 'dered_r_dered_g_diff', 'dered_r_dered_u_diff', 'dered_g_dered_u_diff', 'dered_g_airmass_z_diff', 'airmass_z_airmass_i_diff', 'ugriz_B_jester', 'ugriz_V_jester', 'ugriz_icolor', 'ugriz_ucolor', 'ugriz_p1', 'dered_orthogonal','class']

In [21]:
all_data = all_data[selected_columns].copy()

In [22]:
all_data.shape

(399923, 42)

In [23]:
all_data.describe()

Unnamed: 0,redshift,dered_i,nObserve,airmass_u,airmass_g,airmass_i,ugriz_sum,dered_sum,airmass_min,airmass_sum,ugriz_z_ugriz_i_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_i_diff,ugriz_z_dered_r_diff,ugriz_i_ugriz_u_diff,ugriz_i_dered_z_diff,ugriz_i_dered_g_diff,ugriz_i_dered_u_diff,ugriz_r_ugriz_g_diff,ugriz_r_dered_i_diff,ugriz_r_dered_u_diff,ugriz_r_airmass_i_diff,ugriz_g_ugriz_u_diff,ugriz_g_dered_u_diff,dered_z_dered_i_diff,dered_z_dered_g_diff,dered_z_airmass_g_diff,dered_i_dered_r_diff,dered_i_dered_u_diff,dered_r_dered_g_diff,dered_r_dered_u_diff,dered_g_dered_u_diff,dered_g_airmass_z_diff,airmass_z_airmass_i_diff,ugriz_B_jester,ugriz_V_jester,ugriz_icolor,ugriz_ucolor,ugriz_p1,dered_orthogonal,class
count,399920.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0,399923.0
mean,0.060401,17.2156,1.487,1.1758,1.1766,1.1755,90.4052,89.6925,1.1725,5.8792,-0.2262,-2.7759,0.1085,-0.1375,-0.5384,-2.5497,0.3347,-0.9558,-2.3265,-0.6968,0.5205,-1.8947,16.5606,-1.4211,-1.198,-0.246,-1.2905,15.793,-0.4009,-2.4152,-0.6436,-2.0143,-1.3708,17.0839,0.0007,18.7844,18.0405,0.3095,0.0825,0.3023,0.06,0.6931
std,0.30307,1.3264,0.906,0.1163,0.1181,0.1155,13.0856,17.409,0.1143,0.5816,11.0252,11.1283,26.8449,11.0268,11.0445,1.4103,15.8386,0.9324,1.3837,0.6056,0.6235,1.1223,1.4582,0.8107,0.8981,15.8369,15.8692,15.8934,0.5352,1.3472,0.5412,1.0501,0.7285,1.6548,0.0044,1.6909,1.5353,0.5926,0.3428,0.8331,0.5031,1.1828
min,-30.149,-3.6973,0.6931,1.0001,1.0001,1.0002,-52.0917,-9946.0385,1.0001,5.0008,-51.1116,-54.5181,-62.2878,-51.0261,-51.4462,-35.8869,-17.4604,-14.995,-17.2026,-79.9156,-25.7374,-27.6043,-6.6318,-66.1663,-72.2208,-10011.3,-10012.3326,-10000.4312,-14.6183,-17.0175,-15.2991,-16.4937,-13.0846,-19.8418,-0.011,-39.817,-18.3358,-72.1962,-50.8115,-25.4325,-17.4669,-1.0
25%,3.5279e-05,16.454,0.6931,1.0882,1.0885,1.088,86.1133,85.617,1.0859,5.4412,-0.315,-3.2235,0.0208,-0.2543,-0.6702,-2.9146,0.1564,-1.1684,-2.7323,-0.8573,0.3057,-2.2692,15.7069,-1.5986,-1.4307,-0.2965,-1.5268,15.0579,-0.4268,-2.8028,-0.8122,-2.3608,-1.5538,16.1758,-0.0022,17.8089,17.1439,0.1092,0.0021,-0.1298,-0.0837,0.0
50%,0.047148,17.0881,1.0986,1.1794,1.1792,1.1794,89.2995,88.8508,1.1782,5.8969,-0.2196,-2.4909,0.0406,-0.1543,-0.4864,-2.2707,0.2661,-0.8504,-2.0917,-0.6261,0.4203,-1.7264,16.3591,-1.2742,-1.1142,-0.2012,-1.118,15.7038,-0.3379,-2.1603,-0.5822,-1.8173,-1.2325,16.8562,0.0006,18.4731,17.7978,0.25,0.0652,0.1375,0.0094,1.0
75%,0.094638,17.9075,1.7918,1.2276,1.226,1.2286,93.9936,93.2465,1.2231,6.1382,-0.1067,-1.8892,0.0699,-0.0258,-0.2492,-1.7643,0.3684,-0.5282,-1.5758,-0.4276,0.5273,-1.3181,17.2282,-1.0717,-0.8945,-0.0845,-0.6961,16.4608,-0.214,-1.6499,-0.3821,-1.418,-1.0372,17.6987,0.003,19.4211,18.7176,0.3752,0.1324,0.5229,0.0569,2.0
max,46.39,30.9478,3.8067,2.0502,2.0797,2.0358,7029.6812,136.5719,2.0216,10.2521,6964.0436,6961.6056,16975.3922,6964.0922,6963.7347,15.1183,10011.3486,36.7597,48.7374,63.2106,30.6619,51.2309,45.4601,62.9862,63.4692,13.6082,18.8719,27.4969,13.5664,26.9364,14.8045,21.8777,14.5549,29.7932,0.029,83.7615,52.773,83.3487,59.3271,32.6989,17.7349,2.0


# 데이터셋 분리

In [24]:
# train set
X = all_data.loc[all_data['class'] != -1 , :]
X.drop(columns='class',inplace=True,axis=1)
y = all_data.loc[all_data['class'] != -1, 'class']
y.astype(int)

# test set
test = all_data.loc[all_data['class'] == -1, :]
test.drop(columns='class', inplace=True,axis=1)

# train set split
SEED = 2020
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = SEED)

In [25]:
df_train.shape, df_test.shape, X.shape, y.shape, test.shape

((319923, 154), (80000, 153), (319923, 41), (319923,), (80000, 41))

In [26]:
X.describe()

Unnamed: 0,redshift,dered_i,nObserve,airmass_u,airmass_g,airmass_i,ugriz_sum,dered_sum,airmass_min,airmass_sum,ugriz_z_ugriz_i_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_i_diff,ugriz_z_dered_r_diff,ugriz_i_ugriz_u_diff,ugriz_i_dered_z_diff,ugriz_i_dered_g_diff,ugriz_i_dered_u_diff,ugriz_r_ugriz_g_diff,ugriz_r_dered_i_diff,ugriz_r_dered_u_diff,ugriz_r_airmass_i_diff,ugriz_g_ugriz_u_diff,ugriz_g_dered_u_diff,dered_z_dered_i_diff,dered_z_dered_g_diff,dered_z_airmass_g_diff,dered_i_dered_r_diff,dered_i_dered_u_diff,dered_r_dered_g_diff,dered_r_dered_u_diff,dered_g_dered_u_diff,dered_g_airmass_z_diff,airmass_z_airmass_i_diff,ugriz_B_jester,ugriz_V_jester,ugriz_icolor,ugriz_ucolor,ugriz_p1,dered_orthogonal
count,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0
mean,0.060481,17.2151,1.4862,1.1758,1.1765,1.1754,90.3835,89.7141,1.1725,5.8791,-0.2434,-2.7921,0.0659,-0.1547,-0.5554,-2.5487,0.3093,-0.9554,-2.3261,-0.6966,0.5201,-1.8946,16.5597,-1.4207,-1.198,-0.2206,-1.2647,15.8179,-0.4007,-2.4147,-0.6434,-2.014,-1.3707,17.083,0.0007,18.7833,18.0396,0.3094,0.0826,0.3019,0.0599
std,0.2911,1.325,0.9061,0.1163,0.1181,0.1155,7.1184,7.1531,0.1143,0.5815,0.5068,1.59,0.2909,0.5392,0.823,1.4048,0.5069,0.9286,1.379,0.5761,0.6143,1.1115,1.4542,0.7878,0.8763,0.4517,1.1078,1.3186,0.5339,1.3446,0.5376,1.0457,0.7255,1.6537,0.0044,1.6847,1.5316,0.5612,0.313,0.8226,0.5009
min,-25.915,-3.6973,0.6931,1.0001,1.0001,1.0002,45.8488,-61.5271,1.0001,5.0008,-28.3223,-33.975,-33.894,-31.0682,-30.872,-35.8869,-13.7933,-14.995,-17.2026,-31.5613,-13.9519,-16.7362,6.4311,-35.3807,-32.3721,-14.3327,-17.234,-0.97,-14.6183,-14.8136,-15.2991,-14.9929,-13.0846,-19.8418,-0.011,-7.0215,5.7123,-39.077,-26.8336,-15.3031,-17.4669
25%,3.5724e-05,16.4532,0.6931,1.0883,1.0885,1.0881,86.111,85.6105,1.086,5.4415,-0.3149,-3.223,0.0207,-0.2541,-0.6698,-2.9139,0.1563,-1.1677,-2.7314,-0.8568,0.3054,-2.2687,15.7068,-1.5978,-1.4301,-0.2965,-1.5263,15.0579,-0.4266,-2.8022,-0.8119,-2.3602,-1.553,16.1752,-0.0022,17.8087,17.1433,0.1091,0.002,-0.1303,-0.0839
50%,0.047153,17.0874,1.0986,1.1794,1.1792,1.1794,89.295,88.8465,1.1782,5.8969,-0.2195,-2.4892,0.0405,-0.1542,-0.4858,-2.2695,0.266,-0.8495,-2.0907,-0.6254,0.4202,-1.7259,16.3579,-1.2741,-1.114,-0.2009,-1.117,15.703,-0.3376,-2.1591,-0.5815,-1.8169,-1.2325,16.8553,0.0006,18.4719,17.7974,0.2499,0.0651,0.1372,0.0093
75%,0.094606,17.907,1.7918,1.2275,1.226,1.2286,93.986,93.2392,1.223,6.1379,-0.1067,-1.8881,0.0699,-0.0259,-0.2491,-1.7638,0.3683,-0.5278,-1.5755,-0.4273,0.5271,-1.3176,17.2267,-1.0715,-0.8945,-0.0845,-0.6953,16.4602,-0.2137,-1.649,-0.3819,-1.4173,-1.0372,17.6972,0.003,19.42,18.7172,0.3752,0.1324,0.522,0.0568
max,44.62,30.5509,3.8067,2.0491,2.0786,2.0347,164.4997,136.5719,2.0205,10.2467,35.1829,29.8174,30.5841,35.4233,31.1726,14.7756,17.8855,36.7574,48.7351,34.9739,28.8781,51.2227,44.0092,16.8637,54.5216,13.6082,18.8719,27.4969,13.5664,26.9364,11.7109,21.8777,12.3501,29.509,0.029,46.6178,35.1872,22.8994,17.0388,30.901,16.9173


In [27]:
test.describe()

Unnamed: 0,redshift,dered_i,nObserve,airmass_u,airmass_g,airmass_i,ugriz_sum,dered_sum,airmass_min,airmass_sum,ugriz_z_ugriz_i_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_i_diff,ugriz_z_dered_r_diff,ugriz_i_ugriz_u_diff,ugriz_i_dered_z_diff,ugriz_i_dered_g_diff,ugriz_i_dered_u_diff,ugriz_r_ugriz_g_diff,ugriz_r_dered_i_diff,ugriz_r_dered_u_diff,ugriz_r_airmass_i_diff,ugriz_g_ugriz_u_diff,ugriz_g_dered_u_diff,dered_z_dered_i_diff,dered_z_dered_g_diff,dered_z_airmass_g_diff,dered_i_dered_r_diff,dered_i_dered_u_diff,dered_r_dered_g_diff,dered_r_dered_u_diff,dered_g_dered_u_diff,dered_g_airmass_z_diff,airmass_z_airmass_i_diff,ugriz_B_jester,ugriz_V_jester,ugriz_icolor,ugriz_ucolor,ugriz_p1,dered_orthogonal
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,0.060083,17.2178,1.4903,1.176,1.1767,1.1756,90.4921,89.6061,1.1727,5.8799,-0.1575,-2.7109,0.2787,-0.0689,-0.4704,-2.5533,0.4363,-0.9573,-2.3285,-0.6974,0.5219,-1.8952,16.5641,-1.4227,-1.1978,-0.3476,-1.3935,15.6935,-0.4015,-2.4171,-0.6444,-2.0156,-1.3712,17.0875,0.0007,18.789,18.0445,0.31,0.0824,0.3041,0.0604
std,0.34684,1.3324,0.9055,0.1164,0.1182,0.1156,25.5608,36.2002,0.1144,0.5822,24.6299,24.6773,60.0184,24.6307,24.6389,1.4322,35.3983,0.9475,1.4022,0.7116,0.6589,1.1647,1.4738,0.8966,0.9802,35.3975,35.412,35.4374,0.5405,1.3579,0.5555,1.0679,0.7405,1.6596,0.0044,1.7155,1.55,0.7041,0.4422,0.8741,0.5114
min,-30.149,-3.6973,0.6931,1.0001,1.0001,1.0002,-52.0917,-9946.0385,1.0001,5.0008,-51.1116,-54.5181,-62.2878,-51.0261,-51.4462,-35.612,-17.4604,-13.3576,-16.9809,-79.9156,-25.7374,-27.6043,-6.6318,-66.1663,-72.2208,-10011.3,-10012.3326,-10000.4312,-14.2138,-17.0175,-13.0229,-16.4937,-11.2314,-19.8418,-0.011,-39.817,-18.3358,-72.1962,-50.8115,-25.4325,-14.3826
25%,3.3645e-05,16.4562,0.6931,1.0881,1.0883,1.0879,86.1228,85.6429,1.0858,5.4406,-0.3154,-3.2249,0.021,-0.255,-0.6715,-2.9173,0.1567,-1.1714,-2.7348,-0.8593,0.3071,-2.2711,15.707,-1.6022,-1.433,-0.2968,-1.5288,15.0581,-0.4273,-2.8048,-0.8136,-2.3636,-1.5568,16.1776,-0.0022,17.81,17.1463,0.1099,0.0023,-0.1276,-0.0828
50%,0.047115,17.0912,1.0986,1.1794,1.1793,1.1794,89.3158,88.8656,1.1782,5.8969,-0.2202,-2.4986,0.0406,-0.1547,-0.4886,-2.2754,0.2666,-0.854,-2.0948,-0.6289,0.4207,-1.7289,16.3638,-1.275,-1.1151,-0.2023,-1.122,15.7068,-0.3388,-2.1645,-0.5848,-1.819,-1.2327,16.86,0.0006,18.4785,17.7997,0.2504,0.0655,0.1386,0.0097
75%,0.094769,17.9109,1.7918,1.2278,1.2262,1.2289,94.034,93.2762,1.2233,6.1393,-0.1067,-1.893,0.07,-0.0255,-0.2499,-1.7662,0.3688,-0.5301,-1.5768,-0.4286,0.5283,-1.3199,17.2346,-1.0723,-0.8945,-0.0849,-0.6987,16.4633,-0.2149,-1.6533,-0.3828,-1.4208,-1.0377,17.7048,0.003,19.4251,18.7195,0.3752,0.1323,0.5261,0.0571
max,46.39,30.9478,3.8067,2.0502,2.0797,2.0358,7029.6812,129.0685,2.0216,10.2521,6964.0436,6961.6056,16975.3922,6964.0922,6963.7347,15.1183,10011.3486,36.7597,48.7374,63.2106,30.6619,51.2309,45.4601,62.9862,63.4692,12.417,18.8719,27.3017,11.5039,26.9364,14.8045,21.8777,14.5549,29.7932,0.029,83.7615,52.773,83.3487,59.3271,32.6989,17.7349


# 모델 학습

In [28]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'tree_method': 'auto',
    'objective': 'multi:softmax',
    'num_class': 3,
    'random_state': 2020
}

In [29]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train,y_train)
p = xgb_clf.predict(X_val)
print(accuracy_score(p,y_val))

0.9344738843681298
