In [48]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [49]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import TruncatedSVD, FastICA

import xgboost as xgb


import eli5
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances

In [50]:
import pandas as pd # 데이터 분석
import numpy as np # 행렬 연산, version: 1.6.1

import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import sklearn
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD, PCA, FastICA, FactorAnalysis, KernelPCA, DictionaryLearning
from sklearn.decomposition import IncrementalPCA, LatentDirichletAllocation,MiniBatchSparsePCA, SparsePCA

import itertools

In [51]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [52]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

target_column = 'class'
SEED = 2020

In [53]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

# 이상치 제거

In [54]:
# test의 MinMax 범위 넘는 행은 train에서 제거
train_shape = df_train.shape[0]

for col in df_train.columns[:18]:
    df_train = df_train.loc[np.logical_and(df_train[col]>=df_test[col].min(),
                            df_train[col]<=df_test[col].max())]

print('제거된 행 개수 :', train_shape - df_train.shape[0])

제거된 행 개수 : 77


In [55]:
df_train.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class
count,319923.0,319923.0,319923.0,319923.0,319923.0,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0
mean,19.8525,18.4318,17.7352,17.3037,17.0603,0.060481,19.6298,18.2591,17.6158,17.2151,16.9944,6.3334,6.1396,1.1758,1.1765,1.1751,1.1754,1.1761,1.1165
std,1.9392,1.6598,1.4586,1.3141,1.331,0.2911,1.9114,1.6594,1.4672,1.325,1.3232,8.8817,8.5522,0.1163,0.1181,0.1147,0.1155,0.1171,0.9234
min,2.2651,-12.4441,7.7314,7.7115,-9.2548,-25.915,-30.6337,-18.656,-8.756,-3.6973,0.2159,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0002,0.0
25%,18.7244,17.475,16.8773,16.524,16.289,3.5724e-05,18.5639,17.3486,16.7874,16.4532,16.2343,1.0,1.0,1.0883,1.0885,1.0878,1.0881,1.0883,0.0
50%,19.4195,18.1405,17.5259,17.1498,16.9177,0.047153,19.2647,18.0225,17.4434,17.0874,16.8694,2.0,2.0,1.1794,1.1792,1.1794,1.1794,1.1793,1.0
75%,20.432,19.0728,18.4279,18.0074,17.7288,0.094606,20.1976,18.8883,18.2908,17.907,17.6555,5.0,5.0,1.2275,1.226,1.2292,1.2286,1.2268,2.0
max,49.1436,46.3383,45.1299,32.8634,52.6127,44.62,30.7779,30.6132,31.294,30.5509,28.571,44.0,42.0,2.0491,2.0786,2.0205,2.0347,2.0637,2.0


In [56]:
df_test.describe()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,19.8598,18.4371,17.7397,17.3065,17.149,0.060083,19.635,18.2638,17.6193,17.2178,16.8702,6.3499,6.1578,1.176,1.1767,1.1753,1.1756,1.1763
std,1.9543,1.703,1.4786,1.3207,24.6431,0.34684,1.921,1.6657,1.476,1.3324,35.4367,8.8728,8.5509,0.1164,0.1182,0.1148,0.1156,0.1173
min,-0.3385,-51.1753,-5.4387,5.3267,-39.5272,-30.149,-30.6337,-18.656,-8.756,-3.6973,-9999.0,1.0,1.0,1.0001,1.0001,1.0002,1.0002,1.0001
25%,18.7273,17.478,16.882,16.5255,16.2882,3.3645e-05,18.5709,17.3522,16.7906,16.4562,16.231,1.0,1.0,1.0881,1.0883,1.0877,1.0879,1.0882
50%,19.4261,18.1451,17.5299,17.156,16.9212,0.047115,19.2674,18.0247,17.4487,17.0912,16.8733,2.0,2.0,1.1794,1.1793,1.1794,1.1794,1.1793
75%,20.4344,19.0793,18.4345,18.008,17.7333,0.094769,20.1999,18.8948,18.2952,17.9109,17.6578,5.0,5.0,1.2278,1.2262,1.2294,1.2289,1.2269
max,56.8471,94.3591,46.6913,33.0259,6976.3922,46.39,30.8899,30.9529,31.6536,30.9478,28.6441,44.0,42.0,2.0502,2.0797,2.0216,2.0358,2.0648


# 변수 생성 

In [57]:
for i in ['u','g','r','i','z']:
    df_train[f'ugriz_{i}'] = df_train[i]
    df_test[f'ugriz_{i}'] = df_test[i]
df_train.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)
df_test.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)

In [58]:
ugriz_col = [c for c in df_train.columns if c.find('ugriz') != -1]
dered_col = [c for c in df_train.columns if c.find('dered') != -1]
airmass_col = [c for c in df_train.columns if c.find('airmass') != -1]

In [59]:
# zip 함수를 이용하여 각 Row별, Magnitude별, max, min, max-min, std, sum을 구한다.
for prefix , g in zip(['ugriz','dered','airmass'], [ugriz_col, dered_col, airmass_col]):
    df_train[f'{prefix}_max'] = df_train[g].max(axis=1)
    df_test[f'{prefix}_max'] = df_test[g].max(axis=1)
    
    df_train[f'{prefix}_min'] = df_train[g].min(axis=1)
    df_test[f'{prefix}_min'] = df_test[g].min(axis=1)
    
    df_train[f'{prefix}_diff'] = df_train[f'{prefix}_max'] - df_train[f'{prefix}_min']
    df_test[f'{prefix}_diff'] = df_test[f'{prefix}_max'] - df_test[f'{prefix}_min']
    
    df_train[f'{prefix}_std'] = df_train[g].std(axis=1)
    df_test[f'{prefix}_std'] = df_test[g].std(axis=1)
    
    df_train[f'{prefix}_sum'] = df_train[g].sum(axis=1)
    df_test[f'{prefix}_sum'] = df_test[g].sum(axis=1)

In [60]:
# diff feature 추가 예: z - i
# itertools combinations을 활용하여 전체 magnitude에서 diff를 구함
# 총 105가지 조합이 나옴. 여기서 안 좋은 것은 permutation importance를 활용하여 제거할 예정
diff_feature = []
for c1, c2 in itertools.combinations(ugriz_col[::-1]+dered_col[::-1]
                                     +airmass_col[::-1],2):
    new_c = f'{c1}_{c2}_diff'
    df_train[new_c] = df_train[c1]-df_train[c2]
    df_test[new_c] = df_test[c1]-df_test[c2]
    diff_feature.append(new_c)
    

In [61]:
# 각 maginitude 별 max-max, min-min, sum-sum 을 구함

for c in itertools.combinations(['ugriz','dered','airmass'],2):
    df_train[f'{c[0]}_{c[1]}_max_diff'] = df_train[f'{c[0]}_max'] - df_train[f'{c[1]}_max']
    df_test[f'{c[0]}_{c[1]}_max_diff'] = df_test[f'{c[0]}_max'] - df_test[f'{c[1]}_max']
    
    df_train[f'{c[0]}_{c[1]}_min_diff'] = df_train[f'{c[0]}_min'] - df_train[f'{c[1]}_min']
    df_test[f'{c[0]}_{c[1]}_min_diff'] = df_test[f'{c[0]}_min'] - df_test[f'{c[1]}_min']
    
    df_train[f'{c[0]}_{c[1]}_sum_diff'] = df_train[f'{c[0]}_sum'] - df_train[f'{c[1]}_sum']
    df_test[f'{c[0]}_{c[1]}_sum_diff'] = df_test[f'{c[0]}_sum'] - df_test[f'{c[1]}_sum']

In [62]:
# 아래와 같은 공식들을 구현함
# http://classic.sdss.org/dr4/algorithms/sdssUBVRITransform.html 

def make_2flux_feature(train, test, c1, c2, func, mag_list=None):
    
    for c in mag_list:
        x=train[f'{c}_{c1}'].values
        y=train[f'{c}_{c2}'].values
        train[f'{c}_{func.__name__}'] = func(x,y)
        
        x=test[f'{c}_{c1}'].values
        y=test[f'{c}_{c2}'].values
        test[f'{c}_{func.__name__}'] = func(x,y)
        
def UB_jester(x1,x2):
    return 0.75*(x1-x2)-0.81

make_2flux_feature(df_train,df_test, 'u','g',UB_jester,['ugriz'])

def BV_jester(x1,x2):
    return 0.62*(x1-x2)+0.15

make_2flux_feature(df_train,df_test, 'g','r',BV_jester,['ugriz'])

def VR_jester(x1,x2):
    return 0.38*(x1-x2)+0.27

make_2flux_feature(df_train, df_test, 'r','i',VR_jester,['ugriz'])

def RcIc_jester(x1,x2):
    return 0.72*(x1-x2)+0.27

make_2flux_feature(df_train, df_test, 'r','i', RcIc_jester, ['ugriz'])

def B_jester(x1,x2):
    return x2+0.17*(x1-x2)+0.11

make_2flux_feature(df_train,df_test, 'u','g', B_jester, ['ugriz'])

def V_jester(x1,x2):
    return x1-0.52*(x1-x2)-0.03

make_2flux_feature(df_train,df_test, 'g','r',V_jester, ['ugriz'])

In [63]:
# https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy

# 아래 나열된 모든 ugriz 등급은 달리 명시되지 않는 한 dereddened PSF mag입니다.
# dered_X 사용하면 될듯, 일단은 잘 몰라서 ugriz_X 버전과 dered_X 버전을 비교해서 사용하면 될듯.

# ugriz_X 버전

all_data = pd.concat([df_train,df_test], axis=0)
all_data.fillna(-1, inplace=True)

all_data['ugriz_icolor'] = all_data['ugriz_u']*(-0.436) + all_data['ugriz_g']*(1.129) + all_data['ugriz_r']*(-0.119) + all_data['ugriz_i']*(-0.574) + 0.1984

all_data['ugriz_ucolor'] = all_data['ugriz_u']*(-0.249) + all_data['ugriz_g']*(0.794) + all_data['ugriz_r']*(-0.555) + 0.234

all_data['ugriz_p1'] = (all_data['ugriz_u']-all_data['ugriz_g'])*(0.91) + (all_data['ugriz_g']-all_data['ugriz_r'])*(0.415)- 1.280

all_data['ugriz_r_std_div'] = all_data['ugriz_r']/all_data['ugriz_r'].std()

In [64]:
# https://www.sdss.org/dr16/algorithms/legacy_target_selection/
# 멸망과 관련 있는 거라고 함.
# dered_X 가 어떤 곳에서는 멸망과 관련 있는 데이터였음.
# ugriz_X를 붉어짐에 대한 걸로 사용하다면, 여기서는 dered_X를 멸망과 관련 있는 것과 사용하면 될듯

all_data['dered_orthogonal'] = (all_data['dered_r']-all_data['dered_i'])-(all_data['dered_g']-all_data['dered_r'])/4-0.18

all_data['dered_parallel'] = 0.7*(all_data['dered_g']-all_data['dered_r']) + 1.2*((all_data['dered_r']-all_data['dered_i'])-0.18)

In [65]:
# https://www.sdss.org/dr12/algorithms/magnitudes/
# 문서에는 psfMag에 대한 데이터인데, 그것을 나는 ugriz로 사용하고 있으니까 urgiz로 사용

color_list = ['u', 'g', 'r', 'i', 'z']
b_list = [1.4*10e-10, 0.9*10e-10, 1.2*10e-10, 1.8*10e-10, 7.4*10e-10]
f0_list = [24.63, 25.11, 24.80, 24.36, 22.83]
for c, b, f0 in zip(color_list, b_list, f0_list):
    all_data[f'ugriz_{c}_asinh'] = -2.5*np.log(10)*(np.arcsinh((all_data[f'ugriz_{c}']/f0)/(2*b))+np.log(b))

In [66]:
all_data['nObserve'] = all_data['nObserve'].apply(np.log1p)
all_data['d_obs_det'] = all_data['nObserve'] - all_data['nDetect']

# 데이터셋 분리

In [20]:
# train set
X = all_data.loc[all_data['class'] != -1 , :]
X.drop(columns='class',inplace=True,axis=1)
y = all_data.loc[all_data['class'] != -1, 'class']
y.astype(int)

# test set
test = all_data.loc[all_data['class'] == -1, :]
test.drop(columns='class', inplace=True,axis=1)

# train set split
SEED = 2020
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = SEED)

In [21]:
df_train.shape, df_test.shape, X.shape, y.shape, test.shape

((319923, 154), (80000, 153), (319923, 165), (319923,), (80000, 165))

In [22]:
X.describe()

Unnamed: 0,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,ugriz_u,ugriz_g,ugriz_r,ugriz_i,ugriz_z,ugriz_max,ugriz_min,ugriz_diff,ugriz_std,ugriz_sum,dered_max,dered_min,dered_diff,dered_std,dered_sum,airmass_max,airmass_min,airmass_diff,airmass_std,airmass_sum,ugriz_z_ugriz_i_diff,ugriz_z_ugriz_r_diff,ugriz_z_ugriz_g_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_i_diff,ugriz_z_dered_r_diff,ugriz_z_dered_g_diff,ugriz_z_dered_u_diff,ugriz_z_airmass_z_diff,ugriz_z_airmass_i_diff,ugriz_z_airmass_r_diff,ugriz_z_airmass_g_diff,ugriz_z_airmass_u_diff,ugriz_i_ugriz_r_diff,ugriz_i_ugriz_g_diff,ugriz_i_ugriz_u_diff,...,dered_r_airmass_g_diff,dered_r_airmass_u_diff,dered_g_dered_u_diff,dered_g_airmass_z_diff,dered_g_airmass_i_diff,dered_g_airmass_r_diff,dered_g_airmass_g_diff,dered_g_airmass_u_diff,dered_u_airmass_z_diff,dered_u_airmass_i_diff,dered_u_airmass_r_diff,dered_u_airmass_g_diff,dered_u_airmass_u_diff,airmass_z_airmass_i_diff,airmass_z_airmass_r_diff,airmass_z_airmass_g_diff,airmass_z_airmass_u_diff,airmass_i_airmass_r_diff,airmass_i_airmass_g_diff,airmass_i_airmass_u_diff,airmass_r_airmass_g_diff,airmass_r_airmass_u_diff,airmass_g_airmass_u_diff,ugriz_dered_max_diff,ugriz_dered_min_diff,ugriz_dered_sum_diff,ugriz_airmass_max_diff,ugriz_airmass_min_diff,ugriz_airmass_sum_diff,dered_airmass_max_diff,dered_airmass_min_diff,dered_airmass_sum_diff,ugriz_UB_jester,ugriz_BV_jester,ugriz_VR_jester,ugriz_RcIc_jester,ugriz_B_jester,ugriz_V_jester,ugriz_icolor,ugriz_ucolor,ugriz_p1,ugriz_r_std_div,dered_orthogonal,dered_parallel,ugriz_u_asinh,ugriz_g_asinh,ugriz_r_asinh,ugriz_i_asinh,ugriz_z_asinh,d_obs_det
count,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319920.0,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,...,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0
mean,0.060481,19.6298,18.2591,17.6158,17.2151,16.9944,1.4862,6.1396,1.1758,1.1765,1.1751,1.1754,1.1761,19.8525,18.4318,17.7352,17.3037,17.0603,19.8847,17.0145,2.8701,1.1633,90.3835,19.6647,16.9415,2.7232,1.1041,89.7141,1.1792,1.1725,0.0067146,0.0026544,5.8791,-0.2434,-0.6749,-1.3715,-2.7921,0.0659,-0.1547,-0.5554,-1.1988,-2.5694,15.8842,15.8849,15.8852,15.8838,15.8846,-0.4315,-1.1281,-2.5487,...,16.4392,16.44,-1.3707,17.083,17.0837,17.084,17.0826,17.0833,18.4536,18.4543,18.4546,18.4532,18.454,0.0007,0.001,-0.0004,0.0004,0.0003,-0.0011,-0.0003,-0.0014,-0.0006,0.0008,0.22,0.0731,0.6694,18.7055,15.8421,84.5044,18.4855,15.769,83.8351,0.2555,0.5819,0.434,0.5807,18.7833,18.0396,0.3094,0.0826,0.3019,12.1259,0.0599,0.7152,1.2671,1.8034,1.9489,1.9851,1.6964,-4.6534
std,0.2911,1.9114,1.6594,1.4672,1.325,1.3232,0.9061,8.5522,0.1163,0.1181,0.1147,0.1155,0.1171,1.9392,1.6598,1.4586,1.3141,1.331,1.9642,1.2381,1.5576,0.6366,7.1184,1.9254,1.2544,1.4738,0.5997,7.1531,0.1183,0.1143,0.0058399,0.0023081,0.5815,0.5068,0.8029,1.1437,1.59,0.2909,0.5392,0.823,1.153,1.571,1.327,1.327,1.327,1.327,1.327,0.5494,0.9157,1.4048,...,1.462,1.462,0.7255,1.6537,1.6536,1.6536,1.6537,1.6536,1.9046,1.9046,1.9046,1.9046,1.9046,0.0044,0.0066,0.0022,0.0022,0.0022,0.0066,0.0022,0.0088,0.0044,0.0044,0.6246,0.2656,1.7878,1.9591,1.2335,7.0936,1.9189,1.249,7.1231,0.5908,0.3572,0.2088,0.3956,1.6847,1.5316,0.5612,0.313,0.8226,0.9972,0.5009,0.8528,0.5376,0.7641,0.4618,0.4308,0.8514,7.6978
min,-25.915,-30.6337,-18.656,-8.756,-3.6973,0.2159,0.6931,1.0,1.0001,1.0001,1.0002,1.0002,1.0002,2.2651,-12.4441,7.7314,7.7115,-9.2548,11.0908,-12.4441,0.0844,0.0342,45.8488,0.2159,-30.6337,0.0779,0.0365,-61.5271,1.0002,1.0001,1.9e-05,8.7006e-06,5.0008,-28.3223,-27.294,-36.3828,-33.975,-33.894,-31.0682,-30.872,-38.7162,-26.7982,-10.5755,-10.566,-10.5614,-10.5804,-10.5707,-28.8183,-30.9932,-35.8869,...,-9.9419,-9.9418,-13.0846,-19.8418,-19.8418,-19.8419,-19.8419,-19.8418,-31.8195,-31.8195,-31.8196,-31.8196,-31.8195,-0.011,-0.0166,-0.0149,-0.0055,-0.0056,-0.0439,-0.0144,-0.0581,-0.0286,-0.0109,-11.1856,-32.3721,-72.1912,9.8281,-13.4702,40.609,-0.97,-31.8195,-67.4564,-13.4578,-21.5338,-5.6792,-11.0021,-7.0215,5.7123,-39.077,-26.8336,-15.3031,5.2861,-17.4669,-16.2214,-3.9765,-3.527,-3.4464,-1.7236,-4.806,-38.1933
25%,3.5724e-05,18.5639,17.3486,16.7874,16.4532,16.2343,0.6931,1.0,1.0883,1.0885,1.0878,1.0881,1.0883,18.7244,17.475,16.8773,16.524,16.289,18.7403,16.2816,1.9019,0.7716,86.111,18.582,16.224,1.7669,0.7189,85.6105,1.0906,1.086,0.001878,0.00074273,5.4415,-0.3149,-0.7559,-1.6103,-3.223,0.0207,-0.2541,-0.6698,-1.477,-3.0409,15.1144,15.1151,15.1154,15.1138,15.1148,-0.4501,-1.3007,-2.9139,...,15.6125,15.6134,-1.553,16.1752,16.1763,16.1766,16.1749,16.1759,17.3909,17.3918,17.3921,17.3907,17.3914,-0.0022,-0.0033,-0.0015,-0.0011,-0.0011,-0.0045,-0.0015,-0.006,-0.0029,-0.0021,0.0684,0.0227,0.23,17.5646,15.1107,80.2482,17.4036,15.0517,79.7333,-0.0064,0.415,0.3621,0.4445,17.8087,17.1433,0.1091,0.002,-0.1303,11.5394,-0.0839,0.3154,1.0757,1.583,1.7095,1.7394,1.4557,-3.2082
50%,0.047153,19.2647,18.0225,17.4434,17.0874,16.8694,1.0986,2.0,1.1794,1.1792,1.1794,1.1794,1.1793,19.4195,18.1405,17.5259,17.1498,16.9177,19.4335,16.9106,2.5007,1.0064,89.295,19.2795,16.8603,2.3725,0.9554,88.8465,1.1806,1.1782,0.005264,0.0020807,5.8969,-0.2195,-0.5751,-1.2016,-2.4892,0.0405,-0.1542,-0.4858,-1.0675,-2.3099,15.7516,15.752,15.752,15.7514,15.7519,-0.3602,-0.9832,-2.2695,...,16.2739,16.2746,-1.2325,16.8553,16.8559,16.8562,16.855,16.8557,18.0995,18.0999,18.1001,18.0992,18.0999,0.0006,0.0009,-0.0004,0.0003,0.0003,-0.001,-0.0003,-0.0013,-0.0006,0.0007,0.1323,0.0426,0.4004,18.2657,15.7474,83.4663,18.1119,15.6975,83.0003,0.1455,0.5377,0.4069,0.5294,18.4719,17.7974,0.2499,0.0651,0.1372,11.9828,0.0093,0.5943,1.3682,1.8715,1.9984,2.0203,1.7253,-0.9014
75%,0.094606,20.1976,18.8883,18.2908,17.907,17.6555,1.7918,5.0,1.2275,1.226,1.2292,1.2286,1.2268,20.432,19.0728,18.4279,18.0074,17.7288,20.4767,17.6943,3.2427,1.3044,93.986,20.2453,17.6149,3.1128,1.2521,93.2392,1.2323,1.223,0.010346,0.0040897,6.1379,-0.1067,-0.3555,-0.7902,-1.8881,0.0699,-0.0259,-0.2491,-0.6406,-1.695,16.5339,16.5351,16.5357,16.5334,16.5345,-0.2423,-0.6743,-1.7638,...,17.0906,17.0925,-1.0372,17.6972,17.6978,17.6983,17.6967,17.6974,19.0142,19.015,19.0154,19.0137,19.0146,0.003,0.0044,0.0011,0.0015,0.0015,0.0032,0.0011,0.0043,0.0022,0.003,0.2328,0.0733,0.6742,19.283,16.5046,87.9772,19.0567,16.4277,87.2366,0.3883,0.6812,0.4411,0.5941,19.42,18.7172,0.3752,0.1324,0.522,12.5996,0.0568,0.8593,1.5781,2.0867,2.2155,2.2343,1.9433,-0.3069
max,44.62,30.7779,30.6132,31.294,30.5509,28.571,3.8067,42.0,2.0491,2.0786,2.0205,2.0347,2.0637,49.1436,46.3383,45.1299,32.8634,52.6127,52.6127,23.8072,37.4356,16.8871,164.4997,31.294,23.1122,30.8496,12.4473,136.5719,2.0786,2.0205,0.058064,0.02295,10.2467,35.1829,31.1184,34.3737,29.8174,30.5841,35.4233,31.1726,35.0874,47.0651,51.4029,51.3988,51.3967,51.405,51.4009,15.6557,37.4356,14.7756,...,30.2152,30.2189,12.3501,29.509,29.5046,29.5023,29.5111,29.5068,29.6683,29.6671,29.6664,29.6689,29.6677,0.029,0.0432,0.0054,0.0146,0.0142,0.0164,0.0055,0.022,0.0111,0.0295,26.2366,47.0651,164.5976,51.3967,22.661,157.2684,30.2152,21.8963,131.021,25.7255,19.718,11.2209,21.0192,46.6178,35.1872,22.8994,17.0388,30.901,30.8562,16.9173,17.6126,13.7369,235.7573,6.7095,6.6213,210.3448,2.7377


In [23]:
test.describe()

Unnamed: 0,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,ugriz_u,ugriz_g,ugriz_r,ugriz_i,ugriz_z,ugriz_max,ugriz_min,ugriz_diff,ugriz_std,ugriz_sum,dered_max,dered_min,dered_diff,dered_std,dered_sum,airmass_max,airmass_min,airmass_diff,airmass_std,airmass_sum,ugriz_z_ugriz_i_diff,ugriz_z_ugriz_r_diff,ugriz_z_ugriz_g_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_i_diff,ugriz_z_dered_r_diff,ugriz_z_dered_g_diff,ugriz_z_dered_u_diff,ugriz_z_airmass_z_diff,ugriz_z_airmass_i_diff,ugriz_z_airmass_r_diff,ugriz_z_airmass_g_diff,ugriz_z_airmass_u_diff,ugriz_i_ugriz_r_diff,ugriz_i_ugriz_g_diff,ugriz_i_ugriz_u_diff,...,dered_r_airmass_g_diff,dered_r_airmass_u_diff,dered_g_dered_u_diff,dered_g_airmass_z_diff,dered_g_airmass_i_diff,dered_g_airmass_r_diff,dered_g_airmass_g_diff,dered_g_airmass_u_diff,dered_u_airmass_z_diff,dered_u_airmass_i_diff,dered_u_airmass_r_diff,dered_u_airmass_g_diff,dered_u_airmass_u_diff,airmass_z_airmass_i_diff,airmass_z_airmass_r_diff,airmass_z_airmass_g_diff,airmass_z_airmass_u_diff,airmass_i_airmass_r_diff,airmass_i_airmass_g_diff,airmass_i_airmass_u_diff,airmass_r_airmass_g_diff,airmass_r_airmass_u_diff,airmass_g_airmass_u_diff,ugriz_dered_max_diff,ugriz_dered_min_diff,ugriz_dered_sum_diff,ugriz_airmass_max_diff,ugriz_airmass_min_diff,ugriz_airmass_sum_diff,dered_airmass_max_diff,dered_airmass_min_diff,dered_airmass_sum_diff,ugriz_UB_jester,ugriz_BV_jester,ugriz_VR_jester,ugriz_RcIc_jester,ugriz_B_jester,ugriz_V_jester,ugriz_icolor,ugriz_ucolor,ugriz_p1,ugriz_r_std_div,dered_orthogonal,dered_parallel,ugriz_u_asinh,ugriz_g_asinh,ugriz_r_asinh,ugriz_i_asinh,ugriz_z_asinh,d_obs_det
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,...,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,0.060083,19.635,18.2638,17.6193,17.2178,16.8702,1.4903,6.1578,1.176,1.1767,1.1753,1.1756,1.1763,19.8598,18.4371,17.7397,17.3065,17.149,19.9839,17.0133,2.9706,1.2082,90.4921,19.6729,16.8157,2.8572,1.164,89.6061,1.1794,1.1727,0.0067318,0.0026612,5.8799,-0.1575,-0.5908,-1.2882,-2.7109,0.2787,-0.0689,-0.4704,-1.1148,-2.486,15.9726,15.9733,15.9736,15.9722,15.973,-0.4333,-1.1306,-2.5533,...,16.4426,16.4434,-1.3712,17.0875,17.0882,17.0885,17.087,17.0878,18.4586,18.4593,18.4596,18.4582,18.459,0.0007,0.001,-0.0004,0.0004,0.0003,-0.0011,-0.0003,-0.0014,-0.0006,0.0008,0.3109,0.1975,0.886,18.8045,15.8406,84.6122,18.4936,15.6431,83.7262,0.2571,0.5824,0.4346,0.5819,18.789,18.0445,0.31,0.0824,0.3041,12.129,0.0604,0.717,1.2678,1.806,1.9531,1.9843,1.6981,-4.6675
std,0.34684,1.921,1.6657,1.476,1.3324,35.4367,0.9055,8.5509,0.1164,0.1182,0.1148,0.1156,0.1173,1.9543,1.703,1.4786,1.3207,24.6431,24.6764,1.2616,24.6651,11.0261,25.5608,1.9296,35.4342,35.425,15.8383,36.2002,0.1185,0.1144,0.00586,0.002316,0.5822,24.6299,24.6382,24.6502,24.6773,60.0184,24.6307,24.6389,24.6522,24.6755,24.642,24.642,24.642,24.6419,24.642,0.582,0.9922,1.4322,...,1.4704,1.4703,0.7405,1.6596,1.6596,1.6596,1.6596,1.6596,1.9139,1.9139,1.9139,1.9139,1.9139,0.0044,0.0066,0.0022,0.0022,0.0022,0.0066,0.0022,0.0088,0.0044,0.0044,24.6229,35.3974,60.0523,24.6751,1.2568,25.5492,1.9228,35.4349,36.1981,0.6725,0.4412,0.2211,0.419,1.7155,1.55,0.7041,0.4422,0.8741,1.0109,0.5114,0.8599,0.9151,1.2688,1.2135,0.4324,1.1297,7.6974
min,-30.149,-30.6337,-18.656,-8.756,-3.6973,-9999.0,0.6931,1.0,1.0001,1.0001,1.0002,1.0002,1.0001,-0.3385,-51.1753,-5.4387,5.3267,-39.5272,13.3546,-51.1753,0.1139,0.0419,-52.0917,0.2159,-9999.0,0.1126,0.0461,-9946.0385,1.0002,1.0001,2.1e-05,9.0277e-06,5.0008,-51.1116,-51.5625,-24.4105,-54.5181,-62.2878,-51.0261,-51.4462,-64.4691,-60.5727,-40.7395,-40.7382,-40.7377,-40.7402,-40.7389,-30.6336,-80.4502,-35.612,...,-9.9419,-9.9418,-11.2314,-19.8418,-19.8418,-19.8419,-19.8419,-19.8418,-31.8195,-31.8195,-31.8196,-31.8196,-31.8195,-0.011,-0.0166,-0.0149,-0.0055,-0.0056,-0.0439,-0.0144,-0.0581,-0.0286,-0.0109,-10.6594,-62.6742,-144.2578,11.9876,-52.3858,-58.1505,-0.97,-10000.4062,-9953.1315,-48.0496,-39.0406,-9.1022,-17.4878,-39.817,-18.3358,-72.1962,-50.8115,-25.4325,-3.7185,-14.3826,-13.2165,-4.8147,-7.6206,-3.6422,-1.752,-32.9397,-38.1933
25%,3.3645e-05,18.5709,17.3522,16.7906,16.4562,16.231,0.6931,1.0,1.0881,1.0883,1.0877,1.0879,1.0882,18.7273,17.478,16.882,16.5255,16.2882,18.7428,16.2793,1.9076,0.7743,86.1228,18.5863,16.2232,1.7724,0.7218,85.6429,1.0905,1.0858,0.001881,0.00074431,5.4406,-0.3154,-0.7566,-1.6134,-3.2249,0.021,-0.255,-0.6715,-1.4804,-3.0447,15.114,15.1152,15.1155,15.1134,15.1145,-0.4508,-1.303,-2.9173,...,15.6131,15.6139,-1.5568,16.1776,16.1781,16.1788,16.1774,16.1778,17.3965,17.3975,17.398,17.3959,17.3968,-0.0022,-0.0033,-0.0015,-0.0011,-0.0011,-0.0046,-0.0015,-0.006,-0.003,-0.0021,0.0684,0.0229,0.2308,17.5703,15.1098,80.2652,17.4075,15.0527,79.7463,-0.0058,0.4157,0.3624,0.445,17.81,17.1463,0.1099,0.0023,-0.1276,11.5426,-0.0828,0.3175,1.075,1.5811,1.7075,1.7392,1.4543,-3.2082
50%,0.047115,19.2674,18.0247,17.4487,17.0912,16.8733,1.0986,2.0,1.1794,1.1793,1.1794,1.1794,1.1793,19.4261,18.1451,17.5299,17.156,16.9212,19.4419,16.9142,2.5099,1.0103,89.3158,19.2848,16.8648,2.3794,0.9584,88.8656,1.1806,1.1782,0.005295,0.0020931,5.8969,-0.2202,-0.5772,-1.2076,-2.4986,0.0406,-0.1547,-0.4886,-1.0703,-2.3161,15.7563,15.7568,15.7567,15.7563,15.7566,-0.3613,-0.9872,-2.2754,...,16.2793,16.2793,-1.2327,16.86,16.8606,16.8606,16.8597,16.8603,18.1018,18.1019,18.102,18.1019,18.1018,0.0006,0.0009,-0.0004,0.0003,0.0003,-0.001,-0.0003,-0.0013,-0.0006,0.0007,0.1323,0.0427,0.4005,18.2741,15.7523,83.4951,18.1145,15.7019,83.0353,0.1463,0.5399,0.4073,0.5301,18.4785,17.7997,0.2504,0.0655,0.1386,11.9856,0.0097,0.5977,1.3663,1.8701,1.9971,2.0182,1.7241,-0.9014
75%,0.094769,20.1999,18.8948,18.2952,17.9109,17.6578,1.7918,5.0,1.2278,1.2262,1.2294,1.2289,1.2269,20.4344,19.0793,18.4345,18.008,17.7333,20.4825,17.6966,3.2476,1.3063,94.034,20.2549,17.6172,3.1196,1.2545,93.2762,1.2325,1.2233,0.010345,0.0040893,6.1393,-0.1067,-0.3562,-0.7938,-1.893,0.07,-0.0255,-0.2499,-0.6429,-1.6985,16.5364,16.5375,16.5372,16.5359,16.5373,-0.2431,-0.6768,-1.7662,...,17.0991,17.1001,-1.0377,17.7048,17.7054,17.7055,17.7039,17.7052,19.0148,19.0157,19.0156,19.0141,19.0152,0.003,0.0045,0.001,0.0015,0.0015,0.0032,0.0011,0.0043,0.0022,0.0031,0.2326,0.0733,0.6749,19.288,16.5039,88.0251,19.063,16.4264,87.2768,0.3916,0.6828,0.4413,0.5946,19.4251,18.7195,0.3752,0.1323,0.5261,12.604,0.0571,0.8616,1.5772,2.0857,2.2139,2.2337,1.9436,-0.3069
max,46.39,30.8899,30.9529,31.6536,30.9478,28.6441,3.8067,42.0,2.0502,2.0797,2.0216,2.0358,2.0648,56.8471,94.3591,46.6913,33.0259,6976.3922,6976.3922,23.2894,6964.0436,3113.9797,7029.6812,31.6536,22.7629,10013.6713,4477.6101,129.0685,2.0797,2.0216,0.058093,0.022962,10.2521,6964.0436,6963.6681,6962.9626,6961.6056,16975.3922,6964.0922,6963.7347,6963.0596,6961.7209,6974.9675,6974.9799,6974.9861,6974.961,6974.9737,24.6637,62.7597,15.1183,...,30.3648,30.3592,14.5549,29.7932,29.7876,29.7848,29.796,29.7904,29.6498,29.6429,29.6394,29.6532,29.6463,0.029,0.0432,0.0054,0.0146,0.0142,0.0164,0.0055,0.0219,0.0111,0.0295,6961.7209,10011.3486,16975.7197,6974.961,22.0102,7022.5882,30.3536,21.6493,123.8048,48.8147,49.6976,11.9108,22.3262,83.7615,52.773,83.3487,59.3271,32.6989,31.9239,17.7349,15.2929,210.033,243.897,227.752,8.751,218.7023,2.6636


# feature 선택

In [24]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'tree_method': 'auto',
    'objective': 'multi:softmax',
    'num_class': 3,
    'random_state': 2020
}

In [25]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train,y_train)
p = xgb_clf.predict(X_val)
print(accuracy_score(p,y_val))

0.9344113693905831


In [26]:
perm = PermutationImportance(xgb_clf, random_state=2020).fit(X_val,y_val)

In [27]:
eli5.show_weights(perm, top = 200, feature_names = X_val.columns.tolist())

Weight,Feature
0.4358  ± 0.0010,redshift
0.0637  ± 0.0012,dered_orthogonal
0.0167  ± 0.0013,dered_g_dered_u_diff
0.0036  ± 0.0002,dered_z_dered_i_diff
0.0035  ± 0.0007,ugriz_r_dered_u_diff
0.0030  ± 0.0003,ugriz_ucolor
0.0026  ± 0.0002,dered_r_dered_g_diff
0.0025  ± 0.0007,dered_r_dered_u_diff
0.0018  ± 0.0002,dered_i
0.0009  ± 0.0002,ugriz_icolor


In [28]:
feature_importances=pd.DataFrame(perm.feature_importances_, index=X_val.columns.tolist())
top = feature_importances.loc[feature_importances[0]>0.0000]
best_feature = top.index.tolist()

print(top.shape, best_feature)

(111, 1) ['redshift', 'dered_g', 'dered_i', 'dered_z', 'nObserve', 'airmass_u', 'airmass_g', 'airmass_r', 'airmass_i', 'ugriz_r', 'ugriz_i', 'ugriz_z', 'ugriz_max', 'ugriz_diff', 'ugriz_std', 'ugriz_sum', 'dered_max', 'dered_diff', 'dered_sum', 'airmass_max', 'airmass_min', 'airmass_diff', 'airmass_std', 'airmass_sum', 'ugriz_z_ugriz_i_diff', 'ugriz_z_ugriz_r_diff', 'ugriz_z_ugriz_u_diff', 'ugriz_z_dered_z_diff', 'ugriz_z_dered_i_diff', 'ugriz_z_dered_r_diff', 'ugriz_z_dered_g_diff', 'ugriz_z_dered_u_diff', 'ugriz_z_airmass_z_diff', 'ugriz_z_airmass_i_diff', 'ugriz_z_airmass_r_diff', 'ugriz_z_airmass_u_diff', 'ugriz_i_ugriz_u_diff', 'ugriz_i_dered_z_diff', 'ugriz_i_dered_i_diff', 'ugriz_i_dered_g_diff', 'ugriz_i_dered_u_diff', 'ugriz_i_airmass_z_diff', 'ugriz_i_airmass_i_diff', 'ugriz_i_airmass_u_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_ugriz_u_diff', 'ugriz_r_dered_i_diff', 'ugriz_r_dered_g_diff', 'ugriz_r_dered_u_diff', 'ugriz_r_airmass_i_diff', 'ugriz_g_ugriz_u_diff', 'ugriz_g_dered_

In [29]:
feature_importances=pd.DataFrame(perm.feature_importances_, index=X_val.columns.tolist())
top = feature_importances.loc[feature_importances[0]>0.0001]
best_feature = top.index.tolist()

print(top.shape, best_feature)

(67, 1) ['redshift', 'dered_g', 'dered_i', 'dered_z', 'nObserve', 'airmass_u', 'airmass_g', 'airmass_r', 'airmass_i', 'ugriz_r', 'ugriz_i', 'ugriz_z', 'ugriz_max', 'ugriz_diff', 'ugriz_std', 'ugriz_sum', 'dered_max', 'dered_sum', 'airmass_max', 'airmass_min', 'airmass_std', 'airmass_sum', 'ugriz_z_ugriz_i_diff', 'ugriz_z_ugriz_r_diff', 'ugriz_z_ugriz_u_diff', 'ugriz_z_dered_z_diff', 'ugriz_z_dered_i_diff', 'ugriz_z_dered_r_diff', 'ugriz_i_ugriz_u_diff', 'ugriz_i_dered_z_diff', 'ugriz_i_dered_g_diff', 'ugriz_i_dered_u_diff', 'ugriz_i_airmass_i_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_ugriz_u_diff', 'ugriz_r_dered_i_diff', 'ugriz_r_dered_u_diff', 'ugriz_r_airmass_i_diff', 'ugriz_g_ugriz_u_diff', 'ugriz_g_dered_r_diff', 'ugriz_g_dered_g_diff', 'ugriz_g_dered_u_diff', 'ugriz_u_dered_z_diff', 'ugriz_u_dered_r_diff', 'dered_z_dered_i_diff', 'dered_z_dered_g_diff', 'dered_z_airmass_g_diff', 'dered_i_dered_r_diff', 'dered_i_dered_g_diff', 'dered_i_dered_u_diff', 'dered_i_airmass_z_diff', 'dered

In [30]:
feature_importances=pd.DataFrame(perm.feature_importances_, index=X_val.columns.tolist())
top = feature_importances.loc[feature_importances[0]>0.0002]
best_feature = top.index.tolist()

print(top.shape, best_feature)

(41, 1) ['redshift', 'dered_i', 'nObserve', 'airmass_u', 'airmass_g', 'airmass_i', 'ugriz_sum', 'dered_sum', 'airmass_min', 'airmass_sum', 'ugriz_z_ugriz_i_diff', 'ugriz_z_ugriz_u_diff', 'ugriz_z_dered_z_diff', 'ugriz_z_dered_i_diff', 'ugriz_z_dered_r_diff', 'ugriz_i_ugriz_u_diff', 'ugriz_i_dered_z_diff', 'ugriz_i_dered_g_diff', 'ugriz_i_dered_u_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_dered_i_diff', 'ugriz_r_dered_u_diff', 'ugriz_r_airmass_i_diff', 'ugriz_g_ugriz_u_diff', 'ugriz_g_dered_u_diff', 'dered_z_dered_i_diff', 'dered_z_dered_g_diff', 'dered_z_airmass_g_diff', 'dered_i_dered_r_diff', 'dered_i_dered_u_diff', 'dered_r_dered_g_diff', 'dered_r_dered_u_diff', 'dered_g_dered_u_diff', 'dered_g_airmass_z_diff', 'airmass_z_airmass_i_diff', 'ugriz_B_jester', 'ugriz_V_jester', 'ugriz_icolor', 'ugriz_ucolor', 'ugriz_p1', 'dered_orthogonal']


In [31]:
feature_importances=pd.DataFrame(perm.feature_importances_, index=X_val.columns.tolist())
top = feature_importances.loc[feature_importances[0]>0.0003]
best_feature = top.index.tolist()

print(top.shape, best_feature)

(27, 1) ['redshift', 'dered_i', 'airmass_u', 'airmass_g', 'airmass_i', 'airmass_min', 'ugriz_z_ugriz_i_diff', 'ugriz_z_dered_i_diff', 'ugriz_z_dered_r_diff', 'ugriz_i_ugriz_u_diff', 'ugriz_i_dered_g_diff', 'ugriz_i_dered_u_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_dered_u_diff', 'ugriz_g_dered_u_diff', 'dered_z_dered_i_diff', 'dered_z_dered_g_diff', 'dered_z_airmass_g_diff', 'dered_i_dered_r_diff', 'dered_r_dered_g_diff', 'dered_r_dered_u_diff', 'dered_g_dered_u_diff', 'airmass_z_airmass_i_diff', 'ugriz_V_jester', 'ugriz_icolor', 'ugriz_ucolor', 'dered_orthogonal']


In [32]:
feature_importances=pd.DataFrame(perm.feature_importances_, index=X_val.columns.tolist())
top = feature_importances.loc[feature_importances[0]>0.0004]
best_feature = top.index.tolist()

print(top.shape, best_feature)

(20, 1) ['redshift', 'dered_i', 'airmass_u', 'airmass_i', 'airmass_min', 'ugriz_z_dered_r_diff', 'ugriz_i_dered_g_diff', 'ugriz_r_dered_u_diff', 'ugriz_g_dered_u_diff', 'dered_z_dered_i_diff', 'dered_z_dered_g_diff', 'dered_i_dered_r_diff', 'dered_r_dered_g_diff', 'dered_r_dered_u_diff', 'dered_g_dered_u_diff', 'airmass_z_airmass_i_diff', 'ugriz_V_jester', 'ugriz_icolor', 'ugriz_ucolor', 'dered_orthogonal']


# 차원 축소까지 적용

permutation importance 중요도가 0보다 큰 것들 가지고 표준편차 값 구하고
차원 축소 적용 해보기

In [67]:
feature_importances=pd.DataFrame(perm.feature_importances_, index=X_val.columns.tolist())
top = feature_importances.loc[feature_importances[0]>0.0001]
best_feature = top.index.tolist()

In [69]:
intersect_good_feature = list(set(diff_feature).intersection(set(best_feature)))
all_data['diff_feature_std'] = all_data[intersect_good_feature].std(axis=1)

In [70]:
df_train = all_data.loc[all_data['class'] != -1 , :]
df_test = all_data.loc[all_data['class'] == -1, :]
df_test.drop(columns='class', inplace=True,axis=1)

In [74]:
df_train.describe()

Unnamed: 0,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class,ugriz_u,ugriz_g,ugriz_r,ugriz_i,ugriz_z,ugriz_max,ugriz_min,ugriz_diff,ugriz_std,ugriz_sum,dered_max,dered_min,dered_diff,dered_std,dered_sum,airmass_max,airmass_min,airmass_diff,airmass_std,airmass_sum,ugriz_z_ugriz_i_diff,ugriz_z_ugriz_r_diff,ugriz_z_ugriz_g_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_i_diff,ugriz_z_dered_r_diff,ugriz_z_dered_g_diff,ugriz_z_dered_u_diff,ugriz_z_airmass_z_diff,ugriz_z_airmass_i_diff,ugriz_z_airmass_r_diff,ugriz_z_airmass_g_diff,ugriz_z_airmass_u_diff,ugriz_i_ugriz_r_diff,ugriz_i_ugriz_g_diff,...,dered_g_airmass_g_diff,dered_g_airmass_u_diff,dered_u_airmass_z_diff,dered_u_airmass_i_diff,dered_u_airmass_r_diff,dered_u_airmass_g_diff,dered_u_airmass_u_diff,airmass_z_airmass_i_diff,airmass_z_airmass_r_diff,airmass_z_airmass_g_diff,airmass_z_airmass_u_diff,airmass_i_airmass_r_diff,airmass_i_airmass_g_diff,airmass_i_airmass_u_diff,airmass_r_airmass_g_diff,airmass_r_airmass_u_diff,airmass_g_airmass_u_diff,ugriz_dered_max_diff,ugriz_dered_min_diff,ugriz_dered_sum_diff,ugriz_airmass_max_diff,ugriz_airmass_min_diff,ugriz_airmass_sum_diff,dered_airmass_max_diff,dered_airmass_min_diff,dered_airmass_sum_diff,ugriz_UB_jester,ugriz_BV_jester,ugriz_VR_jester,ugriz_RcIc_jester,ugriz_B_jester,ugriz_V_jester,ugriz_icolor,ugriz_ucolor,ugriz_p1,ugriz_r_std_div,dered_orthogonal,dered_parallel,ugriz_u_asinh,ugriz_g_asinh,ugriz_r_asinh,ugriz_i_asinh,ugriz_z_asinh,d_obs_det,diff_feature_std,tsvd5_0,tsvd5_1,tsvd5_2,tsvd5_3,tsvd5_4
count,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319920.0,319920.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,...,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0,319923.0
mean,0.060481,19.6298,18.2591,17.6158,17.2151,16.9944,1.4862,6.1396,1.1758,1.1765,1.1751,1.1754,1.1761,1.1165,19.8525,18.4318,17.7352,17.3037,17.0603,19.8847,17.0145,2.8701,1.1633,90.3835,19.6647,16.9415,2.7232,1.1041,89.7141,1.1792,1.1725,0.0067146,0.0026544,5.8791,-0.2434,-0.6749,-1.3715,-2.7921,0.0659,-0.1547,-0.5554,-1.1988,-2.5694,15.8842,15.8849,15.8852,15.8838,15.8846,-0.4315,-1.1281,...,17.0826,17.0833,18.4536,18.4543,18.4546,18.4532,18.454,0.0007,0.001,-0.0004,0.0004,0.0003,-0.0011,-0.0003,-0.0014,-0.0006,0.0008,0.22,0.0731,0.6694,18.7055,15.8421,84.5044,18.4855,15.769,83.8351,0.2555,0.5819,0.434,0.5807,18.7833,18.0396,0.3094,0.0826,0.3019,12.1259,0.0599,0.7152,1.2671,1.8034,1.9489,1.9851,1.6964,-4.6534,6.1946,57.0962,-0.0354,0.0136,-0.0022,0.0043
std,0.2911,1.9114,1.6594,1.4672,1.325,1.3232,0.9061,8.5522,0.1163,0.1181,0.1147,0.1155,0.1171,0.9234,1.9392,1.6598,1.4586,1.3141,1.331,1.9642,1.2381,1.5576,0.6366,7.1184,1.9254,1.2544,1.4738,0.5997,7.1531,0.1183,0.1143,0.0058399,0.0023081,0.5815,0.5068,0.8029,1.1437,1.59,0.2909,0.5392,0.823,1.153,1.571,1.327,1.327,1.327,1.327,1.327,0.5494,0.9157,...,1.6537,1.6536,1.9046,1.9046,1.9046,1.9046,1.9046,0.0044,0.0066,0.0022,0.0022,0.0022,0.0066,0.0022,0.0088,0.0044,0.0044,0.6246,0.2656,1.7878,1.9591,1.2335,7.0936,1.9189,1.249,7.1231,0.5908,0.3572,0.2088,0.3956,1.6847,1.5316,0.5612,0.313,0.8226,0.9972,0.5009,0.8528,0.5376,0.7641,0.4618,0.4308,0.8514,7.6978,0.6253,4.5018,1.6683,0.6606,0.6052,0.3957
min,-25.915,-30.6337,-18.656,-8.756,-3.6973,0.2159,0.6931,1.0,1.0001,1.0001,1.0002,1.0002,1.0002,0.0,2.2651,-12.4441,7.7314,7.7115,-9.2548,11.0908,-12.4441,0.0844,0.0342,45.8488,0.2159,-30.6337,0.0779,0.0365,-61.5271,1.0002,1.0001,1.9e-05,8.7006e-06,5.0008,-28.3223,-27.294,-36.3828,-33.975,-33.894,-31.0682,-30.872,-38.7162,-26.7982,-10.5755,-10.566,-10.5614,-10.5804,-10.5707,-28.8183,-30.9932,...,-19.8419,-19.8418,-31.8195,-31.8195,-31.8196,-31.8196,-31.8195,-0.011,-0.0166,-0.0149,-0.0055,-0.0056,-0.0439,-0.0144,-0.0581,-0.0286,-0.0109,-11.1856,-32.3721,-72.1912,9.8281,-13.4702,40.609,-0.97,-31.8195,-67.4564,-13.4578,-21.5338,-5.6792,-11.0021,-7.0215,5.7123,-39.077,-26.8336,-15.3031,5.2861,-17.4669,-16.2214,-3.9765,-3.527,-3.4464,-1.7236,-4.806,-38.1933,3.3732,12.73,-19.6899,-17.564,-21.7591,-22.3074
25%,3.5724e-05,18.5639,17.3486,16.7874,16.4532,16.2343,0.6931,1.0,1.0883,1.0885,1.0878,1.0881,1.0883,0.0,18.7244,17.475,16.8773,16.524,16.289,18.7403,16.2816,1.9019,0.7716,86.111,18.582,16.224,1.7669,0.7189,85.6105,1.0906,1.086,0.001878,0.00074273,5.4415,-0.3149,-0.7559,-1.6103,-3.223,0.0207,-0.2541,-0.6698,-1.477,-3.0409,15.1144,15.1151,15.1154,15.1138,15.1148,-0.4501,-1.3007,...,16.1749,16.1759,17.3909,17.3918,17.3921,17.3907,17.3914,-0.0022,-0.0033,-0.0015,-0.0011,-0.0011,-0.0045,-0.0015,-0.006,-0.0029,-0.0021,0.0684,0.0227,0.23,17.5646,15.1107,80.2482,17.4036,15.0517,79.7333,-0.0064,0.415,0.3621,0.4445,17.8087,17.1433,0.1091,0.002,-0.1303,11.5394,-0.0839,0.3154,1.0757,1.583,1.7095,1.7394,1.4557,-3.2082,5.8397,54.4347,-0.987,-0.1213,-0.1567,-0.0422
50%,0.047153,19.2647,18.0225,17.4434,17.0874,16.8694,1.0986,2.0,1.1794,1.1792,1.1794,1.1794,1.1793,1.0,19.4195,18.1405,17.5259,17.1498,16.9177,19.4335,16.9106,2.5007,1.0064,89.295,19.2795,16.8603,2.3725,0.9554,88.8465,1.1806,1.1782,0.005264,0.0020807,5.8969,-0.2195,-0.5751,-1.2016,-2.4892,0.0405,-0.1542,-0.4858,-1.0675,-2.3099,15.7516,15.752,15.752,15.7514,15.7519,-0.3602,-0.9832,...,16.855,16.8557,18.0995,18.0999,18.1001,18.0992,18.0999,0.0006,0.0009,-0.0004,0.0003,0.0003,-0.001,-0.0003,-0.0013,-0.0006,0.0007,0.1323,0.0426,0.4004,18.2657,15.7474,83.4663,18.1119,15.6975,83.0003,0.1455,0.5377,0.4069,0.5294,18.4719,17.7974,0.2499,0.0651,0.1372,11.9828,0.0093,0.5943,1.3682,1.8715,1.9984,2.0203,1.7253,-0.9014,6.073,56.4649,-0.2823,0.0061,-0.0721,0.0141
75%,0.094606,20.1976,18.8883,18.2908,17.907,17.6555,1.7918,5.0,1.2275,1.226,1.2292,1.2286,1.2268,2.0,20.432,19.0728,18.4279,18.0074,17.7288,20.4767,17.6943,3.2427,1.3044,93.986,20.2453,17.6149,3.1128,1.2521,93.2392,1.2323,1.223,0.010346,0.0040897,6.1379,-0.1067,-0.3555,-0.7902,-1.8881,0.0699,-0.0259,-0.2491,-0.6406,-1.695,16.5339,16.5351,16.5357,16.5334,16.5345,-0.2423,-0.6743,...,17.6967,17.6974,19.0142,19.015,19.0154,19.0137,19.0146,0.003,0.0044,0.0011,0.0015,0.0015,0.0032,0.0011,0.0043,0.0022,0.003,0.2328,0.0733,0.6742,19.283,16.5046,87.9772,19.0567,16.4277,87.2366,0.3883,0.6812,0.4411,0.5941,19.42,18.7172,0.3752,0.1324,0.522,12.5996,0.0568,0.8593,1.5781,2.0867,2.2155,2.2343,1.9433,-0.3069,6.4105,59.3492,0.5391,0.1675,0.0338,0.0681
max,44.62,30.7779,30.6132,31.294,30.5509,28.571,3.8067,42.0,2.0491,2.0786,2.0205,2.0347,2.0637,2.0,49.1436,46.3383,45.1299,32.8634,52.6127,52.6127,23.8072,37.4356,16.8871,164.4997,31.294,23.1122,30.8496,12.4473,136.5719,2.0786,2.0205,0.058064,0.02295,10.2467,35.1829,31.1184,34.3737,29.8174,30.5841,35.4233,31.1726,35.0874,47.0651,51.4029,51.3988,51.3967,51.405,51.4009,15.6557,37.4356,...,29.5111,29.5068,29.6683,29.6671,29.6664,29.6689,29.6677,0.029,0.0432,0.0054,0.0146,0.0142,0.0164,0.0055,0.022,0.0111,0.0295,26.2366,47.0651,164.5976,51.3967,22.661,157.2684,30.2152,21.8963,131.021,25.7255,19.718,11.2209,21.0192,46.6178,35.1872,22.8994,17.0388,30.901,30.8562,16.9173,17.6126,13.7369,235.7573,6.7095,6.6213,210.3448,2.7377,19.9178,87.9913,20.9261,29.168,53.5206,25.2676


In [73]:
def get_decomposition_feature(train, test, feature, param, decompose_func, prefix):
    n_components = param['n_components']
    de = decompose_func(**param)
    de_train = de.fit_transform(train[feature])
    de_test = de.transform(test[feature])
    train = pd.concat([train, pd.DataFrame(de_train, columns=[f'{prefix}_{c}' for c in range(n_components)])], axis=1)
    test = pd.concat([test, pd.DataFrame(de_test, columns=[f'{prefix}_{c}' for c in range(n_components)])], axis=1)
    return train, test

org_feature = ugriz_col+dered_col+airmass_col
# decompostion 해서 다시 feature로 추가, 원래 orignal feature만 사용하고 5개로 축소
decom_common_param = {'n_components': 5, 'random_state': 2020}
df_train, df_test = get_decomposition_feature(df_train, df_test, org_feature, decom_common_param, TruncatedSVD, 'tsvd5')
df_train, df_test = get_decomposition_feature(df_train, df_test, org_feature, decom_common_param, FastICA, 'ica5')

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
df_train