In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import xgboost as xgb


import eli5
from eli5.sklearn import PermutationImportance
from eli5.permutation_importance import get_score_importances



In [3]:
import pandas as pd # 데이터 분석
import numpy as np # 행렬 연산, version: 1.6.1

import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgbm
import sklearn
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD, PCA, FastICA, FactorAnalysis, KernelPCA, DictionaryLearning
from sklearn.decomposition import IncrementalPCA, LatentDirichletAllocation,MiniBatchSparsePCA, SparsePCA

import itertools

In [4]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [5]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

target_column = 'class'
SEED = 2020

In [6]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [7]:
for i in ['u','g','r','i','z']:
    df_train[f'ugriz_{i}'] = df_train[i]
    df_test[f'ugriz_{i}'] = df_test[i]
df_train.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)
df_test.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)

# 변수 생성 

In [8]:
ugriz_col = [c for c in df_train.columns if c.find('ugriz') != -1]
dered_col = [c for c in df_train.columns if c.find('dered') != -1]
airmass_col = [c for c in df_train.columns if c.find('airmass') != -1]

In [9]:
# zip 함수를 이용하여 각 Row별, Magnitude별, max, min, max-min, std, sum을 구한다.
for prefix , g in zip(['ugriz','dered','airmass'], [ugriz_col, dered_col, airmass_col]):
    df_train[f'{prefix}_max'] = df_train[g].max(axis=1)
    df_test[f'{prefix}_max'] = df_test[g].max(axis=1)
    
    df_train[f'{prefix}_min'] = df_train[g].min(axis=1)
    df_test[f'{prefix}_min'] = df_test[g].min(axis=1)
    
    df_train[f'{prefix}_diff'] = df_train[f'{prefix}_max'] - df_train[f'{prefix}_min']
    df_test[f'{prefix}_diff'] = df_test[f'{prefix}_max'] - df_test[f'{prefix}_min']
    
    df_train[f'{prefix}_std'] = df_train[g].std(axis=1)
    df_test[f'{prefix}_std'] = df_test[g].std(axis=1)
    
    df_train[f'{prefix}_sum'] = df_train[g].sum(axis=1)
    df_test[f'{prefix}_sum'] = df_test[g].sum(axis=1)

In [10]:
# diff feature 추가 예: z - i
# itertools combinations을 활용하여 전체 magnitude에서 diff를 구함
# 총 105가지 조합이 나옴. 여기서 안 좋은 것은 permutation importance를 활용하여 제거할 예정
diff_feature = []
for c1, c2 in itertools.combinations(ugriz_col[::-1]+dered_col[::-1]
                                     +airmass_col[::-1],2):
    new_c = f'{c1}_{c2}_diff'
    df_train[new_c] = df_train[c1]-df_train[c2]
    df_test[new_c] = df_test[c1]-df_test[c2]
    diff_feature.append(new_c)
    

In [11]:
# 각 maginitude 별 max-max, min-min, sum-sum 을 구함

for c in itertools.combinations(['ugriz','dered','airmass'],2):
    df_train[f'{c[0]}_{c[1]}_max_diff'] = df_train[f'{c[0]}_max'] - df_train[f'{c[1]}_max']
    df_test[f'{c[0]}_{c[1]}_max_diff'] = df_test[f'{c[0]}_max'] - df_test[f'{c[1]}_max']
    
    df_train[f'{c[0]}_{c[1]}_min_diff'] = df_train[f'{c[0]}_min'] - df_train[f'{c[1]}_min']
    df_test[f'{c[0]}_{c[1]}_min_diff'] = df_test[f'{c[0]}_min'] - df_test[f'{c[1]}_min']
    
    df_train[f'{c[0]}_{c[1]}_sum_diff'] = df_train[f'{c[0]}_sum'] - df_train[f'{c[1]}_sum']
    df_test[f'{c[0]}_{c[1]}_sum_diff'] = df_test[f'{c[0]}_sum'] - df_test[f'{c[1]}_sum']

In [12]:
# 아래와 같은 공식들을 구현함
# http://classic.sdss.org/dr4/algorithms/sdssUBVRITransform.html 

def make_2flux_feature(train, test, c1, c2, func, mag_list=None):
    
    for c in mag_list:
        x=train[f'{c}_{c1}'].values
        y=train[f'{c}_{c2}'].values
        train[f'{c}_{func.__name__}'] = func(x,y)
        
        x=test[f'{c}_{c1}'].values
        y=test[f'{c}_{c2}'].values
        test[f'{c}_{func.__name__}'] = func(x,y)
        
def UB_jester(x1,x2):
    return 0.75*(x1-x2)-0.81

make_2flux_feature(df_train,df_test, 'u','g',UB_jester,['ugriz'])

def BV_jester(x1,x2):
    return 0.62*(x1-x2)+0.15

make_2flux_feature(df_train,df_test, 'g','r',BV_jester,['ugriz'])

def VR_jester(x1,x2):
    return 0.38*(x1-x2)+0.27

make_2flux_feature(df_train, df_test, 'r','i',VR_jester,['ugriz'])

def RcIc_jester(x1,x2):
    return 0.72*(x1-x2)+0.27

make_2flux_feature(df_train, df_test, 'r','i', RcIc_jester, ['ugriz'])

def B_jester(x1,x2):
    return x2+0.17*(x1-x2)+0.11

make_2flux_feature(df_train,df_test, 'u','g', B_jester, ['ugriz'])

def V_jester(x1,x2):
    return x1-0.52*(x1-x2)-0.03

make_2flux_feature(df_train,df_test, 'g','r',V_jester, ['ugriz'])

In [13]:
# https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy

# 아래 나열된 모든 ugriz 등급은 달리 명시되지 않는 한 dereddened PSF mag입니다.
# dered_X 사용하면 될듯, 일단은 잘 몰라서 ugriz_X 버전과 dered_X 버전을 비교해서 사용하면 될듯.

# ugriz_X 버전

all_data = pd.concat([df_train,df_test], axis=0)
all_data.fillna(-1, inplace=True)

all_data['ugriz_icolor'] = all_data['ugriz_u']*(-0.436) + all_data['ugriz_g']*(1.129) + all_data['ugriz_r']*(-0.119) + all_data['ugriz_i']*(-0.574) + 0.1984

all_data['ugriz_ucolor'] = all_data['ugriz_u']*(-0.249) + all_data['ugriz_g']*(0.794) + all_data['ugriz_r']*(-0.555) + 0.234

all_data['ugriz_p1'] = (all_data['ugriz_u']-all_data['ugriz_g'])*(0.91) + (all_data['ugriz_g']-all_data['ugriz_r'])*(0.415)- 1.280

all_data['ugriz_r_std_div'] = all_data['ugriz_r']/all_data['ugriz_r'].std()

In [14]:
# https://www.sdss.org/dr16/algorithms/legacy_target_selection/
# 멸망과 관련 있는 거라고 함.
# dered_X 가 어떤 곳에서는 멸망과 관련 있는 데이터였음.
# ugriz_X를 붉어짐에 대한 걸로 사용하다면, 여기서는 dered_X를 멸망과 관련 있는 것과 사용하면 될듯

all_data['dered_orthogonal'] = (all_data['dered_r']-all_data['dered_i'])-(all_data['dered_g']-all_data['dered_r'])/4-0.18

all_data['dered_parallel'] = 0.7*(all_data['dered_g']-all_data['dered_r']) + 1.2*((all_data['dered_r']-all_data['dered_i'])-0.18)

In [15]:
# https://www.sdss.org/dr12/algorithms/magnitudes/
# 문서에는 psfMag에 대한 데이터인데, 그것을 나는 ugriz로 사용하고 있으니까 urgiz로 사용

color_list = ['u', 'g', 'r', 'i', 'z']
b_list = [1.4*10e-10, 0.9*10e-10, 1.2*10e-10, 1.8*10e-10, 7.4*10e-10]
f0_list = [24.63, 25.11, 24.80, 24.36, 22.83]
for c, b, f0 in zip(color_list, b_list, f0_list):
    all_data[f'ugriz_{c}_asinh'] = -2.5*np.log(10)*(np.arcsinh((all_data[f'ugriz_{c}']/f0)/(2*b))+np.log(b))

# feature 선택

In [16]:
selected_columns = ['redshift', 'dered_g', 'dered_r', 'dered_i', 'airmass_g', 'ugriz_r', 'ugriz_i', 'airmass_diff', 'ugriz_z_ugriz_g_diff', 'ugriz_z_ugriz_u_diff', 'ugriz_z_dered_z_diff', 'ugriz_z_dered_g_diff', 'ugriz_i_ugriz_r_diff', 'ugriz_i_dered_g_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_ugriz_u_diff', 'ugriz_r_dered_z_diff', 'ugriz_r_dered_r_diff', 'ugriz_r_dered_g_diff', 'ugriz_r_dered_u_diff', 'ugriz_g_ugriz_u_diff', 'ugriz_g_dered_g_diff', 'ugriz_g_dered_u_diff', 'ugriz_u_dered_r_diff', 'ugriz_u_dered_g_diff', 'dered_z_dered_i_diff', 'dered_z_airmass_g_diff', 'dered_r_dered_g_diff', 'dered_r_dered_u_diff', 'dered_g_dered_u_diff', 'dered_g_airmass_i_diff', 'dered_g_airmass_g_diff', 'dered_u_airmass_i_diff', 'airmass_z_airmass_g_diff', 'ugriz_dered_min_diff', 'dered_airmass_sum_diff', 'ugriz_icolor', 'ugriz_ucolor', 'ugriz_p1', 'dered_orthogonal', 'dered_parallel','class']

In [17]:
all_data = all_data[selected_columns].copy()

In [18]:
all_data.shape

(400000, 42)

In [19]:
all_data.describe()

Unnamed: 0,redshift,dered_g,dered_r,dered_i,airmass_g,ugriz_r,ugriz_i,airmass_diff,ugriz_z_ugriz_g_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_g_diff,ugriz_i_ugriz_r_diff,ugriz_i_dered_g_diff,ugriz_r_ugriz_g_diff,ugriz_r_ugriz_u_diff,ugriz_r_dered_z_diff,ugriz_r_dered_r_diff,ugriz_r_dered_g_diff,ugriz_r_dered_u_diff,ugriz_g_ugriz_u_diff,ugriz_g_dered_g_diff,ugriz_g_dered_u_diff,ugriz_u_dered_r_diff,ugriz_u_dered_g_diff,dered_z_dered_i_diff,dered_z_airmass_g_diff,dered_r_dered_g_diff,dered_r_dered_u_diff,dered_g_dered_u_diff,dered_g_airmass_i_diff,dered_g_airmass_g_diff,dered_u_airmass_i_diff,airmass_z_airmass_g_diff,ugriz_dered_min_diff,dered_airmass_sum_diff,ugriz_icolor,ugriz_ucolor,ugriz_p1,dered_orthogonal,dered_parallel,class
count,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0,400000.0
mean,0.058663,18.2344,17.6162,17.1654,1.1766,17.7366,17.2007,0.0067176,-1.459,-2.924,0.0605,-1.3045,-0.5359,-1.0337,-0.6523,-2.1174,0.8671,0.1203,-0.4979,-1.8935,-1.465,0.1545,-1.2412,2.2377,1.6195,-0.296,15.6929,-0.6182,-2.0139,-1.3957,17.059,17.0579,18.4546,-0.0004,0.0499,83.6365,0.3193,0.0474,0.3239,0.1162,0.7577,0.6931
std,0.53757,15.9262,1.4736,22.4364,0.1181,1.4687,47.0161,0.0058444,44.2913,52.2959,36.879,45.8747,47.0015,39.7539,27.7938,1.1322,35.4185,0.37,15.8538,1.147,27.799,11.9595,27.8021,1.2171,15.8691,27.4294,35.4369,15.8484,1.0552,15.8557,15.9258,15.9258,1.9162,0.0022,29.8367,63.7611,26.676,22.0656,13.7802,25.3575,20.6056,1.1827
min,-166.05,-9999.0,-16.3802,-9999.0,1.0,-5.4387,-23955.8178,1.9e-05,-23974.0966,-23975.1951,-13956.8178,-23973.9447,-23973.7582,-23973.9447,-79.9156,-43.2285,-27.9347,-30.1964,-30.1264,-27.6043,-17575.3489,-7556.3656,-17575.1375,-31.5399,-42.7465,-10016.5565,-10000.4312,-15.2991,-16.4937,-10018.7719,-10000.3031,-10000.3121,-44.0978,-0.0149,-13956.8178,-29965.944,-9753.8734,-13953.6418,-30.0839,-17.4669,-19.2967,-1.0
25%,3.5184e-05,17.3492,16.7879,16.4537,1.0884,16.8782,16.5241,0.001878,-1.611,-3.2235,0.0208,-1.4778,-0.4503,-1.1684,-0.8573,-2.4471,0.4068,0.044,-0.7263,-2.2692,-1.5987,0.0633,-1.4307,1.5841,1.179,-0.2965,15.0579,-0.8122,-2.3608,-1.5538,16.1765,16.175,17.3926,-0.0015,0.0227,79.7345,0.1092,0.0021,-0.1298,-0.0837,0.3158,0.0
50%,0.047127,18.0229,17.4444,17.0881,1.1792,17.5267,17.151,0.005269,-1.2027,-2.4909,0.0406,-1.068,-0.3604,-0.8504,-0.6261,-1.9049,0.6233,0.0702,-0.4922,-1.7264,-1.2743,0.1015,-1.1142,1.9981,1.404,-0.2012,15.7038,-0.5821,-1.8172,-1.2325,16.8567,16.856,18.1003,-0.0004,0.0426,83.007,0.25,0.0652,0.1375,0.0094,0.5948,1.0
75%,0.094629,18.8902,18.2922,17.9075,1.226,18.4289,18.0074,0.010346,-0.7908,-1.8891,0.07,-0.641,-0.2425,-0.5281,-0.4276,-1.5031,0.8101,0.1137,-0.2854,-1.318,-1.0717,0.1649,-0.8944,2.5575,1.7576,-0.0845,16.461,-0.382,-1.4179,-1.0372,17.6997,17.6983,19.0152,0.0011,0.0733,87.244,0.3752,0.1324,0.5229,0.0569,0.8598,2.0
max,62.323,30.9529,31.9572,32.0162,2.0803,72.0097,50.1641,0.058153,6962.9626,6961.6056,16975.3922,6963.0596,33.473,48.2109,17573.2749,55.0733,10016.9404,46.9576,10016.9093,62.2889,62.9862,69.3194,63.7963,43.2932,10018.9833,16.6637,29.6599,10016.8146,26.8203,14.5549,29.7876,29.796,29.6671,0.0054,10011.3486,131.021,13760.8912,59.3271,8699.3784,12520.8383,12020.1869,2.0


# 데이터셋 분리

In [20]:
# train set
X = all_data.loc[all_data['class'] != -1 , :]
X.drop(columns='class',inplace=True,axis=1)
y = all_data.loc[all_data['class'] != -1, 'class']
y.astype(int)

# test set
test = all_data.loc[all_data['class'] == -1, :]
test.drop(columns='class', inplace=True,axis=1)

# train set split
SEED = 2020
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state = SEED)

In [21]:
df_train.shape, df_test.shape, X.shape, y.shape, test.shape

((320000, 154), (80000, 153), (320000, 41), (320000,), (80000, 41))

In [22]:
X.describe()

Unnamed: 0,redshift,dered_g,dered_r,dered_i,airmass_g,ugriz_r,ugriz_i,airmass_diff,ugriz_z_ugriz_g_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_g_diff,ugriz_i_ugriz_r_diff,ugriz_i_dered_g_diff,ugriz_r_ugriz_g_diff,ugriz_r_ugriz_u_diff,ugriz_r_dered_z_diff,ugriz_r_dered_r_diff,ugriz_r_dered_g_diff,ugriz_r_dered_u_diff,ugriz_g_ugriz_u_diff,ugriz_g_dered_g_diff,ugriz_g_dered_u_diff,ugriz_u_dered_r_diff,ugriz_u_dered_g_diff,dered_z_dered_i_diff,dered_z_airmass_g_diff,dered_r_dered_g_diff,dered_r_dered_u_diff,dered_g_dered_u_diff,dered_g_airmass_i_diff,dered_g_airmass_g_diff,dered_u_airmass_i_diff,airmass_z_airmass_g_diff,ugriz_dered_min_diff,dered_airmass_sum_diff,ugriz_icolor,ugriz_ucolor,ugriz_p1,dered_orthogonal,dered_parallel
count,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0,320000.0
mean,0.058308,18.2271,17.6154,17.1523,1.1765,17.7358,17.1742,0.0067141,-1.5017,-2.9772,0.0059,-1.3519,-0.5615,-1.0529,-0.6411,-2.1167,0.8665,0.1203,-0.4913,-1.8931,-1.4756,0.1498,-1.252,2.237,1.6253,-0.2831,15.6927,-0.6117,-2.0134,-1.4018,17.0517,17.0506,18.4534,-0.0004,0.0129,83.614,0.3216,0.0386,0.3288,0.1302,0.7679
std,0.57546,17.7866,1.473,25.0758,0.1181,1.4662,52.5615,0.0058405,47.9608,57.1518,28.2757,49.7862,52.5485,44.4437,31.0724,1.1258,35.422,0.363,17.7215,1.1425,31.077,13.3675,31.0799,1.2117,17.7352,25.0444,35.4368,17.7168,1.052,17.7234,17.7863,17.7863,1.9168,0.0022,28.2761,68.9513,29.8226,24.6691,15.4006,28.3494,23.0338
min,-166.05,-9999.0,-16.3802,-9999.0,1.0,7.7314,-23955.8178,1.9e-05,-23974.0966,-23975.1951,-13956.8178,-23973.9447,-23973.7582,-23973.9447,-31.5613,-43.2285,-17.6227,-16.9807,-17.2542,-16.7362,-17575.3489,-7556.3656,-17575.1375,-31.5399,-42.7465,-10016.5565,-10000.3121,-15.2991,-14.9929,-10018.7719,-10000.3031,-10000.3121,-44.0978,-0.0149,-13956.8178,-29965.944,-9753.8734,-13953.6418,-30.0839,-17.4669,-19.2967
25%,3.558e-05,17.3484,16.7871,16.453,1.0884,16.8772,16.5239,0.001877,-1.6103,-3.2231,0.0207,-1.477,-0.4502,-1.1677,-0.8568,-2.4466,0.4064,0.0439,-0.7256,-2.2686,-1.5978,0.0632,-1.4301,1.5834,1.1789,-0.2965,15.0578,-0.8119,-2.3602,-1.553,16.176,16.1746,17.3916,-0.0015,0.0227,79.7323,0.1091,0.002,-0.1303,-0.084,0.3153
50%,0.047129,18.0224,17.4434,17.0874,1.1792,17.5258,17.1497,0.005263,-1.2016,-2.4891,0.0405,-1.0674,-0.3602,-0.8494,-0.6254,-1.9042,0.6229,0.0701,-0.4914,-1.7258,-1.2741,0.1014,-1.1139,1.9973,1.4035,-0.2009,15.703,-0.5814,-1.8168,-1.2325,16.8558,16.8549,18.0998,-0.0004,0.0426,83.0002,0.2499,0.0651,0.1372,0.0093,0.5942
75%,0.094597,18.8884,18.2911,17.907,1.226,18.4281,18.0074,0.010346,-0.7901,-1.888,0.07,-0.6404,-0.2423,-0.5276,-0.4273,-1.5025,0.81,0.1136,-0.2852,-1.3174,-1.0715,0.1649,-0.8943,2.5575,1.7569,-0.0844,16.4604,-0.3818,-1.4171,-1.0371,17.6979,17.6968,19.015,0.0011,0.0733,87.2378,0.3752,0.1324,0.522,0.0568,0.8593
max,62.323,30.6132,31.9572,32.0162,2.0803,72.0097,50.1641,0.058153,34.3737,29.8174,1968.6898,47.384,33.473,48.2109,17573.2749,55.0733,10016.9404,46.9576,10016.9093,62.2889,32.1057,51.2796,63.7963,43.2932,10018.9833,16.6637,29.6599,10016.8146,26.8203,12.5168,29.5046,29.5111,29.6671,0.0054,1968.6898,131.021,13760.8912,17.0388,8699.3784,12520.8383,12020.1869


In [23]:
test.describe()

Unnamed: 0,redshift,dered_g,dered_r,dered_i,airmass_g,ugriz_r,ugriz_i,airmass_diff,ugriz_z_ugriz_g_diff,ugriz_z_ugriz_u_diff,ugriz_z_dered_z_diff,ugriz_z_dered_g_diff,ugriz_i_ugriz_r_diff,ugriz_i_dered_g_diff,ugriz_r_ugriz_g_diff,ugriz_r_ugriz_u_diff,ugriz_r_dered_z_diff,ugriz_r_dered_r_diff,ugriz_r_dered_g_diff,ugriz_r_dered_u_diff,ugriz_g_ugriz_u_diff,ugriz_g_dered_g_diff,ugriz_g_dered_u_diff,ugriz_u_dered_r_diff,ugriz_u_dered_g_diff,dered_z_dered_i_diff,dered_z_airmass_g_diff,dered_r_dered_g_diff,dered_r_dered_u_diff,dered_g_dered_u_diff,dered_g_airmass_i_diff,dered_g_airmass_g_diff,dered_u_airmass_i_diff,airmass_z_airmass_g_diff,ugriz_dered_min_diff,dered_airmass_sum_diff,ugriz_icolor,ugriz_ucolor,ugriz_p1,dered_orthogonal,dered_parallel
count,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0,80000.0
mean,0.060083,18.2638,17.6193,17.2178,1.1767,17.7397,17.3065,0.0067318,-1.2882,-2.7109,0.2787,-1.1148,-0.4333,-0.9573,-0.6974,-2.1201,0.8695,0.1204,-0.524,-1.8952,-1.4227,0.1733,-1.1978,2.2405,1.5961,-0.3476,15.6935,-0.6444,-2.0156,-1.3712,17.0882,17.087,18.4593,-0.0004,0.1975,83.7262,0.31,0.0824,0.3041,0.0604,0.717
std,0.34684,1.6657,1.476,1.3324,0.1182,1.4786,1.3207,0.00586,24.6502,24.6773,60.0184,24.6522,0.582,0.9475,0.7116,1.1574,35.4045,0.3967,0.7107,1.1647,0.8966,0.6235,0.9802,1.2383,0.9968,35.3975,35.4374,0.5555,1.0679,0.7405,1.6596,1.6596,1.9139,0.0022,35.3974,36.1981,0.7041,0.4422,0.8741,0.5114,0.8599
min,-30.149,-18.656,-8.756,-3.6973,1.0001,-5.4387,5.3267,2.1e-05,-24.4105,-54.5181,-62.2878,-64.4691,-30.6336,-13.3576,-79.9156,-40.3838,-27.9347,-30.1964,-30.1264,-27.6043,-66.1663,-76.1173,-72.2208,-15.7468,-21.2096,-10011.3,-10000.4312,-13.0229,-16.4937,-11.2314,-19.8418,-19.8419,-31.8195,-0.0149,-62.6742,-9953.1315,-72.1962,-50.8115,-25.4325,-14.3826,-13.2165
25%,3.3645e-05,17.3522,16.7906,16.4562,1.0883,16.882,16.5255,0.001881,-1.6134,-3.2249,0.021,-1.4804,-0.4508,-1.1714,-0.8593,-2.4496,0.4082,0.0441,-0.729,-2.2711,-1.6022,0.0634,-1.433,1.5871,1.1794,-0.2968,15.0581,-0.8136,-2.3636,-1.5568,16.1781,16.1774,17.3975,-0.0015,0.0229,79.7463,0.1099,0.0023,-0.1276,-0.0828,0.3175
50%,0.047115,18.0247,17.4487,17.0912,1.1793,17.5299,17.156,0.005295,-1.2076,-2.4986,0.0406,-1.0703,-0.3613,-0.854,-0.6289,-1.9081,0.6252,0.0704,-0.4951,-1.7289,-1.275,0.1017,-1.1151,2.0014,1.4058,-0.2023,15.7068,-0.5848,-1.819,-1.2327,16.8606,16.8597,18.1019,-0.0004,0.0427,83.0353,0.2504,0.0655,0.1386,0.0097,0.5977
75%,0.094769,18.8948,18.2952,17.9109,1.2262,18.4345,18.008,0.010345,-0.7938,-1.893,0.07,-0.6429,-0.2431,-0.5301,-0.4286,-1.506,0.8105,0.1138,-0.2859,-1.3199,-1.0723,0.165,-0.8945,2.5575,1.7602,-0.0849,16.4633,-0.3828,-1.4208,-1.0377,17.7054,17.7039,19.0157,0.001,0.0733,87.2768,0.3752,0.1323,0.5261,0.0571,0.8616
max,46.39,30.9529,31.6536,30.9478,2.0797,46.6913,33.0259,0.058093,6962.9626,6961.6056,16975.3922,6963.0596,24.6637,36.7597,63.2106,28.3387,10011.7241,29.3532,39.2532,51.2309,62.9862,69.3194,63.4692,40.449,42.8546,12.417,27.3017,14.8045,21.8777,14.5549,29.7876,29.796,29.6429,0.0054,10011.3486,123.8048,83.3487,59.3271,32.6989,17.7349,15.2929


# feature 선택

In [24]:
# Xgboost
xgb_params = {
    'n_jobs' : -1, 
    'n_estimators': 100,
    'eval_metric': 'mlogloss',
    'eta': 0.3, # learning_rate
    'booster': 'gbtree',
    'tree_method': 'auto',
    'objective': 'multi:softmax',
    'num_class': 3,
    'random_state': 2020
}

In [25]:
xgb_clf = xgb.XGBClassifier(**xgb_params)
xgb_clf.fit(X_train,y_train)
p = xgb_clf.predict(X_val)
print(accuracy_score(p,y_val))

0.9334895833333333
