In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

In [3]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [4]:
data_dir = Path('../input/dankook')
sub_dir = Path('../output/')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'
sample_file = data_dir / 'sample_submission.csv'
sub_file = sub_dir / 'submission.csv'

target_column = 'class'
SEED = 2020

In [5]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

In [6]:
# test의 MinMax 범위 넘는 행은 train에서 제거
train_shape = df_train.shape[0]

for col in df_train.columns[:18]:
    df_train = df_train.loc[np.logical_and(df_train[col]>df_test[col].min(),df_train[col]<df_test[col].max())]
    
print('제거된 행 개수 :', train_shape - df_train.shape[0])

제거된 행 개수 : 106590


# 학습/테스트 데이터 결합

In [7]:
dataset = pd.concat([df_train,df_test], axis=0)

dataset.fillna(-1, inplace=True)

In [8]:
dataset['nObserve'] = dataset['nObserve'].apply(np.log1p)

In [9]:
dataset['d_dered_u'] = dataset['dered_u'] - dataset['u']
dataset['d_dered_g'] = dataset['dered_g'] - dataset['g']
dataset['d_dered_r'] = dataset['dered_r'] - dataset['r']
dataset['d_dered_i'] = dataset['dered_i'] - dataset['i']
dataset['d_dered_z'] = dataset['dered_z'] - dataset['z']
dataset['d_dered_rg'] = dataset['dered_r'] - dataset['dered_g']
dataset['d_dered_ig'] = dataset['dered_i'] - dataset['dered_g']
dataset['d_dered_zg'] = dataset['dered_z'] - dataset['dered_g']
dataset['d_dered_ri'] = dataset['dered_r'] - dataset['dered_i']
dataset['d_dered_rz'] = dataset['dered_r'] - dataset['dered_z']
dataset['d_dered_iz'] = dataset['dered_i'] - dataset['dered_z']
dataset['d_obs_det'] = dataset['nObserve'] - dataset['nDetect']

dataset['i-color'] = -0.436*dataset['u']+1.129*dataset['g']- 0.119*dataset['r'] - 0.574*dataset['i'] + 0.1984
dataset['s-color'] = -0.249*dataset['u'] + 0.794*dataset['g'] - 0.555*dataset['r'] + 0.234
dataset['p1'] = 0.91*(dataset['u']-dataset['g']) + 0.415*(dataset['g']-dataset['r']) -1.280

print(dataset.shape)
dataset.head()

(293410, 34)


Unnamed: 0_level_0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,dered_z,nObserve,nDetect,airmass_u,airmass_g,airmass_r,airmass_i,airmass_z,class,d_dered_u,d_dered_g,d_dered_r,d_dered_i,d_dered_z,d_dered_rg,d_dered_ig,d_dered_zg,d_dered_ri,d_dered_rz,d_dered_iz,d_obs_det,i-color,s-color,p1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1
0,23.264,20.3368,19.0095,17.6724,16.9396,-8.1086e-05,23.1243,20.2578,18.9551,17.6321,16.9089,2.9444,18,1.1898,1.1907,1.189,1.1894,1.1902,0.0,-0.1397,-0.079,-0.0544,-0.0403,-0.0307,-1.3027,-2.6257,-3.3488,1.323,2.0462,0.7232,-15.0556,0.6094,0.0384,1.9346
2,16.7864,15.8254,15.5363,15.3935,15.35,0.00047198,16.6076,15.6866,15.44,15.3217,15.2961,1.0986,2,1.0225,1.0241,1.021,1.0217,1.0233,0.0,-0.1787,-0.1388,-0.0963,-0.0718,-0.054,-0.2466,-0.3649,-0.3905,0.1183,0.144,0.0257,-0.9014,0.0617,-0.0031,-0.2855
3,25.6606,21.1887,20.2212,19.8949,19.6346,5.8143e-06,25.3536,20.9947,20.0873,19.7947,19.5552,1.6094,3,1.2054,1.2061,1.2049,1.2051,1.2057,0.0,-0.307,-0.1941,-0.1339,-0.1003,-0.0795,-0.9074,-1.2,-1.4395,0.2926,0.5321,0.2395,-1.3906,-0.8936,-0.5544,3.191
4,24.4534,20.6992,19.0424,18.3242,17.9826,-3.3247e-05,23.7714,20.4338,18.863,18.1903,17.8759,2.6391,12,1.1939,1.1943,1.1937,1.1938,1.1941,0.0,-0.682,-0.2653,-0.1794,-0.1339,-0.1067,-1.5709,-2.2436,-2.5579,0.6727,0.9871,0.3144,-9.3609,0.122,0.0117,2.8239
6,23.9063,20.8472,19.9784,19.4952,19.0526,-4.2489e-05,23.8458,20.7,19.8766,19.4205,19.004,1.0986,2,1.2019,1.202,1.202,1.2019,1.202,0.0,-0.0604,-0.1472,-0.1018,-0.0747,-0.0486,-0.8234,-1.2795,-1.696,0.4561,0.8726,0.4165,-0.9014,-0.2559,-0.254,1.8643


In [10]:
dataset.drop(['airmass_z', 'airmass_i', 'airmass_r', 'airmass_g', 'u', 'g', 'r', 'i', 'nDetect'], 
        axis=1, inplace=True)

# 데이터셋 분리

In [11]:
from sklearn.model_selection import train_test_split 

# train set
X = dataset.loc[ dataset['class'] != -1 , :]
X.drop(columns='class',inplace=True,axis=1)
y = dataset.loc[ dataset['class'] != -1, 'class']
y.astype(int)

# test set
test = dataset.loc[ dataset['class'] == -1, :]
test.drop(columns='class', inplace=True,axis=1)

# train set split
SEED = 2020
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = SEED)

# 모델 학습

In [12]:
from sklearn.metrics import accuracy_score 
from sklearn.metrics import mean_squared_error 


from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
df_parmas = {
    'max_depth': 5,
    'min_samples_leaf': 10,
    'max_features': 'sqrt',
    'random_state': 2020
}

clf = DecisionTreeClassifier(**df_parmas)
clf.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=5, max_features='sqrt', min_samples_leaf=10,
                       random_state=2020)

In [16]:
print(f'{accuracy_score(y_test, clf.predict(X_test)) * 100:.4f}%')

82.1424%


# 시험 데이터 예측

In [19]:
submission = pd.read_csv(sample_file, index_col=0)
submission[target_column] = clf.predict(test)

In [21]:
submission[target_column].value_counts()

2.0    47993
0.0    30640
1.0     1367
Name: class, dtype: int64

# 제출파일 저장

In [23]:
submission.to_csv(sub_file)