In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd

from tqdm import tqdm
import itertools

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import warnings
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

In [None]:
pd.set_option('max_columns', 100)
pd.set_option('display.precision', 4)

rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
sns.set(font_scale=2.5)

warnings.filterwarnings('ignore')

# 데이터 로드

In [None]:
# 로컬 버전

# data_dir = Path('../input/dankook')
# feature_dir = Path('../output/feature')


# train_file = data_dir / 'train.csv'
# test_file = data_dir / 'test.csv'

In [None]:
# 코렙 

from google.colab import drive
drive.mount('/content/drive')

data_dir = Path('/content/drive/My Drive/Colab Notebooks/input/dankook')
feature_dir = Path('/content/drive/My Drive/Colab Notebooks/output/feature')

train_file = data_dir / 'train.csv'
test_file = data_dir / 'test.csv'

In [None]:
feature_name = 'feature'

feature_Ver1_file = feature_dir / f'{feature_name}_Ver1.csv'
feature_Ver2_file = feature_dir / f'{feature_name}_Ver2.csv'
feature_Ver3_file = feature_dir / f'{feature_name}_Ver3.csv'

polynomial_feature_Ver1_file = feature_dir / f'polynomial_{feature_name}_Ver1.csv'
polynomial_feature_Ver2_file = feature_dir / f'polynomial_{feature_name}_Ver2.csv'
polynomial_feature_Ver3_file = feature_dir / f'polynomial_{feature_name}_Ver3.csv'

feature_target_file = feature_dir / f'{feature_name}_target.csv'

In [None]:
target_column = 'class'
SEED = 2020

In [None]:
df_train = pd.read_csv(train_file, index_col=0)
df_test = pd.read_csv(test_file, index_col=0)

# 이상치 제거

In [None]:
# test의 MinMax 범위 넘는 행은 train에서 제거
train_shape = df_train.shape[0]

for col in df_train.columns[:18]:
    df_train = df_train.loc[np.logical_and(df_train[col]>=df_test[col].min(),
                            df_train[col]<=df_test[col].max())]

print('제거된 행 개수 :', train_shape - df_train.shape[0])

In [None]:
df_train.describe()

In [None]:
df_test.describe()

## 실제 타겟 값 생성

In [None]:
y = df_train.loc[:, target_column]
y.to_csv(feature_target_file)

# 변수 생성

Decision Tree류 알고리즘을 위한 2 가지 버전으로 변수 생성

- Ver1

- Ver2

기본 데이터셋을 가지고 Feature로서 사용. Why? 기본 데이터셋 만으로도 충분히 높은 정확도가 나왔기 때문.

- Ver3

logistic regression을 위해서 위의 Ver1, Ver2를 가지고 두 가지 버전의 Polynomial Feature를 생성

- Ver1

- Ver2

기본 데이터셋을 가지고 Polynomial Feature를 생성

- Ver3

## Ver1

In [None]:
df_train2 = df_train.copy()
df_test2 = df_test.copy()

In [None]:
wave_columns = df_train.columns.drop(['nObserve','nDetect',target_column,'redshift'])

In [None]:
for j in range(14):
    name = 'diff_' + str(wave_columns[j+1]) + '_' + str(wave_columns[j])
    df_train2[name] = df_train2[wave_columns[j+1]] - df_train2[wave_columns[j]]
    df_test2[name] = df_test2[wave_columns[j+1]] - df_test2[wave_columns[j]]
    print(wave_columns[j+1], ' - ', wave_columns[j], j)

In [None]:
# 15포인트 랭킹

mag_rank_tr = df_train2[wave_columns].rank(axis=1)
mag_rank_tt = df_test2[wave_columns].rank(axis=1)

rank_col = []
for col in df_train2[wave_columns].columns:
    col = col + '_rank'
    rank_col.append(col)
mag_rank_tr.columns = rank_col
mag_rank_tt.columns = rank_col

df_train2 = pd.concat([df_train2, mag_rank_tr], axis=1)
df_test2 = pd.concat([df_test2,mag_rank_tt], axis=1)

In [None]:
# 측정방법별 파장 차이 비교 변수

diff_col = []
for col in ['u','g','r','i','z']:
    for i in range(2):
        diff_col.append(col + '_' + str(i))

mag_wave_diff_tr = pd.DataFrame(np.zeros((df_train2.shape[0], 10)), index=df_train2.index)
mag_wave_diff_tt = pd.DataFrame(np.zeros((df_test2.shape[0],10)), index=df_test2.index)

for i in range(0,10,5):
    for j in range(5):
        mag_wave_diff_tr.loc[:, j+i] = df_train2[wave_columns[j]] - df_train2[wave_columns[5+j+i]]
        mag_wave_diff_tt.loc[:, j+i] = df_test2[wave_columns[j]] - df_test2[wave_columns[5+j+i]]
        print(wave_columns[j], ' - ', wave_columns[5+j+i],i+j)

In [None]:
mag_wave_diff_tr.columns = diff_col
mag_wave_diff_tt.columns = diff_col

df_train2 = pd.concat([df_train2, mag_wave_diff_tr], axis=1)
df_test2 = pd.concat([df_test2, mag_wave_diff_tt], axis=1)

In [None]:
df_train2['nObserve'] = df_train2['nObserve'].apply(np.log1p)
df_test2['nObserve'] = df_test2['nObserve'].apply(np.log1p)

df_train2['d_obs_det'] = df_train2['nObserve'] - df_train2['nDetect']
df_test2['d_obs_det'] = df_test2['nObserve'] - df_test2['nDetect']

In [None]:
drop_columns = ['d_obs_det','g_0','diff_airmass_z_airmass_i','u','airmass_g','airmass_z','nDetect','dered_i_rank','diff_airmass_r_airmass_g','dered_r_rank','dered_g_rank','g_rank','airmass_i_rank','airmass_r_rank','airmass_g_rank','airmass_z_rank','dered_u_rank','r_rank','diff_airmass_u_dered_z','u_rank','z_rank','dered_z_rank','airmass_u_rank','diff_airmass_i_airmass_r','i_rank','airmass_r','z']

In [None]:
df_train2 = df_train2.drop(drop_columns, axis=1).copy()
df_test2 = df_test2.drop(drop_columns, axis=1).copy()

In [None]:
dataset = pd.concat([df_train2,df_test2], axis=0)
dataset.fillna(-1, inplace=True)

In [None]:
feature = pd.DataFrame(dataset)
feature.to_csv(feature_Ver1_file, float_format='%.18f')

## Ver2

In [None]:
df_train2 = df_train.copy()
df_test2 = df_test.copy()

In [None]:
for i in ['u','g','r','i','z']:
    df_train2[f'ugriz_{i}'] = df_train2[i]
    df_test2[f'ugriz_{i}'] = df_test2[i]
df_train2.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)
df_test2.drop(columns=['u','g','r','i','z'], axis=1, inplace=True)

In [None]:
ugriz_col = [c for c in df_train2.columns if c.find('ugriz') != -1]
dered_col = [c for c in df_train2.columns if c.find('dered') != -1]
airmass_col = [c for c in df_train2.columns if c.find('airmass') != -1]

In [None]:
# zip 함수를 이용하여 각 Row별, Magnitude별, max, min, max-min, std, sum을 구한다.
for prefix , g in zip(['ugriz','dered','airmass'], [ugriz_col, dered_col, airmass_col]):
    df_train2[f'{prefix}_max'] =df_train2[g].max(axis=1)
    df_test2[f'{prefix}_max'] = df_test2[g].max(axis=1)
    
    df_train2[f'{prefix}_min'] = df_train2[g].min(axis=1)
    df_test2[f'{prefix}_min'] = df_test2[g].min(axis=1)
    
    df_train2[f'{prefix}_diff'] = df_train2[f'{prefix}_max'] - df_train2[f'{prefix}_min']
    df_test2[f'{prefix}_diff'] = df_test2[f'{prefix}_max'] - df_test2[f'{prefix}_min']
    
    df_train2[f'{prefix}_std'] = df_train2[g].std(axis=1)
    df_test2[f'{prefix}_std'] = df_test2[g].std(axis=1)
    
    df_train2[f'{prefix}_sum'] = df_train2[g].sum(axis=1)
    df_test2[f'{prefix}_sum'] = df_test2[g].sum(axis=1)

In [None]:
# diff feature 추가 예: z - i
# itertools combinations을 활용하여 전체 magnitude에서 diff를 구함
# 총 105가지 조합이 나옴. 여기서 안 좋은 것은 permutation importance를 활용하여 제거할 예정
diff_feature = []
for c1, c2 in itertools.combinations(ugriz_col[::-1]+dered_col[::-1]
                                     +airmass_col[::-1],2):
    new_c = f'{c1}_{c2}_diff'
    df_train2[new_c] = df_train2[c1]-df_train2[c2]
    df_test2[new_c] = df_test2[c1]-df_test2[c2]
    diff_feature.append(new_c)
    

In [None]:
# 각 maginitude 별 max-max, min-min, sum-sum 을 구함

for c in itertools.combinations(['ugriz','dered','airmass'],2):
    df_train2[f'{c[0]}_{c[1]}_max_diff'] = df_train2[f'{c[0]}_max'] - df_train2[f'{c[1]}_max']
    df_test2[f'{c[0]}_{c[1]}_max_diff'] = df_test2[f'{c[0]}_max'] - df_test2[f'{c[1]}_max']
    
    df_train2[f'{c[0]}_{c[1]}_min_diff'] = df_train2[f'{c[0]}_min'] - df_train2[f'{c[1]}_min']
    df_test2[f'{c[0]}_{c[1]}_min_diff'] = df_test2[f'{c[0]}_min'] - df_test2[f'{c[1]}_min']
    
    df_train2[f'{c[0]}_{c[1]}_sum_diff'] = df_train2[f'{c[0]}_sum'] - df_train2[f'{c[1]}_sum']
    df_test2[f'{c[0]}_{c[1]}_sum_diff'] = df_test2[f'{c[0]}_sum'] - df_test2[f'{c[1]}_sum']

In [None]:
# 아래와 같은 공식들을 구현함
# http://classic.sdss.org/dr4/algorithms/sdssUBVRITransform.html 

def make_2flux_feature(train, test, c1, c2, func, mag_list=None):
    
    for c in mag_list:
        x=train[f'{c}_{c1}'].values
        y=train[f'{c}_{c2}'].values
        train[f'{c}_{func.__name__}'] = func(x,y)
        
        x=test[f'{c}_{c1}'].values
        y=test[f'{c}_{c2}'].values
        test[f'{c}_{func.__name__}'] = func(x,y)
        
def UB_jester(x1,x2):
    return 0.75*(x1-x2)-0.81

make_2flux_feature(df_train2,df_test2, 'u','g',UB_jester,['ugriz'])

def BV_jester(x1,x2):
    return 0.62*(x1-x2)+0.15

make_2flux_feature(df_train2,df_test2, 'g','r',BV_jester,['ugriz'])

def VR_jester(x1,x2):
    return 0.38*(x1-x2)+0.27

make_2flux_feature(df_train2, df_test2, 'r','i',VR_jester,['ugriz'])

def RcIc_jester(x1,x2):
    return 0.72*(x1-x2)+0.27

make_2flux_feature(df_train2, df_test2, 'r','i', RcIc_jester, ['ugriz'])

def B_jester(x1,x2):
    return x2+0.17*(x1-x2)+0.11

make_2flux_feature(df_train2,df_test2, 'u','g', B_jester, ['ugriz'])

def V_jester(x1,x2):
    return x1-0.52*(x1-x2)-0.03

make_2flux_feature(df_train2,df_test2, 'g','r',V_jester, ['ugriz'])

In [None]:
# https://www.sdss.org/dr16/algorithms/segue_target_selection/#Legacy

# 아래 나열된 모든 ugriz 등급은 달리 명시되지 않는 한 dereddened PSF mag입니다.
# dered_X 사용하면 될듯, 일단은 잘 몰라서 ugriz_X 버전과 dered_X 버전을 비교해서 사용하면 될듯.

# ugriz_X 버전

dataset = pd.concat([df_train2,df_test2], axis=0)
dataset.fillna(-1, inplace=True)

dataset['ugriz_icolor'] = dataset['ugriz_u']*(-0.436) + dataset['ugriz_g']*(1.129) + dataset['ugriz_r']*(-0.119) + dataset['ugriz_i']*(-0.574) + 0.1984

dataset['ugriz_ucolor'] = dataset['ugriz_u']*(-0.249) + dataset['ugriz_g']*(0.794) + dataset['ugriz_r']*(-0.555) + 0.234

dataset['ugriz_p1'] = (dataset['ugriz_u']-dataset['ugriz_g'])*(0.91) + (dataset['ugriz_g']-dataset['ugriz_r'])*(0.415)- 1.280

dataset['ugriz_r_std_div'] = dataset['ugriz_r']/dataset['ugriz_r'].std()

In [None]:
# https://www.sdss.org/dr16/algorithms/legacy_target_selection/
# 멸망과 관련 있는 거라고 함.
# dered_X 가 어떤 곳에서는 멸망과 관련 있는 데이터였음.
# ugriz_X를 붉어짐에 대한 걸로 사용하다면, 여기서는 dered_X를 멸망과 관련 있는 것과 사용하면 될듯

dataset['dered_orthogonal'] = (dataset['dered_r']-dataset['dered_i'])-(dataset['dered_g']-dataset['dered_r'])/4-0.18

dataset['dered_parallel'] = 0.7*(dataset['dered_g']-dataset['dered_r']) + 1.2*((dataset['dered_r']-dataset['dered_i'])-0.18)

In [None]:
# https://www.sdss.org/dr12/algorithms/magnitudes/
# 문서에는 psfMag에 대한 데이터인데, 그것을 나는 ugriz로 사용하고 있으니까 urgiz로 사용

color_list = ['u', 'g', 'r', 'i', 'z']
b_list = [1.4*10e-10, 0.9*10e-10, 1.2*10e-10, 1.8*10e-10, 7.4*10e-10]
f0_list = [24.63, 25.11, 24.80, 24.36, 22.83]
for c, b, f0 in zip(color_list, b_list, f0_list):
    dataset[f'ugriz_{c}_asinh'] = -2.5*np.log(10)*(np.arcsinh((dataset[f'ugriz_{c}']/f0)/(2*b))+np.log(b))

In [None]:
selected_columns = ['redshift', 'dered_g', 'dered_r', 'dered_i', 'airmass_g', 'ugriz_r', 'ugriz_i', 'airmass_diff', 'ugriz_z_ugriz_g_diff', 'ugriz_z_ugriz_u_diff', 'ugriz_z_dered_z_diff', 'ugriz_z_dered_g_diff', 'ugriz_i_ugriz_r_diff', 'ugriz_i_dered_g_diff', 'ugriz_r_ugriz_g_diff', 'ugriz_r_ugriz_u_diff', 'ugriz_r_dered_z_diff', 'ugriz_r_dered_r_diff', 'ugriz_r_dered_g_diff', 'ugriz_r_dered_u_diff', 'ugriz_g_ugriz_u_diff', 'ugriz_g_dered_g_diff', 'ugriz_g_dered_u_diff', 'ugriz_u_dered_r_diff', 'ugriz_u_dered_g_diff', 'dered_z_dered_i_diff', 'dered_z_airmass_g_diff', 'dered_r_dered_g_diff', 'dered_r_dered_u_diff', 'dered_g_dered_u_diff', 'dered_g_airmass_i_diff', 'dered_g_airmass_g_diff', 'dered_u_airmass_i_diff', 'airmass_z_airmass_g_diff', 'ugriz_dered_min_diff', 'dered_airmass_sum_diff', 'ugriz_icolor', 'ugriz_ucolor', 'ugriz_p1', 'dered_orthogonal', 'dered_parallel',target_column]

In [None]:
dataset = dataset[selected_columns].copy()

In [None]:
feature = pd.DataFrame(dataset)
feature.to_csv(feature_Ver2_file, float_format='%.18f')

## Ver3

In [None]:
df_train2 = df_train.copy()
df_test2= df_test.copy()
dataset = pd.concat([df_train2, df_test2],axis=0)
dataset.fillna(-1, inplace=True)

In [None]:
dataset.to_csv(feature_Ver3_file, float_format='%.18f')
dataset.shape, dataset.head()

## Polynomial Feature Ver1

In [None]:
dataset = pd.read_csv(feature_Ver1_file, index_col=0)

In [None]:
scaler = StandardScaler()
poly = PolynomialFeatures(2)
X = poly.fit_transform(scaler.fit_transform(dataset.drop(target_column, axis=1)))
feature_names = poly.get_feature_names(dataset.columns)
feature_names

In [None]:
dataset_poly = pd.DataFrame(data=X, columns=feature_names, index=dataset.index)
dataset_poly[target_column] = dataset[target_column]
dataset_poly.to_csv(polynomial_feature_Ver1_file, float_format='%.18f')
dataset_poly.shape,dataset_poly.head()

## Polynomial Feature Ver2

In [None]:
dataset = pd.read_csv(feature_Ver2_file, index_col=0)
print(dataset.shape)
dataset.head()

In [None]:
scaler = StandardScaler()
poly = PolynomialFeatures(2)
X = poly.fit_transform(scaler.fit_transform(dataset.drop(target_column, axis=1)))
feature_names = poly.get_feature_names(dataset.columns)
feature_names

In [None]:
dataset_poly = pd.DataFrame(data=X, columns=feature_names, index=dataset.index)
dataset_poly[target_column] = dataset[target_column]
dataset_poly.to_csv(polynomial_feature_Ver2_file, float_format='%.18f')
dataset_poly.shape,dataset_poly.head()

## Polynomial Feature Ver3

In [None]:
df_train2 = df_train.copy()
df_test2 = df_test.copy()
dataset = pd.concat([df_train2,df_test2], axis=0)
dataset.fillna(-1, inplace=True)

In [None]:
scaler = StandardScaler()
poly = PolynomialFeatures(2)
X = poly.fit_transform(scaler.fit_transform(dataset.drop(target_column, axis=1)))
feature_names = poly.get_feature_names(dataset.columns)
feature_names

In [None]:
dataset_poly = pd.DataFrame(data=X, columns=feature_names, index=dataset.index)
dataset_poly[target_column] = dataset[target_column]
dataset_poly.to_csv(polynomial_feature_Ver3_file, float_format='%.18f')
dataset_poly.shape, dataset_poly.head()