In [40]:
import numpy as np
import pandas as pd
from pycaret.classification import *

%matplotlib inline

In [41]:
# load data
train = pd.read_csv("./train.csv", index_col=0)
test = pd.read_csv("./test.csv", index_col=0).reset_index(drop=True)
submission = pd.read_csv("./sample_submission.csv")

In [42]:
train['u-g'] = train['u'] - train['g']
train['g-r'] = train['g'] - train['r']
train['r-i'] = train['r'] - train['i']
train['i-z'] = train['i'] - train['z']
train['dered_u-g'] = train['dered_u'] - train['dered_g']
train['dered_g-r'] = train['dered_g'] - train['dered_r']
train['dered_r-i'] = train['dered_r'] - train['dered_i']
train['dered_i-z'] = train['dered_i'] - train['dered_z']

test['u-g'] = test['u'] - test['g']
test['g-r'] = test['g'] - test['r']
test['r-i'] = test['r'] - test['i']
test['i-z'] = test['i'] - test['z']
test['dered_u-g'] = test['dered_u'] - test['dered_g']
test['dered_g-r'] = test['dered_g'] - test['dered_r']
test['dered_r-i'] = test['dered_r'] - test['dered_i']
test['dered_i-z'] = test['dered_i'] - test['dered_z']

In [43]:
train['r_u-g'] = train['redshift'] * train['u-g']
train['r_g-r'] = train['redshift'] * train['g-r']
train['r_r-i'] = train['redshift'] * train['r-i']
train['r_i-z'] = train['redshift'] * train['i-z']
train['r_dered_u-g'] = train['redshift'] * train['dered_u-g']
train['r_dered_g-r'] = train['redshift'] * train['dered_g-r']
train['r_dered_r-i'] = train['redshift'] * train['dered_r-i']
train['r_dered_i-z'] = train['redshift'] * train['dered_i-z']

test['r_u-g'] = test['redshift'] * test['u-g']
test['r_g-r'] = test['redshift'] * test['g-r']
test['r_r-i'] = test['redshift'] * test['r-i']
test['r_i-z'] = test['redshift'] * test['i-z']
test['r_dered_u-g'] = test['redshift'] * test['dered_u-g']
test['r_dered_g-r'] = test['redshift'] * test['dered_g-r']
test['r_dered_r-i'] = test['redshift'] * test['dered_r-i']
test['r_dered_i-z'] = test['redshift'] * test['dered_i-z']

In [44]:
# zip 함수를 이용하여 각 Magnitude별 max, min, max-min, std, sum을 구한다.
ugriz = ['u', 'g', 'r', 'i', 'z']

for a, b in zip(ugriz, [ugriz, ugriz, ugriz, ugriz, ugriz]):
    train[f'{a}_max'] = train[b].max(axis=1)
    test[f'{a}_max'] = test[b].max(axis=1)
    
    train[f'{a}_min'] = train[b].min(axis=1)
    test[f'{a}_min'] = test[b].min(axis=1)
    
    train[f'{a}_diff'] = train[f'{a}_max'] - train[f'{a}_min']
    test[f'{a}_diff'] = test[f'{a}_max'] - test[f'{a}_min']
    
    train[f'{a}_sum'] = train[b].sum(axis=1)
    test[f'{a}_sum'] = test[b].sum(axis=1)

In [45]:
dered_ugriz = ['dered_u', 'dered_g', 'dered_r', 'dered_i', 'dered_z']

for a, b in zip(dered_ugriz, [dered_ugriz, dered_ugriz, dered_ugriz, dered_ugriz, dered_ugriz]):
    train[f'{a}_max'] = train[b].max(axis=1)
    test[f'{a}_max'] = test[b].max(axis=1)
    
    train[f'{a}_min'] = train[b].min(axis=1)
    test[f'{a}_min'] = test[b].min(axis=1)
    
    train[f'{a}_diff'] = train[f'{a}_max'] - train[f'{a}_min']
    test[f'{a}_diff'] = test[f'{a}_max'] - test[f'{a}_min']
    
    train[f'{a}_sum'] = train[b].sum(axis=1)
    test[f'{a}_sum'] = test[b].sum(axis=1)

In [46]:
# https://classic.sdss.org/education/kron_ARCS.pdf
distance_train = []
for rs in train['redshift']:
    if rs > 0:
        distance_train.append((rs / (1+rs)) * 13.5 * 10**9)
    else:
        distance_train.append((abs(rs-1) / abs(rs)) * 13.5 * 10**9)

train['distance'] = distance_train

In [47]:
highest_train = []
for u, g, r, i, z in zip(train['dered_u'], train['dered_g'], train['dered_r'], train['dered_i'], train['dered_z']):
    if max([u, g, r, i, z]) == u:
        highest_train.append(u)
    elif max([u, g, r, i, z]) == g:
        highest_train.append(g)
    elif max([u, g, r, i, z]) == r:
        highest_train.append(r)
    elif max([u, g, r, i, z]) == i:
        highest_train.append(i)
    elif max([u, g, r, i, z]) == z:
        highest_train.append(z)
train['highest'] = highest_train
train['M'] = train['highest'] - 5*(np.log(train['distance']) - 1)

In [48]:
# https://classic.sdss.org/education/kron_ARCS.pdf
distance_test = []
for rs in test['redshift']:
    if rs > 0:
        distance_test.append((rs / (1+rs)) * 13.5 * 10**9)
    else:
        distance_test.append((abs(rs-1) / abs(rs)) * 13.5 * 10**9)

test['distance'] = distance_test

In [49]:
highest_test = []
for u, g, r, i, z in zip(test['dered_u'], test['dered_g'], test['dered_r'], test['dered_i'], test['dered_z']):
    if max([u, g, r, i, z]) == u:
        highest_test.append(u)
    elif max([u, g, r, i, z]) == g:
        highest_test.append(g)
    elif max([u, g, r, i, z]) == r:
        highest_test.append(r)
    elif max([u, g, r, i, z]) == i:
        highest_test.append(i)
    elif max([u, g, r, i, z]) == z:
        highest_test.append(z)
test['highest'] = highest_test
test['M'] = test['highest'] - 5*(np.log(test['distance']) - 1)

In [50]:
# n값 처리
train['nO-nD'] = train['nObserve'] - train['nDetect']
test['nO-nD'] = test['nObserve'] - test['nDetect']

In [51]:
# 단순 나눗셈
train['u/dered_u'] = train['u'] / train['dered_u'] - 1
train['g/dered_g'] = train['g'] / train['dered_g'] - 1
train['r/dered_r'] = train['r'] / train['dered_r'] - 1
train['i/dered_i'] = train['i'] / train['dered_i'] - 1
train['z/dered_z'] = train['z'] / train['dered_z'] - 1

test['u/dered_u'] = test['u'] / test['dered_u'] - 1
test['g/dered_g'] = test['g'] / test['dered_g'] - 1
test['r/dered_r'] = test['r'] / test['dered_r'] - 1
test['i/dered_i'] = test['i'] / test['dered_i'] - 1
test['z/dered_z'] = test['z'] / test['dered_z'] - 1

In [52]:
train['M_u'] = train['u'] - 5*(np.log(train['distance']) - 1)
train['M_g'] = train['g'] - 5*(np.log(train['distance']) - 1)
train['M_r'] = train['r'] - 5*(np.log(train['distance']) - 1)
train['M_i'] = train['i'] - 5*(np.log(train['distance']) - 1)
train['M_z'] = train['z'] - 5*(np.log(train['distance']) - 1)
train['M_dered_u'] = train['dered_u'] - 5*(np.log(train['distance']) - 1)
train['M_dered_g'] = train['dered_g'] - 5*(np.log(train['distance']) - 1)
train['M_dered_r'] = train['dered_r'] - 5*(np.log(train['distance']) - 1)
train['M_dered_i'] = train['dered_i'] - 5*(np.log(train['distance']) - 1)
train['M_dered_z'] = train['dered_z'] - 5*(np.log(train['distance']) - 1)

test['M_u'] = test['u'] - 5*(np.log(test['distance']) - 1)
test['M_g'] = test['g'] - 5*(np.log(test['distance']) - 1)
test['M_r'] = test['r'] - 5*(np.log(test['distance']) - 1)
test['M_i'] = test['i'] - 5*(np.log(test['distance']) - 1)
test['M_z'] = test['z'] - 5*(np.log(test['distance']) - 1)
test['M_dered_u'] = test['dered_u'] - 5*(np.log(test['distance']) - 1)
test['M_dered_g'] = test['dered_g'] - 5*(np.log(test['distance']) - 1)
test['M_dered_r'] = test['dered_r'] - 5*(np.log(test['distance']) - 1)
test['M_dered_i'] = test['dered_i'] - 5*(np.log(test['distance']) - 1)
test['M_dered_z'] = test['dered_z'] - 5*(np.log(test['distance']) - 1)

In [59]:
bad_feature = ['g_max', 'g_min', 'g_diff', 'g_sum', 'r_max', 'r_min', 'r_diff', 'r_sum',
               'i_max', 'i_min', 'i_diff', 'i_sum', 'z_max', 'z_min', 'z_diff', 'z_sum',
               'dered_g_max', 'dered_g_min', 'dered_g_diff', 'dered_g_sum',
               'dered_r_max', 'dered_r_min', 'dered_r_diff', 'dered_r_sum',
               'dered_i_max', 'dered_i_min', 'dered_i_diff', 'dered_i_sum',
               'dered_z_max', 'dered_z_min', 'dered_z_diff', 'dered_z_sum']

In [60]:
useful_columns = [c for c in train.columns if c not in bad_feature]

In [61]:
# split data into X and y

X = train[useful_columns]
Y = train['class']

In [62]:
clf = setup(data=X,
            target='class')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,8575
1,Target Type,Multiclass
2,Label Encoded,"0: 0, 1: 1, 2: 2"
3,Original Data,"(320000, 62)"
4,Missing Values,False
5,Numeric Features,61
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


In [63]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.9345,0.0,0.8797,0.9323,0.9327,0.8896,0.8902,188.5158
1,Light Gradient Boosting Machine,0.9328,0.0,0.8777,0.9306,0.931,0.8868,0.8874,18.3562
2,Extra Trees Classifier,0.9307,0.0,0.8672,0.928,0.9279,0.8826,0.8838,29.6431
3,Gradient Boosting Classifier,0.9283,0.0,0.8649,0.9255,0.9257,0.8788,0.8798,1420.7541
4,Random Forest Classifier,0.9278,0.0,0.8801,0.9266,0.9271,0.8791,0.8792,6.2398
5,Extreme Gradient Boosting,0.9267,0.0,0.8586,0.9237,0.9235,0.8758,0.8772,404.4713
6,Decision Tree Classifier,0.9052,0.0,0.8537,0.9061,0.9056,0.8424,0.8424,61.3917
7,Ada Boost Classifier,0.8971,0.0,0.8049,0.8907,0.8907,0.8247,0.8273,129.9216
8,K Neighbors Classifier,0.843,0.0,0.679,0.8029,0.8153,0.7252,0.735,14.3254
9,Quadratic Discriminant Analysis,0.7153,0.0,0.6644,0.7846,0.7325,0.5552,0.5728,0.8467


<catboost.core.CatBoostClassifier at 0x25f3bd010c8>

In [64]:
cb = create_model('catboost')
save_model(cb, 'cb_model_1')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9371,0.0,0.886,0.9352,0.9355,0.8941,0.8945
1,0.9341,0.0,0.879,0.9319,0.9322,0.8889,0.8895
2,0.9361,0.0,0.88,0.934,0.934,0.8921,0.8929
3,0.9328,0.0,0.8754,0.9304,0.9307,0.8866,0.8873
4,0.9343,0.0,0.8821,0.9323,0.9328,0.8895,0.8899
5,0.9354,0.0,0.8805,0.9331,0.9335,0.891,0.8917
6,0.9329,0.0,0.8765,0.9306,0.931,0.887,0.8876
7,0.9346,0.0,0.8802,0.9324,0.9328,0.8898,0.8904
8,0.9332,0.0,0.8789,0.9311,0.9315,0.8875,0.8881
9,0.9344,0.0,0.8788,0.9322,0.9325,0.8893,0.89


In [None]:
saved_lr = load_model('cb_model_1')

In [67]:
# tuned_cb = tune_model(cb)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.9356,0.0,0.8812,0.9335,0.9337,0.8914,0.892
1,0.9323,0.0,0.8748,0.9299,0.9302,0.8857,0.8865
2,0.9345,0.0,0.8758,0.9323,0.9322,0.8893,0.8903
3,0.9322,0.0,0.873,0.9297,0.9299,0.8855,0.8863
4,0.9322,0.0,0.8767,0.9299,0.9304,0.8858,0.8864
5,0.9345,0.0,0.8776,0.9322,0.9324,0.8895,0.8902
6,0.9339,0.0,0.8771,0.9316,0.9318,0.8885,0.8892
7,0.9333,0.0,0.8774,0.931,0.9313,0.8875,0.8881
8,0.9321,0.0,0.8751,0.9298,0.9301,0.8855,0.8862
9,0.9329,0.0,0.8753,0.9305,0.9308,0.8868,0.8875


In [73]:
cb_final = finalize_model(cb)
print(cb_final)

KeyError: "['class'] not in index"

In [79]:
test['class'] = 0

In [80]:
predictions = predict_model(cb_final, data=test[useful_columns])
predictions.head()

Unnamed: 0,u,g,r,i,z,redshift,dered_u,dered_g,dered_r,dered_i,...,M_r,M_i,M_z,M_dered_u,M_dered_g,M_dered_r,M_dered_i,M_dered_z,Label,Score
0,19.14305,18.0172,17.473255,17.114411,16.926293,0.121303,19.07495,17.95495,17.43085,17.08381,...,-83.036762,-83.395606,-83.583724,-81.435067,-82.555067,-83.079167,-83.426207,-83.601297,2,0.8524
1,18.74091,17.170473,16.369084,15.986452,15.730708,-6.7e-05,15.07033,14.30976,14.39011,14.516,...,-143.322053,-143.704686,-143.960429,-144.620807,-145.381377,-145.301027,-145.175137,-145.053977,0,0.9992
2,19.298048,17.880346,17.222135,16.809561,16.635239,0.11001,19.10036,17.77263,17.14601,16.74968,...,-82.849891,-83.262466,-83.436788,-80.971667,-82.299397,-82.926017,-83.322347,-83.503147,2,0.92
3,18.077467,17.156943,16.904111,16.8118,16.79552,3e-05,17.97294,17.07353,16.84696,16.76995,...,-42.60316,-42.695472,-42.711751,-41.534332,-42.433742,-42.660312,-42.737322,-42.740502,0,0.9972
4,17.832467,16.764587,16.213526,15.872181,15.662084,0.068471,17.75447,16.67943,16.15628,15.83145,...,-81.678392,-82.019737,-82.229834,-80.137448,-81.212488,-81.735638,-82.060468,-82.248568,2,0.9414


In [84]:
submission['class'] = predictions['Label']
submission.to_csv("submission_autoML1.csv",index=False, encoding='utf-8-sig')
submission.head()

Unnamed: 0,id,class
0,320000,2
1,320001,0
2,320002,2
3,320003,0
4,320004,2
