In [1]:
import numpy as np
import math
import pandas as pd
import random as rd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from datetime import datetime

from keras.utils.np_utils import to_categorical

from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras.callbacks import EarlyStopping

%matplotlib inline

In [2]:
# load data
train = pd.read_csv("./train.csv", index_col=0)
test = pd.read_csv("./test.csv", index_col=0).reset_index(drop=True)

In [3]:
train = train.dropna()
train = train.drop(train[train['dered_g']==-9999].index)
train = train.drop(train[train['dered_i']==-9999].index)
train = train.drop(train[train['dered_z']==-9999].index)

In [4]:
train['u-g'] = train['u'] - train['g']
train['g-r'] = train['g'] - train['r']
train['r-i'] = train['r'] - train['i']
train['i-z'] = train['i'] - train['z']
train['dered_u-g'] = train['dered_u'] - train['dered_g']
train['dered_g-r'] = train['dered_g'] - train['dered_r']
train['dered_r-i'] = train['dered_r'] - train['dered_i']
train['dered_i-z'] = train['dered_i'] - train['dered_z']

test['u-g'] = test['u'] - test['g']
test['g-r'] = test['g'] - test['r']
test['r-i'] = test['r'] - test['i']
test['i-z'] = test['i'] - test['z']
test['dered_u-g'] = test['dered_u'] - test['dered_g']
test['dered_g-r'] = test['dered_g'] - test['dered_r']
test['dered_r-i'] = test['dered_r'] - test['dered_i']
test['dered_i-z'] = test['dered_i'] - test['dered_z']

In [5]:
X = train[['u', 'dered_u', 'airmass_u', 'g', 'dered_g', 'airmass_g', 'r', 'dered_r', 'airmass_r',
           'i', 'dered_i', 'airmass_i', 'z', 'dered_z', 'airmass_z',
           'u-g', 'dered_u-g', 'g-r', 'dered_g-r', 'r-i', 'dered_r-i', 'i-z', 'dered_i-z']]

T = test[['u', 'dered_u', 'airmass_u', 'g', 'dered_g', 'airmass_g', 'r', 'dered_r', 'airmass_r',
           'i', 'dered_i', 'airmass_i', 'z', 'dered_z', 'airmass_z',
           'u-g', 'dered_u-g', 'g-r', 'dered_g-r', 'r-i', 'dered_r-i', 'i-z', 'dered_i-z']]

In [6]:
# scale features
scaler = RobustScaler()
scaled_train = scaler.fit_transform(X)
scaled_train = pd.DataFrame(data=scaled_train, columns=X.columns)
scaled_test = scaler.transform(T)
scaled_test = pd.DataFrame(data=scaled_test, columns=T.columns)

In [7]:
ugriz_train = np.array(scaled_train[['u', 'dered_u', 'airmass_u', 'g', 'dered_g', 'airmass_g',
                                     'r', 'dered_r', 'airmass_r', 'i', 'dered_i', 'airmass_i',
                                     'z', 'dered_z', 'airmass_z']]).reshape(scaled_train.shape[0], 15, 1)
color_train = np.array(scaled_train[['u-g', 'dered_u-g', 'g-r', 'dered_g-r', 'r-i', 'dered_r-i',
                                     'i-z', 'dered_i-z']]).reshape(scaled_train.shape[0], 8, 1)

ugriz_test = np.array(scaled_test[['u', 'dered_u', 'airmass_u', 'g', 'dered_g', 'airmass_g',
                                   'r', 'dered_r', 'airmass_r', 'i', 'dered_i', 'airmass_i',
                                   'z', 'dered_z', 'airmass_z']]).reshape(scaled_test.shape[0], 15, 1)
color_test = np.array(scaled_test[['u-g', 'dered_u-g', 'g-r', 'dered_g-r', 'r-i', 'dered_r-i',
                                   'i-z', 'dered_i-z']]).reshape(scaled_test.shape[0], 8, 1)

labels = to_categorical(train['class'], num_classes=3)

In [8]:
def rnn(X, inputshape, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=7)
    
    model = Sequential()
    model.add(LSTM(16, input_shape=(inputshape, 1), dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.05))
    model.add(Dense(8, activation='softmax'))
    model.add(Dropout(0.05))
    model.add(Dense(3, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 5
    batch_size = 32

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
    
    score,acc = model.evaluate(X_val, y_val, verbose = 2, batch_size = batch_size)
    print("score: %.3f" % (score))
    print("acc: %.3f" % (acc))
    
    return model

In [9]:
ugriz_model = rnn(ugriz_train, 15, labels)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
2000/2000 - 5s - loss: 0.5612 - accuracy: 0.7790
score: 0.561
acc: 0.779


In [10]:
pred = ugriz_model.predict(ugriz_train)
ugriz_recovery = np.argmax(pred, axis=1).reshape(-1, 1)
submission['ugriz_train'] = ugriz_recovery

ValueError: Length of values (319996) does not match length of index (80000)

In [None]:
pred = ugriz_model.predict(ugriz_test)
ugriz_recovery = np.argmax(pred, axis=1).reshape(-1, 1)
submission['ugriz_test'] = ugriz_recovery

In [None]:
color_model = rnn(color_train, 8, labels)

In [None]:
pred = color_model.predict(color_train)
color_recovery = np.argmax(pred, axis=1).reshape(-1, 1)
submission['color_train'] = color_recovery

In [None]:
pred = color_model.predict(color_test)
color_recovery = np.argmax(pred, axis=1).reshape(-1, 1)
submission['color_test'] = color_recovery

In [None]:
rnn_result = submission.drop(['id', 'class'], axis=1)
rnn_result.to_csv("rnn_result.csv",index=False, encoding='utf-8-sig')