In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
train_df = pd.read_csv('./drive/MyDrive/LG_Aimer/train.csv')

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [5]:
def nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [44]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=10)


In [43]:
x_train

array([[[ 67.485   , 103.32    ,  69.47    , ..., 128.808712,
         134.3037  , 125.692806],
        [ 67.485   , 103.32    ,  66.37    , ..., 123.985599,
         133.586453, 127.820431],
        [ 64.425   , 103.32    ,  79.87    , ..., 131.546726,
         149.012471, 127.745869],
        ...,
        [ 69.524   , 103.321   ,  64.27    , ..., 132.877315,
         144.031117, 130.507931],
        [ 67.485   , 103.32    ,  64.87    , ..., 125.412929,
         135.170941, 128.05698 ],
        [ 67.485   , 103.32    ,  61.97    , ..., 136.309299,
         131.845906, 126.341496]]])

In [48]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(27724, 56, 1)
(11883, 56, 1)
(27724, 14)
(11883, 14)


In [45]:
x_train = x_train.to_numpy().reshape(x_train.shape[0], x_train.shape[1],1)
print(x_train.shape)
x_test = x_test.to_numpy().reshape(x_test.shape[0],x_test.shape[1],1)
print(x_test.shape)

(27724, 56, 1)
(11883, 56, 1)


In [58]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Bidirectional,TimeDistributed
from numpy import array
from numpy.random import uniform
from numpy import hstack
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.callbacks import EarlyStopping
from sklearn.model_selection import RepeatedKFold

input_layer = Input(shape=(56,1))

lstm = Bidirectional(
    LSTM(250),
    merge_mode='concat'
)(input_layer)

pred = Dense(14)(lstm)
model = Model(inputs=input_layer, outputs=pred)
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
model.summary()

# model.fit(X_train, Y_train, epochs = 20, batch_size = 32)

early_stopping = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 10, verbose = 1)
hist = model.fit(x_train, y_train, epochs = 1000, batch_size = 20, validation_data = (x_test, y_test),callbacks = [early_stopping])

Model: "model_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_19 (InputLayer)       [(None, 56, 1)]           0         
                                                                 
 bidirectional_17 (Bidirecti  (None, 500)              504000    
 onal)                                                           
                                                                 
 dense_18 (Dense)            (None, 14)                7014      
                                                                 
Total params: 511,014
Trainable params: 511,014
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 1

In [60]:
preds = model.predict(x_test)

lg_score = nrmse(y_test, preds)
print('NRMSE :', lg_score)

NRMSE : 2.0577477779854334


In [11]:
test_x = pd.read_csv('./drive/MyDrive/LG_Aimer/test.csv').drop(columns=['ID'])


In [15]:
preds = model.predict(test_x)

In [16]:
submit = pd.read_csv('./drive/MyDrive/LG_Aimer/sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')
submit.to_csv('./submit.csv', index=False)

Done.
