In [2]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
train_df = pd.read_csv('./drive/MyDrive/LG_Aimer/train.csv')

train_x = train_df.filter(regex='X') # Input : X Featrue
train_y = train_df.filter(regex='Y') # Output : Y Feature

In [5]:
def nrmse(gt, preds):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    all_nrmse = []
    for idx in range(0,13): # ignore 'ID'
        rmse = mean_squared_error(np.array(gt)[:,idx], preds[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(np.array(gt)[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [10]:
test_x = pd.read_csv('./drive/MyDrive/LG_Aimer/test.csv').drop(columns=['ID'])


drop_sensors = ['X_04','X_23','X_47','X_48','X_13','X_15','X_14','X_17','X_18','X_45']
train_x = train_x.drop(labels = drop_sensors, axis=1, inplace=False)
test_x = test_x.drop(labels = drop_sensors, axis=1, inplace=False)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.fit_transform(test_x)
train_x = pd.DataFrame(train_x_scaled, index=train_x.index,columns = train_x.columns)
test_x = pd.DataFrame(test_x_scaled, index=test_x.index, columns= train_x.columns)

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.3, random_state=10)


In [None]:
x_train

array([[[ 67.485   , 103.32    ,  69.47    , ..., 128.808712,
         134.3037  , 125.692806],
        [ 67.485   , 103.32    ,  66.37    , ..., 123.985599,
         133.586453, 127.820431],
        [ 64.425   , 103.32    ,  79.87    , ..., 131.546726,
         149.012471, 127.745869],
        ...,
        [ 69.524   , 103.321   ,  64.27    , ..., 132.877315,
         144.031117, 130.507931],
        [ 67.485   , 103.32    ,  64.87    , ..., 125.412929,
         135.170941, 128.05698 ],
        [ 67.485   , 103.32    ,  61.97    , ..., 136.309299,
         131.845906, 126.341496]]])

In [13]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(27724, 46)
(11883, 46)
(27724, 14)
(11883, 14)


In [14]:
x_train = x_train.to_numpy().reshape(x_train.shape[0], x_train.shape[1],1)
print(x_train.shape)
x_test = x_test.to_numpy().reshape(x_test.shape[0],x_test.shape[1],1)
print(x_test.shape)

(27724, 46, 1)
(11883, 46, 1)


In [20]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input, Bidirectional,TimeDistributed
from numpy import array
from numpy.random import uniform
from numpy import hstack
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Model
from keras.callbacks import EarlyStopping
from sklearn.model_selection import RepeatedKFold

input_layer = Input(shape=(46,1))
lstm_1 = LSTM(64, return_sequences=True)(input_layer)
lstm_2 = Bidirectional(
    LSTM(64),
    merge_mode='concat'
)(lstm_1)
pred = Dense(14)(lstm_2)
model = Model(inputs=input_layer, outputs=pred)
model.compile(optimizer = 'adam', loss = 'mean_squared_error')
model.summary()

# model.fit(X_train, Y_train, epochs = 20, batch_size = 32)

early_stopping = EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 10, verbose = 1)
hist = model.fit(x_train, y_train, epochs = 1000, batch_size = 20, validation_data = (x_test, y_test),callbacks = [early_stopping])

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 46, 1)]           0         
                                                                 
 lstm_6 (LSTM)               (None, 46, 64)            16896     
                                                                 
 bidirectional_4 (Bidirectio  (None, 128)              66048     
 nal)                                                            
                                                                 
 dense_3 (Dense)             (None, 14)                1806      
                                                                 
Total params: 84,750
Trainable params: 84,750
Non-trainable params: 0
_________________________________________________________________
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoc

In [21]:
preds = model.predict(x_test)

lg_score = nrmse(y_test, preds)
print('NRMSE :', lg_score)

NRMSE : 2.025684297792766


In [None]:
test_x = pd.read_csv('./drive/MyDrive/LG_Aimer/test.csv').drop(columns=['ID'])


In [None]:
preds = model.predict(test_x)

In [None]:
submit = pd.read_csv('./drive/MyDrive/LG_Aimer/sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
print('Done.')
submit.to_csv('./submit.csv', index=False)

Done.
