---
layout: post
title: "시계열 데이터 - LSTM"
author: "Chanjun Kim"
categories: Data분석
tags: [Data, TimeSeries, ARIMA, LSTM, BOOSTING, REGRESSION, 시계열데이터, 시계열분석]
image: 05_timeseries.png
---

## **학습목적**
시계열 데이터를 다루는 법과 시계열 예측을 하기 위한 여러가지 모델을 사용해보고 특성을 이해한다.<br>
이 포스팅에선 시계열 데이터의 대표적인 딥러닝 기법인 LSTM에 대해서 설명한다.
> 이 글은 LSTM에 대한 글이므로 EDA에 대한 글은 따로 포스팅하겠습니다.

In [32]:
import os
import sys
import warnings
from tqdm import tqdm

import itertools
import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
import plotnine as p9
import seaborn as sns

import scipy
import stats
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_absolute_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, LSTM, GRU, RNN, Reshape

In [3]:
%matplotlib inline
warnings.filterwarnings("ignore")

In [4]:
mpl.rcParams['axes.unicode_minus'] = False
# fm._rebuild()
plt.rcParams["font.family"] = 'NanumMyeongjo'
plt.rcParams["figure.figsize"] = (10,10)

In [8]:
train = pd.read_csv("data/dacon/energy/train.csv", encoding = "cp949")
train.head()

Unnamed: 0,num,date_time,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유
0,1,2020-06-01 00,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0
1,1,2020-06-01 01,8135.64,17.7,2.9,91.0,0.3,0.0,0.0,0.0
2,1,2020-06-01 02,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0
3,1,2020-06-01 03,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0
4,1,2020-06-01 04,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122400 entries, 0 to 122399
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   num         122400 non-null  int64  
 1   date_time   122400 non-null  object 
 2   전력사용량(kWh)  122400 non-null  float64
 3   기온(°C)      122400 non-null  float64
 4   풍속(m/s)     122400 non-null  float64
 5   습도(%)       122400 non-null  float64
 6   강수량(mm)     122400 non-null  float64
 7   일조(hr)      122400 non-null  float64
 8   비전기냉방설비운영   122400 non-null  float64
 9   태양광보유       122400 non-null  float64
dtypes: float64(8), int64(1), object(1)
memory usage: 9.3+ MB


In [10]:
test = pd.read_csv("data/dacon/energy/test.csv", encoding = "cp949")
test.head()

Unnamed: 0,num,date_time,기온(°C),풍속(m/s),습도(%),"강수량(mm, 6시간)","일조(hr, 3시간)",비전기냉방설비운영,태양광보유
0,1,2020-08-25 00,27.8,1.5,74.0,0.0,0.0,,
1,1,2020-08-25 01,,,,,,,
2,1,2020-08-25 02,,,,,,,
3,1,2020-08-25 03,27.3,1.1,78.0,,0.0,,
4,1,2020-08-25 04,,,,,,,


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10080 entries, 0 to 10079
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   num           10080 non-null  int64  
 1   date_time     10080 non-null  object 
 2   기온(°C)        3360 non-null   float64
 3   풍속(m/s)       3360 non-null   float64
 4   습도(%)         3360 non-null   float64
 5   강수량(mm, 6시간)  1680 non-null   float64
 6   일조(hr, 3시간)   3360 non-null   float64
 7   비전기냉방설비운영     2296 non-null   float64
 8   태양광보유         1624 non-null   float64
dtypes: float64(7), int64(1), object(1)
memory usage: 708.9+ KB


In [12]:
print(train.num.nunique())
print(test.num.nunique())
print(pd.concat([train.num.value_counts().sort_index(), test.num.value_counts()], axis = 1).head())

60
60
    num  num
1  2040  168
2  2040  168
3  2040  168
4  2040  168
5  2040  168


In [14]:
input_window =996 #임의의 수
output_window = 24 #168 7일 24시간
window = 12 #window는 12시간 마다는 12시간 마다
num_features = 1 #베이스라인은 feature를 하나만 사용했습니다.
num_power = 60
end_=168
lstm_units=32
dropout=0.2
EPOCH=30
BATCH_SIZE=128

In [69]:
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

In [215]:
from keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
data = np.array([[i] for i in range(50)])
targets = np.array([[i] for i in range(50)])
data_gen = TimeseriesGenerator(data, targets,
                               length=10, sampling_rate=1,
                               batch_size=2, stride = 5)

In [216]:
data_gen.to_json()

'{"class_name": "TimeseriesGenerator", "config": {"data": "[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49]]", "targets": "[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12], [13], [14], [15], [16], [17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27], [28], [29], [30], [31], [32], [33], [34], [35], [36], [37], [38], [39], [40], [41], [42], [43], [44], [45], [46], [47], [48], [49]]", "length": 10, "sampling_rate": 1, "stride": 5, "start_index": 10, "end_index": 49, "shuffle": false, "reverse": false, "batch_size": 2}}'

In [187]:
print(data_gen[0][0])
print(data_gen[0][1])

[[[ 0]
  [ 1]
  [ 2]
  [ 3]
  [ 4]
  [ 5]
  [ 6]
  [ 7]
  [ 8]
  [ 9]]

 [[ 5]
  [ 6]
  [ 7]
  [ 8]
  [ 9]
  [10]
  [11]
  [12]
  [13]
  [14]]]
[[10]
 [15]]


In [188]:
print(data_gen[1][0])
print(data_gen[1][1])

[[[10]
  [11]
  [12]
  [13]
  [14]
  [15]
  [16]
  [17]
  [18]
  [19]]

 [[15]
  [16]
  [17]
  [18]
  [19]
  [20]
  [21]
  [22]
  [23]
  [24]]]
[[20]
 [25]]


In [307]:
from keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
data = np.array(train.loc[train.num == 1].iloc[ : , [0, 2]].reset_index(drop = True))
targets = np.array(train.loc[train.num == 1].iloc[ : , [2]].reset_index(drop = True))
data_gen = TimeseriesGenerator(data, targets,
                               length=10,
                               batch_size=2)

In [361]:
x = 1
data = np.array(train.loc[train.num == x].iloc[ : , [0, 2]].reset_index(drop = True))
targets = np.array(train.loc[train.num == x].iloc[ : , [2]].reset_index(drop = True))
data_gen = TimeseriesGenerator(data, targets, length=10, batch_size = 2030)
    

In [366]:
data_gen[0]

(array([[[1.000000e+00, 8.179056e+03],
         [1.000000e+00, 8.135640e+03],
         [1.000000e+00, 8.107128e+03],
         ...,
         [1.000000e+00, 8.019000e+03],
         [1.000000e+00, 8.020944e+03],
         [1.000000e+00, 8.083152e+03]],
 
        [[1.000000e+00, 8.135640e+03],
         [1.000000e+00, 8.107128e+03],
         [1.000000e+00, 8.048808e+03],
         ...,
         [1.000000e+00, 8.020944e+03],
         [1.000000e+00, 8.083152e+03],
         [1.000000e+00, 8.116200e+03]],
 
        [[1.000000e+00, 8.107128e+03],
         [1.000000e+00, 8.048808e+03],
         [1.000000e+00, 8.043624e+03],
         ...,
         [1.000000e+00, 8.083152e+03],
         [1.000000e+00, 8.116200e+03],
         [1.000000e+00, 8.104536e+03]],
 
        ...,
 
        [[1.000000e+00, 8.714952e+03],
         [1.000000e+00, 8.717544e+03],
         [1.000000e+00, 8.727912e+03],
         ...,
         [1.000000e+00, 8.747352e+03],
         [1.000000e+00, 8.714952e+03],
         [1.000000e+00,

In [346]:
len(data_gen)

1015

In [322]:
data_gen[0][0]

array([[[1.000000e+00, 8.179056e+03],
        [1.000000e+00, 8.135640e+03],
        [1.000000e+00, 8.107128e+03],
        [1.000000e+00, 8.048808e+03],
        [1.000000e+00, 8.043624e+03],
        [1.000000e+00, 8.010576e+03],
        [1.000000e+00, 7.978176e+03],
        [1.000000e+00, 8.019000e+03],
        [1.000000e+00, 8.020944e+03],
        [1.000000e+00, 8.083152e+03]],

       [[1.000000e+00, 8.135640e+03],
        [1.000000e+00, 8.107128e+03],
        [1.000000e+00, 8.048808e+03],
        [1.000000e+00, 8.043624e+03],
        [1.000000e+00, 8.010576e+03],
        [1.000000e+00, 7.978176e+03],
        [1.000000e+00, 8.019000e+03],
        [1.000000e+00, 8.020944e+03],
        [1.000000e+00, 8.083152e+03],
        [1.000000e+00, 8.116200e+03]]])

In [324]:
data_gen[0][1]

array([[8116.2  ],
       [8104.536]])

In [296]:
import json

In [337]:
np.array(json.loads(json.loads(data_gen.to_json())["config"]["data"])).shape

(2040, 2)

In [344]:
for i, x in enumerate(data_gen.__iter__()) :
    for x_ in x :
        

(array([[[1.000000e+00, 8.179056e+03],
        [1.000000e+00, 8.135640e+03],
        [1.000000e+00, 8.107128e+03],
        [1.000000e+00, 8.048808e+03],
        [1.000000e+00, 8.043624e+03],
        [1.000000e+00, 8.010576e+03],
        [1.000000e+00, 7.978176e+03],
        [1.000000e+00, 8.019000e+03],
        [1.000000e+00, 8.020944e+03],
        [1.000000e+00, 8.083152e+03]],

       [[1.000000e+00, 8.135640e+03],
        [1.000000e+00, 8.107128e+03],
        [1.000000e+00, 8.048808e+03],
        [1.000000e+00, 8.043624e+03],
        [1.000000e+00, 8.010576e+03],
        [1.000000e+00, 7.978176e+03],
        [1.000000e+00, 8.019000e+03],
        [1.000000e+00, 8.020944e+03],
        [1.000000e+00, 8.083152e+03],
        [1.000000e+00, 8.116200e+03]]]), array([[8116.2  ],
       [8104.536]]))
(array([[[1.000000e+00, 8.107128e+03],
        [1.000000e+00, 8.048808e+03],
        [1.000000e+00, 8.043624e+03],
        [1.000000e+00, 8.010576e+03],
        [1.000000e+00, 7.978176e+03],
   

In [327]:
np.reshape(np.array(json.loads(json.loads(data_gen.to_json())["config"]["data"])), (-1, 10, 2)).shape

(204, 10, 2)

In [320]:
np.array(json.loads(json.loads(data_gen.to_json())["config"]["targets"])).shape

(2040, 1)

In [202]:
data_gen[1019][0]

array([[[1.000000e+00, 8.725968e+03],
        [1.000000e+00, 8.705232e+03],
        [2.000000e+00, 9.771840e+02],
        [2.000000e+00, 9.661680e+02],
        [2.000000e+00, 9.729720e+02],
        [2.000000e+00, 9.658440e+02],
        [2.000000e+00, 9.716760e+02],
        [2.000000e+00, 9.726480e+02],
        [2.000000e+00, 1.041336e+03],
        [2.000000e+00, 1.187136e+03]],

       [[1.000000e+00, 8.705232e+03],
        [2.000000e+00, 9.771840e+02],
        [2.000000e+00, 9.661680e+02],
        [2.000000e+00, 9.729720e+02],
        [2.000000e+00, 9.658440e+02],
        [2.000000e+00, 9.716760e+02],
        [2.000000e+00, 9.726480e+02],
        [2.000000e+00, 1.041336e+03],
        [2.000000e+00, 1.187136e+03],
        [2.000000e+00, 1.394820e+03]]])

In [158]:
data_gen[0][1]

array([8116.2  , 1500.444])

In [15]:
train_x=tf.reshape(train.iloc[:,2].values, [num_power, 24*85, num_features])

In [18]:
train_x.shape

TensorShape([60, 2040, 1])

In [17]:
train_window_x= np.zeros(( train_x.shape[0], (train_x.shape[1]-(input_window + output_window))//window, input_window, num_features)) 
train_window_y= np.zeros(( train_x.shape[0], (train_x.shape[1]-(input_window + output_window))//window, output_window, num_features))
print(f'train_window_x.shape:{train_window_x.shape}')
print(f'train_window_y.shape:{train_window_y.shape}')

train_window_x.shape:(60, 85, 996, 1)
train_window_y.shape:(60, 85, 24, 1)


In [20]:
for example in range(train_x.shape[0]):
    
    for start in range(0, train_x.shape[1]-(input_window+output_window), window):
        end=start+input_window
        train_window_x[example, start//window, :] = train_x[example, start: end               , :]
        train_window_y[example, start//window, :] = train_x[example, end  : end+ output_window, :]

In [24]:
85*996

84660

In [22]:
train_x

<tf.Tensor: shape=(60, 2040, 1), dtype=float64, numpy=
array([[[8179.056   ],
        [8135.64    ],
        [8107.128   ],
        ...,
        [8730.504   ],
        [8725.968   ],
        [8705.232   ]],

       [[ 977.184   ],
        [ 966.168   ],
        [ 972.972   ],
        ...,
        [1214.028   ],
        [1126.224   ],
        [1107.432   ]],

       [[3183.624   ],
        [3171.636   ],
        [3175.74    ],
        ...,
        [3569.301794],
        [3549.66049 ],
        [3465.782444]],

       ...,

       [[ 580.608   ],
        [ 538.164   ],
        [ 529.254   ],
        ...,
        [ 915.3     ],
        [ 691.578   ],
        [ 690.444   ]],

       [[ 572.184   ],
        [ 574.128   ],
        [ 910.44    ],
        ...,
        [1003.104   ],
        [1110.672   ],
        [ 985.284   ]],

       [[2528.496   ],
        [2243.808   ],
        [2284.848   ],
        ...,
        [3572.208   ],
        [3299.184   ],
        [3204.576   ]]])>

In [23]:
print(train_window_x.shape)
train_window_x

(60, 85, 996, 1)


array([[[[8179.056    ],
         [8135.64     ],
         [8107.128    ],
         ...,
         [8438.904    ],
         [8430.48     ],
         [8449.272    ]],

        [[8088.984    ],
         [8102.592    ],
         [8088.336    ],
         ...,
         [8458.992    ],
         [8468.712    ],
         [8451.216    ]],

        [[7920.504    ],
         [7890.048    ],
         [7868.016    ],
         ...,
         [8479.08     ],
         [8493.984    ],
         [8487.504    ]],

        ...,

        [[8494.632    ],
         [8482.32     ],
         [8477.784    ],
         ...,
         [8598.96     ],
         [8635.896    ],
         [8685.144    ]],

        [[8457.696    ],
         [8462.88     ],
         [8467.416    ],
         ...,
         [8568.504    ],
         [8565.912    ],
         [8566.56     ]],

        [[8447.976    ],
         [8436.96     ],
         [8415.576    ],
         ...,
         [8538.696    ],
         [8532.864    ],
         [8563.96

In [26]:
print(train_window_y.shape)
train_window_y

(60, 85, 24, 1)


array([[[[8457.696    ],
         [8462.88     ],
         [8467.416    ],
         ...,
         [8479.08     ],
         [8493.984    ],
         [8487.504    ]],

        [[8447.976    ],
         [8436.96     ],
         [8415.576    ],
         ...,
         [8481.024    ],
         [8468.712    ],
         [8466.768    ]],

        [[8485.56     ],
         [8474.544    ],
         [8495.928    ],
         ...,
         [8425.944    ],
         [8475.84     ],
         [8481.024    ]],

        ...,

        [[8705.232    ],
         [8652.096    ],
         [8593.776    ],
         ...,
         [8538.696    ],
         [8532.864    ],
         [8563.968    ]],

        [[8551.008    ],
         [8519.256    ],
         [8521.848    ],
         ...,
         [8595.072    ],
         [8583.408    ],
         [8566.56     ]],

        [[8576.28     ],
         [8563.968    ],
         [8582.76     ],
         ...,
         [8624.88     ],
         [8670.24     ],
         [8714.95

In [42]:
#new_train_x, reshape통해 lstm에 알맞은 형태로 집어넣기
new_train_x=tf.reshape(train_window_x, [-1, input_window, num_features])
new_train_y=tf.reshape(train_window_y, [-1, output_window,num_features])
print(f'new_train_x.shape:{new_train_x.shape}')
print(f'new_train_y.shape:{new_train_y.shape}')

new_train_x.shape:(5100, 996, 1)
new_train_y.shape:(5100, 24, 1)


In [33]:
model=Sequential([
LSTM(lstm_units, return_sequences=False, recurrent_dropout=dropout),
Dense(output_window * num_features, kernel_initializer=tf.initializers.zeros()), 
Reshape([output_window, num_features])
])

In [37]:
model.compile(optimizer='adam', loss='mae', metrics=['mae'])
# 에포크가 끝날 때마다 점(.)을 출력해 훈련 진행 과정을 표시합니다

In [39]:
#가장 좋은 성능을 낸 val_loss가 적은 model만 남겨 놓았습니다.
save_best_only=tf.keras.callbacks.ModelCheckpoint(filepath="lstm_model.h5", monitor='val_loss', save_best_only=True)

In [40]:
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=20)

In [46]:
model.fit(new_train_x, new_train_y, epochs=EPOCH, batch_size=BATCH_SIZE, validation_split = 0.2, verbose=1,
          callbacks=[early_stop, save_best_only , reduceLR])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x263aba800a0>

In [47]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 32)                4352      
_________________________________________________________________
dense_1 (Dense)              (None, 24)                792       
_________________________________________________________________
reshape (Reshape)            (None, 24, 1)             0         
Total params: 5,144
Trainable params: 5,144
Non-trainable params: 0
_________________________________________________________________


In [49]:
prediction=np.zeros((num_power, end_, num_features))
new_test_x=train_x

for i in range(end_//output_window):
    start_=i*output_window
    next_=model.predict(new_test_x[ : , -input_window:, :])
    new_test_x = tf.concat([new_test_x, next_], axis=1)
    print(new_test_x.shape)
    prediction[:, start_: start_ + output_window, :]= next_
prediction =prediction

(60, 2064, 1)
(60, 2088, 1)
(60, 2112, 1)
(60, 2136, 1)
(60, 2160, 1)
(60, 2184, 1)
(60, 2208, 1)


In [51]:
prediction.shape

(60, 168, 1)

In [62]:
test.shape

(10080, 9)

---

참고 자료 : 
- https://dacon.io/competitions/official/235736/codeshare/2628?page=1&dtype=recent
- https://byeongkijeong.github.io/ARIMA-with-Python/
- https://otexts.com/fppkr/arima-estimation.html