## 利用LSTM进行负荷预测

----------

程序架构：
1. 文件读取
2. 数据预处理
    - 转化为df
    - 归一化
    - 转化为监督学习df
    - 数据集分割(6:2:2)
3. 模型

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import requests
import csv
import os
from datetime import datetime
import time

import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout

INFO:tensorflow:Enabling eager execution
INFO:tensorflow:Enabling v2 tensorshape
INFO:tensorflow:Enabling resource variables
INFO:tensorflow:Enabling tensor equality
INFO:tensorflow:Enabling control flow v2


# Load and prepare data

In [2]:
# 读取数据

file_path = r'C:\Users\KAI\Source\kais_lstm_load_forecasting\真空泵空压机(A_10000151_1).csv'

# 第一行做列名(header)\第一列做索引(id)\解析第二列为日期
# 参考https://www.cnblogs.com/traditional/p/12514914.html
data_raw = pd.read_csv(file_path, header = 0)

values = data_raw.values # 转化为array
values[:,1].astype('float32') #调整数据格式

# 调整时间戳
data_raw['ts'] = pd.to_datetime(data_raw['ts'], unit='ms')
data_raw.index=data_raw['ts']
del data_raw['ts']

# normalize
scaler = MinMaxScaler(feature_range=(0, 1))
dt_scaled = scaler.fit_transform(data_raw)  # dt_scaled is now a numpy array

In [3]:
dt_scaled

array([[0.83414313],
       [0.83257415],
       [0.83206802],
       ...,
       [0.86683875],
       [0.92018423],
       [0.83373823]])

In [4]:
# convertseries to supervised learning

def series_to_supervised(data, n_in=1, n_out=1, drop_nan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    colums, names = [],[]
    
    # 输入序列 (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        colums.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]

    # 预测序列 (t, t+1, ... t+n)
    for i in range(0, n_out):
        colums.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
        else: 
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]

    # put it all together
    agg = pd.concat(colums, axis=1)
    agg.columns = names

    # drop rows with NaN values
    if drop_nan:
        agg.dropna(inplace=True)

    return agg


In [5]:
# 用前十步预测当前
df_dt_reframed = series_to_supervised(dt_scaled,10,1) 

In [6]:
df_dt_reframed.head

<bound method NDFrame.head of        var1(t-10)  var1(t-9)  var1(t-8)  var1(t-7)  var1(t-6)  var1(t-5)  \
10       0.834143   0.832574   0.832068   0.832878   0.787985   0.787681   
11       0.832574   0.832068   0.832878   0.787985   0.787681   0.789149   
12       0.832068   0.832878   0.787985   0.787681   0.789149   0.790161   
13       0.832878   0.787985   0.787681   0.789149   0.790161   0.787428   
14       0.787985   0.787681   0.789149   0.790161   0.787428   0.791426   
...           ...        ...        ...        ...        ...        ...   
40756    0.795020   0.791173   0.795425   0.792337   0.791679   0.793754   
40757    0.791173   0.795425   0.792337   0.791679   0.793754   0.790768   
40758    0.795425   0.792337   0.791679   0.793754   0.790768   0.792236   
40759    0.792337   0.791679   0.793754   0.790768   0.792236   0.791578   
40760    0.791679   0.793754   0.790768   0.792236   0.791578   0.793906   

       var1(t-4)  var1(t-3)  var1(t-2)  var1(t-1)   var1(

In [7]:
# 划分数据集
split_idx_1 = int(df_dt_reframed.values.shape[0] * 0.6)
split_idx_2 = int(df_dt_reframed.values.shape[0] * 0.8)
train_set, valid_set, test_set = df_dt_reframed.values[:split_idx_1, :], df_dt_reframed.values[split_idx_1:split_idx_2, :], df_dt_reframed.values[split_idx_2:, :]
'''训练集：train_set | 验证集：valid_set | 测试集：test_set'''

'训练集：train_set | 验证集：valid_set | 测试集：test_set'

In [8]:
# 划分输入输出（最后一列为输出/待预测值）
train_X, train_Y = train_set[:, :-1], train_set[:, -1]
valid_X, valid_Y = valid_set[:, :-1], valid_set[:, -1]
test_X, test_Y = test_set[:, :-1], test_set[:, -1]
train_X.shape

(24450, 10)

In [9]:
# reshape input to be 3D [samples, timesteps, dim]
train_X = train_X.reshape((-1, train_X.shape[1], 1))
valid_X = valid_X.reshape((-1, valid_X.shape[1], 1))
test_X = test_X.reshape((-1, test_X.shape[1], 1))
print(train_X)

[[[0.83414313]
  [0.83257415]
  [0.83206802]
  ...
  [0.79016095]
  [0.78742788]
  [0.79142626]]

 [[0.83257415]
  [0.83206802]
  [0.83287782]
  ...
  [0.78742788]
  [0.79142626]
  [0.78904747]]

 [[0.83206802]
  [0.83287782]
  [0.78798461]
  ...
  [0.79142626]
  [0.78904747]
  [0.79264096]]

 ...

 [[0.79947363]
  [0.79613321]
  [0.794463  ]
  ...
  [0.79137565]
  [0.79188177]
  [0.79152748]]

 [[0.79613321]
  [0.794463  ]
  [0.79081891]
  ...
  [0.79188177]
  [0.79152748]
  [0.79056585]]

 [[0.794463  ]
  [0.79081891]
  [0.78935115]
  ...
  [0.79152748]
  [0.79056585]
  [0.7907683 ]]]


In [None]:
# 设计网络
model = Sequential()

model.add(LSTM(units = 50, return_sequences = True, input_shape = (train_X.shape[1], 1)))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50, return_sequences = True))
model.add(Dropout(0.2))

model.add(LSTM(units = 50))
model.add(Dropout(0.2))

model.add(Dense(units = 1))
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

model.fit(train_X, train_Y, epochs = 20, batch_size = 100)
# # fitnetwork
# history =model.fit(train_X, train_Y, epochs=50, batch_size=100, validation_data=(valid_X,valid_Y), verbose=2, shuffle=False)

# # evaluatethe model
# scores =model.evaluate(test_X, test_Y)
# #print scores
# #lcd print("\n\n\t%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
model.summary()

In [None]:
#预测
predicted = model.predict(test_X)
#对预测数据还原。
predicted = scaler.inverse_transform(predicted)

real_value = scaler.inverse_transform(test_Y)

In [None]:
plt.plot(real_value, color = 'red' , label = 'real_value')
plt.plot(predicted, color = 'blue', label = 'predicted')
plt.title('load predict')
plt.xlabel('Time')
plt.ylabel('Load Value')
plt.legend()
plt.show()

In [None]:
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# invert scaling for forecast
#inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat,0)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
#pyplot.plot(history.history['yhat'], label='actual')
#pyplot.plot(history.history['inv_yhat'], label='forecast')
#pyplot.legend()
#pyplot.show()