## 利用LSTM进行负荷预测

----------

程序架构：
1. 文件读取
2. 数据预处理
    - 转化为df
    - 归一化
    - 转化为监督学习df
    - 数据集分割(6:2:2)
3. 模型

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import requests
import csv
import os
from datetime import datetime
import time

import tensorflow as tf
from keras.models import Sequential

from keras.layers import Dense
from keras.layers import LSTM

# Load and prepare data

In [82]:
# 读取数据

file_path = r'C:\Users\KAI\Source\kais_lstm_load_forecasting\真空泵空压机(A_10000151_1).csv'

# 第一行做列名(header)\第一列做索引(id)\解析第二列为日期
# 参考https://www.cnblogs.com/traditional/p/12514914.html
data_raw = pd.read_csv(file_path, header = 0)

values = data_raw.values # 转化为array
values[:,1].astype('float32') #调整数据格式

# 调整时间戳
data_raw['ts'] = pd.to_datetime(data_raw['ts'], unit='ms')
data_raw.index=data_raw['ts']
del data_raw['ts']

# normalize
scaler = MinMaxScaler(feature_range=(0, 1))
dt_scaled = scaler.fit_transform(data_raw)  # dt_scaled is now a numpy array

In [83]:
dt_scaled

array([[0.83414313],
       [0.83257415],
       [0.83206802],
       ...,
       [0.86683875],
       [0.92018423],
       [0.83373823]])

In [84]:
# convertseries to supervised learning

def series_to_supervised(data, n_in=1, n_out=1, drop_nan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    colums, names = [],[]
    
    # 输入序列 (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        colums.append(df.shift(i))
        names += [('var%d(t-%d)' % (j + 1, i)) for j in range(n_vars)]

    # 预测序列 (t, t+1, ... t+n)
    for i in range(0, n_out):
        colums.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j + 1)) for j in range(n_vars)]
        else: 
            names += [('var%d(t+%d)' % (j + 1, i)) for j in range(n_vars)]

    # put it all together
    agg = pd.concat(colums, axis=1)
    agg.columns = names

    # drop rows with NaN values
    if drop_nan:
        agg.dropna(inplace=True)

    return agg


In [85]:
df_dt_reframed = series_to_supervised(dt_scaled,2,2)

In [86]:
df_dt_reframed.values

array([[0.83414313, 0.83257415, 0.83206802, 0.83287782],
       [0.83257415, 0.83206802, 0.83287782, 0.78798461],
       [0.83206802, 0.83287782, 0.78798461, 0.78768094],
       ...,
       [0.79390627, 0.7887438 , 0.78813645, 0.86683875],
       [0.7887438 , 0.78813645, 0.86683875, 0.92018423],
       [0.78813645, 0.86683875, 0.92018423, 0.83373823]])

In [87]:
# 划分数据集
split_idx_1 = int(dt_reframed.shape[0] * 0.6)
split_idx_2 = int(dt_reframed.shape[0] * 0.8)
train_set, valid_set, test_set = df_dt_reframed.values[:split_idx_1, :], df_dt_reframed.values[split_idx_1:split_idx_2, :], df_dt_reframed.values[split_idx_2:, :]
'''训练集：train_set | 验证集：valid_set | 测试集：test_set'''

'训练集：train_set | 验证集：valid_set | 测试集：test_set'

In [97]:
# 划分输入输出（最后一列为输出）
train_X, train_Y = train_set[:, :-1], train_set[:, -1]
valid_X, valid_Y = valid_set[:, :-1], valid_set[:, -1]
test_X, test_Y = test_set[:, :-1], test_set[:, -1]
train_X

array([[0.83414313, 0.83257415, 0.83206802],
       [0.83257415, 0.83206802, 0.83287782],
       [0.83206802, 0.83287782, 0.78798461],
       ...,
       [0.78935115, 0.78884502, 0.79431117],
       [0.78884502, 0.79431117, 0.79137565],
       [0.79431117, 0.79137565, 0.79188177]])

In [100]:
# reshape input to be 3D [samples, timesteps, features]
train_X_3d = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X_3d = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
print(train_X_3d.shape, train_Y.shape, test_X_3d.shape, test_Y.shape)

(24454, 1, 3) (24454,) (8152, 1, 3) (8152,)


In [95]:
# 设计网络
model =Sequential()
model.add(LSTM(5, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')

# fitnetwork
history =model.fit(train_X, train_Y, epochs=50, batch_size=72, validation_data=(test_X,test_y), verbose=2, shuffle=False)

# evaluatethe model
scores =model.evaluate(test_X, test_Y)

IndexError: tuple index out of range