# LSTM prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras

from keras.models import Sequential
from keras.layers import LSTM, Dense

from sklearn.metrics import mean_squared_error

## Loading data

In [None]:
df_train = pd.read_pickle(r"../input/train.pkl")
y_train = df_train["count"]
df_train.head()

In [None]:
# Select features with more than x positive or negative correlation
x = 0.20
corr = df_train.drop("Days from epoch", axis=1).corr()[["count"]]
cols = corr[corr["count"].abs() > x].index.values[1:]
cols

In [None]:
X_train = df_train[cols]

In [None]:
df_test = pd.read_pickle(r"../input/test.pkl")
X_test = df_test[cols]

y_test = df_test['count']
df_test.head()

# Data preparation 

Changes bool columns to integer. `intcols` may be different based on minimal correlation.

In [None]:
# converting bools to integers
intcols = list(filter(lambda x : x in X_train.columns, ['Friday', 'Weekend', 'School holiday', 'National holiday']))

X_train[intcols] = X_train[intcols].astype(int)
X_test[intcols] = X_test[intcols].astype(int)
intcols

Set timesteps for LSTM.

In [None]:
# the periodicity of the data
time_steps = 7

The testing data follow directly from the training data, meaning that the testing data can be seeded with the training data.

In [None]:
def seed_test(X_test, y_test, X_train):
    X_temp = [X_train.values.tolist()[-(time_steps-1):] + [X_test.values.tolist()[0]]]
    Y_temp = [y_test.values.tolist()[0]]
    for i in range(X_test.values[1:].shape[0]):
        X_temp.append(X_temp[i][1:] + [X_test.values[i+1]])
        Y_temp.append(y_test.values[i+1])
    return np.array(X_temp), np.array(Y_temp)

In [None]:
X_test, y_test = seed_test(X_test, y_test, X_train)
X_test.shape, y_test.shape

Seed training data

In [None]:
def create_dataset(X, y, time_steps=1):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        v = X.iloc[i:(i + time_steps)].values
        Xs.append(v)
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

In [None]:
X_train, y_train = create_dataset(X_train, y_train, time_steps=time_steps)

X_train.shape, y_train.shape

## Modeling

In [None]:
model = keras.Sequential()
model.add(keras.layers.LSTM(
    units=256,
    activation='relu',
    input_shape=(X_train.shape[1], X_train.shape[2])
))
model.add(keras.layers.Dense(units=128))
model.add(keras.layers.Dense(units=64))
model.add(keras.layers.Dense(units=1))
model.compile(
  loss='mean_squared_error',
  optimizer=keras.optimizers.Adam(0.001)
)

In [None]:
# define model
# model = Sequential()
# model.add(LSTM(128, activation='relu', return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
# model.add(LSTM(128, activation='relu'))
# model.add(Dense(units=1))
# model.compile(optimizer=keras.optimizers.Adam(0.001), loss='mse')

### Training

In [None]:
model.fit(
    X_train, y_train,
    epochs=600,
    batch_size=16,
    validation_split=0.1,
    verbose=0,
    shuffle=False
)

In [None]:
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred) ** 0.5

In [None]:
plt.plot(y_test, label='real value')
plt.plot(y_pred, label='prediction')

plt.legend()
plt.show()

### Validating LSTM
Predict the data for the dates in validation.pkl, enter data into kaggle competition.

In [None]:
df_validation = pd.read_pickle(r"../input/validation.pkl")
df_validation.head()

Seeding validation data (see test data)

In [None]:
df_validation[intcols] = df_validation[intcols].astype(int)
X_validate = df_validation[cols]
y_validate = df_validation["Predicted"]
X_validate, _ = seed_test(X_validate, y_validate, df_train[cols])
X_validate.shape

In [None]:
y_validate = model.predict(X_validate)
df_validation["Predicted"] = y_validate

In [None]:
df_validation["Predicted"].plot(figsize=(14,7))
df_test["count"].plot(label="Real")
plt.legend()
plt.show()

### Writing validation data to .csv file

In [None]:
df_validation.rename(columns= {"date" : "id"}, inplace=True)
df_validation["id"] = df_validation["id"].dt.strftime("%Y%m%d")
df_validation[["id", "Predicted"]].to_csv("../output/LSTM.csv", index=False)

### Offsetting by 100
We noticed that the model seemed to be off by 100 from most peaks. In order to test this hypothesis, we test and validate again with 100 added to the predicted values.

In [None]:
mean_squared_error(y_test, y_pred+100) ** 0.5

The RMSE about 20 points lower than previous for the test data.

In [None]:
df_validation["Predicted"] += 100
df_validation["Predicted"].plot(figsize=(14,7))
df_test["count"].plot(label="Real")
plt.legend()
plt.show()

This already looks significantly better.
### Writing validation data to .csv file

In [None]:
df_validation[["id", "Predicted"]].to_csv("../output/LSTMOffset.csv", index=False)