# LSTM Modelling 

In [56]:
import numpy as np
import pandas as pd
import random
from datetime import datetime
import matplotlib.pyplot as plt
from pprint import pprint

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM

import warnings
warnings.filterwarnings("ignore")

2023-04-26 15:36:06.479557: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-26 15:36:06.730658: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-26 15:36:06.730690: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-26 15:36:07.515607: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

## Data Refactor

In [36]:
training_data = pd.read_csv("./RegressionDatasets/training.csv").drop(columns=["Unnamed: 0"])
testing_data = pd.read_csv("./RegressionDatasets/testing.csv").drop(columns=["Unnamed: 0"])

training_data.head()

Unnamed: 0,Hour,StationEnd,Count,Day,Year,Count1,Count1week,Count2week,Count3week,Count4week
0,12,5,2,60,2011,1,5,0,6,2
1,12,33,4,60,2011,5,1,2,4,0
2,12,37,10,60,2011,4,2,1,7,1
3,12,32,5,60,2011,2,4,0,3,0
4,12,27,4,60,2011,1,7,0,0,1


In [14]:
trips = pd.read_csv("./CSVFiles/datatrips.csv").drop(columns=["Unnamed: 0"])

day = []
year = []
for trip in trips.values:
    date = datetime.strptime(trip[0], "%Y-%m-%d %H:%M:%S").timetuple()
    day.append(date.tm_yday)
    year.append(date.tm_year)

trips = trips.drop(columns=["DateEnd"])
trips["Day"] = day
trips["Year"] = year

trips.head()

Unnamed: 0,StationEnd,Count,Season,Year,Month,Hour,Holiday,Weekday,Workingday,WeatherSituation,Temp,ATemp,Humidity,Windspeed,Day
0,26,1,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,1
1,17,1,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,1
2,38,2,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,1
3,27,2,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,1
4,2,1,1,2011,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,1


In [28]:
new_data = pd.DataFrame({"Holiday": [], "Weekday": [], "Workingday": [], "WeatherSituation": [], "Temp": [], "ATemp": [], "Humidity": [], "Windspeed": []})

for data in training_data.values:
    hour = data[0]
    station = data[1]
    day = data[3]
    year = data[4]

    df = trips.loc[(trips["StationEnd"] == station) & (trips["Day"] == day) & (trips["Year"] == year) & (trips["Hour"] == hour)].values[0][6:-1]
    new_data.loc[len(new_data)] = df

training_data = training_data.join(new_data)
training_data.head()

Unnamed: 0,Hour,StationEnd,Count,Day,Year,Count1,Count1week,Count2week,Count3week,Count4week
0,12,5,2,60,2011,1,5,0,6,2
1,12,33,4,60,2011,5,1,2,4,0
2,12,37,10,60,2011,4,2,1,7,1
3,12,32,5,60,2011,2,4,0,3,0
4,12,27,4,60,2011,1,7,0,0,1


In [None]:
new_data = pd.DataFrame({"Holiday": [], "Weekday": [], "Workingday": [], "WeatherSituation": [], "Temp": [], "ATemp": [], "Humidity": [], "Windspeed": []})

for data in testing_data.values:
    hour = data[0]
    station = data[1]
    day = data[3]
    year = data[4]

    df = trips.loc[(trips["StationEnd"] == station) & (trips["Day"] == day) & (trips["Year"] == year) & (trips["Hour"] == hour)].values[0][6:-1]
    new_data.loc[len(new_data)] = df

testing_data = testing_data.join(new_data)
testing_data.head()

## Correlation Features

In [55]:
def featureSelect_dataframe(X, y, criteria, k):

    # initialize our function/method
    reg = SelectKBest(criteria, k=k).fit(X,y)
    
    # transform after creating the reg (so we can use getsupport)
    X_transformed = reg.transform(X)

    # filter down X based on kept columns
    X = X[[val for i,val in enumerate(X.columns) if reg.get_support()[i]]]

    # return that dataframe
    return X, reg.scores_

X = training_data[[col for col in training_data.columns if "Count" not in col]]
y = training_data['Count']

new_x_data = []

# F-value between label/feature for regression tasks.
New_X, scoresX = featureSelect_dataframe(X, y, f_regression, 11)
new_x_data_1 = {list(X.columns)[x]:scoresX[x] for x in range(0,len(list(X.columns)))}
pprint(sorted(new_x_data_1.items(), key=lambda x:x[1], reverse=True))

[('Temp', 12975.06306104232),
 ('ATemp', 12715.149689877073),
 ('Humidity', 10761.481927771849),
 ('Hour', 8611.457569253347),
 ('Year', 5528.5326672106385),
 ('WeatherSituation', 2292.2803907753714),
 ('Windspeed', 747.8772754672241),
 ('Workingday', 188.68129873442368),
 ('Day', 179.86179242347123),
 ('Holiday', 89.32473601511859),
 ('Weekday', 60.19156868486718),
 ('StationEnd', 7.898503319834889)]


## LSTM

### Method 1
Input: c(t), StationEnd, Temp

Output: c(t+1)

In [96]:
print("""
--------------------------------------------------------------------------------------------

METHOD 1:
Input: c(t), StationEnd, Temp
Output: c(t+1)

""")

cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp"]

training_data_met1 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met1 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met1.drop(columns=["Count"])
trainY = training_data_met1["Count"]

testX = testing_data_met1.drop(columns=["Count"])
testY = testing_data_met1["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met1, testing_data_met1]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))

Epoch 1/5


KeyboardInterrupt: 

### Method 2
Input: c(t), StationEnd, Temp, ATemp

Output: c(t+1)

In [None]:
print("""
--------------------------------------------------------------------------------------------

METHOD 2:
Input: c(t), StationEnd, Temp, ATemp
Output: c(t+1)

""")

cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp", "ATemp"]

training_data_met2 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met2 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met2.drop(columns=["Count"])
trainY = training_data_met2["Count"]

testX = testing_data_met2.drop(columns=["Count"])
testY = testing_data_met2["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met2, testing_data_met2]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))

Epoch 1/5


ValueError: in user code:

    File "/home/filipino/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/home/filipino/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/filipino/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/home/filipino/anaconda3/lib/python3.9/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/home/filipino/anaconda3/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/home/filipino/anaconda3/lib/python3.9/site-packages/keras/engine/input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_24" is incompatible with the layer: expected shape=(None, 1, 7), found shape=(1, 1, 8)


### Method 3
Input: c(t), StationEnd, Temp, ATemp, Humidity

Output: c(t+1)

In [None]:
print("""
--------------------------------------------------------------------------------------------

METHOD 3:
Input: c(t), StationEnd, Temp, ATemp, Humidity
Output: c(t+1)

""")

cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp", "ATemp", "Humidity"]

training_data_met3 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met3 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met3.drop(columns=["Count"])
trainY = training_data_met3["Count"]

testX = testing_data_met3.drop(columns=["Count"])
testY = testing_data_met3["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met3, testing_data_met3]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))

### Method 4
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour

Output: c(t+1)

In [None]:
print("""
--------------------------------------------------------------------------------------------

METHOD 4:
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour
Output: c(t+1)

""")

cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp", "ATemp", "Humidity", "Hour"]

training_data_met4 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met4 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met4.drop(columns=["Count"])
trainY = training_data_met4["Count"]

testX = testing_data_met4.drop(columns=["Count"])
testY = testing_data_met4["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met4, testing_data_met4]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))

### Method 5
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour, Year

Output: c(t+1)

In [None]:
print("""
--------------------------------------------------------------------------------------------

METHOD 5:
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour, Year
Output: c(t+1)

""")

cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp", "ATemp", "Humidity", "Hour", "Year"]

training_data_met5 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met5 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met5.drop(columns=["Count"])
trainY = training_data_met5["Count"]

testX = testing_data_met5.drop(columns=["Count"])
testY = testing_data_met5["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met5, testing_data_met5]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))

### Method 6
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour, Year, WeatherSituation

Output: c(t+1)

In [None]:
print("""
--------------------------------------------------------------------------------------------

METHOD 6:
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour, Year, WeatherSituation
Output: c(t+1)

""")


cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp", "ATemp", "Humidity", "Hour", "Year", "WeatherSituation"]

training_data_met6 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met6 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met6.drop(columns=["Count"])
trainY = training_data_met6["Count"]

testX = testing_data_met6.drop(columns=["Count"])
testY = testing_data_met6["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met6, testing_data_met6]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))

### Method 7
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour, Year, WeatherSituation, Windspeed

Output: c(t+1)

In [None]:
print("""
--------------------------------------------------------------------------------------------

METHOD 7:
Input: c(t), StationEnd, Temp, ATemp, Humidity, Hour, Year, WeatherSituation, Windspeed
Output: c(t+1)

""")


cols = ["StationEnd", "Count", "Count1", "Count1week", "Count2week", "Count3week", "Count4week", "Temp", "ATemp", "Humidity", "Hour", "Year", "WeatherSituation", "Windspeed"]

training_data_met7 = training_data[[col for col in training_data.columns if col in cols]]
testing_data_met7 = testing_data[[col for col in testing_data.columns if col in cols]]

trainX = training_data_met7.drop(columns=["Count"])
trainY = training_data_met7["Count"]

testX = testing_data_met7.drop(columns=["Count"])
testY = testing_data_met7["Count"]

trainX = np.reshape(trainX.values, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX.values, (testX.shape[0], 1, testX.shape[1]))

model_met1 = Sequential()
model_met1.add(LSTM(4, input_shape=(1,training_data_met1.shape[1]-1)))
model_met1.add(Dense(1))
model_met1.compile(loss='mean_squared_error', optimizer='adam')
model_met1.fit(trainX, trainY, epochs=5, batch_size=1, verbose=2)

scaler = MinMaxScaler(feature_range=(0, 1))
dataset = scaler.fit_transform(pd.concat([training_data_met7, testing_data_met7]))

# make predictions
trainPredict = model_met1.predict(trainX)
testPredict = model_met1.predict(testX)

# calculate root mean squared error
trainScore = np.sqrt(mean_squared_error(trainY[:], trainPredict[:]))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = np.sqrt(mean_squared_error(testY[:], testPredict[:]))
print('Test Score: %.2f RMSE' % (testScore))