LSTM for Time Series Forecasting with of Pond Water Conditions

In [None]:
# Import libraries
# !pip3 install keras
import pandas as pd
import keras
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# Setting seed for reproducibility
np.random.seed(1234)
PYTHONHASHSEED = 0

from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.model_selection import train_test_split
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, LSTM
from keras.layers.core import Activation
from keras.utils import pad_sequences

In [12]:
# Import Data
df = pd.read_csv('cleaned_Pond2.csv', delimiter = ',', index_col = 0)

In [13]:
# Set Datetime as index
df['Datetime'] = pd.to_datetime(df['Datetime'])
df.set_index('Datetime', inplace=True)

In [14]:
# Resample to 1 minute
df_resampled = df.resample('1min').mean()

# Print resampled data missing values count and percentage
print("Before Fill")
print("Missing Values Count: ", df_resampled.isnull().sum())
print("Missing Values Percentage: ", df_resampled.isnull().sum()/len(df_resampled))


Before Fill
Missing Values Count:  Temperature         111905
Turbidity           111905
Dissolved Oxygen    111905
PH                  111905
Ammonia             111905
Nitrate             111905
Population          111905
Fish_Length         111905
Fish_Weight         111905
Weight_diff         111905
dtype: int64
Missing Values Percentage:  Temperature         0.579939
Turbidity           0.579939
Dissolved Oxygen    0.579939
PH                  0.579939
Ammonia             0.579939
Nitrate             0.579939
Population          0.579939
Fish_Length         0.579939
Fish_Weight         0.579939
Weight_diff         0.579939
dtype: float64


In [15]:
#  Fill missing values with previous value
df_resampled.fillna(method='ffill', inplace=True)
print("After Fill")
print("Missing Values Count: ", df_resampled.isnull().sum())
print("Missing Values Percentage: ", df_resampled.isnull().sum()/len(df_resampled))

# save to csv
df_resampled.to_csv('resampled_Pond2.csv')

After Fill
Missing Values Count:  Temperature         0
Turbidity           0
Dissolved Oxygen    0
PH                  0
Ammonia             0
Nitrate             0
Population          0
Fish_Length         0
Fish_Weight         0
Weight_diff         0
dtype: int64
Missing Values Percentage:  Temperature         0.0
Turbidity           0.0
Dissolved Oxygen    0.0
PH                  0.0
Ammonia             0.0
Nitrate             0.0
Population          0.0
Fish_Length         0.0
Fish_Weight         0.0
Weight_diff         0.0
dtype: float64


In [16]:
# display the first 5 rows of the data
df.head()

Unnamed: 0_level_0,Temperature,Turbidity,Dissolved Oxygen,PH,Ammonia,Nitrate,Population,Fish_Length,Fish_Weight,Date,Time,Weight_diff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-06-19 00:00:05,24.875,100,4.505,8.43365,0.38,193,50,6.96,3.36,2021-06-19,00:00:05,0.7
2021-06-19 00:01:02,24.9375,100,6.601,8.43818,0.38,194,50,6.96,3.36,2021-06-19,00:01:02,0.0
2021-06-19 00:01:22,24.875,100,15.797,8.42457,0.38,192,50,6.96,3.36,2021-06-19,00:01:22,0.0
2021-06-19 00:01:44,24.9375,100,5.046,8.43365,0.38,193,50,6.96,3.36,2021-06-19,00:01:44,0.0
2021-06-19 00:02:07,24.9375,100,38.407,8.40641,0.38,192,50,6.96,3.36,2021-06-19,00:02:07,0.0


In [17]:
#  Display the first 5 rows
df_resampled.head()

Unnamed: 0_level_0,Temperature,Turbidity,Dissolved Oxygen,PH,Ammonia,Nitrate,Population,Fish_Length,Fish_Weight,Weight_diff
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2021-06-19 00:00:00,24.875,100.0,4.505,8.43365,0.38,193.0,50.0,6.96,3.36,0.7
2021-06-19 00:01:00,24.916667,100.0,9.148,8.432133,0.38,193.0,50.0,6.96,3.36,0.0
2021-06-19 00:02:00,24.916667,100.0,15.033333,8.42154,0.38,193.0,50.0,6.96,3.36,0.0
2021-06-19 00:03:00,24.916667,100.0,8.311,8.430623,0.38,192.333333,50.0,6.96,3.36,0.0
2021-06-19 00:04:00,24.875,100.0,6.964,8.48358,0.38,191.0,50.0,6.96,3.36,0.0


In [18]:
# Display Datetime range
print(df_resampled.index.min())
print(df_resampled.index.max())

2021-06-19 00:00:00
2021-10-30 23:59:00


In [19]:
# Create Training and Test Sets
split_point = round(len(df_resampled) * 0.8)
train_df = df_resampled.iloc[:split_point]
test_df = df_resampled.iloc[split_point:]

In [20]:
# Reset Index for train and test sets
train_df.reset_index(inplace=True)
test_df.reset_index(inplace=True)

In [21]:
#  Create Sequence Data for LSTM
seq_arrays = []
seq_labels = []

seq_length = 20 # switch to longer sequence length?
ph = 1 # switch to 5 min prediction horizon?
feat_cols = ["Temperature"]

# List of sequences and labels
for start in range(0,len(train_df)-seq_length-ph):
  seq_arrays.append(train_df[feat_cols].iloc[start:start+seq_length].to_numpy())
  seq_labels.append(train_df['Temperature'].iloc[start+seq_length+ph])
    
#convert to numpy arrays and floats to appease keras/tensorflow        
seq_arrays = np.array(seq_arrays, dtype=object).astype(np.float32)
seq_labels = np.array(seq_labels, dtype=object).astype(np.float32)

    

: 

: 

In [None]:
assert(seq_arrays.shape == (len(train_df) - seq_length - ph, seq_length, len(feat_cols)))
assert(seq_labels.shape == (len(train_df) - seq_length - ph, ))

In [None]:
seq_arrays.shape

In [None]:
#  check for null values
np.isnan(seq_arrays).any()

# what columns have null values
np.where(np.isnan(seq_arrays))

In [None]:
#  Define path to save model
model_path = 'LSTM_model.h5'

# Build the Network
nb_features = len(feat_cols)
nb_out = 1

# Define the model
model = Sequential()

# Add first LSTM layer
model.add(LSTM(
          input_shape=(seq_length, nb_features),
          # units=50,
          units=5,
          return_sequences=True))
model.add(Dropout(0.2))

# Add second LSTM layer
model.add(LSTM(
          # units=100,
          units=3,
          # return_sequences=True))
          return_sequences=False))
model.add(Dropout(0.2))

# Add third LSTM layer
# model.add(LSTM(
#           units=100))
# model.add(Dropout(0.2))

# Add output layer
model.add(Dense(units=nb_out))
model.add(Activation('linear'))
optimizer = keras.optimizers.Adam(lr=0.01)

# Compile the model
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mse'])

print(model.summary())

# Train the model
# fit the network
history = model.fit(seq_arrays, seq_labels, epochs=100, batch_size=500, validation_split=0.05, verbose=2, callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='min'), keras.callbacks.ModelCheckpoint(model_path,monitor='val_loss', save_best_only=True, mode='min', verbose=0)])

# List all data in history
print(history.history.keys())


In [None]:
# summarize history for RMSE
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['mse'])
plt.plot(history.history['val_mse'])
plt.title('model RMSE')
plt.ylabel('RMSE')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("LSTM_rmse1.png")
# summarize history for Loss
fig_acc = plt.figure(figsize=(10, 10))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
fig_acc.savefig("LSTM_loss1.png")

In [None]:
#  Model Validation

val_arrays = []
val_labs = []

# create a list of sequences and labels, starting with a min of 2 readings
for end in range(seq_length+ph, len(test_df)-ph):
    #add short sequences until we reach the sequence length
    if end < seq_length:
        val_arrays.append(test_df[feat_cols][0:end].to_numpy())
        val_labs.append(test_df['Temperature'][end+ph])
    #add sequences of seq_length once we have enough data
    else:
        val_arrays.append(test_df[feat_cols][end-seq_length:end].to_numpy())
        val_labs.append(test_df['Temperature'][end+ph])
       
# use the pad_sequences function on your input sequences
# remember that we will later want our datatype to be np.float32 
val_arrays = pad_sequences(val_arrays, maxlen=seq_length, dtype=np.float32)
        
#convert labels to numpy arrays and floats to appease keras/tensorflow
val_labs = np.array(val_labs, dtype=object).astype(np.float32)

In [None]:
scores_test = model.evaluate(val_arrays, val_labs, verbose=2)
print('\nMSE: {}'.format(scores_test[1]))
y_pred_test = model.predict(val_arrays)
y_true_test = val_labs
test_set = pd.DataFrame(y_pred_test)
test_set.to_csv('test_set.csv', index = None)
# Plot the predicted data vs. the actual data
# we will limit our plot to the last 500 values
fig_verify = plt.figure(figsize=(10, 5))
plt.plot(y_pred_test, label = 'Predicted Value')
plt.plot(y_true_test, label = 'Actual Value')
plt.title('Temperature', fontsize=22, fontweight='bold')
plt.ylabel('value')
plt.xlabel('row')
plt.legend()
plt.show()
fig_verify.savefig("model_regression_verify.png")
