##### Authors: Alexander Mo & Tommaso Lucarelli

# Training pipeline for LSTM stock market price prediction model
This pipeline reads data from the feature store and adjusts it according to a 'feature view' to fit the data to an interpretable format w.r.t. the LSTM model input. Upon completing the training phase, the model is uploaded to Hugging Face where it can be used for inference in the web application.

Code is written to run on Google Colab.

In [None]:
from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive/MyDrive/Scalable/Project/feature_store

# Feature view processing


In [None]:
import numpy as np
import pandas as pd

def generate_sequences(data, sequence_length, prediction_length):
  X, y = [], []
  c = data.iloc[0, 7]
  i = 0
  while i < (len(data) - sequence_length - prediction_length):
    if ((i + sequence_length + prediction_length) < (len(data) - sequence_length - prediction_length)) and (data.iloc[(i + sequence_length + prediction_length), 7] != c):
      i = i + sequence_length + prediction_length
      c = data.iloc[i, 7]
      print(i, c)
    X.append(data.iloc[i:i+sequence_length, [0,1,2,3,4,5,6,8,9,10,11]])
    y.append(data.iloc[i+sequence_length:i+sequence_length+prediction_length, 3]) 
    i += 1

  return np.array(X), np.array(y)


In [None]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Load the data
data = pd.read_csv("small_test.csv")

# Select the columns we want to scale
scaled_data = data[['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Diff', 'EPS Estimate', 'Reported EPS', 'Offset']]

max_close = data.max(axis=0)[3]
min_close = data.min(axis=0)[3]

scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(scaled_data)

# put the scaled data back into the dataframe
data[['Open', 'High', 'Low', 'Close', 'Volume', 'Dividends', 'Stock Splits', 'Diff', 'EPS Estimate', 'Reported EPS', 'Offset']] = scaled_data

sequence_length = 240  # Number of timesteps in each input sequence -> 10 days
prediction_length = 24  # Number of timesteps to predict ->  1 day

X, y = generate_sequences(data, sequence_length, prediction_length)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
input_shape = X_train.shape[1:]
output_shape = y_train.shape[1:]
batch_size = 32
epochs = 200

print(input_shape)
print(output_shape)

# Model training


In [None]:
tfk = tf.keras
tfkl = tf.keras.layers

def build_CONV_LSTM_model(input_shape, output_shape):
    # Build the neural network layer by layer
    input_layer = tfkl.Input(shape=input_shape, name='Input')

    convlstm = tfkl.Bidirectional(tfkl.LSTM(128, return_sequences=True))(input_layer)
    convlstm = tfkl.Conv1D(256, 3, padding='same', activation='relu')(convlstm)
    convlstm = tfkl.MaxPool1D()(convlstm)
    convlstm = tfkl.Bidirectional(tfkl.LSTM(256, return_sequences=True))(convlstm)
    convlstm = tfkl.Conv1D(512, 3, padding='same', activation='relu')(convlstm)
    convlstm = tfkl.GlobalAveragePooling1D()(convlstm)
    convlstm = tfkl.Dropout(.5)(convlstm)

    output_layer = tfkl.Dense(output_shape[-1], activation='relu')(convlstm)

    # Connect input and output through the Model class
    model = tfk.Model(inputs=input_layer, outputs=output_layer, name='model')

    # Compile the model
    model.compile(loss=tfk.losses.MeanSquaredError(), optimizer=tfk.optimizers.Adam(learning_rate=1e-4), metrics=['mae'])

    # Return the model
    return model

In [None]:
model = build_CONV_LSTM_model(input_shape, output_shape)

# Train the model
history = model.fit(
    x = X_train,
    y = y_train,
    batch_size = batch_size,
    epochs = epochs,
    validation_split=.1,
    callbacks = [
        #Early stopping to avoid overfitting
        tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=10, restore_best_weights=True),
        #Reduce learning rate to do fine tuning in the last epochs
        tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=5, factor=0.5, min_lr=1e-5)
    ]
).history

In [None]:
#save the model in format h5 on google drive
model.save("model_definitive.h5")

In [None]:
#save the model to huggingface

In [None]:
#save the model to huggingface
! pip install huggingface_hub

In [None]:
drive.flush_and_unmount()

In [None]:
from huggingface_hub import notebook_login, push_to_hub_keras

notebook_login()

In [None]:
push_to_hub_keras(model, "stock_market_model", token = "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")

# Testing the model

In [None]:
model = tfk.models.load_model("model_small.h5")

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Make predictions on the test data using your model
predictions = model.predict(X_test)

#inverse transform
predictions = predictions*(max_close-min_close)+min_close
actuals = y_test*(max_close-min_close)+min_close
X_test_2 = X_test[:, :, 3]*(max_close-min_close)+min_close

#metrics
mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print("mse", mse)
print("mae", mae)

#concatenate
predictions = np.concatenate((X_test_2, predictions), axis=1)
actuals = np.concatenate((X_test_2, actuals), axis=1)

for i in range (len(predictions)):
  # Create a scatter plot of the actual values
  plt.plot(actuals[i,:], c='b', label='Actual')

  # Create a line plot of the predicted values
  plt.plot(predictions[i,:], c='r', label='Predicted')

  # Set the x-axis range
  plt.xlim([200, 264])

  # Add a legend to the plot
  plt.legend(loc='lower right')

  # Show the plot
  plt.show()
