# Stock Price Forecasting with LSTM

Authors: Matthew Franci, Ben Shealy

In this notebook we will demonstrate how to perform times series forecasting with an LSTM. We'll use historical stock price data for a few major companies, which can be downloaded from Yahoo Finance:

- [AAPL](https://finance.yahoo.com/quote/AAPL/history)
- [GDX](https://finance.yahoo.com/quote/GDX/history)
- [QQQ](https://finance.yahoo.com/quote/QQQ/history)

## Getting Started

In [None]:
# pandas imports
import pandas as pd
from pandas.plotting import register_matplotlib_converters

# numpy import
import numpy as np

# matplotlib import
import matplotlib.pyplot as plt

# sklearn imports
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score

# keras import
from tensorflow import keras

## Stock Class

We will create a `Stock` class which will provide all of the necessary functionality for any stock price dataset.

In [None]:
class Stock:
    # CONSTRUCTOR
    def __init__(self, stockName, filePath):
        # read data in filePath, header at first row, dates on first column, dates used as the index
        self.data = pd.read_csv(filePath, header = 0, parse_dates=[0], index_col = 0, squeeze= True)
        # drop any present NA values
        self.data = self.data.dropna(0)
        
        self.name = stockName
        
        # Member Variables relating to the Core LSTM Branch
        self.train_samplesX = np.array
        self.test_samplesX = np.array
        self.train_samplesY = np.array
        self.test_samplesY = np.array
        self.scaler = MinMaxScaler
        
        # Normalized data for the Core LSTM Branch
        self.normalizedData = np.array
        self.normDataHistory = np.array
        self.nextDayVals = np.array
        self.nextDaysVals_test = np.array
        self.nextDayValsNorm = np.array
        self.model = keras.Model()
        
        # Member Variables for the Tech Indicators LSTM Branch
        self.indicators = np.array
        self.indicatorsNorm = np.array
        self.indicators_train = np.array
        self.indicators_test = np.array
        self.indicatorScaler = MinMaxScaler
        
        # Default Values for "Global" Parameters
        # testProp: test vs. train proportion - .2 means 20% will be used for testing, 80% for training
        self.testProp = .2
        # numLag: where the "expected" value is 
        self.numLag = 1
        # numSeq: size to make each LSTM sample
        self.numSeq = 1
        # numEpochs - number of epochs on which to train the model
        self.numEpochs = 1
        # numNeurons - number of neurons to use with the modlel
        self.numNeurons = 1
        # batchSize - model batch size
        self.batchSize = 32

    # FUNCTION show_head() - displays the head of the Stock's dataframe to the console
    def show_head(self):
        print(self.data.head())

    # FUNCTION plot() - basic plot of Stock's longitudinal open price and volume
    def plot(self):
        register_matplotlib_converters()
        fig, (ax1,ax2) = plt.subplots(1,2,figsize=(21,3),dpi=300)
        
        ax1.set_title(self.name + " Longitudinal Open Prices")
        ax1.plot(self.data['Open'][::30])
        ax1.set_xlabel('Date')
        ax1.set_ylabel('Stock Price USD [$]')
        
        ax2.set_title(self.name + " Longitudinal Volume")
        ax2.plot(self.data['Volume'][::30])
        ax2.set_xlabel('Date')
        ax2.set_ylabel('Stock Volume')
        fig.savefig(self.name+"_plot.png")
        plt.show()
        
    # FUNCTION prepare_samples() - scales Stock data to (0,1) range, creates core LSTM samples
    def prepare_samples(self):
        self.scaler = MinMaxScaler(feature_range=(0,1))
        self.normalizedData = self.scaler.fit_transform(self.data)
        
        self.normDataHistory = np.array([self.normalizedData[i : i + self.numSeq].copy() 
                                         for i in range(len(self.normalizedData)-self.numSeq)])
        self.nextDayValsNorm = np.array([self.normalizedData[:,0][i+self.numSeq].copy() 
                                         for i in range(len(self.normalizedData)-self.numSeq)])
        self.nextDayValsNorm = np.expand_dims(self.nextDayValsNorm,-1)
        
        self.nextDayVals = np.array([self.data.values[:,0][i+ self.numSeq].copy() 
                                     for i in range(len(self.data) - self.numSeq)])
        
        self.nextDayVals = np.expand_dims(self.nextDayVals,-1)
        
        yNormalizer = MinMaxScaler()
        yNormalizer.fit(self.nextDayVals)
        
        self.scaler = yNormalizer
        
        assert self.normDataHistory.shape[0] == self.nextDayValsNorm.shape[0]
    
    # FUNCTION prepare_indicators() - creates normalized indicator data
    # precondition - prepare_samples() has been run successfully
    def prepare_indicators(self):
        indicators = []
        for day in self.normDataHistory:
            sma = np.mean(day[:,4])
            indicators.append(np.array([sma]))
            
        self.indicators = np.array(indicators)
        
        self.indicatorScaler = MinMaxScaler()
        self.indicatorsNorm = self.indicatorScaler.fit_transform(self.indicators)

    # FUNCTION split_data() - splits the Core LSTM Layer data into its testing and training sets
    # precondition - prepare_samples() has been run successfully
    def split_data(self):
        split = self.normDataHistory.shape[0]-int(self.normDataHistory.shape[0] * self.testProp)
        
        self.train_samplesX, self.test_samplesX = self.normDataHistory[:split], self.normDataHistory[split:]
        self.train_samplesY, self.test_samplesY = self.nextDayValsNorm[:split],self.nextDayValsNorm[split:]
        self.nextDayVals_test = self.nextDayVals[split:]
    
    # FUNCTION split_indicator_data() - splits the indicator layer data into its testing and trianing sets
    # precondition - prepare_indicators() has been run successfully
    def split_indicator_data(self):
        split = self.indicatorsNorm.shape[0] - int(self.indicatorsNorm.shape[0] * self.testProp)
        
        self.indicators_train, self.indicators_test = self.indicatorsNorm[:split],self.indicatorsNorm[split:]

    # FUNCTION set_globals() - lets the user modify the test proprtion, batch size, sequence size, number of neurons,
    #                           and number of training epochs
    # POTENTIAL TO-DO: add functions that modify each of these parameters individually
    def set_globals(self,testProp,batchSize,numSeq,numNeurons,numEpochs):
        self.testProp = testProp
        self.batchSize = batchSize
        self.numSeq = numSeq
        self.numNeurons = numNeurons
        self.numEpochs = numEpochs

    # FUNCTION fit_LSTM() - creates the full keras model, taking into the "global" parameters
    def fit_LSTM(self):
        lstm_input = keras.layers.Input(shape=(self.numSeq,self.data.shape[1]),name='lstm_input')
        dense_input = keras.layers.Input(shape=(self.indicators.shape[1],),name='ind_input')
        
        x = keras.layers.LSTM(self.numNeurons,name='LSTM_0')(lstm_input)
        x = keras.layers.Dropout(0.2,name='lstm_dropout_0')(x)
        lstm_branch = keras.Model(inputs=lstm_input,outputs=x)
        
        y = keras.layers.Dense(64,name='dense_0')(dense_input)
        y = keras.layers.Activation('sigmoid',name='sigmoid_0')(y)
        y = keras.layers.Dense(1,name='dense_1')(y)
        
        ind_branch = keras.Model(inputs=dense_input,outputs=y)
        
        complete = keras.layers.concatenate([lstm_branch.output,ind_branch.output],name='complete')
        
        z = keras.layers.Dense(64, activation = "sigmoid",name='dense_pooling')(complete)
        z = keras.layers.Dense(1,activation='linear',name='dense_out')(z)
    
        self.model = keras.Model(inputs=[lstm_branch.input,ind_branch.input],outputs=z)
        
        self.model.compile(optimizer='adam',loss='mse')
        
        # line below creates the diagram of the LSTM model - not needed for every run
        # plot_model(model,to_file='model.png')
        
        self.model.fit(x=[self.train_samplesX,self.indicators_train],y=self.train_samplesY,
                       batch_size=self.batchSize,epochs=self.numEpochs,shuffle=True,validation_split=.1)
        
        # built-in keras evaluation, replaced with MSE and R^2 in evaluate_forecasts() function
        #evaluation = self.model.evaluate([self.test_samplesX,self.indicators_test],self.test_samplesY)
        #print(evaluation)

    # FUNCTION evaluate_forecasts() - generates MSE and R^2, and plots predicted vs. actual curve
    # POTENTIAL TO-DO: split the evaluation metrics and evaluation plots into two functions
    def evaluate_forecasts(self):
        # np array of predicted values from the model
        predictedPrices = self.model.predict([self.test_samplesX,self.indicators_test])
        predictedPrices = self.scaler.inverse_transform(predictedPrices)
        
        MSE = np.mean(np.square(self.nextDayVals_test - predictedPrices))
        MSE_scaled = MSE / (np.max(self.nextDayVals_test) - np.min(self.nextDayVals_test)) * 100
        print("MSE Score: " +MSE_scaled)
        
        rSquared = r2_score(self.nextDayVals_test,predictedPrices)
        print("R^2 Score: " +rSquared)
        
        fig = plt.figure(figsize=(20,10),dpi=300)
        actual = plt.plot(self.nextDayVals_test,label='actual')
        predicted = plt.plot(predictedPrices,label='predicted')
        plt.legend(['Actual','Predicted'])
        plt.show()
        fig.savefig(self.name+"forecastplot"+self.numEpochs+".png")

## Forecast Stock Prices

Now we will apply our forecasting model to several major stocks.

In [None]:
AAPL = Stock("AAPL","AAPL.csv")
AAPL.set_globals(.15,32,5,50,5)
AAPL.show_head()
AAPL.plot()
AAPL.prepare_samples()
AAPL.prepare_indicators()
AAPL.split_data()
AAPL.split_indicator_data()
AAPL.fit_LSTM()
AAPL.evaluate_forecasts()

In [None]:
GDX = Stock("GDX","GDX.csv")
GDX.set_globals(.15,32,10,50,5)
GDX.show_head()
GDX.plot()
GDX.prepare_samples()
GDX.prepare_indicators()
GDX.split_data()
GDX.split_indicator_data()
GDX.fit_LSTM()
GDX.evaluate_forecasts()

In [None]:
QQQ = Stock("QQQ","QQQ.csv")
QQQ.set_globals(.15,32,5,50,50)
QQQ.show_head()
QQQ.plot()
QQQ.prepare_samples()
QQQ.prepare_indicators()
QQQ.split_data()
QQQ.split_indicator_data()
QQQ.fit_LSTM()
QQQ.evaluate_forecasts()