#  Stock Market Dataset

ref: https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs

This is the data we will use to train the models that will be used by our Dash web app.

In [17]:
import os
import random

import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
import matplotlib.animation as animation

from sklearn.metrics import mean_absolute_error

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.losses import categorical_crossentropy

## Pre-Processing

Filter the company's data to use.

In [2]:
os.chdir('../input/Data/Stocks/')

In [3]:
# This function will be used to determine if any value is missing
def has_nan(df):
    return df.isnull().values.any()

In [4]:
# This function will be used to determine if the data for a given company is too sparse
def has_few_data(df):
    if(df.shape[0] < 2000):
        return True
    
    return False

In [5]:
# This function will be used to determine if there is sufficient and complete data about the company stocks.
def use_company_data(df):
    if(has_nan(df)):
        return False
    elif(has_few_data(df)):
        return False
    else:
        return True

In [6]:
files = []
for file in os.listdir():
    try:
        df = pd.read_csv(file, sep=',')
        if(use_company_data( df )):
            files.append(file)
    except:
        continue

Load the data.

In [7]:
df = pd.read_csv(files[0], sep=',')["Close"]

Split the data into train-validation-testing sets (70-10-20).

In [8]:
length = df.shape[0]

In [9]:
df_train = df[:int(length*0.7)]
df_valid = df[int(length*0.7)+1:int(length*0.8)]
df_test  = df[int(length*0.8)+1:]

In [10]:
window_len = 10

def split_data_variations_in_windows(df):
    """
    Create series of 10 Closing prices variation and its coresponding 11th price variation.
    """
    LSTM_inputs = []
    for i in range(len(df) - window_len):
        tmp_df = df[i:(i+window_len)].copy()
        tmp_df = tmp_df/tmp_df.iloc[0] - 1

        LSTM_inputs.append(tmp_df)
        
    LSTM_outputs = (df[window_len:].values/df[:-window_len].values)-1
    LSTM_inputs = [np.array(LSTM_input) for LSTM_input in LSTM_inputs]
    LSTM_inputs = np.array(LSTM_inputs)

    return LSTM_inputs, LSTM_outputs


def split_data_prices_in_windows(df):
    """
    Create series of 10 Closing prices and its coresponding 11th price.
    """
    LSTM_inputs = []
    for i in range(len(df) - window_len):
        LSTM_inputs.append(df[i:(i+window_len)])
        
    LSTM_outputs = (df['Close'][window_len:].values/df['Close'][:-window_len].values)-1
    LSTM_inputs = [np.array(LSTM_input) for LSTM_input in LSTM_inputs]
    LSTM_inputs = np.array(LSTM_inputs)
    
    return LSTM_inputs, LSTM_outputs

In [11]:
LSTM_train_inputs, LSTM_train_outputs = split_data_variations_in_windows(df_train)
LSTM_valid_inputs, LSTM_valid_outputs = split_data_variations_in_windows(df_valid)
LSTM_test_inputs, LSTM_test_outputs = split_data_variations_in_windows(df_test)

In [12]:
LSTM_train_inputs = np.reshape(LSTM_train_inputs, (LSTM_train_inputs.shape[0], 1, LSTM_train_inputs.shape[1]))
LSTM_valid_inputs = np.reshape(LSTM_valid_inputs, (LSTM_valid_inputs.shape[0], 1, LSTM_valid_inputs.shape[1]))
LSTM_test_inputs = np.reshape(LSTM_test_inputs, (LSTM_test_inputs.shape[0], 1, LSTM_test_inputs.shape[1]))

In [13]:
x = Input(shape=(1, 10))
hidden = LSTM(32, return_sequences=True)(x)
hidden = Dropout(0.1)(hidden)
hidden = LSTM(16, return_sequences=False)(hidden)
hidden = Dropout(0.1)(hidden)
y = Dense(1, activation='linear')(hidden)

model = Model(inputs=x, outputs=y)

In [14]:
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', patience=2, verbose=1, factor=0.4, min_lr=0.000001)
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=1, mode='auto')

model.compile(loss='mae', optimizer=optimizer, metrics=['mae'])

In [19]:
model.evaluate(LSTM_valid_inputs, LSTM_valid_outputs)



[0.05072909870855389, 0.05072909870855389]

In [15]:
history = model.fit(LSTM_train_inputs, LSTM_train_outputs, 
                    validation_data = (LSTM_valid_inputs, LSTM_valid_outputs),
                    epochs=5, batch_size=1, verbose=2, shuffle=True)

Train on 2214 samples, validate on 307 samples
Epoch 1/5
 - 10s - loss: 0.0252 - mean_absolute_error: 0.0252 - val_loss: 0.0514 - val_mean_absolute_error: 0.0514
Epoch 2/5
 - 7s - loss: 0.0212 - mean_absolute_error: 0.0212 - val_loss: 0.0469 - val_mean_absolute_error: 0.0469
Epoch 3/5
 - 7s - loss: 0.0214 - mean_absolute_error: 0.0214 - val_loss: 0.0513 - val_mean_absolute_error: 0.0513
Epoch 4/5
 - 7s - loss: 0.0210 - mean_absolute_error: 0.0210 - val_loss: 0.0468 - val_mean_absolute_error: 0.0468
Epoch 5/5
 - 7s - loss: 0.0204 - mean_absolute_error: 0.0204 - val_loss: 0.0507 - val_mean_absolute_error: 0.0507


In [None]:
x = np.linspace(0, len(LSTM_test_outputs), len(LSTM_test_outputs))

fig, ax = plt.subplots()
prediction, = ax.plot(x, model.predict(LSTM_test_inputs), color='m')
ground_truth, = ax.plot(x, LSTM_test_outputs, color='k')

def update(num, x, y1, line1, y2, line2):
    line1.set_data(x[:num], y1[:num])
    line2.set_data(x[:num], y2[:num])
    return line1, line2,

ani = animation.FuncAnimation(fig, update, len(x), fargs=[x, y1, line1, y2, line2],
                              interval=25, blit=True)

ani.save('test.gif',writer="imagemagick")
plt.show()

In [None]:
mae = mean_absolute_error(LSTM_test_outputs, model.predict(LSTM_test_inputs))
print('The Mean Absolute Error is: {}'.format(MAE))