#  Stock Market Dataset


In this kernel, we will design and train the model we will use in our Dash web app. We will develop a Long Short-Term Memory (LSTM) Neural Network and harness its capability to solve problems in time series.

In [None]:
import os
import random

import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
import matplotlib.animation as animation

from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.losses import categorical_crossentropy
import tensorflow.keras.backend as K

## Pre-Processing

Filter the company's data to use.

In [None]:
os.chdir('../input/Data/Stocks/')

In [None]:
# This function will be used to determine if any value is missing
def has_nan(df):
    return df.isnull().values.any()

In [None]:
# This function will be used to determine if the data for a given company is too sparse
def has_few_data(df):
    if(df.shape[0] < 2000):
        return True
    
    return False

In [None]:
# This function will be used to determine if there is sufficient and complete data about the company stocks.
def use_company_data(df):
    if(has_nan(df)):
        return False
    elif(has_few_data(df)):
        return False
    else:
        return True

In [None]:
files = []
for file in os.listdir():
    try:
        df = pd.read_csv(file, sep=',')
        if(use_company_data( df )):
            files.append(file)
    except:
        continue

## Processing

We will explore two kinds of data types. The first one will be a stock's closing price variation. This is format for data is efficient if we want our model to be accurate for companies with high or low stock prices. Since we have a large database of stock prices, we will also attempt to use the raw stock's closing price.

In [None]:
window_len = 10

def split_data_variations_in_windows(df):
    """
    Create series of 10 Closing prices variation and its coresponding 11th price variation.
    """
    LSTM_inputs = []
    for i in range(len(df) - window_len):
        tmp_df = df[i:(i+window_len)].copy()
        tmp_df = tmp_df/tmp_df.iloc[0] - 1

        LSTM_inputs.append(tmp_df)
        
    LSTM_outputs = (df[window_len:].values/df[:-window_len].values)-1
    LSTM_inputs = [np.array(LSTM_input) for LSTM_input in LSTM_inputs]
    LSTM_inputs = np.array(LSTM_inputs)

    return LSTM_inputs, LSTM_outputs


def split_data_prices_in_windows(df):
    """
    Create series of 10 Closing prices and its coresponding 11th price.
    """
    LSTM_inputs = []
    for i in range(len(df) - window_len):
        LSTM_inputs.append(df[i:(i+window_len)])
        
    LSTM_outputs = (df['Close'][window_len:].values/df['Close'][:-window_len].values)-1
    LSTM_inputs = [np.array(LSTM_input) for LSTM_input in LSTM_inputs]
    LSTM_inputs = np.array(LSTM_inputs)
    
    return LSTM_inputs, LSTM_outputs

## Model Design

We will create our model with an Adam Optimizer to accelerate learning. Since this is a regression problem, we will use a Mean Average Error problem.

In [None]:
class SGDRScheduler(Callback):
    '''Cosine annealing learning rate scheduler with periodic restarts.
    # Usage
        ```python
            schedule = SGDRScheduler(min_lr=1e-5,
                                     max_lr=1e-2,
                                     steps_per_epoch=np.ceil(epoch_size/batch_size),
                                     lr_decay=0.9,
                                     cycle_length=5,
                                     mult_factor=1.5)
            model.fit(X_train, Y_train, epochs=100, callbacks=[schedule])
        ```
        
    # Arguments
        min_lr: The lower bound of the learning rate range for the experiment.
        max_lr: The upper bound of the learning rate range for the experiment.
        steps_per_epoch: Number of mini-batches in the dataset. Calculated as `np.ceil(epoch_size/batch_size)`. 
        lr_decay: Reduce the max_lr after the completion of each cycle.
                  Ex. To reduce the max_lr by 20% after each cycle, set this value to 0.8.
        cycle_length: Initial number of epochs in a cycle.
        mult_factor: Scale epochs_to_restart after each full cycle completion.
        
    # References
        Blog post: jeremyjordan.me/nn-learning-rate
        Original paper: http://arxiv.org/abs/1608.03983
    '''
    
    def __init__(self,
                 min_lr,
                 max_lr,
                 steps_per_epoch,
                 lr_decay=1,
                 cycle_length=10,
                 mult_factor=2):

        self.min_lr = min_lr
        self.max_lr = max_lr
        self.lr_decay = lr_decay

        self.batch_since_restart = 0
        self.next_restart = cycle_length

        self.steps_per_epoch = steps_per_epoch

        self.cycle_length = cycle_length
        self.mult_factor = mult_factor

        self.history = {}

    def clr(self):
        '''Calculate the learning rate.'''
        fraction_to_restart = self.batch_since_restart / (self.steps_per_epoch * self.cycle_length)
        lr = self.min_lr + 0.5 * (self.max_lr - self.min_lr) * (1 + np.cos(fraction_to_restart * np.pi))
        return lr

    def on_train_begin(self, logs={}):
        '''Initialize the learning rate to the minimum value at the start of training.'''
        logs = logs or {}
        K.set_value(self.model.optimizer.lr, self.max_lr)

    def on_batch_end(self, batch, logs={}):
        '''Record previous batch statistics and update the learning rate.'''
        logs = logs or {}
        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)

        self.batch_since_restart += 1
        K.set_value(self.model.optimizer.lr, self.clr())

    def on_epoch_end(self, epoch, logs={}):
        '''Check for end of current cycle, apply restarts when necessary.'''
        if epoch + 1 == self.next_restart:
            self.batch_since_restart = 0
            self.cycle_length = np.ceil(self.cycle_length * self.mult_factor)
            self.next_restart += self.cycle_length
            self.max_lr *= self.lr_decay
            self.best_weights = self.model.get_weights()

    def on_train_end(self, logs={}):
        '''Set weights to the values from the end of the most recent cycle for best performance.'''
        self.model.set_weights(self.best_weights)

In [None]:
class LRFinder(Callback):
    
    '''
    A simple callback for finding the optimal learning rate range for your model + dataset. 
    
    # Usage
        ```python
            lr_finder = LRFinder(min_lr=1e-5, 
                                 max_lr=1e-2, 
                                 steps_per_epoch=np.ceil(epoch_size/batch_size), 
                                 epochs=3)
            model.fit(X_train, Y_train, callbacks=[lr_finder])
            
            lr_finder.plot_loss()
        ```
    
    # Arguments
        min_lr: The lower bound of the learning rate range for the experiment.
        max_lr: The upper bound of the learning rate range for the experiment.
        steps_per_epoch: Number of mini-batches in the dataset. Calculated as `np.ceil(epoch_size/batch_size)`. 
        epochs: Number of epochs to run experiment. Usually between 2 and 4 epochs is sufficient. 
        
    # References
        Blog post: jeremyjordan.me/nn-learning-rate
        Original paper: https://arxiv.org/abs/1506.01186
    '''
    
    def __init__(self, min_lr=1e-5, max_lr=1e-2, steps_per_epoch=None, epochs=None):
        super().__init__()
        
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.total_iterations = steps_per_epoch * epochs
        self.iteration = 0
        self.history = {}
        
    def clr(self):
        '''Calculate the learning rate.'''
        x = self.iteration / self.total_iterations 
        return self.min_lr + (self.max_lr-self.min_lr) * x
        
    def on_train_begin(self, logs=None):
        '''Initialize the learning rate to the minimum value at the start of training.'''
        logs = logs or {}
        K.set_value(self.model.optimizer.lr, self.min_lr)
        
    def on_batch_end(self, epoch, logs=None):
        '''Record previous batch statistics and update the learning rate.'''
        logs = logs or {}
        self.iteration += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.iteration)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
            
        K.set_value(self.model.optimizer.lr, self.clr())
 
    def plot_lr(self):
        '''Helper function to quickly inspect the learning rate schedule.'''
        plt.plot(self.history['iterations'], self.history['lr'])
        plt.yscale('log')
        plt.xlabel('Iteration')
        plt.ylabel('Learning rate')
        
    def plot_loss(self):
        '''Helper function to quickly observe the learning rate experiment results.'''
        plt.plot(self.history['lr'], self.history['loss'])
        plt.xscale('log')
        plt.xlabel('Learning rate')
        plt.ylabel('Loss')

In [None]:
def LSTM_model(input_shape, output_shape, neurons, dropout):
    x = Input(shape=input_shape)
    hidden = LSTM(2 * neurons, return_sequences=True)(x)
    hidden = Dropout(dropout)(hidden)
    hidden = LSTM(neurons, return_sequences=False)(hidden)
    hidden = Dropout(dropout)(hidden)
    y = Dense(output_shape, activation='linear')(hidden)
    return Model(inputs=x, outputs=y)

model = LSTM_model((1, 10), 1, 32, 0.1)

In [None]:
class LossHistory(Callback):
    def __init__(self):
        self.loss = []
        self.val_loss = []

    def on_batch_end(self, epoch, logs={}):
        self.loss.append(logs.get('loss'))
    
    def on_epoch_end(self, epoch, logs={}):
        self.val_loss.append(logs.get('loss'))
        
cb = LossHistory()

In [None]:
optimizer = Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-8)

model.compile(loss='mse', optimizer=optimizer, metrics=['mse'])

### Fine-tuning

Let us firstly look at how an untrained model performes beforehand to have a baseline upon which to improve.

In [None]:
# Randomly select a file (that was approved before hand) and load it.
file_nb = random.randint(1, len(files))
df = pd.read_csv(files[file_nb], sep=',')["Close"]
    
# Process the data
LSTM_inputs, LSTM_outputs = split_data_variations_in_windows(df)
LSTM_inputs = np.reshape(LSTM_inputs, (LSTM_inputs.shape[0], 1, LSTM_inputs.shape[1]))

# Evaluate
model.evaluate(LSTM_inputs, LSTM_outputs)

In [None]:
# find learning rate first
epoch_size = 20
batch_size = 256

lr_finder = LRFinder(min_lr=1e-4, 
                     max_lr=1e-2, 
                     steps_per_epoch=np.ceil(epoch_size/batch_size), 
                     epochs=3)
model.fit(LSTM_inputs, LSTM_outputs, callbacks=[lr_finder])

lr_finder.plot_loss()

In [None]:
schedule = SGDRScheduler(min_lr=1e-3,
                         max_lr=1e-2,
                         steps_per_epoch=np.ceil(epoch_size/batch_size),
                         lr_decay=0.9,
                         cycle_length= 3,
                         mult_factor=1.5)

# Training

Since every company is unique in that it operates in a specific field, geographic zone, has different sizes and clients, we need to capture as much of that vairance as possible in our model. We will thus train our model using numerous companies rather than a single one.

In [None]:
file_dups = []

for i in range(10):
    # Randomly select a file (that was approved before hand) and load it.
    file_nb = random.randint(1, len(files))
    if file_nb in file_dups:
        continue
    else:
        file_dups.append(file_nb)
    df = pd.read_csv(files[file_nb], sep=',')["Close"]
    
    # Split the data into train-validation-testing sets (70-10-20).
    length = df.shape[0]
    df_train = df[:int(length*0.7)]
    df_valid = df[int(length*0.7)+1:int(length*0.8)]
    df_test  = df[int(length*0.8)+1:]
    
    # Process the data
    LSTM_train_inputs, LSTM_train_outputs = split_data_variations_in_windows(df_train)
    LSTM_valid_inputs, LSTM_valid_outputs = split_data_variations_in_windows(df_valid)
    LSTM_test_inputs, LSTM_test_outputs = split_data_variations_in_windows(df_test)

    LSTM_train_inputs = np.reshape(LSTM_train_inputs, (LSTM_train_inputs.shape[0], 1, LSTM_train_inputs.shape[1]))
    LSTM_valid_inputs = np.reshape(LSTM_valid_inputs, (LSTM_valid_inputs.shape[0], 1, LSTM_valid_inputs.shape[1]))
    LSTM_test_inputs = np.reshape(LSTM_test_inputs, (LSTM_test_inputs.shape[0], 1, LSTM_test_inputs.shape[1]))
    
    # Since we want our model to be trained on multiple company stocks, we will only train them for a few epochs.
    history = model.fit(LSTM_train_inputs, LSTM_train_outputs, 
                        validation_data = (LSTM_valid_inputs, LSTM_valid_outputs),
                        epochs=5, batch_size=16, verbose=2, callbacks=[cb, schedule])
# Save the model.
# model.save('lstm.h5')

In [None]:
fig = plt.figure()
# summarize history for batches
plt.plot(cb.loss)
plt.title('Model Train Loss')
plt.ylabel('Loss')
plt.xlabel('Batches')
plt.legend(['loss'], loc='upper left')
plt.show()

In [None]:
fig = plt.figure()
# summarize history for epochs
plt.plot(cb.val_loss)
plt.title('Model Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['acc', 'loss'], loc='upper left')
plt.show()

## Evaluation

Let us now  look at how our model performs.

In [None]:
mse = mean_squared_error(LSTM_outputs, model.predict(LSTM_inputs))
print('The Mean Absolute Error is: {}'.format(mse))

In [None]:
x = np.linspace(0, len(LSTM_outputs), len(LSTM_outputs))
y1 = model.predict(LSTM_inputs)
y2 = LSTM_outputs

fig, ax = plt.subplots()
prediction, = ax.plot(x, y1, color='b')
ground_truth, = ax.plot(x, y2, color='g')

def update(num, x, y1, prediction, y2, ground_truth):
    prediction.set_data(x[:num], y1[:num])
    ground_truth.set_data(x[:num], y2[:num])
    return prediction, ground_truth,

ani = animation.FuncAnimation(fig, update, len(x), fargs=[x, y1, prediction, y2, ground_truth],
                              interval=25, blit=True)

ani.save('test.gif', writer="imagemagick")
plt.show()