In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras as keras
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


# Define Functions

In [2]:
def read_data(file):
    """
    Read csv data from the specified file location.
    """
    df = pd.read_csv(file, index_col='Date')
    return df

In [3]:
def clip_recent_days(df, n_days):
    """
    optional
    remove recent days from data frame
    """
    return(df[:-n_days])

In [4]:
def format_predictors_and_targets(df):
    
    df = df.dropna()
    
    predictors = df[['back_5', 'back_4', 'back_3', 'back_2', 'back_1']].values
    assert type(predictors) is np.ndarray
    
    n_cols = predictors.shape[1]
    
    targets = df[['Adj Close']].values
    assert type(targets) is np.ndarray
    
    return predictors, targets, n_cols

# Prepare Data

In [5]:
# DEFINE DATA CONSTANTS
TEST_SIZE = 0.15

In [6]:
# specify location of time series data
file_path = '../data/interim/time_series.csv'

In [7]:
# read time series data
time_series_df = read_data(file_path)
time_series_df

Unnamed: 0_level_0,back_5,back_4,back_3,back_2,back_1,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,,,,,,0.014743
1962-01-03,,,,,0.014743,0.002435
1962-01-04,,,,0.014743,0.002435,-0.022141
1962-01-05,,,0.014743,0.002435,-0.022141,-0.002490
1962-01-08,,0.014743,0.002435,-0.022141,-0.002490,-0.002498
...,...,...,...,...,...,...
2020-04-03,0.014775,0.012455,-0.011656,0.073689,-0.029898,0.031629
2020-04-06,0.012455,-0.011656,0.073689,-0.029898,0.031629,0.018848
2020-04-07,-0.011656,0.073689,-0.029898,0.031629,0.018848,0.061366
2020-04-08,0.073689,-0.029898,0.031629,0.018848,0.061366,-0.016556


In [8]:
time_series_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14669 entries, 1962-01-02 to 2020-04-09
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   back_5     14664 non-null  float64
 1   back_4     14665 non-null  float64
 2   back_3     14666 non-null  float64
 3   back_2     14667 non-null  float64
 4   back_1     14668 non-null  float64
 5   Adj Close  14668 non-null  float64
dtypes: float64(6)
memory usage: 802.2+ KB


In [9]:
# call function to format predictors and targets
predictors, targets, n_cols = format_predictors_and_targets(time_series_df)

In [10]:
# scale data to range [0,1]
X_scaler = MinMaxScaler(feature_range=(0,1))
y_scaler = MinMaxScaler(feature_range=(0,1))

predictors = X_scaler.fit_transform(predictors)
targets = y_scaler.fit_transform(targets)

In [11]:
# test for correct scaling
assert min(predictors.flatten()) == 0
assert max(predictors.flatten()) == 1
assert min(targets.flatten()) == 0
assert max(targets.flatten()) == 1

**Important for Time Series Analysis:**

The training set must only contain stock price data for dates prior to all of the test data.

The model will be overly optimistic if it trained on future data and evaluated on previous data.


RESULT: Random split will not work
split needs to be sequential.


[____________ALL_____SEQUENTIAL_______DATA______]


[____________TRAINING    DATA_____],[___________TEST_DATA____]

In [12]:
# split data into training set and testing set
# SHUFFLE = FALSE
X_train, X_test, y_train, y_test = train_test_split(predictors, targets, test_size=TEST_SIZE, shuffle=False, stratify=None, random_state=1)

In [13]:
# test for sequential split
assert np.argwhere(predictors == X_train[-1])[0][0] == (np.argwhere(predictors == X_test[0])[0][0]) -1
assert np.argwhere(predictors == y_train[-1])[0][0] == (np.argwhere(predictors == y_test[0])[0][0]) -1

# Build Model

In [14]:
X_train.shape[0], X_train.shape[1]

(12463, 5)

In [15]:

X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

X_train.shape

(12463, 1, 5)

In [16]:
#y_train = np.reshape(y_train, (y_train.shape[0], 1, y_train.shape[1]))
#y_test = np.reshape(y_test, (y_test.shape[0], 1, y_test.shape[1]))

In [17]:
def build_sequential_LSTM(n_nodes, n_layers, add_dense):
    
    model = Sequential()
    
    model.add(LSTM(n_nodes, return_sequences=True, input_shape=X_train.shape[1:]))
    

    # add LSTM layers that also use nodes from the same layer
    for i in range(n_layers-2):
        model.add(LSTM(n_nodes, return_sequences=True))
    
    #return_sequences = False if next layer is not LSTM
    model.add(LSTM(n_nodes, return_sequences=False))
    
    
    if add_dense:
        # add Fully Connected Layer
        model.add(Dense(n_nodes, activation='relu'))
        
    model.add(Dense(1))
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    
    return model

In [22]:
# DEFINE MODEL CONSTANTS
N_NODES = 50
N_LAYERS = 5
ADD_DENSE = True

In [23]:
# build Sequential Model
model = build_sequential_LSTM(N_NODES, N_LAYERS, ADD_DENSE)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 1, 50)             11200     
_________________________________________________________________
lstm_6 (LSTM)                (None, 1, 50)             20200     
_________________________________________________________________
lstm_7 (LSTM)                (None, 1, 50)             20200     
_________________________________________________________________
lstm_8 (LSTM)                (None, 1, 50)             20200     
_________________________________________________________________
lstm_9 (LSTM)                (None, 1, 50)             20200     
_________________________________________________________________
lstm_10 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_3 (Dense)              (None, 50)               

# Fit The Model

In [24]:
# DEFINE TRAINING CONSTANTS
EPOCHS = 2 #50

In [25]:
model.fit(X_train, y_train, epochs=EPOCHS)

Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x781cdeebf150>