In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

%matplotlib inline

In [2]:
# from numpy.random import seed
# seed(1)

# from tensorflow import random
# random.set_seed(2)

In [3]:
# load csv into dataframe
df = pd.read_csv(
    Path('../data/stocks_history.csv'),
    index_col='Unnamed: 0',
    infer_datetime_format=True,
    parse_dates=True
)
df.head()

Unnamed: 0,MSFT_open,MSFT_high,MSFT_low,MSFT_close,AMD_open,AMD_high,AMD_low,AMD_close,TSLA_open,TSLA_high,...,JNJ_low,JNJ_close,REGN_open,REGN_high,REGN_low,REGN_close,GILD_open,GILD_high,GILD_low,GILD_close
2010-06-29,24.13,24.2,23.11,23.31,7.93,7.93,7.41,7.48,19.0,25.0,...,58.68,59.24,23.67,23.95,22.86,22.98,35.4,35.61,34.74,34.97
2010-06-30,23.3,23.68,22.95,23.01,7.58,7.65,7.3,7.32,25.79,30.4192,...,58.94,59.06,23.05,23.47,22.32,22.32,34.83,35.13,34.26,34.28
2010-07-01,23.09,23.32,22.73,23.16,7.35,7.53,7.1,7.39,25.0,25.92,...,58.65,59.07,22.31,22.37,20.45,20.79,34.24,34.27,33.3,34.14
2010-07-02,23.36,23.48,23.05,23.27,7.45,7.48,7.02,7.17,23.0,23.1,...,58.85,59.08,21.06,21.88,20.75,21.61,34.38,35.16,34.18,34.87
2010-07-06,23.7,24.09,23.584,23.82,7.4,7.42,6.96,7.04,20.0,20.0,...,58.669,59.08,22.03,22.03,21.16,21.36,35.11,35.42,34.415,34.77


In [4]:
# drop all columns except closing prices
dropped_columns = [
    'MSFT_open',
    'MSFT_high',
    'MSFT_low',
    'AMD_open',
    'AMD_high',
    'AMD_low',
    'TSLA_open',
    'TSLA_high',
    'TSLA_low',
    'JNJ_open',
    'JNJ_high',
    'JNJ_low',
    'REGN_open',
    'REGN_high',
    'REGN_low',
    'GILD_open',
    'GILD_high',
    'GILD_low'
]
df.drop(columns=dropped_columns, inplace=True)
df.head()

Unnamed: 0,MSFT_close,AMD_close,TSLA_close,JNJ_close,REGN_close,GILD_close
2010-06-29,23.31,7.48,23.89,59.24,22.98,34.97
2010-06-30,23.01,7.32,23.83,59.06,22.32,34.28
2010-07-01,23.16,7.39,21.96,59.07,20.79,34.14
2010-07-02,23.27,7.17,19.2,59.08,21.61,34.87
2010-07-06,23.82,7.04,16.11,59.08,21.36,34.77


In [5]:
def window_data(df, window, feature_col_number, target_col_number):
    """
    This function accepts the column number for the features (X) and the target (y).
    It chunks the data up with a rolling window of Xt - window to predict Xt.
    It returns two numpy arrays of X and y.
    """
    X = []
    y = []
    for i in range(len(df) - window - 1):
        features = df.iloc[i : (i + window), feature_col_number]
        target = df.iloc[(i + window), target_col_number]
        X.append(features)
        y.append(target)
    return np.array(X), np.array(y).reshape(-1, 1)

In [6]:
# create dataframe to store model metrics
train_test_columns = [
    'stock',
    'window size',
    'dropout fraction',
    'epochs',
    'batch size',
    'mse',
    'rmse'
]
train_test_eval = pd.DataFrame(columns=train_test_columns)

# create dictionary to (temporarily) store model metrics during loop
dict_train_test = {
    'stock':[],
    'window size':[],
    'dropout fraction':[],
    'epochs':[],
    'batch size':[],
    'mse':[],
    'rmse':[]
}

In [7]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

from datetime import datetime

# if time permitted, we would've iterated on epochs for all parameters as well (30,50,100,150),
# as completing the 10 epochs with the below iterations took 14.5 hours
epochs = [10]
batch_sizes = [1,10,50,150,500]
window_sizes = [1,5,10,15,20,25,30,50]
stock_list = [0,1,2,3,4,5]
dropout_fractions = [0.1,0.15,0.2,0.25,0.3,0.35]


print(f'started at: {datetime.now()}')

for epoch in epochs:
    for batch_size in batch_sizes:
        for stock in stock_list:
            for window_size in window_sizes:
                for dropout_fraction in dropout_fractions:

                    feature_column = stock # iterate over this to predict each stock in dataframe
                    target_column = stock # iterate over this to predict each stock in dataframe
                    X, y = window_data(df, window_size, feature_column, target_column)

                    # Use 70% of the data for training and the remainder for testing
                    split = int(0.7 * len(X))
                    X_train = X[: split - 1]
                    X_test = X[split:]
                    y_train = y[: split - 1]
                    y_test = y[split:]

                    # Use the MinMaxScaler to scale data between 0 and 1.
                    scaler = MinMaxScaler()
                    scaler.fit(X)
                    X_train = scaler.transform(X_train)
                    X_test = scaler.transform(X_test)
                    scaler.fit(y)
                    y_train = scaler.transform(y_train)
                    y_test = scaler.transform(y_test)

                    # Reshape the features for the model
                    X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
                    X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

                    # Define the LSTM RNN model.
                    model = Sequential()
                    # Layer 1
                    model.add(LSTM(
                        units=window_size,
                        input_shape=(X_train.shape[1], 1))
                        )
                    model.add(Dropout(dropout_fraction))
                    # Output layer
                    model.add(Dense(1))

                    # Compile the model
                    model.compile(optimizer="adam", loss="mean_squared_error")

                    # Train the model
                    model.fit(X_train, y_train, epochs=epoch, shuffle=False, batch_size=batch_size, verbose=0)

                    # Evaluate the model
                    loss = model.evaluate(X_test, y_test, verbose=0)

                    # Make some predictions
                    predicted = model.predict(X_test)

                    # Recover the original prices instead of the scaled version
                    predicted_prices = scaler.inverse_transform(predicted)
                    real_prices = scaler.inverse_transform(y_test.reshape(-1, 1))

                    # Create a DataFrame of Real and Predicted values
                    stocks = pd.DataFrame({
                        "Real": real_prices.ravel(),
                        "Predicted": predicted_prices.ravel()
                    })

                    # append model performance to train_test_eval dataframe
                    dict_train_test['stock'].append(df.columns[stock])
                    dict_train_test['window size'].append(window_size)
                    dict_train_test['dropout fraction'].append(dropout_fraction)
                    dict_train_test['epochs'].append(epoch)
                    dict_train_test['batch size'].append(batch_size)
                    dict_train_test['mse'].append(mean_squared_error(stocks.iloc[:,0],stocks.iloc[:,1], squared=True))
                    dict_train_test['rmse'].append(mean_squared_error(stocks.iloc[:,0],stocks.iloc[:,1], squared=False))

                    #print metrics (to track progress while iterating)
            #             print(f'stock: {df.columns[stock]}')
            #             print(f'window size: {window_size}')
            #             print(f'dropout fraction: {dropout_fraction}')
            #             print(f'mse: {mean_squared_error(stocks.iloc[:,0],stocks.iloc[:,1], squared=True):.3f}')
            #             print(f'rmse: {mean_squared_error(stocks.iloc[:,0],stocks.iloc[:,1], squared=False):.3f}')
            #             print(datetime.now())

print(f'ended at: {datetime.now()}')

started at: 2020-07-10 02:22:28.504309
ended at: 2020-07-10 16:59:45.188725


KeyboardInterrupt: 

In [8]:
# convert dictionary to dataframe
dict_df = pd.DataFrame(dict_train_test)

# concat dict_df with train_test_eval
train_test_eval = pd.DataFrame(dict_train_test)
train_test_eval.head()

Unnamed: 0,stock,window size,dropout fraction,epochs,batch size,mse,rmse
0,MSFT_close,1,0.1,10,1,2921.012523,54.046392
1,MSFT_close,1,0.15,10,1,3166.597026,56.272525
2,MSFT_close,1,0.2,10,1,2925.410086,54.08706
3,MSFT_close,1,0.25,10,1,3335.027713,57.749699
4,MSFT_close,1,0.3,10,1,3593.514474,59.94593


In [42]:
eval_summary = train_test_eval.groupby('stock').describe().transpose()
eval_summary

Unnamed: 0,stock,AMD_close,GILD_close,JNJ_close,MSFT_close,REGN_close,TSLA_close
window size,count,240.0,240.0,240.0,248.0,240.0,240.0
window size,mean,19.5,19.5,19.5,18.935484,19.5,19.5
window size,std,14.7532,14.7532,14.7532,14.842625,14.7532,14.7532
window size,min,1.0,1.0,1.0,1.0,1.0,1.0
window size,25%,8.75,8.75,8.75,5.0,8.75,8.75
window size,50%,17.5,17.5,17.5,15.0,17.5,17.5
window size,75%,26.25,26.25,26.25,25.0,26.25,26.25
window size,max,50.0,50.0,50.0,50.0,50.0,50.0
dropout fraction,count,240.0,240.0,240.0,248.0,240.0,240.0
dropout fraction,mean,0.225,0.225,0.225,0.224194,0.225,0.225


In [51]:
dict_best_performers = {
    'stock':[],
    'window size':[],
    'dropout fraction':[],
    'epochs':[],
    'batch size':[],
    'mse':[],
    'rmse':[]
}

for i in [1.692983,1.921647,2.550220,2.897495,14.395417,31.582278]:
#     train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i]]

    dict_best_performers['stock'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],0].values[0])
    dict_best_performers['window size'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],1].values[0])
    dict_best_performers['dropout fraction'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],2].values[0])
    dict_best_performers['epochs'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],3].values[0])
    dict_best_performers['batch size'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],4].values[0])
    dict_best_performers['mse'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],5].values[0])
    dict_best_performers['rmse'].append(train_test_eval.iloc[train_test_eval.index[train_test_eval['rmse'].round(6)==i],6].values[0])

df_best_performers = pd.DataFrame(dict_best_performers)
df_best_performers

Unnamed: 0,stock,window size,dropout fraction,epochs,batch size,mse,rmse
0,AMD_close,20,0.1,10,50,2.866193,1.692983
1,GILD_close,1,0.2,10,10,3.692729,1.921647
2,JNJ_close,25,0.1,10,10,6.503621,2.55022
3,MSFT_close,30,0.2,10,10,8.395475,2.897495
4,REGN_close,50,0.35,10,50,207.228028,14.395417
5,TSLA_close,30,0.25,10,10,997.440286,31.582278


In [84]:
epoch_iters = {
    'stock':[],
    'window size':[],
    'dropout fraction':[],
    'epochs':[],
    'batch size':[],
    'mse':[],
    'rmse':[]
}

epochs = [30,50,100,150]

print(f'started at: {datetime.now()}')

for epoch in epochs:
    for index, row in df_best_performers.iterrows():

        feature_column = df.columns.get_loc(row['stock'])
        target_column = df.columns.get_loc(row['stock'])
        X, y = window_data(df, row['window size'], feature_column, target_column)

        # Use 70% of the data for training and the remainder for testing
        split = int(0.7 * len(X))
        X_train = X[: split - 1]
        X_test = X[split:]
        y_train = y[: split - 1]
        y_test = y[split:]

        # Use the MinMaxScaler to scale data between 0 and 1.
        scaler = MinMaxScaler()
        scaler.fit(X)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
        scaler.fit(y)
        y_train = scaler.transform(y_train)
        y_test = scaler.transform(y_test)

        # Reshape the features for the model
        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

        # Define the LSTM RNN model.
        model = Sequential()
        # Layer 1
        model.add(LSTM(
            units=row['window size'],
            input_shape=(X_train.shape[1], 1))
            )
        model.add(Dropout(dropout_fraction))
        # Output layer
        model.add(Dense(1))

        # Compile the model
        model.compile(optimizer="adam", loss="mean_squared_error")

        # Train the model
        model.fit(X_train, y_train, epochs=epoch, shuffle=False, batch_size=row['batch size'], verbose=0)

        # Evaluate the model
        loss = model.evaluate(X_test, y_test, verbose=0)

        # Make some predictions
        predicted = model.predict(X_test)

        # Recover the original prices instead of the scaled version
        predicted_prices = scaler.inverse_transform(predicted)
        real_prices = scaler.inverse_transform(y_test.reshape(-1, 1))

        # Create a DataFrame of Real and Predicted values
        stocks = pd.DataFrame({
            "Real": real_prices.ravel(),
            "Predicted": predicted_prices.ravel()
        })

        # append model performance to train_test_eval dataframe
        epoch_iters['stock'].append(row['stock'])
        epoch_iters['window size'].append(row['window size'])
        epoch_iters['dropout fraction'].append(row['dropout fraction'])
        epoch_iters['epochs'].append(epoch)
        epoch_iters['batch size'].append(row['batch size'])
        epoch_iters['mse'].append(mean_squared_error(stocks.iloc[:,0],stocks.iloc[:,1], squared=True))
        epoch_iters['rmse'].append(mean_squared_error(stocks.iloc[:,0],stocks.iloc[:,1], squared=False))

print(f'ended at: {datetime.now()}')

df_epoch_iters = pd.DataFrame(epoch_iters)

started at: 2020-07-10 20:20:52.318643
ended at: 2020-07-10 21:14:08.009787


In [85]:
df_epoch_iters.head()

Unnamed: 0,stock,window size,dropout fraction,epochs,batch size,mse,rmse
0,AMD_close,20,0.1,30,50,282.591465,16.810457
1,GILD_close,1,0.2,30,10,3.39865,1.843543
2,JNJ_close,25,0.1,30,10,6.186289,2.487225
3,MSFT_close,30,0.2,30,10,43.679797,6.609069
4,REGN_close,50,0.35,30,50,320.169562,17.893283


In [86]:
df_epoch_iters

Unnamed: 0,stock,window size,dropout fraction,epochs,batch size,mse,rmse
0,AMD_close,20,0.1,30,50,282.591465,16.810457
1,GILD_close,1,0.2,30,10,3.39865,1.843543
2,JNJ_close,25,0.1,30,10,6.186289,2.487225
3,MSFT_close,30,0.2,30,10,43.679797,6.609069
4,REGN_close,50,0.35,30,50,320.169562,17.893283
5,TSLA_close,30,0.25,30,10,1062.923856,32.602513
6,AMD_close,20,0.1,50,50,72.847405,8.535069
7,GILD_close,1,0.2,50,10,4.310303,2.076127
8,JNJ_close,25,0.1,50,10,5.555439,2.356998
9,MSFT_close,30,0.2,50,10,7.613719,2.759297


NameError: name 'df_epoch_iters' is not defined