This notebook explores a machine learning algorithm to predict the stock prices of SPY, the S&P 500 ETF, and is intended to utilize functions that can be easily translated to a python executable.

In [1]:
# installing dependencies
!pip install numpy
!pip install pandas
!pip install torch
!pip install matplotlib
!pip install alpha_vantage
!pip install scikit-learn
!pip install pandas_market_calendars





In [90]:
# import libararies
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from alpha_vantage.timeseries import TimeSeries 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
import pandas_market_calendars as mcal

In [91]:
# config file (placing here for now, some fields will change on later impplementations)

config = {
    "alpha_vantage": {
        "key": "2JMCN347HZ3BU9RC", 
        "symbol": "SPY",
        "outputsize": "full",
        "key_adjusted_close": "5. adjusted close",
    },
    "data": {
        "window_size": 20,
        "train_split_size": 0.80,
    }, 
    "plots": {
        "xticks_interval": 90, # show a date every 90 days
        "color_actual": "#001f3f",
        "color_train": "#3D9970",
        "color_val": "#0074D9",
        "color_pred_train": "#3D9970",
        "color_pred_val": "#0074D9",
        "color_pred_test": "#FF4136",
    },
    "model": {
        "input_size": 1, # since for now we are only using close price
        "num_lstm_layers": 2,
        "lstm_size": 32,
        "dropout": 0.2,
    },
    "training": {
        "device": "cpu",
        "batch_size": 64,
        "num_epoch": 100,
        "epoch_stop": 10,
        "learning_rate": 0.01,
        "scheduler_step_size": 40,
    }
}

In [92]:
# get data from the configuration file
def get_data(config):
    ts = TimeSeries(key=config["alpha_vantage"]["key"]) 
    data, meta_data = ts.get_daily_adjusted(config["alpha_vantage"]["symbol"], outputsize=config["alpha_vantage"]["outputsize"])

    date_data = [date for date in data.keys()]
    date_data.reverse()

    close_price_data = [float(data[date][config["alpha_vantage"]["key_adjusted_close"]]) for date in data.keys()]
    close_price_data.reverse()
    close_price_data = np.array(close_price_data)

    num_data_points = len(date_data)
    display_date_range = "from " + date_data[0] + " to " + date_data[num_data_points-1]
    print("Number data points", num_data_points, display_date_range)

    return date_data, close_price_data, num_data_points, display_date_range


date_data, close_price_data, num_data_points, display_date_range = get_data(config)

close_price_data


Number data points 5887 from 1999-11-01 to 2023-03-24


array([ 88.21718508,  87.58674188,  88.17651326, ..., 392.11      ,
       393.17      , 395.75      ])

In [93]:
split_index = num_data_points-1

next_weeks = mcal.date_range(mcal.get_calendar('NYSE').schedule(start_date=date_data[split_index], end_date=(datetime.strptime(date_data[split_index],'%Y-%m-%d')+relativedelta(months=1)).strftime('%Y-%m-%d')), frequency='1D')
next_weeks = [date.strftime('%Y-%m-%d') for date in next_weeks]
date_data = date_data + next_weeks

num_data_points = len(date_data)
print("Number data points", num_data_points)
date_data

Number data points 5908


['1999-11-01',
 '1999-11-02',
 '1999-11-03',
 '1999-11-04',
 '1999-11-05',
 '1999-11-08',
 '1999-11-09',
 '1999-11-10',
 '1999-11-11',
 '1999-11-12',
 '1999-11-15',
 '1999-11-16',
 '1999-11-17',
 '1999-11-18',
 '1999-11-19',
 '1999-11-22',
 '1999-11-23',
 '1999-11-24',
 '1999-11-26',
 '1999-11-29',
 '1999-11-30',
 '1999-12-01',
 '1999-12-02',
 '1999-12-03',
 '1999-12-06',
 '1999-12-07',
 '1999-12-08',
 '1999-12-09',
 '1999-12-10',
 '1999-12-13',
 '1999-12-14',
 '1999-12-15',
 '1999-12-16',
 '1999-12-17',
 '1999-12-20',
 '1999-12-21',
 '1999-12-22',
 '1999-12-23',
 '1999-12-27',
 '1999-12-28',
 '1999-12-29',
 '1999-12-30',
 '1999-12-31',
 '2000-01-03',
 '2000-01-04',
 '2000-01-05',
 '2000-01-06',
 '2000-01-07',
 '2000-01-10',
 '2000-01-11',
 '2000-01-12',
 '2000-01-13',
 '2000-01-14',
 '2000-01-18',
 '2000-01-19',
 '2000-01-20',
 '2000-01-21',
 '2000-01-24',
 '2000-01-25',
 '2000-01-26',
 '2000-01-27',
 '2000-01-28',
 '2000-01-31',
 '2000-02-01',
 '2000-02-02',
 '2000-02-03',
 '2000-02-

In [94]:
# class with functions to normalize the data for more accurate predictions

class Normalization():
    def __init__(self):
        self.mu = None
        self.sd = None

    def fit_transform(self, x):
        self.mu = np.mean(x, axis=(0), keepdims=True)
        self.sd = np.std(x, axis=(0), keepdims=True)
        normalized_x = (x - self.mu)/self.sd
        return normalized_x

    def inverse_transform(self, x):
        return (x*self.sd) + self.mu

    
scaler = Normalization()
normalized_close_price_data = np.concatenate((scaler.fit_transform(close_price_data),np.full(len(next_weeks),np.nan)))
normalized_close_price_data

array([-0.69745504, -0.70343252, -0.69784066, ...,         nan,
               nan,         nan])

In [115]:
# prep for data training

def prepare_data_x(x, window_size):
    # perform windowing
    n_row = x.shape[0] - window_size + 1
    output = np.lib.stride_tricks.as_strided(x, shape=(n_row, window_size), strides=(x.strides[0], x.strides[0]))
    return output[:-1], output[-1]


def prepare_data_y(x, window_size):
    # use the next day as label
    output = x[window_size:]
    return output

data_x, data_x_unseen = prepare_data_x(normalized_close_price_data, window_size=config["data"]["window_size"])
data_y = prepare_data_y(normalized_close_price_data, window_size=config["data"]["window_size"])

In [116]:
split_index = split_index-config["data"]["window_size"]+2
data_x_train = data_x[:split_index]
data_x_val = data_x[split_index:]
data_y_train = data_y[:split_index]
data_y_val = data_y[split_index:]

In [117]:
# Class to prepare data for training and LSTM model
class TimeSeriesDataset(Dataset):
    def __init__(self, x, y):
        x = np.expand_dims(x, 2) # right now we have only 1 feature, so we need to convert `x` into [batch, sequence, features]
        self.x = x.astype(np.float32)
        self.y = y.astype(np.float32)
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

# prepare and shuffle data

train_dataset = TimeSeriesDataset(data_x_train, data_y_train)
val_dataset = TimeSeriesDataset(data_x_val, data_y_val)

print("Train data shape", train_dataset.x.shape, train_dataset.y.shape)
print("Validation data shape", val_dataset.x.shape, val_dataset.y.shape)

train_dataloader = DataLoader(train_dataset, batch_size=config["training"]["batch_size"], shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=config["training"]["batch_size"], shuffle=True)
train_dataloader

Train data shape (5670, 20, 1) (5670,)
Validation data shape (218, 20, 1) (218,)


<torch.utils.data.dataloader.DataLoader at 0x2176d481ee0>

In [118]:
# neural network model definition

class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=32, num_layers=2, output_size=1, dropout=0.2):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.linear_1 = nn.Linear(input_size, hidden_layer_size)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(hidden_layer_size, hidden_size=self.hidden_layer_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(num_layers*hidden_layer_size, output_size)
        
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'bias' in name:
                 nn.init.constant_(param, 0.0)
            elif 'weight_ih' in name:
                 nn.init.kaiming_normal_(param)
            elif 'weight_hh' in name:
                 nn.init.orthogonal_(param)

    def forward(self, x):
        batchsize = x.shape[0]

        # layer 1
        x = self.linear_1(x)
        x = self.relu(x)
        
        # LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(x)

        # reshape output from hidden cell into [batch, features] for `linear_2`
        x = h_n.permute(1, 0, 2).reshape(batchsize, -1) 
        
        # layer 2
        x = self.dropout(x)
        predictions = self.linear_2(x)
        return predictions[:,-1]

In [119]:
# function for training LSTM model

def run_epoch(dataloader, is_training=False):
    epoch_loss = 0

    if is_training:
        model.train()
    else:
        model.eval()

    for idx, (x, y) in enumerate(dataloader):
        if is_training:
            optimizer.zero_grad()

        batchsize = x.shape[0]

        x = x.to(config["training"]["device"])
        y = y.to(config["training"]["device"])

        out = model(x)
        loss = criterion(out.contiguous(), y.contiguous())

        if is_training:
            loss.backward()
            optimizer.step()

        epoch_loss += (loss.detach().item() / batchsize)

    lr = scheduler.get_last_lr()[0]

    return epoch_loss, lr


In [120]:
# train the data

train_dataloader = DataLoader(train_dataset, batch_size=config["training"]["batch_size"], shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=config["training"]["batch_size"], shuffle=True)

model = LSTMModel(input_size=config["model"]["input_size"], hidden_layer_size=config["model"]["lstm_size"], num_layers=config["model"]["num_lstm_layers"], output_size=1, dropout=config["model"]["dropout"])
model = model.to(config["training"]["device"])

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=config["training"]["learning_rate"], betas=(0.9, 0.98), eps=1e-9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=config["training"]["scheduler_step_size"], gamma=0.1)

best_loss = np.inf
epochs_no_improve = 0
n_epochs_stop = config["training"]["epoch_stop"]
for epoch in range(config["training"]["num_epoch"]):
    loss_train, lr_train = run_epoch(train_dataloader, is_training=True)
    scheduler.step()
    
    print('Epoch[{}/{}] | loss train:{:.6f}'
              .format(epoch+1, config["training"]["num_epoch"], loss_train))
    if loss_train < best_loss:
        best_loss = loss_train
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
    if epochs_no_improve == n_epochs_stop:
        print("Early stopping.")
        break

Epoch[1/100] | loss train:0.066974
Epoch[2/100] | loss train:0.011827
Epoch[3/100] | loss train:0.012185
Epoch[4/100] | loss train:0.011146
Epoch[5/100] | loss train:0.009589
Epoch[6/100] | loss train:0.009113
Epoch[7/100] | loss train:0.009263
Epoch[8/100] | loss train:0.009772
Epoch[9/100] | loss train:0.008041
Epoch[10/100] | loss train:0.009413
Epoch[11/100] | loss train:0.007792
Epoch[12/100] | loss train:0.008568
Epoch[13/100] | loss train:0.009615
Epoch[14/100] | loss train:0.008568
Epoch[15/100] | loss train:0.008162
Epoch[16/100] | loss train:0.007579
Epoch[17/100] | loss train:0.008121
Epoch[18/100] | loss train:0.007929
Epoch[19/100] | loss train:0.008446
Epoch[20/100] | loss train:0.008514
Epoch[21/100] | loss train:0.008277
Epoch[22/100] | loss train:0.008009
Epoch[23/100] | loss train:0.007618
Epoch[24/100] | loss train:0.007964
Epoch[25/100] | loss train:0.007694
Epoch[26/100] | loss train:0.007580
Early stopping.


In [123]:
# here we re-initialize dataloader so the data doesn't shuffled, so we can plot the values by date

train_dataloader = DataLoader(train_dataset, batch_size=config["training"]["batch_size"], shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=config["training"]["batch_size"], shuffle=False)

model.eval()

# predict on the training data, to see how well the model managed to learn and memorize

predicted_train = np.array([])

for idx, (x, y) in enumerate(train_dataloader):
    x = x.to(config["training"]["device"])
    out = model(x)
    out = out.cpu().detach().numpy()
    predicted_train = np.concatenate((predicted_train, out))

# predict on the validation data, to see how the model does

predicted_val = np.array([])

for idx, (x, y) in enumerate(val_dataloader):
    x = x.to(config["training"]["device"])
    out = model(x)
    out = out.cpu().detach().numpy()
    predicted_val = np.concatenate((predicted_val, out))

data_y_train_pred = np.zeros(num_data_points)
data_y_val_pred = np.zeros(num_data_points)

data_y_train_pred[config["data"]["window_size"]:split_index+config["data"]["window_size"]] = scaler.inverse_transform(predicted_train)
data_y_val_pred[split_index+config["data"]["window_size"]:] = scaler.inverse_transform(predicted_val)

data_y_train_pred = np.where(data_y_train_pred == 0, None, data_y_train_pred)
data_y_val_pred = np.where(data_y_val_pred == 0, None, data_y_val_pred)


In [133]:
data_y_val_pred

array([None, None, None, ..., nan, nan, nan], dtype=object)