This notebook explores a machine learning algorithm to predict the stock prices of SPY, the S&P 500 ETF, and is intended to utilize functions that can be easily translated to a python executable.

In [5]:
# installing dependencies
!pip install numpy
!pip install pandas
!pip install torch
!pip install matplotlib
!pip install alpha_vantage
!pip install scikit-learn
!pip install pandas_market_calendars
!pip install lxml



Collecting lxml
  Downloading lxml-4.9.2-cp39-cp39-win_amd64.whl (3.9 MB)
     ---------------------------------------- 0.0/3.9 MB ? eta -:--:--
     - -------------------------------------- 0.1/3.9 MB 3.3 MB/s eta 0:00:02
     - -------------------------------------- 0.1/3.9 MB 3.3 MB/s eta 0:00:02
     - -------------------------------------- 0.1/3.9 MB 3.3 MB/s eta 0:00:02
     - -------------------------------------- 0.1/3.9 MB 658.7 kB/s eta 0:00:06
     -- ------------------------------------- 0.3/3.9 MB 1.1 MB/s eta 0:00:04
     --- ------------------------------------ 0.4/3.9 MB 1.3 MB/s eta 0:00:03
     ---- ----------------------------------- 0.5/3.9 MB 1.4 MB/s eta 0:00:03
     ----- ---------------------------------- 0.6/3.9 MB 1.6 MB/s eta 0:00:03
     ------ --------------------------------- 0.6/3.9 MB 1.6 MB/s eta 0:00:02
     ------ --------------------------------- 0.6/3.9 MB 1.6 MB/s eta 0:00:02
     ------- -------------------------------- 0.7/3.9 MB 1.5 MB/s eta 0:0

In [1]:
# import libararies
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from alpha_vantage.timeseries import TimeSeries 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from datetime import datetime,timedelta
from dateutil.relativedelta import relativedelta
import pandas_market_calendars as mcal

In [2]:
# config file (placing here for now, some fields will change on later impplementations)

config = {
    "alpha_vantage": {
        "key": "2JMCN347HZ3BU9RC", 
        "symbol": "SPY",
        "outputsize": "full",
        "key_adjusted_close": "5. adjusted close",
    },
    "data": {
        "window_size": 30,
        "train_split_size": 1,
    }, 
    "plots": {
        "xticks_interval": 90, # show a date every 90 days
        "color_actual": "#001f3f",
        "color_train": "#3D9970",
        "color_val": "#0074D9",
        "color_pred_train": "#3D9970",
        "color_pred_val": "#0074D9",
        "color_pred_test": "#FF4136",
    },
    "model": {
        "input_size": 1, # since for now we are only using close price
        "num_lstm_layers": 2,
        "lstm_size": 32,
        "dropout": 0.2,
    },
    "training": {
        "device": "cpu",
        "batch_size": 64,
        "num_epoch": 100,
        "epoch_stop": 10,
        "learning_rate": 0.01,
        "scheduler_step_size": 40,
    }
}

In [3]:
sp500 = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
sp500

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1129)>

In [4]:
# get data from the configuration file
def get_data(config):
    ts = TimeSeries(key=config["alpha_vantage"]["key"]) 
    data, meta_data = ts.get_daily_adjusted(config["alpha_vantage"]["symbol"], outputsize=config["alpha_vantage"]["outputsize"])

    date_data = [date for date in data.keys()]
    date_data.reverse()

    close_price_data = [float(data[date][config["alpha_vantage"]["key_adjusted_close"]]) for date in data.keys()]
    close_price_data.reverse()
    close_price_data = np.array(close_price_data)

    num_data_points = len(date_data)
    display_date_range = "from " + date_data[0] + " to " + date_data[num_data_points-1]
    print("Number data points", num_data_points, display_date_range)

    return date_data, close_price_data, num_data_points, display_date_range


date_data, close_price_data, num_data_points, display_date_range = get_data(config)

close_price_data


Number data points 5887 from 1999-11-01 to 2023-03-24


array([ 88.21718508,  87.58674188,  88.17651326, ..., 392.11      ,
       393.17      , 395.75      ])

In [5]:
# class with functions to normalize the data for more accurate predictions

class Normalization():
    def __init__(self):
        self.mu = None
        self.sd = None

    def fit_transform(self, x):
        self.mu = np.mean(x, axis=(0), keepdims=True)
        self.sd = np.std(x, axis=(0), keepdims=True)
        normalized_x = (x - self.mu)/self.sd
        return normalized_x

    def inverse_transform(self, x):
        return (x*self.sd) + self.mu

    
scaler = Normalization()
normalized_close_price_data = scaler.fit_transform(close_price_data)
normalized_close_price_data

array([-0.69745504, -0.70343252, -0.69784066, ...,  2.18387452,
        2.19392481,  2.21838682])

In [6]:
# prep for data training

def prepare_data_x(x, window_size):
    # perform windowing
    n_row = x.shape[0] - window_size + 1
    output = np.lib.stride_tricks.as_strided(x, shape=(n_row, window_size), strides=(x.strides[0], x.strides[0]))
    return output[:-1], output[-1]


def prepare_data_y(x, window_size):
    # use the next day as label
    output = x[window_size:]
    return output

data_x, data_x_unseen = prepare_data_x(normalized_close_price_data, window_size=config["data"]["window_size"])
data_y = prepare_data_y(normalized_close_price_data, window_size=config["data"]["window_size"])

In [7]:
# split dataset into training and validation sets

split_index = int(data_y.shape[0]*config["data"]["train_split_size"])
data_x_train = data_x[:split_index]
data_x_val = data_x[split_index:]
data_y_train = data_y[:split_index]
data_y_val = data_y[split_index:]

In [8]:
# Class to prepare data for training and LSTM model
class TimeSeriesDataset(Dataset):
    def __init__(self, x, y):
        x = np.expand_dims(x, 2) # right now we have only 1 feature, so we need to convert `x` into [batch, sequence, features]
        self.x = x.astype(np.float32)
        self.y = y.astype(np.float32)
        
    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return (self.x[idx], self.y[idx])

# prepare and shuffle data

train_dataset = TimeSeriesDataset(data_x_train, data_y_train)

print("Train data shape", train_dataset.x.shape, train_dataset.y.shape)

train_dataloader = DataLoader(train_dataset, batch_size=config["training"]["batch_size"], shuffle=True)

Train data shape (5857, 30, 1) (5857,)


In [9]:
# neural network model definition

class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=32, num_layers=2, output_size=1, dropout=0.2):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.linear_1 = nn.Linear(input_size, hidden_layer_size)
        self.relu = nn.ReLU()
        self.lstm = nn.LSTM(hidden_layer_size, hidden_size=self.hidden_layer_size, num_layers=num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(num_layers*hidden_layer_size, output_size)
        
        self.init_weights()

    def init_weights(self):
        for name, param in self.lstm.named_parameters():
            if 'bias' in name:
                 nn.init.constant_(param, 0.0)
            elif 'weight_ih' in name:
                 nn.init.kaiming_normal_(param)
            elif 'weight_hh' in name:
                 nn.init.orthogonal_(param)

    def forward(self, x):
        batchsize = x.shape[0]

        # layer 1
        x = self.linear_1(x)
        x = self.relu(x)
        
        # LSTM layer
        lstm_out, (h_n, c_n) = self.lstm(x)

        # reshape output from hidden cell into [batch, features] for `linear_2`
        x = h_n.permute(1, 0, 2).reshape(batchsize, -1) 
        
        # layer 2
        x = self.dropout(x)
        predictions = self.linear_2(x)
        return predictions[:,-1]

In [10]:
# function for training LSTM model

def run_epoch(dataloader, is_training=False):
    epoch_loss = 0

    if is_training:
        model.train()
    else:
        model.eval()

    for idx, (x, y) in enumerate(dataloader):
        if is_training:
            optimizer.zero_grad()

        batchsize = x.shape[0]

        x = x.to(config["training"]["device"])
        y = y.to(config["training"]["device"])

        out = model(x)
        loss = criterion(out.contiguous(), y.contiguous())

        if is_training:
            loss.backward()
            optimizer.step()

        epoch_loss += (loss.detach().item() / batchsize)

    lr = scheduler.get_last_lr()[0]

    return epoch_loss, lr


In [11]:
# train the data

train_dataloader = DataLoader(train_dataset, batch_size=config["training"]["batch_size"], shuffle=True)

model = LSTMModel(input_size=config["model"]["input_size"], hidden_layer_size=config["model"]["lstm_size"], num_layers=config["model"]["num_lstm_layers"], output_size=1, dropout=config["model"]["dropout"])
model = model.to(config["training"]["device"])

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=config["training"]["learning_rate"], betas=(0.9, 0.98), eps=1e-9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=config["training"]["scheduler_step_size"], gamma=0.1)

best_loss = np.inf
epochs_no_improve = 0
n_epochs_stop = config["training"]["epoch_stop"]
for epoch in range(config["training"]["num_epoch"]):
    loss_train, lr_train = run_epoch(train_dataloader, is_training=True)
    scheduler.step()
    
    print('Epoch[{}/{}] | loss train:{:.6f}| lr:{:.6f}'
              .format(epoch+1, config["training"]["num_epoch"], loss_train, lr_train))
    if loss_train < best_loss:
        best_loss = loss_train
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
    if epochs_no_improve == n_epochs_stop:
        print("Early stopping.")
        break

Epoch[1/100] | loss train:0.080056| lr:0.010000
Epoch[2/100] | loss train:0.015383| lr:0.010000
Epoch[3/100] | loss train:0.013137| lr:0.010000
Epoch[4/100] | loss train:0.011916| lr:0.010000
Epoch[5/100] | loss train:0.011479| lr:0.010000
Epoch[6/100] | loss train:0.011825| lr:0.010000
Epoch[7/100] | loss train:0.010169| lr:0.010000
Epoch[8/100] | loss train:0.010285| lr:0.010000
Epoch[9/100] | loss train:0.010507| lr:0.010000
Epoch[10/100] | loss train:0.009795| lr:0.010000
Epoch[11/100] | loss train:0.011377| lr:0.010000
Epoch[12/100] | loss train:0.010299| lr:0.010000
Epoch[13/100] | loss train:0.011874| lr:0.010000
Epoch[14/100] | loss train:0.009996| lr:0.010000
Epoch[15/100] | loss train:0.010155| lr:0.010000
Epoch[16/100] | loss train:0.009313| lr:0.010000
Epoch[17/100] | loss train:0.010244| lr:0.010000
Epoch[18/100] | loss train:0.009489| lr:0.010000
Epoch[19/100] | loss train:0.010539| lr:0.010000
Epoch[20/100] | loss train:0.010043| lr:0.010000
Epoch[21/100] | loss train:0.

In [12]:
# here we re-initialize dataloader so the data doesn't shuffled, so we can plot the values by date

train_dataloader = DataLoader(train_dataset, batch_size=config["training"]["batch_size"], shuffle=False)

model.eval()

# predict on the training data, to see how well the model managed to learn and memorize

predicted_train = np.array([])

for idx, (x, y) in enumerate(train_dataloader):
    x = x.to(config["training"]["device"])
    out = model(x)
    out = out.cpu().detach().numpy()
    predicted_train = np.concatenate((predicted_train, out))

# predict on the validation data, to see how the model does  

data_y_train_pred = np.zeros(num_data_points)
data_y_val_pred = np.zeros(num_data_points)

data_y_train_pred[config["data"]["window_size"]:split_index+config["data"]["window_size"]] = scaler.inverse_transform(predicted_train)

mae = mean_absolute_error(close_price_data, data_y_train_pred + data_y_val_pred)
print("Mean absolute error: ", mae)

rmse = np.sqrt(mean_squared_error(close_price_data, data_y_train_pred+data_y_val_pred))
print("Root mean squared error: ", rmse)

data_y_train_pred = np.where(data_y_train_pred == 0, None, data_y_train_pred)

Mean absolute error:  1.9108791966545773
Root mean squared error:  6.94512388482604


In [13]:
model.eval()

n = num_data_points
next_weeks = mcal.date_range(mcal.get_calendar('NYSE').schedule(start_date=date_data[n-1], end_date=(datetime.strptime(date_data[n-1],'%Y-%m-%d')+relativedelta(months=1)).strftime('%Y-%m-%d')), frequency='1D')
next_weeks = [date.strftime('%Y-%m-%d') for date in next_weeks]
next_weeks = next_weeks[1:]
next_day = next_weeks[0]
next_day

'2023-03-27'

In [14]:
torch.tensor(data_x_unseen)
x = torch.tensor(data_x_unseen).float().to(config["training"]["device"]).unsqueeze(0).unsqueeze(2) # this is the data type and shape required, [batch, sequence, feature]
prediction = model(x)
prediction = prediction.cpu().detach().numpy()
prediction[0] = scaler.inverse_transform(prediction[0])
prediction[0]

# print(x)
# prediction = []
# for _ in range(1,len(next_weeks)):
#     yhat = model(x)
#     yhat = yhat.cpu().detach().numpy()
#     xarray = np.concatenate((x[0].numpy(),[yhat]))
#     np.expand_dims(xarray,1)
#     print(xarray)
#     yhat = scaler.inverse_transform(yhat)
#     prediction.append(yhat)
#     x = torch.tensor([xarray])
    
# prediction

397.11868

In [15]:
df = pd.DataFrame([[next_day,prediction[0]]], columns = ['date','close price'])
df.to_csv("csv/TimeSeries/SPY_predict.csv")
df

Unnamed: 0,date,close price
0,2023-03-27,397.118683
