In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import torch
import torch.nn as nn
from sklearn.metrics import r2_score
from torch.utils.data import DataLoader

In [2]:
with open("sp500_list.txt", "r") as f:
    sp500_list = [line.strip() for line in f]

In [3]:
rfdata = pd.read_csv("market_data.csv", index_col=0, low_memory=False)
rfdata = rfdata.drop(rfdata.index[[0,1]])
rfdata.columns = pd.MultiIndex.from_arrays([["Adj Close"]*503 + ["Volume"]*503, sp500_list*2])
rfdata.index = pd.core.indexes.datetimes.DatetimeIndex(rfdata.index)
rfdata = rfdata.astype(float)

data = rfdata["Adj Close"]
data = data.dropna(axis=1)

In [4]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [91]:
from copy import deepcopy as dc

def prepare_dataframe_for_lstm(df, n_steps):
    df = dc(df)
    for j in range(1, n_steps+1):
        df[f'Adj Close(t-{j})'] = df['Adj Close'].shift(j)
        df.dropna(inplace=True)
        return df

In [6]:
from torch.utils.data import Dataset

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [7]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_stacked_layers):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_stacked_layers = num_stacked_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, 
                            batch_first=True)
        
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [9]:
    class LSTM(nn.Module):
        def __init__(self, input_size, hidden_size, num_stacked_layers):
            super().__init__()
            self.hidden_size = hidden_size
            self.num_stacked_layers = num_stacked_layers

            self.lstm = nn.LSTM(input_size, hidden_size, num_stacked_layers, batch_first=True)
        
            self.fc = nn.Linear(hidden_size, 1)

        def forward(self, x):
            batch_size = 16
            h0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size)
            c0 = torch.zeros(self.num_stacked_layers, batch_size, self.hidden_size)
        
            out, _ = self.lstm(x, (h0, c0))
            out = self.fc(out[:, -1, :])
            return out

In [112]:
%%time

score_train_lstm = []
score_test_lstm = []
lstm_pred = pd.DataFrame(columns=data.columns)

for i in data.columns:
    frame = data[i].pct_change().dropna().to_frame()
    frame = frame.rename(columns={ i : "Adj Close"})

    def prepare_dataframe_for_lstm(df, n_steps):
        df = dc(df)
    
        for j in range(1, n_steps+1):
            df[f'Adj Close(t-{j})'] = df['Adj Close'].shift(j)
        df.dropna(inplace=True)
        return df
    lookback = 16
    shifted_df = prepare_dataframe_for_lstm(frame, lookback)
    
    y = shifted_df["Adj Close"]
    X = shifted_df.drop(columns =["Adj Close"])
    
    X_train = X["2013-01-01":"2023-01-01"].to_numpy()
    X_train = dc(np.flip(X_train, axis=1))
    X_test = X["2023-01-01":].to_numpy()
    X_test = dc(np.flip(X_test, axis=1))
    y_train = y["2013-01-01":"2023-01-01"].to_numpy()
    y_test = y["2023-01-01":].to_numpy()
    
    X_train = X_train.reshape((-1, lookback, 1))
    X_test = X_test.reshape((-1, lookback, 1))
    y_train = y_train_np.reshape((-1, 1))
    y_test = y_test_np.reshape((-1, 1))

    X_train = torch.tensor(X_train).float()
    y_train = torch.tensor(y_train).float()
    X_test = torch.tensor(X_test).float()
    y_test = torch.tensor(y_test).float()
    
    train_dataset = TimeSeriesDataset(X_train, y_train)
    test_dataset = TimeSeriesDataset(X_test, y_test)
    
    batch_size=16
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    model = LSTM(1, 16, 1)
    model.to(device)

    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)
    
    for epoch in range(5):
        for seqs, labels in train_loader:
            # batch size, seq length, num features
            seqs = seqs.view(16, len(seqs), 1)
            # Get model outputs
            outputs = model(seqs).squeeze()
            # Compute loss
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    with torch.no_grad():    
        predicted_train = model(X_train.to(device)).to('cpu').numpy()
        predicted_test = model(X_test.to(device)).to('cpu').numpy()
            
    score_test_lstm.append(r2_score(np.array(y_test).reshape(250), predicted_test.reshape(250)))
    score_train_lstm.append(r2_score(np.array(y_train).reshape(2518), predicted_train.reshape(2518)))
                                                  
    lstm_pred[i] = predicted_test.reshape(250)
    

lstm_score = pd.DataFrame({"score_train" : score_train_lstm, "score_test": score_test_lstm}, data.columns)

CPU times: total: 46min 8s
Wall time: 32min 3s


In [116]:
#lstm_score.to_csv("LSTMscore.csv")

In [117]:
#lstm_pred.to_csv("LSTMpred.csv")

In [145]:
lstm_score.sort_values(by='score_train', ascending=False)[:50]

Unnamed: 0,score_train,score_test
TMUS,0.005299,-0.018719
TFC,0.004474,-0.003277
JNJ,0.00369,0.002095
VRSN,0.002824,-0.01202
DVN,0.002766,-0.003264
MDLZ,0.00264,-0.003151
BMY,0.002081,0.000246
AON,0.002056,-0.009574
MMM,0.002035,-0.007552
WTW,0.002027,-0.004613
