# QRT Portfolio Optimisation

In [1]:
import pandas as pd
import lseg.data as ld
from pprint import pprint

ld.open_session()

print("Connected to Workspace")

Connected to Workspace


## Fetch the Data
We import the universe that we wish to operate in as well as the various factors that we would like to use

In [2]:
# The universe of equities we will be dealing with
with open('universe.txt') as f:
    universe = f.read().splitlines()

print("Universe of size", len(universe))
print(universe)

field_names = []
fields = {} # fields[name] = {}

class field:
    name = ""
    lag = 0
    string_rep = ""
    
    def __repr__(self):
        return f"ric = {self.name}, lag = {self.lag} ({self.string_rep})"
    
# The various fields we will be using
with open('technical_fields.txt', 'r') as f:
    for line in f:
        field_name, lag = line.split()
        field_names.append(field_name)
        fields[field_name] = field()
        fields[field_name].name = field_name
        fields[field_name].lag = int(lag)

# with open('fundamental_fields.txt', 'r') as f:
#     for line in f:
#         name = line.strip()
#         fields[name] = 0

print("Fields: ", fields.keys())

Universe of size 25
['AAPL.O', 'AMZN.O', 'AVGO.O', 'BAC', 'BRKb', 'COST.O', 'CRM', 'GOOG.O', 'GOOGL.O', 'HD', 'JNJ', 'JPM', 'LLY', 'MA', 'META.O', 'MSFT.O', 'NFLX.O', 'NVDA.O', 'ORCL.K', 'PG', 'TSLA.O', 'UNH', 'V', 'WMT', 'XOM']
Fields:  dict_keys(['TR.TotalReturn1Wk', 'TR.Volume', 'TR.CompanyMarketCapitalization', 'TR.PriceClose', 'TR.PriceOpen'])


In [3]:
start_date = "2020-06-01"
end_date = "2024-10-01"

start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

print("Start Date: ", start_date)
print("End Date: ", end_date)



Start Date:  2020-06-01 00:00:00
End Date:  2024-10-01 00:00:00


In [4]:
# Get a sample of the data for the universe of equities
import pandas as pd
from datetime import timedelta

def fetch_data_in_chunks(start_date, end_date, chunk_size=182):
    # start and end date are datetime objects
    # chunk_size is the number of days to fetch in one go
    data = []
    current_start = start_date
    while current_start < end_date:
        current_end = min(current_start + timedelta(days=chunk_size), end_date)
        print(f"Fetching data from {current_start} to {current_end}")
        current_start_str = current_start.isoformat() + 'Z'
        current_end_str = current_end.isoformat() + 'Z'
        chunk_data = ld.get_history(universe, list(fields.keys()), start=current_start_str, end=current_end_str, interval='1d')
        data.append(chunk_data)
        current_start = current_end + timedelta(days=1)
    return pd.concat(data)

# data = ld.get_history(universe, list(fields.keys()), start=start_date, end=end_date, interval='1d')
data = fetch_data_in_chunks(start_date, end_date)
print("Data shape: ", data.shape)
data.head()

Fetching data from 2020-06-01 00:00:00 to 2020-11-30 00:00:00




Fetching data from 2020-12-01 00:00:00 to 2021-06-01 00:00:00




Fetching data from 2021-06-02 00:00:00 to 2021-12-01 00:00:00




Fetching data from 2021-12-02 00:00:00 to 2022-06-02 00:00:00




Fetching data from 2022-06-03 00:00:00 to 2022-12-02 00:00:00




Fetching data from 2022-12-03 00:00:00 to 2023-06-03 00:00:00




Fetching data from 2023-06-04 00:00:00 to 2023-12-03 00:00:00




Fetching data from 2023-12-04 00:00:00 to 2024-06-03 00:00:00




Fetching data from 2024-06-04 00:00:00 to 2024-10-01 00:00:00
Data shape:  (1092, 125)




Unnamed: 0_level_0,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AMZN.O,AMZN.O,AMZN.O,AMZN.O,AMZN.O,...,WMT,WMT,WMT,WMT,WMT,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,...,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-06-01,0.92822,81018612,1395005719750,80.4625,79.4375,1.401792,58760639,1232495526113.28,123.552,122.4,...,-0.297595,20567276,351048601331.726,41.319959,41.146626,3.766816,19038401,195681616742.56,46.28,45.32
2020-06-02,2.086951,87642816,1401463878900,80.835,80.18625,2.087239,50598059,1233178849277.12,123.6205,123.35,...,0.064589,19187824,350991959430.772,41.313292,41.196625,3.049445,22318978,200036674332.12,47.31,46.86
2020-06-03,2.20364,104491216,1409178995200,81.28,81.165,2.821535,53420479,1236166517708.8,123.92,123.4005,...,0.808295,20263019,349660946973.179,41.156626,41.299959,6.487889,23418047,208197122048.48,49.24,48.0
2020-06-04,1.278869,87560364,1397042857200,80.58,81.0975,2.478031,58974199,1227288304339.2,123.03,123.8715,...,-1.277387,24014913,345809493113.13293,40.703293,40.933292,9.01421,18731361,207605172473.2,49.1,48.88
2020-06-05,4.264956,137250200,1436832052500,82.875,80.8375,1.663548,66128899,1238460887456.0,124.15,122.2255,...,-2.015154,39337206,344251913051.726,40.519959,40.856626,16.73631,41328943,224433453256.16,53.08,51.65


In [5]:
data.tail()

Unnamed: 0_level_0,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AMZN.O,AMZN.O,AMZN.O,AMZN.O,AMZN.O,...,WMT,WMT,WMT,WMT,WMT,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,...,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-09-25,2.573746,42308715,3441760492690,226.37,224.93,3.272006,26391144,2020711491598.9297,192.53,193.75,...,2.998861,14163785,654313645563.6,81.4,80.89,0.165823,13816041,509903206586.6,114.77,116.485
2024-09-26,-0.589855,36636707,3459245250240,227.52,227.3,0.679412,36334854,2006332564971.96,191.16,194.31,...,2.409021,17061133,642417033826.08,79.92,81.04,-2.758621,16887908,501150838224.0,112.8,111.14
2024-09-27,-0.179667,34025967,3463350367230,227.79,228.46,-1.894572,36002316,1972851706621.57,187.97,190.68,...,0.910701,11899050,641291678661.72,79.78,79.9,0.477141,15963973,514568174495.6,115.82,113.76
2024-09-30,2.883384,54793391,3522211138000,233.0,230.04,-3.894161,41680400,1958514630000.0,186.33,187.14,...,0.522843,19008187,649088782300.5,80.75,79.89,-0.119291,13250657,515192981509.92,117.22,115.46
2024-10-01,-0.510182,63285048,3419568161060,226.21,229.52,-4.552485,36044906,1945901430000.0,185.13,184.9,...,0.743771,16054249,653268672910.98,81.27,80.68,2.460487,23235878,527103687702.48,119.93,116.04


In [7]:
# Export the data to a CSV file
data.to_csv('data.csv')

In [8]:
for i in range(len(field_names)):
    fields[field_names[i]].string_rep = data.columns[i][1]

pprint(fields)

{'TR.CompanyMarketCapitalization': ric = TR.CompanyMarketCapitalization, lag = 1 (Company Market Capitalization),
 'TR.PriceClose': ric = TR.PriceClose, lag = 2 (Price Close),
 'TR.PriceOpen': ric = TR.PriceOpen, lag = 2 (Price Open),
 'TR.TotalReturn1Wk': ric = TR.TotalReturn1Wk, lag = 0 (1 Week Total Return),
 'TR.Volume': ric = TR.Volume, lag = 1 (Volume)}


## Construct the Dataset
Our data is based on time series. We would do well to append columns of lagged data to the data set to allow the model to refer to past data.
We also construct a target variable that represents the percentage change in the close price in the next $q$ days. The idea is to rebalance every $q$ days.

In [9]:
# data = pd.read_csv('data.csv', index_col=0, parse_dates=True)
# data = data[1:]

In [10]:
data = data.loc[~data.index.duplicated(keep='first')]

In [11]:
# Construct the lagged fields

def expand_data(data_in): 
    expanded_data = data_in.copy()
    for ric in universe:
        for field_name in field_names:
            lag = fields[field_name].lag
            string_rep = fields[field_name].string_rep
            for cur_lag in range(1, lag+1):
                expanded_data[(ric, f'{string_rep}_lag_{cur_lag}')] = data_in[(ric, f'{string_rep}')].shift(cur_lag)
    expanded_data = expanded_data.sort_index(axis=1)
    expanded_data.dropna(inplace=True)
    return expanded_data
       
# Sort the lagged fields back into the correct order
expanded_data = expand_data(data)
expanded_data.head()

Unnamed: 0_level_0,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,...,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,1 Week Total Return,Company Market Capitalization,Company Market Capitalization_lag_1,Price Close,Price Close_lag_1,Price Close_lag_2,Price Open,Price Open_lag_1,Price Open_lag_2,Volume,...,Company Market Capitalization,Company Market Capitalization_lag_1,Price Close,Price Close_lag_1,Price Close_lag_2,Price Open,Price Open_lag_1,Price Open_lag_2,Volume,Volume_lag_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-06-03,2.20364,1409178995200,1401463878900,81.28,80.835,80.4625,81.165,80.18625,79.4375,104491216,...,208197122048.48,200036674332.12,49.24,47.31,46.28,48.0,46.86,45.32,23418047,22318978
2020-06-04,1.278869,1397042857200,1409178995200,80.58,81.28,80.835,81.0975,81.165,80.18625,87560364,...,207605172473.2,208197122048.48,49.1,49.24,47.31,48.88,48.0,46.86,18731361,23418047
2020-06-05,4.264956,1436832052500,1397042857200,82.875,80.58,81.28,80.8375,81.0975,81.165,137250200,...,224433453256.16,207605172473.2,53.08,49.1,49.24,51.65,48.88,48.0,41328943,18731361
2020-06-08,3.60727,1445327349100,1436832052500,83.365,82.875,80.58,82.5625,80.8375,81.0975,95654536,...,231452283934.48,224433453256.16,54.74,53.08,49.1,54.72,51.65,48.88,33579420,41328943
2020-06-09,6.386466,1490967896650,1445327349100,85.9975,83.365,82.875,83.035,82.5625,80.8375,147712364,...,226293866207.04,231452283934.48,53.52,54.74,53.08,52.88,54.72,51.65,27276943,33579420


In [12]:
expanded_data.to_csv('expanded_data.csv')

In [13]:
pred_horizon = 2 # we predict price close in 2 days

# Construct the target variable
for ric in universe:
    expanded_data[(ric, 'target')] = 100 * (data[(ric, 'Price Close')].shift(-pred_horizon) - data[(ric, 'Price Close')]) / data[(ric, 'Price Close')]
    
expanded_data = expanded_data.dropna()
targets = expanded_data[[(ric, 'target') for ric in universe]]


In [14]:
X = expanded_data.drop(columns=[(ric, 'target') for ric in universe])
y = targets
y.columns = y.columns.droplevel(1)

In [15]:
X.head()

Unnamed: 0_level_0,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,...,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,1 Week Total Return,Company Market Capitalization,Company Market Capitalization_lag_1,Price Close,Price Close_lag_1,Price Close_lag_2,Price Open,Price Open_lag_1,Price Open_lag_2,Volume,...,Company Market Capitalization,Company Market Capitalization_lag_1,Price Close,Price Close_lag_1,Price Close_lag_2,Price Open,Price Open_lag_1,Price Open_lag_2,Volume,Volume_lag_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2020-06-03,2.20364,1409178995200,1401463878900,81.28,80.835,80.4625,81.165,80.18625,79.4375,104491216,...,208197122048.48,200036674332.12,49.24,47.31,46.28,48.0,46.86,45.32,23418047,22318978
2020-06-04,1.278869,1397042857200,1409178995200,80.58,81.28,80.835,81.0975,81.165,80.18625,87560364,...,207605172473.2,208197122048.48,49.1,49.24,47.31,48.88,48.0,46.86,18731361,23418047
2020-06-05,4.264956,1436832052500,1397042857200,82.875,80.58,81.28,80.8375,81.0975,81.165,137250200,...,224433453256.16,207605172473.2,53.08,49.1,49.24,51.65,48.88,48.0,41328943,18731361
2020-06-08,3.60727,1445327349100,1436832052500,83.365,82.875,80.58,82.5625,80.8375,81.0975,95654536,...,231452283934.48,224433453256.16,54.74,53.08,49.1,54.72,51.65,48.88,33579420,41328943
2020-06-09,6.386466,1490967896650,1445327349100,85.9975,83.365,82.875,83.035,82.5625,80.8375,147712364,...,226293866207.04,231452283934.48,53.52,54.74,53.08,52.88,54.72,51.65,27276943,33579420


In [16]:
y.head()

Unnamed: 0_level_0,AAPL.O,AMZN.O,AVGO.O,BAC,BRKb,COST.O,CRM,GOOG.O,GOOGL.O,HD,...,MSFT.O,NFLX.O,NVDA.O,ORCL.K,PG,TSLA.O,UNH,V,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-06-03,1.962352,0.185604,2.439182,8.156983,5.156692,1.761023,-0.634322,0.139935,0.0535,1.553785,...,0.992663,-0.561651,1.716175,0.916059,-0.168734,0.30579,2.128705,1.391781,-1.546934,7.798538
2020-06-04,3.456193,2.579046,2.793875,6.572069,4.951835,-1.145615,2.980635,2.438074,2.385632,3.141193,...,2.973978,1.245384,0.439172,4.257332,2.585093,9.896111,3.824477,3.077876,-0.712472,11.486762
2020-06-05,3.767722,4.746677,-0.555065,0.39132,-2.287451,-2.079862,0.391074,1.235409,0.837488,0.729698,...,1.388889,3.443756,1.412556,0.370508,0.008451,6.211187,-0.846561,-0.265518,-0.172754,0.828937
2020-06-08,5.811792,4.888553,-0.781053,-6.797477,-4.986108,0.045574,3.143585,1.330006,1.150521,-0.903532,...,4.502017,3.573387,6.379898,-1.796733,0.151197,7.909087,-1.205248,0.440882,-0.065985,-7.471684
2020-06-09,-2.351813,-1.649454,-6.84067,-15.201984,-9.205896,-1.544755,-1.437901,-3.593012,-3.455732,-6.733915,...,-1.859852,-1.955996,-2.760889,-5.297158,-1.757647,3.419903,-8.240354,-5.123568,-1.038319,-13.714499


### Train, Test, Validation Split

Now we have our `X` and we need to predict `y` values. We will split the data into training, testing and validation sets.

In [17]:
from sklearn.model_selection import train_test_split
N = X.shape[0]
test_percent = 0.1
val_percent = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_percent, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_percent/(1-test_percent), random_state=42)

print("Train size: ", X_train.shape[0])
print("Validation size: ", X_val.shape[0])
print("Test size: ", X_test.shape[0])

Train size:  761
Validation size:  218
Test size:  109


## Model

We first scale the features and then reduce the dimensionality of the data using PCA.

In [18]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

pipe = Pipeline([('scaler', StandardScaler()), ('pca', PCA(n_components=0.90))])
pipe.fit(X_train)

X_train_scaled = pipe.transform(X_train)

In [19]:
X_train_scaled[:5]

array([[ 1.60367590e+01,  1.85773656e+00, -3.34411409e+00,
        -7.33876572e-01, -1.44245067e+00, -1.26031495e+00,
         8.91065649e-01,  3.51396165e+00,  2.56860158e-01,
         2.67728625e+00,  1.97927394e+00,  1.13321191e+00,
         9.12942819e-01, -2.88745258e-01, -1.49151136e+00,
        -1.57952833e+00,  1.53649605e+00, -1.35218801e+00,
         2.00881644e-01, -1.24761310e+00,  6.98063635e-01,
        -8.11397668e-02, -3.91787027e-02,  1.68695349e-01,
         1.78579373e-01],
       [ 4.39388467e+00,  6.16727701e+00,  1.02411092e+01,
         3.41103464e+00, -1.09635282e+00,  2.76475449e+00,
         7.83530306e-01,  1.92759477e+00, -2.48147823e-01,
        -3.00381119e+00,  7.35387794e-01, -5.31576824e-02,
        -6.44571206e-01, -2.77177542e-01, -3.10833060e-01,
         7.47068812e-01,  8.51223740e-01, -1.29686802e-01,
         7.67042072e-01, -3.02445535e-01,  2.82187591e-01,
         3.48617731e-01,  2.33052986e+00, -2.04360080e-01,
         1.00042315e+00],
    

In [20]:
import torch
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [21]:
import torch.nn as nn
import torch.optim as optim

class StockPredictionNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(StockPredictionNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        # self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        # x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        return x
    
# Example parameters
input_size = X_train_scaled.shape[1]  # Number of features
hidden_size = 20  # Number of neurons in hidden layers
output_size = y_train.shape[1]  # Number of target variables

model = StockPredictionNN(input_size, hidden_size, output_size)
criterion = nn.MSELoss()  # Mean Squared Error Loss for regression
optimizer = optim.Adam(model.parameters(), lr=0.001)

model = model.to(device)

In [22]:
# Example training loop
num_epochs = 100
batch_size = 32

# Convert X_train and y_train to DataLoader for batching
X_train_tensor = torch.from_numpy(X_train_scaled).float()
y_train_tensor = torch.from_numpy(y_train.to_numpy(float)).float()
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [10/100], Loss: 9.9496
Epoch [20/100], Loss: 9.2247
Epoch [30/100], Loss: 12.5262
Epoch [40/100], Loss: 6.5254
Epoch [50/100], Loss: 8.5229
Epoch [60/100], Loss: 5.5359
Epoch [70/100], Loss: 7.1145
Epoch [80/100], Loss: 6.0203
Epoch [90/100], Loss: 6.7038
Epoch [100/100], Loss: 6.2453


In [23]:
torch.save(model.state_dict(), 'stock_prediction_model.pth')

In [24]:
# Score the model on training data
X_val_scaled = pipe.transform(X_val)
X_val_tensor = torch.from_numpy(X_val_scaled).float()
y_val_tensor = torch.from_numpy(y_val.to_numpy(float)).float()
y_val_tensor = y_val_tensor.to(device)
X_val_tensor = X_val_tensor.to(device)

model.eval()
with torch.no_grad():
    y_pred = model(X_val_tensor)
    val_loss = criterion(y_pred, y_val_tensor)
    print(f'Validation Loss: {val_loss.item():.4f}')
    


Validation Loss: 7.7541


In [25]:
# Testing
X_test_scaled = pipe.transform(X_test)
X_test_tensor = torch.from_numpy(X_test_scaled).float()
y_test_tensor = torch.from_numpy(y_test.to_numpy(float)).float()
y_test_tensor = y_test_tensor.to(device)
X_test_tensor = X_test_tensor.to(device)

model.eval()
with torch.no_grad():
    y_pred = model(X_test_tensor)
    test_loss = criterion(y_pred, y_test_tensor)
    print(f'Test Loss: {test_loss.item():.4f}')
    

Test Loss: 9.5886


##  Make Predictions with current data
We need to get data up to the largest lag we have used.

In [26]:
today = pd.to_datetime('2024-10-01')
max_lag = max([fields[field_name].lag for field_name in field_names])
print("Max lag: ", max_lag)
start_date = today - pd.DateOffset(days=max_lag+5) # Add 5 days to be safe across non-trading days
start_date = start_date.isoformat() + 'Z'
end_date = today.isoformat() + 'Z'



Max lag:  2


In [27]:
cur_data = ld.get_history(universe, list(fields.keys()), start=start_date, end=end_date, interval='1d')
cur_data.head()



Unnamed: 0_level_0,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AMZN.O,AMZN.O,AMZN.O,AMZN.O,AMZN.O,...,WMT,WMT,WMT,WMT,WMT,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,...,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open,1 Week Total Return,Volume,Company Market Capitalization,Price Close,Price Open
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-09-24,4.880299,43556068,3456964629690,227.37,228.645,3.788527,43478926,2035720152238.76,193.96,194.27,...,2.633588,14651133,648445722206.58,80.67,80.48,2.513575,11984863,520032851189.0,117.05,117.84
2024-09-25,2.573746,42308715,3441760492690,226.37,224.93,3.272006,26391144,2020711491598.9297,192.53,193.75,...,2.998861,14163785,654313645563.6,81.4,80.89,0.165823,13816041,509903206586.6,114.77,116.485
2024-09-26,-0.589855,36636707,3459245250240,227.52,227.3,0.679412,36334854,2006332564971.96,191.16,194.31,...,2.409021,17061133,642417033826.08,79.92,81.04,-2.758621,16887908,501150838224.0,112.8,111.14
2024-09-27,-0.179667,34025967,3463350367230,227.79,228.46,-1.894572,36002316,1972851706621.57,187.97,190.68,...,0.910701,11899050,641291678661.72,79.78,79.9,0.477141,15963973,514568174495.6,115.82,113.76
2024-09-30,2.883384,54793391,3522211138000,233.0,230.04,-3.894161,41680400,1958514630000.0,186.33,187.14,...,0.522843,19008187,649088782300.5,80.75,79.89,-0.119291,13250657,515192981509.92,117.22,115.46


In [28]:
expanded_cur_data = expand_data(cur_data)
expanded_cur_data = expanded_cur_data.dropna()
expanded_cur_data.head()

Unnamed: 0_level_0,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,AAPL.O,...,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM,XOM
Unnamed: 0_level_1,1 Week Total Return,Company Market Capitalization,Company Market Capitalization_lag_1,Price Close,Price Close_lag_1,Price Close_lag_2,Price Open,Price Open_lag_1,Price Open_lag_2,Volume,...,Company Market Capitalization,Company Market Capitalization_lag_1,Price Close,Price Close_lag_1,Price Close_lag_2,Price Open,Price Open_lag_1,Price Open_lag_2,Volume,Volume_lag_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2024-09-26,-0.589855,3459245250240,3441760492690,227.52,226.37,227.37,227.3,224.93,228.645,36636707,...,501150838224.0,509903206586.6,112.8,114.77,117.05,111.14,116.485,117.84,16887908,13816041
2024-09-27,-0.179667,3463350367230,3459245250240,227.79,227.52,226.37,228.46,227.3,224.93,34025967,...,514568174495.6,501150838224.0,115.82,112.8,114.77,113.76,111.14,116.485,15963973,16887908
2024-09-30,2.883384,3522211138000,3463350367230,233.0,227.79,227.52,230.04,228.46,227.3,54793391,...,515192981509.92,514568174495.6,117.22,115.82,112.8,115.46,113.76,111.14,13250657,15963973
2024-10-01,-0.510182,3419568161060,3522211138000,226.21,233.0,227.79,229.52,230.04,228.46,63285048,...,527103687702.48,515192981509.92,119.93,117.22,115.82,116.04,115.46,113.76,23235878,13250657


In [29]:
# san = X.iloc[120]
# san_scaled = pipe.transform(san.values.reshape(1, -1))
# san_tensor = torch.from_numpy(san_scaled).float()

# print(san_tensor)

In [30]:
# predict based on the last row of the data
X_cur = expanded_cur_data.iloc[-1]
X_cur_scaled = pipe.transform(X_cur.values.reshape(1, -1))
X_cur_tensor = torch.from_numpy(X_cur_scaled).float()
X_cur_tensor = X_cur_tensor.to(device)
print(X_cur_tensor)

model.eval()
with torch.no_grad():
    y_pred = model(X_cur_tensor)

y_pred = y_pred.cpu().numpy()
y_pred

tensor([[ 2.8925e+01,  1.2826e+00, -1.1539e+00, -1.6228e+00, -6.4637e-01,
         -1.1671e+00,  3.2943e+00, -2.9847e+00, -9.6739e-01,  3.4345e-01,
         -1.9192e+00, -2.9478e+00, -3.2437e-02, -7.2055e-01,  1.0028e+00,
         -1.7663e-02, -1.1174e+00, -9.0050e-01,  3.5217e-01, -7.4569e-01,
          7.4870e-01,  1.6317e+00,  4.1689e-01,  4.1589e-01,  1.3559e+00]],
       device='cuda:0')


array([[-1.3168358 , -2.1765437 , -2.0461614 , -0.90592515, -0.27961498,
        -1.165856  , -2.2483003 , -2.1325648 , -2.0943725 , -0.9483896 ,
         0.08496894, -0.9103181 , -0.9190382 , -0.5769508 , -1.550889  ,
        -1.7364078 , -2.651705  , -3.015308  , -1.0000991 , -0.22222903,
        -2.1170733 ,  0.03251641, -0.6044704 , -0.14938112, -0.2757834 ]],
      dtype=float32)