# Neural Network

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
from pathlib import Path
import cvxpy as cp
import numpy as np
import pandas as pd
from statsmodels.regression.rolling import RollingOLS
from sklearn.metrics import r2_score
import quantstats as qs
import matplotlib.pyplot as plt
from tqdm import tqdm
from tensorflow.keras.layers import Input, Dense, Dot, BatchNormalization
from tensorflow.keras.models import Model
from qs import calc_factor, calc_return
from scipy.stats import spearmanr

%config InlineBackend.figure_format = "retina"

In [13]:
class MultipleTimeSeriesCV:
    """Generates tuples of train_idx, test_idx pairs
    Assumes the MultiIndex contains levels 'symbol' and 'date'
    purges overlapping outcomes"""

    def __init__(
        self,
        n_splits=3,
        train_period_length=126,
        test_period_length=21,
        lookahead=0,
        date_idx="date",
        shuffle=False,
    ):
        self.n_splits = n_splits
        self.lookahead = lookahead
        self.test_length = test_period_length
        self.train_length = train_period_length
        self.shuffle = shuffle
        self.date_idx = date_idx

    def split(self, X):
        unique_dates = X.index.get_level_values(self.date_idx).unique()
        days = sorted(unique_dates, reverse=True)
        split_idx = []
        for i in range(self.n_splits):
            test_end_idx = i * self.test_length
            test_start_idx = test_end_idx + self.test_length
            train_end_idx = test_start_idx + self.lookahead - 1
            train_start_idx = train_end_idx + self.train_length + self.lookahead - 1
            split_idx.append(
                [train_start_idx, train_end_idx, test_start_idx, test_end_idx]
            )

        dates = X.reset_index()[[self.date_idx]]
        for train_start, train_end, test_start, test_end in split_idx:
            train_idx = dates[
                (dates[self.date_idx] > days[train_start])
                & (dates[self.date_idx] <= days[train_end])
            ].index
            test_idx = dates[
                (dates[self.date_idx] > days[test_start])
                & (dates[self.date_idx] <= days[test_end])
            ].index
            if self.shuffle:
                np.random.shuffle(list(train_idx))
            yield train_idx.to_numpy(), test_idx.to_numpy()

    def get_n_splits(self):
        return self.n_splits


In [5]:
crsp = pd.read_parquet("./data/crsp.parquet")
glb = pd.read_parquet("./data/glb.parquet")
mfis = pd.read_parquet("./data/mfis.parquet")
famafrench = pd.read_parquet("./data/famafrench.parquet") 
optionmetrics = pd.read_parquet("./data/optionmetrics.parquet")

In [6]:
factor = calc_factor(crsp, famafrench, mfis, glb, "W-FRI")
ret = calc_return(crsp, famafrench, "W-FRI")
X = factor["2000-01-01":"2019-11-30"].drop(columns=["logcap", "dolvol", "retvol", "mktrf_sq", "smb", "hml", "mfis_182"])
y = ret.groupby("permno").shift(-1).loc["2000-01-01":"2019-11-30"]

In [9]:
X["ret"] = ret

In [10]:
def make_model(hidden_units, n_factors):
    input_factor = Input((n_tickers,), name="input_factor")
    input_beta = Input((n_tickers, n_characteristics), name="input_beta")
    hidden_layer = Dense(units=hidden_units, activation="relu", name="hidden_layer")(
        input_beta
    )
    batch_norm = BatchNormalization(name="batch_norm")(hidden_layer)
    output_beta = Dense(units=n_factors, name="output_beta")(batch_norm)
    output_factor = Dense(units=n_factors, name="output_factor")(input_factor)
    output = Dot(axes=(2, 1), name="output_layer")([output_beta, output_factor])
    model = Model(inputs=[input_beta, input_factor], outputs=output)
    model.compile(loss="mse", optimizer="adam")
    return model

In [11]:
def get_train_valid_data(X, y, train_idx, val_idx):
    X_train = X.iloc[train_idx]
    X_val = X.iloc[val_idx]
    X1_train = X_train.drop(columns="ret").to_numpy().reshape(-1, n_tickers, n_characteristics)
    X1_val = X_val.drop(columns="ret").to_numpy().reshape(-1, n_tickers, n_characteristics)
    X2_train = X_train.loc[:, "ret"].unstack("permno")
    X2_val = X_val.loc[:, "ret"].unstack("permno")
    y_train = y.iloc[train_idx].unstack("permno")
    y_val = y.iloc[val_idx].unstack("permno")
    return X1_train, X2_train, y_train, X1_val, X2_val, y_val

In [22]:
n_characteristics = 19
n_tickers = 50
n_factors = 8
units = 32
batch_size = 32
epoch = 100

In [23]:
cv = MultipleTimeSeriesCV(n_splits=7, 
                          train_period_length=6*52,
                          test_period_length=1*52)

In [24]:
predictions = []
for fold, (train_idx, val_idx) in enumerate(cv.split(X)):
    X1_train, X2_train, y_train, X1_val, X2_val, y_val = get_train_valid_data(X, y, train_idx, val_idx)
    model = make_model(n_factors=n_factors, hidden_units=units)
    model.fit([X1_train, X2_train],
        y_train,
        batch_size=batch_size,
        epochs=epoch,
        verbose=0,
        shuffle=True)
    predictions.append(pd.Series(model.predict([X1_val, X2_val]).reshape(-1), index=y_val.stack().index))

2022-03-30 13:14:28.844509: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 13:14:37.040338: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 13:14:37.398803: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 13:14:45.460285: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 13:14:45.822195: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 13:14:54.016040: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-03-30 13:14:54.375503: I tensorflow/core/grappler/optimizers/cust

## Backtest

In [25]:
y_pred = pd.concat(predictions).sort_index()
y_val = y[y_pred.index]
print(r2_score(y_val, y_pred), spearmanr(y_val, y_pred).correlation)

-1.4298204980272455 -0.015429885174217624
