In [1]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, LSTM, Input, concatenate, Embedding, Reshape, BatchNormalization
import tensorflow.keras.backend as K

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
idx = pd.IndexSlice
sns.set_style('whitegrid')
np.random.seed(42)

In [4]:
results_path = Path('results', 'lstm_embeddings')
if not results_path.exists():
    results_path.mkdir(parents=True)

In [5]:
data = pd.read_hdf('data.h5', 'returns_weekly')

In [6]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,fwd_returns,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,label
ticker,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
A,2009-01-11,0.140394,0.051133,-0.050983,-0.063830,0.007532,-0.083378,0.089069,-0.163861,-0.049908,-0.015773,...,-0.017641,-0.025470,0.024796,-0.042187,-0.071387,0.038265,-0.013963,-0.065000,0.035375,1
A,2009-01-18,0.053456,0.140394,0.051133,-0.050983,-0.063830,0.007532,-0.083378,0.089069,-0.166667,-0.049908,...,-0.014300,-0.017641,-0.025470,0.024796,-0.042187,-0.071387,0.038265,-0.013963,-0.065000,1
A,2009-01-25,-0.044593,0.053456,0.140394,0.051133,-0.050983,-0.063830,0.007532,-0.083378,0.089069,-0.166747,...,0.000675,-0.014300,-0.017641,-0.025470,0.024796,-0.042187,-0.071387,0.038265,-0.013963,0
A,2009-02-01,-0.030043,-0.044593,0.053456,0.140394,0.051133,-0.050983,-0.063830,0.007532,-0.083378,0.089069,...,-0.002697,0.000675,-0.014300,-0.017641,-0.025470,0.024796,-0.042187,-0.071387,0.038265,0
A,2009-02-08,0.067478,-0.030043,-0.044593,0.053456,0.140394,0.051133,-0.050983,-0.063830,0.007532,-0.083378,...,0.072008,-0.002697,0.000675,-0.014300,-0.017641,-0.025470,0.024796,-0.042187,-0.071387,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZUMZ,2017-12-03,0.047244,0.007937,0.000000,0.058824,-0.005571,0.016997,0.029155,-0.065395,0.013812,0.100304,...,-0.045570,-0.005038,0.000000,-0.091533,0.023419,-0.112266,-0.060547,0.036437,-0.013972,1
ZUMZ,2017-12-10,0.062657,0.047244,0.007937,0.000000,0.058824,-0.005571,0.016997,0.029155,-0.065395,0.013812,...,0.122016,-0.045570,-0.005038,0.000000,-0.091533,0.023419,-0.112266,-0.060547,0.036437,1
ZUMZ,2017-12-17,-0.051887,0.062657,0.047244,0.007937,0.000000,0.058824,-0.005571,0.016997,0.029155,-0.065395,...,-0.009456,0.122016,-0.045570,-0.005038,0.000000,-0.091533,0.023419,-0.112266,-0.060547,0
ZUMZ,2017-12-24,0.067164,-0.051887,0.062657,0.047244,0.007937,0.000000,0.058824,-0.005571,0.016997,0.029155,...,-0.033413,-0.009456,0.122016,-0.045570,-0.005038,0.000000,-0.091533,0.023419,-0.112266,1


In [14]:
data['ticker'] = pd.factorize(data.index.get_level_values('ticker'))[0]

In [18]:
data['month'] = data.index.get_level_values('date').month

In [20]:
data = pd.get_dummies(data,columns = ['month'], prefix = 'month')

In [21]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1167341 entries, ('A', Timestamp('2009-01-11 00:00:00')) to ('ZUMZ', Timestamp('2017-12-31 00:00:00'))
Data columns (total 67 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   fwd_returns  1167341 non-null  float64
 1   1            1167341 non-null  float64
 2   2            1167341 non-null  float64
 3   3            1167341 non-null  float64
 4   4            1167341 non-null  float64
 5   5            1167341 non-null  float64
 6   6            1167341 non-null  float64
 7   7            1167341 non-null  float64
 8   8            1167341 non-null  float64
 9   9            1167341 non-null  float64
 10  10           1167341 non-null  float64
 11  11           1167341 non-null  float64
 12  12           1167341 non-null  float64
 13  13           1167341 non-null  float64
 14  14           1167341 non-null  float64
 15  15           1167341 non-null  float64
 16  16           1167

In [23]:
window_size = 52
sequence = list(range(1,window_size +1))
ticker =1
months = 12
n_tickers = data.ticker.nunique()

In [24]:
train_data = data.drop('fwd_returns',axis = 1).loc[idx[:,:'2016'],:]
test_data = data.drop('fwd_returns',axis = 1).loc[idx[:,'2017'],:]

In [28]:
X_train = [
    train_data.loc[:,sequence].values.reshape(-1,window_size,1),
    train_data.ticker,
    train_data.filter(like = 'month')
]
y_train = train_data.label
[x.shape for x in X_train], y_train.shape

([(1035424, 52, 1), (1035424,), (1035424, 12)], (1035424,))

In [29]:
X_test = [
    test_data.loc[:,sequence].values.reshape(-1,window_size,1),
    test_data.ticker,
    test_data.filter(like = 'month')
]
y_test = test_data.label
[x.shape for x in X_test], y_test.shape

([(131917, 52, 1), (131917,), (131917, 12)], (131917,))

In [30]:
K.clear_session()

In [31]:
n_features = 1

In [34]:
returns = Input(shape=(window_size, n_features), name = 'Returns')
tickers = Input(shape=(1,),name = 'Tickers')
months = Input(shape=(12,), name = 'Months')

In [35]:
lstm1_units = 25
lstm2_units = 10

In [36]:
lstm1 = LSTM(units = lstm1_units, 
             input_shape = (window_size,n_features), 
             name = 'LSTM1',
             dropout=.2,
            return_sequences = True)(returns )
lstm_model = LSTM(units = lstm2_units,
                 dropout =.2,
                 name ='LSTM2')(lstm1)

In [38]:
ticker_embedding = Embedding(input_dim=n_tickers, 
                             output_dim=5, 
                             input_length=1)(tickers)
ticker_embedding = Reshape(target_shape=(5,))(ticker_embedding)

In [39]:
merged = concatenate([lstm_model, 
                      ticker_embedding, 
                      months], name='Merged')

bn = BatchNormalization()(merged)
hidden_dense = Dense(10, name='FC1')(bn)

output = Dense(1, name='Output', activation='sigmoid')(hidden_dense)

rnn = Model(inputs=[returns, tickers, months], outputs=output)

In [40]:
optimizer = tf.keras.optimizers.RMSprop(lr=0.001,
                                        rho=0.9,
                                        epsilon=1e-08,
                                        decay=0.0)

In [41]:
rnn.compile(loss='binary_crossentropy',
            optimizer=optimizer,
            metrics=['accuracy', 
                     tf.keras.metrics.AUC(name='AUC')])

In [42]:
lstm_path = (results_path / 'lstm.classification.h5').as_posix()

checkpointer = ModelCheckpoint(filepath=lstm_path,
                               verbose=1,
                               monitor='val_AUC',
                               mode='max',
                               save_best_only=True)

In [43]:

early_stopping = EarlyStopping(monitor='val_AUC', 
                              patience=5,
                              restore_best_weights=True,
                              mode='max')

In [None]:
training = rnn.fit(X_train,
                   y_train,
                   epochs=50,
                   batch_size=32,
                   validation_data=(X_test, y_test),
                   callbacks=[early_stopping, checkpointer],
                   verbose=1)

Epoch 1/50

Epoch 00001: val_AUC improved from -inf to 0.62479, saving model to results/lstm_embeddings\lstm.classification.h5
Epoch 2/50

Epoch 00002: val_AUC did not improve from 0.62479
Epoch 3/50

Epoch 00003: val_AUC improved from 0.62479 to 0.63131, saving model to results/lstm_embeddings\lstm.classification.h5
Epoch 4/50

Epoch 00004: val_AUC improved from 0.63131 to 0.67427, saving model to results/lstm_embeddings\lstm.classification.h5
Epoch 5/50

Epoch 00005: val_AUC improved from 0.67427 to 0.68211, saving model to results/lstm_embeddings\lstm.classification.h5
Epoch 6/50

Epoch 00006: val_AUC did not improve from 0.68211
Epoch 7/50

Epoch 00007: val_AUC improved from 0.68211 to 0.68284, saving model to results/lstm_embeddings\lstm.classification.h5
Epoch 8/50

Epoch 00008: val_AUC improved from 0.68284 to 0.68587, saving model to results/lstm_embeddings\lstm.classification.h5
Epoch 9/50

Epoch 00009: val_AUC improved from 0.68587 to 0.68599, saving model to results/lstm_emb