To 30 industry returns we add 3, 6, 12-month moving averages.

Pass to a feedforward neural network.

Xval to tune network depth, size, regularization, dropout.

After identifying most promising network structure, backtest by training network each month on historical data up to that month, predicting following month, going long top 6 industries and short bottom 6 industries.


In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import time 
import copy
import random
from itertools import product

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #Hide messy TensorFlow warnings
warnings.filterwarnings("ignore") #Hide messy Numpy warnings

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import explained_variance_score, r2_score
from sklearn.linear_model import LinearRegression, Lasso, lasso_path, lars_path, LassoLarsIC
from sklearn.ensemble.forest import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

import keras
from keras.layers.core import Dense, Activation, Dropout
from keras import Input
from keras.regularizers import l1, L1L2
from keras.models import Model, Sequential
from keras.models import load_model

import ffn
%matplotlib inline

import plotly as py
# print (py.__version__) # requires version >= 1.9.0
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.graph_objs import *
import plotly.figure_factory as ff

init_notebook_mode(connected=True)

random.seed(1764)
np.random.seed(1764)


Using TensorFlow backend.


In [2]:
print("Loading data...")
data = pd.read_csv("30_Industry_Portfolios.csv")
data = data.set_index('yyyymm')
industries = list(data.columns)
# map industry names to col nums
ind_reverse_dict = dict([(industries[i], i) for i in range(len(industries))])

rfdata = pd.read_csv("F-F_Research_Data_Factors.csv")
rfdata = rfdata.set_index('yyyymm')
data['rf'] = rfdata['RF']

# subtract risk-free rate
# create a response variable led by 1 period to predict
for ind in industries:
    data[ind] = data[ind] - data['rf']

for ind in industries:
    data[ind+".3m"] = pd.rolling_mean(data[ind],3)

for ind in industries:
    data[ind+".6m"] = pd.rolling_mean(data[ind],6)

for ind in industries:
    data[ind+".12m"] = pd.rolling_mean(data[ind],12)
    
for ind in industries:
    data[ind+".lead"] = data[ind].shift(-1)

allcols = list(data.columns[:120])
all_reverse_dict = dict([(allcols[i], i) for i in range(len(allcols))])

data = data.drop(columns=['rf'])    
data = data.dropna(axis=0, how='any')
    
data


Loading data...


Unnamed: 0_level_0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
192708,2.05,-4.26,3.55,5.92,5.18,0.24,1.28,0.00,1.82,0.19,...,6.32,1.94,3.96,3.16,4.10,5.39,7.40,5.73,5.96,0.22
192709,5.83,6.80,4.42,3.97,9.83,2.32,4.44,5.44,5.76,1.68,...,-2.34,4.72,-4.91,-0.11,-4.84,-22.03,-5.28,-2.00,3.85,-3.87
192710,-2.71,-1.05,-0.57,0.01,2.38,-2.09,9.40,4.88,-7.71,-2.86,...,2.62,2.09,9.17,16.66,3.70,-1.51,11.90,2.80,7.79,10.98
192711,6.96,10.08,6.48,3.37,16.41,2.52,1.85,3.47,8.83,5.55,...,1.11,-8.05,1.08,0.82,-0.12,11.23,-0.88,-1.78,10.86,0.85
192712,3.31,12.50,0.81,2.59,3.05,10.09,-0.37,-0.68,-0.45,2.45,...,0.05,0.31,0.93,1.26,-1.48,-1.20,-1.10,-1.83,1.20,-3.63
192801,2.29,0.37,-2.81,-0.62,4.91,3.42,6.82,2.44,-1.33,-1.35,...,-0.63,-0.85,1.69,0.53,-1.68,-8.62,-1.37,-2.92,-3.12,-3.01
192802,-3.29,-5.55,-6.30,-1.22,-2.08,-0.34,-2.78,-1.71,0.40,-5.64,...,1.67,-2.73,2.94,10.79,5.49,7.83,8.40,3.18,9.28,8.56
192803,4.82,14.18,2.05,8.53,4.97,9.70,3.70,10.25,12.87,8.44,...,3.00,2.20,1.38,-0.47,4.15,-6.07,-0.25,0.25,8.65,10.73
192804,2.47,4.65,-6.17,5.18,21.01,3.37,9.74,2.13,-0.48,-4.63,...,6.29,0.46,3.56,5.81,-0.78,7.63,3.12,13.80,1.96,-1.12
192805,1.28,5.16,-0.35,1.53,17.36,-2.84,-1.31,8.95,2.66,-6.68,...,-5.53,-5.56,-3.37,-6.05,-4.15,-7.72,-3.71,-3.12,-10.22,-12.26


In [3]:
data = data.loc[data.index[data.index > 195911]]
data


Unnamed: 0_level_0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
yyyymm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195912,2.01,0.35,-3.02,1.64,7.29,0.67,1.87,-1.97,3.08,0.74,...,0.62,-6.18,-7.93,-9.41,-4.31,-5.33,-6.09,-10.08,-4.68,-3.98
196001,-4.49,-5.71,-2.05,1.21,-5.47,-7.84,-8.53,-6.68,-10.03,-4.77,...,8.07,9.13,5.09,3.00,-0.94,1.42,4.00,1.81,-0.98,6.32
196002,3.35,-2.14,2.27,4.23,2.39,9.31,1.44,-0.02,-0.74,0.32,...,-0.21,-0.31,3.34,-2.43,-4.99,-1.37,-0.13,-3.88,0.05,-2.43
196003,-1.67,-2.94,-0.18,-0.65,2.18,-0.56,-2.59,1.26,-2.75,-6.79,...,-1.24,7.14,1.77,0.41,-2.13,0.45,-0.53,8.86,-0.64,0.55
196004,1.17,-2.16,1.35,6.46,-1.17,-1.27,0.21,1.49,-5.53,-1.10,...,3.05,-1.75,11.90,2.85,0.90,1.65,3.11,0.80,-0.45,1.02
196005,8.20,-0.52,2.44,7.28,11.67,7.74,1.74,13.50,3.40,2.10,...,-0.58,-8.07,2.39,3.50,2.17,5.96,3.41,1.03,3.72,6.41
196006,5.39,0.47,4.73,2.24,0.02,6.38,-1.59,-0.40,0.45,4.04,...,-0.03,2.84,-2.02,-4.10,-3.11,-6.16,-2.99,-1.25,0.09,-5.95
196007,-2.11,-0.79,4.60,-4.72,0.23,-0.60,-1.10,-3.99,-6.80,-3.14,...,6.94,5.69,2.71,1.18,1.98,4.51,2.85,2.05,3.47,3.48
196008,4.57,3.24,5.20,7.16,3.63,5.09,3.34,2.29,1.17,-0.84,...,-6.07,-3.53,-7.61,-7.37,-7.07,-8.44,-8.57,-1.90,-5.78,-4.21
196009,-3.88,-5.00,-2.09,-2.33,-6.20,-9.18,-4.23,-8.87,-6.70,-5.25,...,-0.08,4.62,-3.40,-1.85,-1.02,-4.22,0.31,-4.54,-0.40,0.38


In [4]:
desc = data.describe()
desc
# min, max line up with Table 1

Unnamed: 0,Food,Beer,Smoke,Games,Books,Hshld,Clths,Hlth,Chems,Txtls,...,Telcm.lead,Servs.lead,BusEq.lead,Paper.lead,Trans.lead,Whlsl.lead,Rtail.lead,Meals.lead,Fin.lead,Other.lead
count,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,...,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0,697.0
mean,0.688666,0.72703,0.985079,0.732095,0.532253,0.564333,0.690387,0.665825,0.552367,0.687145,...,0.515968,0.729928,0.62297,0.534806,0.60109,0.631076,0.698235,0.728766,0.637547,0.396628
std,4.30866,5.058992,6.032324,7.12817,5.780362,4.728,6.355251,4.897557,5.482363,6.970961,...,4.607931,6.486956,6.698787,5.021876,5.707154,5.57104,5.334178,6.065564,5.381389,5.771655
min,-18.15,-20.19,-25.32,-33.4,-26.56,-22.24,-31.5,-21.06,-28.6,-33.11,...,-16.44,-28.67,-32.07,-27.74,-28.5,-29.25,-29.74,-31.89,-22.53,-28.09
25%,-1.63,-2.08,-2.74,-3.39,-2.6,-2.03,-2.8,-2.23,-2.75,-3.17,...,-2.11,-3.05,-3.22,-2.4,-2.78,-2.56,-2.38,-2.84,-2.4,-2.93
50%,0.74,0.75,1.27,0.94,0.51,0.75,0.7,0.76,0.72,0.64,...,0.59,1.01,0.67,0.71,0.9,0.94,0.54,1.08,0.87,0.54
75%,3.07,3.69,4.66,5.26,3.64,3.54,4.31,3.55,3.76,4.48,...,3.36,4.26,4.63,3.46,4.04,3.88,3.98,4.3,4.0,4.2
max,19.89,25.51,32.38,34.52,33.13,18.22,31.79,29.01,21.68,59.03,...,21.22,23.38,24.66,21.0,18.5,17.53,26.49,27.38,20.59,19.96


In [5]:
# X = 1st 120 colums
# Y = last 30 columns
X = data.values[:-1,:120]
Y = data.values[:-1,-30:]
nrows = X.shape[0]
X.shape


(696, 120)

In [71]:
# build NN model
# input (120 predictors - 30 industry %chs, 3mo, 6mo, 12mo moving averages)
# parameters: n_hidden layers
# each hidden layer: size, reg_penalty, dropout
# output = 30 industry predictions
# minimize mean absolute error

INPUT_DIM = X.shape[1] # 120
OUTPUT_DIM = len(industries) # 30

def build_model(n_hidden_layers = 2,
                hidden_layer_size = 32,
                reg_penalty = 0.0001,
                dropout = 0.333,
                verbose=True):

    main_input = Input(shape=(INPUT_DIM,), 
                       dtype='float32', 
                       name='main_input')
    lastlayer=main_input

    for i in range(n_hidden_layers):
        if verbose:
            print("layer %d size %d, reg_penalty %.8f, dropout %.3f" % (i, hidden_layer_size, reg_penalty, dropout))
        lastlayer = Dense(units = hidden_layer_size, 
                          activation = 'relu',
                          kernel_initializer = keras.initializers.glorot_uniform(),
                          kernel_regularizer=keras.regularizers.l1(reg_penalty),
                          name = "Dense%02d" % i)(lastlayer)

        if dropout:
            lastlayer = Dropout(dropout, name = "Dropout%02d" % i)(lastlayer)
    
    outputs = []
    for i in range(OUTPUT_DIM):
        # OUTPUT_DIM outputs
        output01 = Dense(1,
                         activation='linear', 
                         name='output%02d' % i)(lastlayer)
        outputs.append(output01)
    
    model = Model(inputs=[main_input], outputs=outputs)
    if verbose:
        print(model.summary())
    model.compile(loss="mae", metrics=['mae'], optimizer="rmsprop", loss_weights=[1.]*OUTPUT_DIM)
    return model


In [57]:
# run an experiment with walk-forward cross-validation

EPOCHS = 100
#VAL_SPLIT = 0.2
BATCH_SIZE = 32
LOOKBACK = 128
BATCH_SIZE = 64
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

def run_experiment (n_hidden_layers = 2,
                    hidden_layer_size = 8,
                    reg_penalty = 0.0,
                    dropout = 0.5,
                    epochs = EPOCHS
                   ):

    start = time.time()

    # generate k-folds
    n_splits = 5
    kf = KFold(n_splits=n_splits)
    kf.get_n_splits(X)
    last_indexes = []
    for train_index, test_index in kf.split(X):
        # use test_index as last index to train
        last_index = test_index[-1] + 1
        last_indexes.append(last_index)

    print("%s Generate splits %s" % (time.strftime("%H:%M:%S"), str([i for i in last_indexes])))
    
    avg_bests = []

    print("%s Build model" % (time.strftime("%H:%M:%S")))
    model = build_model(n_hidden_layers = n_hidden_layers,
                        hidden_layer_size = hidden_layer_size,
                        reg_penalty = reg_penalty,
                        dropout = dropout)
    print("Compile time : %s" % str(time.time() - start))
    print("Starting to train : %s" % (time.strftime("%H:%M:%S")))
    
    for i in range(1, n_splits-1):

        models = []
        losses = []
        scores = []
        count = 0        
        # skip kfold 0 so you start with train 2x size of eval set
        last_train_index = last_indexes[i]
        last_xval_index = last_indexes[i+1]

        # set up train, xval
        # train from beginning to last_train_index
        print("Training indexes 0 to %d" % (last_train_index-1))
        X_fit = X[:last_train_index]
        Y_fit = Y[:last_train_index]
        # xval from last_train_index to last_xval_index
        print("Cross-validating indexes %d to %d" % (last_train_index, last_xval_index -1 ))
        X_xval = X[last_train_index:last_xval_index]
        Y_xval = Y[last_train_index:last_xval_index]

        responses = []
        for i in range(OUTPUT_DIM):
            responses.append(Y_fit[:,i])
        # train for epochs
        for epoch in range(epochs):
            fit = model.fit(
                X_fit,
                responses,
                batch_size=BATCH_SIZE,
                #validation_split=VAL_SPLIT,
                epochs=1,
                verbose=0)
            
            train_loss = fit.history['loss'][-1]
            # evaluate ... run prediction, calc MSE by industry, and average
            y_xval_pred = np.array(model.predict(X_xval))
            y_xval_pred = y_xval_pred.reshape(Y_xval.T.shape)
            y_xval_pred = y_xval_pred.T
            mse_list = []
            for i in range(len(industries)):
                mse_list.append(mean_squared_error(Y_xval[:,i], y_xval_pred[:,i]))
            xval_score = np.mean(np.array(mse_list))            
            
            losses.append(train_loss)
            scores.append(xval_score)
            models.append(copy.copy(model))

            bestloss_index = np.argmin(scores)
            bestloss_value = scores[bestloss_index]

            sys.stdout.write('.')
            count += 1
            if count % 80 == 0:
                print("")
                print("%s Still training" % (time.strftime("%H:%M:%S")))
            sys.stdout.flush()            
            
            # stop if loss rises by 20% from best
            if xval_score / bestloss_value > 1.2:
                print("Stopping early" )
                break

        # choose model with lowest xval loss
        print("")
        print ("%s Best Xval loss epoch %d, value %f" % (time.strftime("%H:%M:%S"), bestloss_index, bestloss_value))
        avg_bests.append(bestloss_value)
        model = models[bestloss_index]
    
    print ("Last Xval loss %f" % (bestloss_value))
    avg_loss = np.mean(np.array(avg_bests))
    print ("Avg Xval loss %f" % avg_loss)
    print("--------------------------------------------------------------------------------")
    return (avg_loss, model)


In [54]:
run_experiment()


08:37:35 Generate splits [140, 279, 418, 557, 696]
08:37:35 Build model
layer 0 size 8, reg_penalty 0.00000000, dropout 0.500
layer 1 size 8, reg_penalty 0.00000000, dropout 0.500
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 120)          0                                            
__________________________________________________________________________________________________
Dense00 (Dense)                 (None, 8)            968         main_input[0][0]                 
__________________________________________________________________________________________________
dropout_285 (Dropout)           (None, 8)            0           Dense00[0][0]                    
__________________________________________________________________________________________________
Dense01 (Dense)             

................................................................................
08:41:10 Still training
....................
08:41:24 Best Xval loss epoch 36, value 41.809016
Last Xval loss 41.809016
Avg Xval loss 38.492434
--------------------------------------------------------------------------------


(38.492433588167565, <keras.engine.training.Model at 0x7f459d2feb90>)

In [50]:
# run a lot of experiments in big xval loop
# make predictions
# pick best hyperparameters

MODELPREFIX = "FFNN"

n_hiddens = [1, 2, 3]
layer_sizes = [8, 16, 32, 64]
reg_penalties = [0.0, 0.0001, 0.001]
dropouts = [0.0, 0.25, 0.5]

hyperparameter_combos = list(product(n_hiddens, layer_sizes, reg_penalties, dropouts))

print("%s Running %d experiments" % (time.strftime("%H:%M:%S"), len(hyperparameter_combos)))

experiments = {}

for counter, param_list in enumerate(hyperparameter_combos):
    n_hidden_layers, layer_size, reg_penalty, dropout = param_list
    print("%s Running experiment %d of %d" % (time.strftime("%H:%M:%S"), counter+1, len(hyperparameter_combos)))
    key = (n_hidden_layers, layer_size, reg_penalty, dropout)
    experiments[key], model = run_experiment(n_hidden_layers = n_hidden_layers,
                                             hidden_layer_size = layer_size,
                                             reg_penalty = reg_penalty,
                                             dropout = dropout)
    modelname = "%s_%.6f_%d_%d_%.6f_%.3f" % (MODELPREFIX, bestloss_value, n_hidden_layers, layer_size, reg_penalty, dropout)
    print("%s Saving %s.h5" % (time.strftime("%H:%M:%S"), modelname))
    model.save("%s.h5" % modelname)
    model.save_weights("%s_weights.h5" % modelname)
    

08:28:48 Running 108 experiments
08:28:48 Running experiment 1 of 108
08:28:48 Generate splits [140, 279, 418, 557, 696]
08:28:48 Build model
layer 0 size 8, reg_penalty 0.00000000, dropout 0.000
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 120)          0                                            
__________________________________________________________________________________________________
Dense00 (Dense)                 (None, 8)            968         main_input[0][0]                 
__________________________________________________________________________________________________
dropout_282 (Dropout)           (None, 8)            0           Dense00[0][0]                    
__________________________________________________________________________________________________
output00 (De

KeyboardInterrupt: 

In [21]:
# list and chart experiments
flatlist = [list(l[0]) + [l[1]] for l in experiments.items()]

lossframe = pd.DataFrame(flatlist, columns=["n_hidden_layers", "layer_size", "reg_penalty", "dropout",
                                            "loss"])
lossframe.sort_values(['loss'])

Unnamed: 0,n_hidden_layers,layer_size,reg_penalty,dropout,loss
35,2,8,0.0000,0.50,37.683170
29,3,64,0.0000,0.50,37.955471
36,3,8,0.0000,0.50,38.189749
98,3,16,0.0010,0.50,38.247286
37,3,8,0.0010,0.50,38.260189
10,2,32,0.0000,0.50,38.303395
89,3,8,0.0001,0.50,38.317764
96,2,16,0.0001,0.50,38.338702
14,3,32,0.0010,0.50,38.345544
63,3,16,0.0001,0.50,38.357251


In [22]:
# we can pick lowest loss , but first we look at patterns by hyperparameter
# if a more parsimonious model has nearly same result, pick more parsimonious model
pd.DataFrame(lossframe.groupby(['n_hidden_layers'])['loss'].mean())


Unnamed: 0_level_0,loss
n_hidden_layers,Unnamed: 1_level_1
1,43.584655
2,42.038494
3,41.619281


In [23]:
pd.DataFrame(lossframe.groupby(['layer_size'])['loss'].mean())


Unnamed: 0_level_0,loss
layer_size,Unnamed: 1_level_1
8,40.527042
16,42.128885
32,43.025512
64,43.975135


In [24]:
pd.DataFrame(lossframe.groupby(['reg_penalty'])['loss'].mean())


Unnamed: 0_level_0,loss
reg_penalty,Unnamed: 1_level_1
0.0,42.326733
0.0001,42.615339
0.001,42.300358


In [25]:
pd.DataFrame(lossframe.groupby(['dropout'])['loss'].mean())


Unnamed: 0_level_0,loss
dropout,Unnamed: 1_level_1
0.0,46.218145
0.25,41.552281
0.5,39.472004


In [46]:
def plot_matrix(lossframe, x_labels, y_labels, x_suffix="", y_suffix=""):

    pivot = lossframe.pivot_table(index=[x_labels], columns=[y_labels], values=['loss'])
    # specify labels as strings, to force it to use a discrete axis
    if lossframe[x_labels].dtype == np.float64 or lossframe[x_labels].dtype == np.float32:
        xaxis = ["%f %s" % (i, x_suffix) for i in pivot.columns.levels[1].values]
    else:
        xaxis = ["%d %s" % (i, x_suffix) for i in pivot.columns.levels[1].values]
    if lossframe[y_labels].dtype == np.float64 or lossframe[y_labels].dtype == np.float32:
        yaxis = ["%f %s" % (i, y_suffix) for i in pivot.index.values]
    else:
        yaxis = ["%d %s" % (i, y_suffix) for i in pivot.index.values]
        
    print(xaxis, yaxis)
    """plot a heat map of a matrix"""
    chart_width=640
    chart_height=480
    
    layout = Layout(
        title="%s v. %s" % (x_labels, y_labels),
        height=chart_height,
        width=chart_width,     
        margin=dict(
            l=150,
            r=30,
            b=120,
            t=100,
        ),
        xaxis=dict(
            title=y_labels,
            tickfont=dict(
                family='Arial, sans-serif',
                size=10,
                color='black'
            ),
        ),
        yaxis=dict(
            title=x_labels,
            tickfont=dict(
                family='Arial, sans-serif',
                size=10,
                color='black'
            ),
        ),
    )
    
    data = [Heatmap(z=pivot.values,
                    x=xaxis,
                    y=yaxis,
                    colorscale=[[0, 'rgb(0,0,255)', [1, 'rgb(255,0,0)']]],
                   )
           ]

    fig = Figure(data=data, layout=layout)
    return iplot(fig, link_text="")

plot_matrix(lossframe, "n_hidden_layers", "layer_size", x_suffix=" units", y_suffix=" layers")



(['8  units', '16  units', '32  units', '64  units'], ['1  layers', '2  layers', '3  layers'])


In [48]:
plot_matrix(lossframe, "reg_penalty", "dropout", x_suffix=" d", y_suffix=" r")


(['0.000000  d', '0.250000  d', '0.500000  d'], ['0.000000  r', '0.000100  r', '0.001000  r'])


In [58]:
# rerun (or load from file)
experiments[key], model = run_experiment(n_hidden_layers = 2,
                                         hidden_layer_size = 8,
                                         reg_penalty = 0.0,
                                         dropout = 0.5,
                                         epochs=240)

08:44:24 Generate splits [140, 279, 418, 557, 696]
08:44:24 Build model
layer 0 size 8, reg_penalty 0.00000000, dropout 0.500
layer 1 size 8, reg_penalty 0.00000000, dropout 0.500
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
main_input (InputLayer)         (None, 120)          0                                            
__________________________________________________________________________________________________
Dense00 (Dense)                 (None, 8)            968         main_input[0][0]                 
__________________________________________________________________________________________________
dropout_287 (Dropout)           (None, 8)            0           Dense00[0][0]                    
__________________________________________________________________________________________________
Dense01 (Dense)             

................................................................................
08:47:52 Still training
................................................................................
08:48:38 Still training
................................................................................
08:49:24 Still training

08:49:24 Best Xval loss epoch 6, value 40.935083
Training indexes 0 to 556
Cross-validating indexes 557 to 695
................................................................................
08:50:22 Still training
................................................................................
08:51:21 Still training
................................................................................
08:52:19 Still training

08:52:19 Best Xval loss epoch 12, value 40.436932
Last Xval loss 40.436932
Avg Xval loss 37.982769
--------------------------------------------------------------------------------


In [65]:
EPOCHS=160

def fit_predict(X, Y, model):
    """for backtest, train model using Ys v. X using n-1 rows
    predict Ys on X using nth row
    return a prediction for month n+1 using X for final month"""
    
    # keep last row to predict against
    X_predict = X[-1]
    X_predict = X_predict.reshape(1,X.shape[1])
    
    # fit on remaining rows
    X_fit = X[:-1]
    Y_fit = Y[:-1]
    
    Ys = []
    for i in range(OUTPUT_DIM):
        Ys.append(Y_fit[:,i])
        
    fit = model.fit(
        X_fit,
        Ys,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        verbose=0)
    
    return [z[0][0] for z in model.predict(X_predict)]    

X = data.values[:,:120]
Y = data.values[:, -30:]
predictions = fit_predict(X, Y, model)
predictions

[0.6711899,
 0.55624336,
 0.81909174,
 0.60417473,
 0.56379104,
 0.7703549,
 0.7968764,
 0.72180486,
 0.38120115,
 0.605703,
 0.36519054,
 0.051413134,
 0.7337813,
 0.45761275,
 0.083381325,
 0.7505549,
 0.47567996,
 0.16907592,
 0.53487134,
 0.57084155,
 0.3908484,
 0.9229705,
 0.65767616,
 0.47478187,
 0.5176616,
 0.7718226,
 0.5405979,
 0.93776643,
 0.5745884,
 0.35901728]

In [60]:
# 197001 = 121
STARTMONTH = 121
print(X[STARTMONTH])
print(data.iloc[STARTMONTH][:30])

[ -3.34        -1.95        -7.59        -7.76       -12.05
  -7.5         -5.69        -7.71        -7.37        -5.26
  -9.84        -6.31        -7.15        -6.89        -9.35
 -12.49        -2.34        -0.77       -12.16        -4.83
  -3.16       -11.17        -9.73        -8.89        -8.17
  -8.28        -6.31       -13.12        -9.78        -6.2
  -2.28666667  -2.18        -3.36        -7.00333333  -6.82
  -3.37666667  -5.32666667  -1.41        -5.58666667  -5.43333333
  -6.02666667  -4.45        -4.68333333  -4.67666667  -5.93666667
  -9.37        -2.48         2.38666667  -6.98666667  -3.96666667
  -2.88333333  -4.91666667  -4.43        -4.87666667  -8.43
  -6.69666667  -4.78333333  -7.30333333  -6.68333333  -6.49
   0.82833333   1.51833333   1.45833333  -0.77333333  -0.9
  -0.20833333  -1.76333333   2.39833333  -2.68166667  -1.68666667
  -1.015       -1.26        -1.80166667  -0.37        -2.11833333
  -3.77333333  -0.13166667   3.405       -4.49333333  -1.795
  -1.535   

In [69]:
# fit and predict all months starting STARTMONTH using data up to that month
# compute predictions matrix P
# compute returns matrix R using mean(top 6, (-bot 6))

def run_backtest(X, Y, arg_dict, startmonth=0):
    global P
    global R 
    
    print("%s Starting backtest" % (time.strftime("%H:%M:%S")))
    print(arg_dict)
    P = np.zeros((X.shape[0],OUTPUT_DIM))
    count = 0
    for month_index in range(startmonth, X.shape[0]+1):
        model = build_model(**arg_dict)
        predictions = fit_predict(X[:month_index, :], 
                                  Y[:month_index], 
                                  model)
        try:
            P[month_index]= predictions
            sys.stdout.write('.')
            count += 1
            if count % 80 == 0:
                print("")
                print("%s Still training" % (time.strftime("%H:%M:%S")))
            sys.stdout.flush()
        except IndexError:
            # I want to run the fit and see the R-squared on full dataset
            # but we are storing the predictions in row of the month predicted
            # so we have no row to store the last prediction (2017-01)
            print("\nlast prediction not stored")
                
    R = np.zeros(P.shape[0])
    numstocks = 6 # top quintile (and bottom)

    for month_index in range(startmonth, X.shape[0]):
        # get indexes of sorted smallest to largest
        select_array = np.argsort(P[month_index])
        # leftmost 6
        short_indexes = select_array[:numstocks]
        # rightmost 6
        long_indexes = select_array[-numstocks:]
        # compute equal weighted long/short return
        R[month_index] = np.mean(X[month_index, long_indexes])/2 - np.mean(X[month_index, short_indexes])/2

    results = R[startmonth:]

    index = pd.date_range('01/01/1970',periods=results.shape[0], freq='M')
    perfdata = pd.DataFrame(results,index=index,columns=['Returns'])
    perfdata['Equity'] = 100 * np.cumprod(1 + results / 100)

    stats = perfdata['Equity'].calc_stats()

    retframe = pd.DataFrame([stats.stats.loc['start'],
                             stats.stats.loc['end'],
                             stats.stats.loc['cagr'],
                             stats.stats.loc['yearly_vol'],
                             stats.stats.loc['yearly_sharpe'],
                             stats.stats.loc['max_drawdown'],
                             ffn.core.calc_sortino_ratio(perfdata.Returns, rf=0, nperiods=564, annualize=False),
                            ],
                            index = ['start',
                                     'end',
                                     'cagr',
                                     'yearly_vol',
                                     'yearly_sharpe',
                                     'max_drawdown',
                                     'sortino',
                                    ],
                            columns=['Value'])   
    return retframe


In [None]:
arg_dict = {'n_hidden_layers' : 2,
            'hidden_layer_size' : 8,
            'reg_penalty' : 0.0,
            'dropout' : 0.5,
            'verbose' : False
           }
     
#model = build_model(**arg_dict)
run_backtest(X, Y, arg_dict, startmonth=STARTMONTH)

................................................................................
11:46:34 Still training
................................................................................
15:16:56 Still training
.......................

In [None]:
# double check results
results = R[STARTMONTH:]
print(len(results))
#print(results)
print(np.mean(results))
print(np.std(results) * np.sqrt(12))
print(np.prod(1 + results / 100))
print(np.prod(1 + results / 100) ** (12.0/results.shape[0]))-1

In [None]:
# calc MSE across all preds
np.mean((P[121:]-X[121:])**2)

In [None]:
# run chart
perf = 100 * np.cumprod(1 + results / 100)

x_coords = np.linspace(1970, 2016, perf.shape[0])

trace1 = Scatter(
    x = x_coords,
    y = perf,
    name = 'Growth of $1',    
)

layout = Layout(
    yaxis=dict(
        type='log',
        autorange=True
    )
)
plotdata = [trace1]

fig = Figure(data=plotdata, layout=layout)

iplot(fig)