In [1]:
import os, random
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.layers import Dense, BatchNormalization
from keras.models import Sequential
from keras.optimizers import SGD, Adam
from keras.callbacks import EarlyStopping
from keras.losses import Huber
import tensorflow_addons as tfa
from keras.regularizers import L1L2, L1, L2

os.chdir('d:\\学习\\Paper_under_working\\20230101_machine_learning_paper')

In [2]:
# load data
X = pd.read_pickle('data/dataset.pkl')
X.set_index('year_month', inplace=True)

# define columns variables
cols = ['InvestPPEInv', 'ShareIss1Y', 'ShareRepurchase', 'DelCOA', 'dNoa', 'GrLTNOA', 'IntMom', 'LRreversal', 'Mom12m', 'Mom6m', 'MRreversal', 'ResidualMomentum', 'STreversal', 'AM', 'BMdec', 'ChEQ', 'AssetGrowth', 'ChNWC', 'DelEqu', 'NOA', 'Size', 'SP', 'AbnormalAccruals', 'Accruals', 'PctAcc', 'OPLeverage', 'BookLeverage', 'CF', 'cfp', 'DelCOL', 'DelFINL', 'IdioRisk', 'IdioVol3F', 'Leverage', 'Beta', 'BetaFP', 'BidAskSpread', 'DolVol', 'Illiquidity', 'PRC', 'VolMkt', 'VolSD', 'High52', 'MaxRet', 'CashProd', 'GP', 'roaq', 'RoE', 'DelLTI', 'CFNAI', 'P_I', 'EU_H', 'C_H', 'SO_I', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'ab_capm', 'ab_ff3', 'ab_ff5', 'ex_return']

# firm feature
firm_features = ['InvestPPEInv', 'ShareIss1Y', 'ShareRepurchase', 'DelCOA', 'dNoa', 'GrLTNOA', 'IntMom', 'LRreversal', 'Mom12m', 'Mom6m', 'MRreversal', 'ResidualMomentum', 'STreversal', 'AM', 'BMdec', 'ChEQ', 'AssetGrowth', 'ChNWC', 'DelEqu', 'NOA', 'Size', 'SP', 'AbnormalAccruals', 'Accruals', 'PctAcc', 'OPLeverage', 'BookLeverage', 'CF', 'cfp', 'DelCOL', 'DelFINL', 'IdioRisk', 'IdioVol3F', 'Leverage', 'Beta', 'BetaFP', 'BidAskSpread', 'DolVol', 'Illiquidity', 'PRC', 'VolMkt', 'VolSD', 'High52', 'MaxRet', 'CashProd', 'GP', 'roaq', 'RoE', 'DelLTI']

# macro features
macro_features = ['CFNAI', 'P_I', 'EU_H', 'C_H', 'SO_I']

# fama-french 5 factors
ff_5factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']

In [3]:
# split data
from sklearn.model_selection import train_test_split
train, X_test = train_test_split(X, test_size=1/3, shuffle=True, random_state=100)
X_train, X_val = train_test_split(train, test_size=1/2, shuffle=True, random_state=100)

In [4]:
# scale data (standerdized)
# fit scaler to train data, and apply it to validation and test data
from sklearn.preprocessing import StandardScaler
scl = StandardScaler()
X_train_scaled = scl.fit_transform(X_train[cols])
X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.index = X_test.index
X_train_scaled.columns = cols

X_val_scaled = scl.transform(X_val[cols])
X_val_scaled = pd.DataFrame(X_val_scaled)
X_val_scaled.index = X_val.index
X_val_scaled.columns = cols

X_test_scaled = scl.transform(X_test[cols])
X_test_scaled = pd.DataFrame(X_test_scaled)
X_test_scaled.index = X_test.index
X_test_scaled.columns = cols

In [49]:
def model_fit(X, y, X_val, y_val, penalty, learning_rate, decay_rate, momentum, batch_size):

  model = Sequential()
  model.add(Dense(16, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=L1(penalty), bias_regularizer=L1(penalty)))
  model.add(BatchNormalization())
  model.add(Dense(1, kernel_regularizer=L1(penalty), bias_regularizer=L1(penalty)))

# add early stop
  earlystop = EarlyStopping(monitor='val_r_square', mode='max', patience=20, verbose=1, restore_best_weights=True)

# Compile model

  sgd = SGD(learning_rate=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
  model.compile(loss='mse', optimizer=sgd, metrics= [tfa.metrics.RSquare()])
  h = model.fit(X, y, epochs=100, batch_size=batch_size, verbose=0, validation_data=(X_val, y_val), callbacks=earlystop)
  return h, model

In [9]:
def set_seed(SEED):
  os.environ['PYTHONHASHSEED']=str(SEED)
  np.random.seed(SEED)
  tf.random.set_seed(SEED)
  random.seed(SEED)
  
set_seed(SEED=1)

In [54]:
# tunning for predicting abnormal return
h, model = model_fit(X=X_train_scaled[firm_features], y=X_train_scaled['ab_ff5'], X_val=X_val_scaled[firm_features], y_val=X_val_scaled['ab_ff5'], penalty=0.001, learning_rate=0.01, decay_rate=0.001, momentum=0.99, batch_size=5000)

Restoring model weights from the end of the best epoch: 35.
Epoch 55: early stopping


In [None]:
32 nodes:
72-57, 67-60, 78-64
16 nodes:
70-59, 94-70, 75-62

In [55]:
pd.DataFrame(h.history).sort_values('val_r_square', ascending=False)

Unnamed: 0,loss,r_square,val_loss,val_r_square
34,1.004985,0.007564,1.001679,0.006279
38,1.006776,0.005777,1.001698,0.006169
42,1.003591,0.007742,1.000873,0.006105
45,1.001954,0.009235,1.000594,0.005905
30,1.005164,0.008015,1.002993,0.005578
49,1.001963,0.008974,1.000685,0.005526
24,1.008866,0.006276,1.004949,0.005298
44,1.003347,0.008186,1.001612,0.00525
32,1.006992,0.00603,1.003307,0.005118
19,1.01584,0.003944,1.009872,0.00499


In [8]:
# tuning for predicting excess return 
h, model = model_fit(X=X_train_scaled[firm_features], y=X_train_scaled['ex_return'], X_val=X_val_scaled[firm_features], y_val=X_val_scaled['ex_return'], penalty=0.001, learning_rate=0.01, decay_rate=0.001, momentum=0.99, batch_size=10000)

pd.DataFrame(h.history).sort_values('val_r_square', ascending=False)

Restoring model weights from the end of the best epoch: 61.
Epoch 81: early stopping


Unnamed: 0,loss,r_square,val_loss,val_r_square
60,1.001434,0.009876,1.006022,0.007030
42,1.011209,0.006999,1.012892,0.006765
73,0.996852,0.012687,1.004718,0.006715
62,1.000407,0.010530,1.005951,0.006712
80,0.999319,0.009898,1.004369,0.006652
...,...,...,...,...
4,1.423932,-0.076136,1.486605,-0.146534
3,1.463586,-0.105802,1.538052,-0.180311
2,1.462662,-0.112708,1.590119,-0.230664
0,1.940850,-0.649954,1.636596,-0.327018


In [9]:
pd.DataFrame(h.history).sort_values('val_r_square', ascending=False)

Unnamed: 0,loss,r_square,val_loss,val_r_square
60,1.001434,0.009876,1.006022,0.007030
42,1.011209,0.006999,1.012892,0.006765
73,0.996852,0.012687,1.004718,0.006715
62,1.000407,0.010530,1.005951,0.006712
80,0.999319,0.009898,1.004369,0.006652
...,...,...,...,...
4,1.423932,-0.076136,1.486605,-0.146534
3,1.463586,-0.105802,1.538052,-0.180311
2,1.462662,-0.112708,1.590119,-0.230664
0,1.940850,-0.649954,1.636596,-0.327018
