In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import lime
import lime.lime_tabular
import shap

In [27]:
df = pd.read_csv("data.csv")

In [28]:
df = df.drop(columns = ['Unnamed: 0'])

In [29]:
df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

In [30]:
df['Date'] = pd.to_datetime(df['Date'])

In [31]:
df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

In [32]:
df_filled.isna().sum()

Date             0
permno           0
return           0
mom1m            0
mom12m           0
chmom            0
indmom           0
mom36m           0
turn             0
mvel1            0
dolvol           0
ill              0
zerotrade        0
baspread         0
retvol           0
idiovol          0
beta             0
betasq           0
ep               0
sp               0
agr              0
nincr            0
return(t-1)    500
r(t+1)         500
dtype: int64

In [33]:
df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

In [34]:
df.set_index('Date', inplace=True)

# Sorting data into bins

In [35]:
bins = pd.read_pickle('dataframes.pkl')
print(len(bins))
bins

9


[               mom1m    mom12m     chmom    indmom    mom36m      turn  \
 Date                                                                     
 2001-02-28 -1.227217  1.496115  0.464327  2.002047 -0.894952 -0.669719   
 2001-03-31 -0.738248  0.268442  1.028669  0.566989 -0.741515 -0.691479   
 2001-04-30 -0.145482  0.232072  1.139679  0.500932 -0.777376 -0.782179   
 2001-05-31  0.680502  0.196008  1.082926  0.435431 -0.757976 -0.839218   
 2001-06-30 -0.256864  0.793600 -0.192024  1.458838 -0.753031 -0.847905   
 ...              ...       ...       ...       ...       ...       ...   
 2019-08-31 -0.456971 -0.371019  0.000223 -0.594437 -0.279818  0.183965   
 2019-09-30  0.056832 -0.625031  0.057134 -1.055789 -0.268858  0.215138   
 2019-10-31 -0.205827 -0.556172 -0.054997 -0.930723 -0.074809  0.284859   
 2019-11-30 -0.106308 -0.595088  0.232050 -1.001404 -0.065805  0.210304   
 2019-12-31  0.615626 -0.530693  0.744224 -0.884447 -0.188191  0.090279   
 
                mvel1  

# Fitting model

In [36]:
training = df[:'2014-01-01'].dropna()
validation = df['2014-01-02':'2018-01-01']
testing = df[:'2020-01-01'].dropna()

In [37]:
X_train = training.drop(columns = ['permno', 'return', 'r(t+1)'])
y_train = training['r(t+1)']

In [38]:
X_val = validation.drop(columns = ['permno', 'return', 'r(t+1)'])
y_val = validation['r(t+1)']

In [39]:
X_test = testing.drop(columns = ['permno', 'return', 'r(t+1)'])
y_test = testing['r(t+1)']

In [40]:
#Hyperparameters based on the paper
depth = 6 
n_trees = 300         
feature_split = 3 

In [41]:
def R_calc(actual, predicted):

    actual = np.array(actual)
    predicted = np.array(predicted).flatten()
    
    predicted = np.clip(predicted, 0, None)
    
    ss_res = np.sum((actual - predicted) ** 2)
    
    ss_tot = np.sum(actual ** 2)
    
    r_squared = 1 - (ss_res / ss_tot)
    
    return r_squared

In [42]:
rf = RandomForestRegressor(n_estimators = n_trees,
                                   max_depth = depth,
                                   max_features = feature_split,
                                   random_state = 42)
        
rf.fit(X_train, y_train)
        
predictions = rf.predict(X_val)
        
r_val = R_calc(y_val, predictions)
        
print(r_val)

0.008260710363663715


In [43]:
test_pred = rf.predict(X_test)
val = R_calc(y_test, test_pred)
print(val)

0.05764657658981798


# Explain Bins

In [None]:
dtype_specification = {'SICCD': str}
df = pd.read_csv('less_messy_data_keyword_LESS.csv', dtype=dtype_specification)
df = df.drop(columns = ['COMNAM', 'Unnamed: 0.1', 'Unnamed: 0', 'r(t+1)' , 'return'])

In [44]:
explainer = lime.lime_tabular.LimeTabularExplainer(X_train.values, 
                                                   feature_names=X_train.columns, 
                                                   class_names=['r(t+1)'], 
                                                   mode='regression')

In [47]:
#BIN 1
explanations1 = {}
#bins = df['bin1'].unique()

i = 0
for bin in bins:
    
    sample = bin.sample(n = 10)
    temp_exp = []
    
    for index, row in sample.iterrows():
        explanation = explainer.explain_instance(data_row = row[X_train.columns].values, 
                                                 predict_fn = rf.predict, 
                                                 num_features = 4)
        exp_map = dict(explanation.as_list())
        temp_exp.append(exp_map)
        
    explanations1[i] = temp_exp
    i += 1







In [48]:
summaries1 = {}

i = 0

for bin, explanations in explanations1.items():
    
    feat_imp = {}
    
    for exp in explanations:
                
        for feat, imp in exp.items():
                        
            if feat in feat_imp:
                
                feat_imp[feat].append(imp)
                
            else:
                
                feat_imp[feat] = [imp]
                
    all_importances = {feat: np.mean(imps) for feat, imps in feat_imp.items()}
    summaries1[i] = all_importances
    i += 1

bin_summary = pd.DataFrame(summaries1).fillna(0)
bin_summary

Unnamed: 0,0,1,2,3,4,5,6,7,8
dolvol <= 11.29,0.006381,0.006521,0.006446,0.006417,0.00625,0.006417,0.006324,0.006439,0.0064
mvel1 <= 144063.06,0.003602,0.003575,0.003664,0.003621,0.003581,0.003448,0.003563,0.003604,0.00338
indmom <= -0.06,0.002301,0.002609,0.002664,0.002534,0.002044,0.002352,0.002623,0.002658,0.002407
mom1m <= -0.05,0.002567,0.00223,0.002702,0.002449,0.00245,0.0,0.002042,0.002324,0.002313
mom36m <= -0.21,0.003946,0.004095,0.003949,0.003979,0.003959,0.004104,0.003976,0.003852,0.004026
sp > 1.66,0.003185,0.003761,0.0,0.0,0.0,0.003549,0.0,0.0,0.0
ep <= 0.01,0.003959,0.004105,0.00391,0.00394,0.003896,0.004123,0.0,0.003441,0.004056
return(t-1) <= -0.05,0.002513,0.003129,0.0,0.002168,0.002638,0.0,0.002549,0.002181,0.0
idiovol > 0.07,0.003017,0.003273,0.003258,0.003006,0.003125,0.002728,0.002924,0.002964,0.002785
baspread > 0.05,0.002972,0.002644,0.002519,0.0,0.0,0.0,0.002827,0.0027,0.0


In [49]:
#BIN 1
explainer = shap.TreeExplainer(rf)
shap_values1 = {}
i = 0
for bin in bins:
    
    sample = bin.sample(n = 10)
    X_sample = sample[X_train.columns]
    temp_values = explainer.shap_values(X_sample)
    shap_values1[i] = temp_values
    i += 1

summary1 = {}

i = 0
for bin, values in shap_values1.items():
    
    mean_values = np.abs(values).mean(axis=0)
    summary1[i] = dict(zip(X_train.columns, mean_values))
    i += 1

bin_shap_summary = pd.DataFrame(summary1).fillna(0)
bin_shap_summary

Unnamed: 0,0,1,2,3,4,5,6,7,8
mom1m,0.00423,0.011299,0.009756,0.007409,0.010852,0.01204,0.008489,0.013979,0.006586
mom12m,0.011642,0.029468,0.003358,0.005611,0.001725,0.016365,0.001556,0.00819,0.011627
chmom,0.002116,0.002685,0.001693,0.002481,0.001695,0.003383,0.000997,0.001921,0.002658
indmom,0.017241,0.028804,0.015115,0.020061,0.010028,0.016056,0.00949,0.013818,0.020285
mom36m,0.004878,0.016394,0.003794,0.003399,0.008043,0.003301,0.000944,0.010739,0.005411
turn,0.007925,0.007144,0.013321,0.011539,0.014689,0.011962,0.019052,0.013505,0.00888
mvel1,0.075532,0.055739,0.068872,0.072618,0.070625,0.070668,0.072493,0.065638,0.080259
dolvol,0.033015,0.031635,0.029776,0.028895,0.030682,0.029539,0.031002,0.033451,0.034506
ill,0.00091,0.003637,0.00376,0.003355,0.002063,0.001473,0.00526,0.002541,0.004213
zerotrade,0.000329,0.000294,0.000421,0.000502,0.000747,0.000316,0.000398,0.000421,0.000497
