In [None]:
# run the single model file fitted on 9:4:6 split data

# %run -i "C:/Users/LL/Desktop/RPI/24_Spring/AMLF/HW_5_SHAP_LIME/Enet_NeuralNet_v2.ipynb"

In [22]:
import numpy as np
import pandas as pd

df = pd.read_csv('C:/users/LL/Documents/GitHub/AMLF_projects/data.csv')

df = df.drop(columns = ['Unnamed: 0'])

# According to note 30: "Therefore, to predict returns at month t+1, we use most recent monthly characteristics at the end of month t." <br>
# Hence, **shift return t+1 to serve as response: r(t+1)**.

df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

### handle missing data

# According to note 30 (bottom of p 2248): "Another issue is missing characteristics, which we replace with the cross-sectional median at each month for each stock, respectively." <br>
# Hence, calculate monthly cross-sectional median for features: **'mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr'**.

df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

df_filled.isna().sum()

df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

df['Date'] = pd.to_datetime(df['Date'])

# Set the datetime column as index
df.set_index('Date', inplace=True, drop = True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_scaled = scaler.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)


permno = df['permno'].reset_index(drop = True)

df_scaled['permno'] = permno

df_scaled.index = df.index

df_scaled_2 = df_scaled.drop(columns = [ 'permno', 'return'])


### split data

##split training, validation, and testing datasets

#training : validation : testing = 6 yr : 4yr : 9 yr <br>
#Also drop the first and last month due to the absence of r(t+1) and return(t-1)

training = df_scaled_2[:'2007-01-01'].dropna()
validation = df_scaled_2['2007-01-01':'2011-01-01']
testing = df_scaled_2['2011-01-01':].dropna()

training_combined = df_scaled_2[:'2011-01-01'].dropna()


##separate X and y

X_train = training.drop(columns = ['r(t+1)'])
y_train = training['r(t+1)']

X_val = validation.drop(columns = ['r(t+1)'])
y_val = validation['r(t+1)']

X_test = testing.drop(columns = ['r(t+1)'])
y_test = testing['r(t+1)']

X_train_combined = training_combined.drop(columns = ['r(t+1)'])
y_train_combined = training_combined['r(t+1)']

In [4]:
import keras
from keras import layers
from keras import Sequential

## performance metric

# Out-of-sample R^2. (r2_score_wo_demeaning)

import tensorflow as tf

def r2_score_wo_demeaning_nn(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred))
    ss_tot = tf.reduce_sum(tf.square(y_true - 0))
    r2 = 1 - (ss_res / ss_tot)
    return r2


## tuning custom function for neural net

def compile_and_tune_model(model, parameter_dicts, x_train, y_train, x_val, y_val):
    results = []
    
    for params in parameter_dicts:

        model.compile(optimizer=keras.optimizers.Adam(learning_rate=params['learning_rate']),
                      loss='mean_squared_error', metrics=r2_score_wo_demeaning_nn)  # Using mean absolute error (mae) as metric
        
        history = model.fit(x_train, y_train, epochs=params['epoch'], batch_size=params['batch_size'],
                            validation_data=(x_val, y_val), verbose=0)
        
        # Get the metric value for the last epoch
        last_epoch_metric = history.history['r2_score_wo_demeaning_nn'][-1]  # Validation MAE for last epoch
        
        # Store results for current parameter set
        results.append({'params': params, 'val_r2_score_wo_demeaning_nn': last_epoch_metric})
    
    return results




In [5]:
## NN

# All activation functions are ReLU function <br>
# optimizer: SGD w/ learning rate shrinkage: adam <br>

# convert dataframes to numpy array

X_train_nn = np.asarray(X_train.values)
y_train_nn = np.asarray(y_train.values)

X_val_nn = np.asarray(X_val.values)
y_val_nn = np.asarray(y_val.values)

X_test_nn = np.asarray(X_test.values)
y_test_nn = np.asarray(y_test.values)

X_train_combined_nn = np.asarray(X_train_combined.values)
y_train_combined_nn = np.asarray(y_train_combined.values)

# set-up grid

nepoch_val = [25, 50, 75, 100]
lr_val = [0.05, 0.01, 0.001]
nbatch_val = [1500, 2500, 3500]

param_nn = [{'epoch': epoch, 'learning_rate': learning_rate, 'batch_size': batch_size} for epoch in nepoch_val for learning_rate in lr_val for batch_size in nbatch_val]

In [7]:
### 3-layer

input_dim = 20
layer1_n = 32
layer2_n = 16
layer3_n = 8


model_3 = Sequential([
            layers.Dense(layer1_n, input_dim = input_dim, activation='relu'),
            layers.Dense(layer2_n, activation='relu'),
            layers.Dense(layer3_n, activation='relu'),
            layers.Dense(1, activation='linear')
        ])


## tuning

result_nn3 = compile_and_tune_model(model_3, param_nn, X_train_nn, y_train_nn, X_val_nn, y_val_nn)

result_nn3


## best model**<br>

# best parameters

result_nn3 = pd.DataFrame(result_nn3).sort_values('val_r2_score_wo_demeaning_nn',ascending=False)
opt_score = result_nn3.iloc[0,1]
opt_para = result_nn3.iloc[0,0]

result_nn3.iloc[0,1]

result_nn3.iloc[0,0]

# best model

model_nn3 = Sequential([
            layers.Dense(layer1_n, activation='relu', input_dim=input_dim),
            layers.Dense(1, activation='linear')
        ])

model_nn3.compile(optimizer=keras.optimizers.Adam(learning_rate=opt_para['learning_rate']),
                      loss='mean_squared_error', metrics=r2_score_wo_demeaning_nn)

history = model_nn3.fit(X_train_combined_nn, y_train_combined_nn, epochs=opt_para['epoch'], batch_size=opt_para['batch_size'],
                            validation_data=(X_test_nn, y_test_nn), verbose=0)
        
last_epoch_metric = history.history['val_r2_score_wo_demeaning_nn'][-1]


last_epoch_metric






-0.04292283207178116

# HW 5

In [23]:
X = df_scaled_2.dropna().drop(columns = ['r(t+1)'])

sort data:<br>
by dolvol<br>

In [24]:
X = X.sort_values(by=["dolvol"]).reset_index(drop = True)

In [25]:
print(min, X['dolvol'].min())
print(max, X['dolvol'].max())

<built-in function min> -4.749755120876968
<built-in function max> 1.9821470349015913


create 5 bins

In [26]:
bins = list()
end = 0
start = 0
size = int(113000/5)

for i in range(5):
    end = end + size
    tmp = X.iloc[start:end,:]
    bins.append(tmp)
    start = start + size

In [27]:
# to make sure bins are correct

for item in bins:
    print(len(item))
    print(item['dolvol'].max())
    print(item['dolvol'].min())
    print('------')

22600
-0.9093693679629887
-4.749755120876968
------
22600
-0.17561548513264147
-0.9093631545240612
------
22600
0.35791880105283025
-0.1756097149360904
------
22600
0.8891905133723533
0.3579209861792339
------
22600
1.9821470349015913
0.8892152394947396
------


## LIME
features of interest: dolvol, baspred, sp
***
unclear instruction -> assumptions: 1) randomly draw a sample first  2) don't perturb data and draw 10 other instances to fit a local model

In [None]:
for df in bins:
    x = df.sample(1).values[0]

## Shapley
features of interest: dolvol(8), baspred(11), sp(17)

In [28]:
from itertools import combinations

In [54]:
# calculate the shaply value for feature j

M = 10  # the number of random samples to calculate marginal contribution from
n_features = len(X.columns)
feature_idxs = [7, 10, 16]


for df in bins:  # for each bin
    x = df.sample(1).values[0]  # instance of interest
    
    for i in range(3):   # for each feature of interest
        x_idx = [7, 10, 16]
        marg_contri_feature = []
    
        feature_of_interest = feature_idxs[i]
        print("Index of feature of interest: ", feature_of_interest)
        x_idx.remove(feature_of_interest)
        
        all_combinations = []  # all coalitions of features minus the feature of interest
        for r in range(1, len(x_idx) + 1):
            combinations_of_length_r = list(combinations(x_idx, r))
            all_combinations.extend(combinations_of_length_r)
            
        for coalition in all_combinations:  # for each coalition
            coalition = list(coalition)
            print('the current coalition: ', coalition)
            
            marg_contri_coalition = []  # marginal contributions of this coalition
            
            for _ in range(M):
                z = df.sample(1).values[0]  # random select sample    

                # construct two new instances
                x_orig = np.array([x[i] if i in coalition + [feature_of_interest] else z[i] for i in range(n_features)])
                x_rando = np.array([x[i] if i in coalition else z[i] for i in range(n_features)])

                print("modified data")
                print(x_orig)
                print(x_orig)
                
                
#                 calculate marginal contribution
#                 marginal_contribution = f.predict_proba(x_plus_j.reshape(1, -1))[0][1] - f.predict_proba(x_minus_j.reshape(1, -1))[0][1]
#                 marginal_contributions.append(marginal_contribution)

#             sigma_j = sum(marginal_contributions) / len(marginal_contributions)  # average across random samples
#             weight = coalition! * (n_features - coalition -1)!/ n_features!
#             shapley_coalition = sigma_j*weight  # for one coaltion examined for feature of interest
#             marg_contri_coalition.append(shapley_coalition)

#           shapley_j = sum(marg_contri_coalition)  # shapley value of feature j
#           print("shapley value of beforehand mentioned feature: ", shapley_j)


Index of feature of interest:  7
the current coalition:  [10]
modified data
[ 0.20157062  0.27216183  0.05872581  0.26564557  0.27244201 -0.65842123
 -0.2796748  -1.60582648 -0.10676761 -0.18796558  1.09080632 -0.48304233
 -0.47832715 -1.19266528 -0.83630283  0.21100585 -0.3968446  -0.42336473
  0.04423431  0.18155159]
[ 0.20157062  0.27216183  0.05872581  0.26564557  0.27244201 -0.65842123
 -0.2796748  -1.60582648 -0.10676761 -0.18796558  1.09080632 -0.48304233
 -0.47832715 -1.19266528 -0.83630283  0.21100585 -0.3968446  -0.42336473
  0.04423431  0.18155159]
modified data
[-0.29847025 -0.28965144 -0.36258038 -0.28803396 -0.65056979 -0.86039598
 -0.28003693 -1.60582648 -0.05449415  1.59632289  1.09080632 -0.29252308
 -0.36691828 -1.45286083 -0.88819614  0.32671981  0.23528648  0.27859215
 -0.76259956 -0.27238498]
[-0.29847025 -0.28965144 -0.36258038 -0.28803396 -0.65056979 -0.86039598
 -0.28003693 -1.60582648 -0.05449415  1.59632289  1.09080632 -0.29252308
 -0.36691828 -1.45286083 -0.8

modified data
[ 1.00941344e+00  6.48098634e-01  8.16533798e-01  5.68190756e-01
 -3.63026116e-03 -4.41959483e-01 -2.67818788e-01 -3.90257552e-01
 -1.09447852e-01 -1.87965586e-01  7.16425373e-01 -1.78510881e-01
  1.47653354e+00  1.00524155e-01 -1.38842412e-01  1.82097663e-01
  1.20403839e+01  1.58283548e-01  4.42343131e-02  8.89957212e-01]
[ 1.00941344e+00  6.48098634e-01  8.16533798e-01  5.68190756e-01
 -3.63026116e-03 -4.41959483e-01 -2.67818788e-01 -3.90257552e-01
 -1.09447852e-01 -1.87965586e-01  7.16425373e-01 -1.78510881e-01
  1.47653354e+00  1.00524155e-01 -1.38842412e-01  1.82097663e-01
  1.20403839e+01  1.58283548e-01  4.42343131e-02  8.89957212e-01]
modified data
[-1.36924713 -0.80095285 -0.68185772 -0.36222946 -0.25759846 -0.46380319
 -0.2767815  -0.39025755 -0.10754767 -0.18796558  0.71642537 -0.47147427
 -0.17003046  0.21591439 -0.04104425  0.157441   12.04038392 -0.01326528
 -0.76259956 -1.38504885]
[-1.36924713 -0.80095285 -0.68185772 -0.36222946 -0.25759846 -0.46380319
 -

modified data
[ 0.41619988  0.12024364  0.59640985 -0.47134784 -0.21178823  1.88781671
 -0.26862092 -0.10488758 -0.11032524 -0.18796561 -0.43048302  0.24026753
  0.02403937  1.91828503  2.07889117  0.41230224 -0.37629231 -0.67780771
  0.85106818  0.42026862]
[ 0.41619988  0.12024364  0.59640985 -0.47134784 -0.21178823  1.88781671
 -0.26862092 -0.10488758 -0.11032524 -0.18796561 -0.43048302  0.24026753
  0.02403937  1.91828503  2.07889117  0.41230224 -0.37629231 -0.67780771
  0.85106818  0.42026862]
modified data
[-1.41794207 -0.06180326 -2.57403862 -0.55977163  1.01668493  1.07474336
 -0.27355796 -0.10488758 -0.109457   -0.1879656  -0.43048302  2.24800216
  1.44720635 -0.11127339 -0.30318699  0.09015087 -0.37629231  0.49269146
  0.04423431 -1.09374532]
[-1.41794207 -0.06180326 -2.57403862 -0.55977163  1.01668493  1.07474336
 -0.27355796 -0.10488758 -0.109457   -0.1879656  -0.43048302  2.24800216
  1.44720635 -0.11127339 -0.30318699  0.09015087 -0.37629231  0.49269146
  0.04423431 -1.09

modified data
[-0.23419499  0.42059101  0.02198885 -0.31898553  0.10561578 -0.45432084
 -0.16666637  0.42731828 -0.1104901  -0.18796559 -0.40845442 -0.75857329
 -0.10015852 -1.78011264 -0.91139869  0.15732977 -0.47820201 -0.33155007
 -0.76259956 -0.23204763]
[-0.23419499  0.42059101  0.02198885 -0.31898553  0.10561578 -0.45432084
 -0.16666637  0.42731828 -0.1104901  -0.18796559 -0.40845442 -0.75857329
 -0.10015852 -1.78011264 -0.91139869  0.15732977 -0.47820201 -0.33155007
 -0.76259956 -0.23204763]
modified data
[-0.39581333  0.06977646 -0.26057763  0.15553932  0.37906707 -0.01668636
 -0.2119996   0.51210706 -0.11054551 -0.1879656  -0.40845442 -0.87695544
 -0.56992036  0.37136503  0.09991916  0.2551098  -0.47820201  0.09513121
 -0.76259956 -0.3906629 ]
[-0.39581333  0.06977646 -0.26057763  0.15553932  0.37906707 -0.01668636
 -0.2119996   0.51210706 -0.11054551 -0.1879656  -0.40845442 -0.87695544
 -0.56992036  0.37136503  0.09991916  0.2551098  -0.47820201  0.09513121
 -0.76259956 -0.39

modified data
[-1.30618271  1.25516018 -0.43005664 -0.28645898  0.33417992  0.60167597
 -0.12695017  1.23983594 -0.11057167 -0.18796561  0.08345199 -0.02867817
 -0.38811476  0.748551    0.4859126   0.1821232  -0.53293446 -0.74862606
 -0.76259956 -1.28647545]
[-1.30618271  1.25516018 -0.43005664 -0.28645898  0.33417992  0.60167597
 -0.12695017  1.23983594 -0.11057167 -0.18796561  0.08345199 -0.02867817
 -0.38811476  0.748551    0.4859126   0.1821232  -0.53293446 -0.74862606
 -0.76259956 -1.28647545]
the current coalition:  [10]
modified data
[ 0.17660163  0.19392334 -0.13587644 -1.09610024  0.4015108   2.98466692
 -0.20360657  1.07315565 -0.11056247 -0.18796561 -0.42776772 -0.43719858
 -0.27838272 -0.84444491 -0.72049649  0.21644969 -0.53293446  0.2944972
  0.04423431  0.16048363]
[ 0.17660163  0.19392334 -0.13587644 -1.09610024  0.4015108   2.98466692
 -0.20360657  1.07315565 -0.11056247 -0.18796561 -0.42776772 -0.43719858
 -0.27838272 -0.84444491 -0.72049649  0.21644969 -0.53293446  0