In [None]:
# run the single model file fitted on 9:4:6 split data

# %run -i "C:/Users/LL/Desktop/RPI/24_Spring/AMLF/HW_5_SHAP_LIME/Enet_NeuralNet_v2.ipynb"

In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('C:/users/LL/Documents/GitHub/AMLF_projects/data.csv')

df = df.drop(columns = ['Unnamed: 0'])

# According to note 30: "Therefore, to predict returns at month t+1, we use most recent monthly characteristics at the end of month t." <br>
# Hence, **shift return t+1 to serve as response: r(t+1)**.

df['r(t+1)'] = df.groupby('permno')['return'].shift(-1)

### handle missing data

# According to note 30 (bottom of p 2248): "Another issue is missing characteristics, which we replace with the cross-sectional median at each month for each stock, respectively." <br>
# Hence, calculate monthly cross-sectional median for features: **'mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr'**.

df_filled = df.copy()
for feature in ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']:
    df_filled[feature] = df_filled.groupby('Date')[feature].transform(lambda x: x.fillna(x.median()))

df_filled.isna().sum()

df.loc[:, ['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']] = df_filled.loc[:,['mom1m', 'mom12m', 'chmom', 'mom36m', 'turn', 'dolvol', 'idiovol', 'beta', 'betasq', 'ep', 'sp', 'agr', 'nincr']]

df['Date'] = pd.to_datetime(df['Date'])

# Set the datetime column as index
df.set_index('Date', inplace=True, drop = True)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df_scaled = scaler.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns)


permno = df['permno'].reset_index(drop = True)

df_scaled['permno'] = permno

df_scaled.index = df.index

df_scaled_2 = df_scaled.drop(columns = [ 'permno', 'return'])


### split data

##split training, validation, and testing datasets

#training : validation : testing = 6 yr : 4yr : 9 yr <br>
#Also drop the first and last month due to the absence of r(t+1) and return(t-1)

training = df_scaled_2[:'2007-01-01'].dropna()
validation = df_scaled_2['2007-01-01':'2011-01-01']
testing = df_scaled_2['2011-01-01':].dropna()

training_combined = df_scaled_2[:'2011-01-01'].dropna()


##separate X and y

X_train = training.drop(columns = ['r(t+1)'])
y_train = training['r(t+1)']

X_val = validation.drop(columns = ['r(t+1)'])
y_val = validation['r(t+1)']

X_test = testing.drop(columns = ['r(t+1)'])
y_test = testing['r(t+1)']

X_train_combined = training_combined.drop(columns = ['r(t+1)'])
y_train_combined = training_combined['r(t+1)']

In [2]:
## NN

import keras
from keras import layers
from keras import Sequential

# convert dataframes to numpy array

X_train_nn = np.asarray(X_train.values)
y_train_nn = np.asarray(y_train.values)

X_val_nn = np.asarray(X_val.values)
y_val_nn = np.asarray(y_val.values)

X_test_nn = np.asarray(X_test.values)
y_test_nn = np.asarray(y_test.values)

X_train_combined_nn = np.asarray(X_train_combined.values)
y_train_combined_nn = np.asarray(y_train_combined.values)




## model
{'epoch': 100, 'learning_rate': 0.01, 'batch_size': 3000} 

In [12]:
import tensorflow as tf
# import tensorflow_addons as tfa
import random

def r2_score_wo_demeaning_nn(y_true, y_pred):
    ss_res = tf.reduce_sum(tf.square(y_true - y_pred))
    ss_tot = tf.reduce_sum(tf.square(y_true - 0))
    r2 = 1 - (ss_res / ss_tot)
    return r2

In [4]:
y_test_nn.mean()

-0.021094502635160946

In [5]:
from sklearn.metrics import r2_score

In [13]:
random_seed = 77

np.random.seed(random_seed)
tf.random.set_seed(random_seed)
random.seed(random_seed)

# nn3 model

model_nn3 = Sequential([
            layers.Dense(32, activation='relu', input_dim=20),
            layers.Dense(16, activation='relu'),
            layers.Dense(8, activation='relu'),
            layers.Dense(1, activation='linear')
        ])

model_nn3.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001),
                      loss='mean_squared_error', metrics=r2_score_wo_demeaning_nn)

history = model_nn3.fit(X_train_combined_nn, y_train_combined_nn, epochs=100, batch_size=2500,
                            validation_data=(X_test_nn, y_test_nn), verbose=0)





In [16]:
# history.history

In [15]:
last_epoch_metric = history.history['val_r2_score_wo_demeaning_nn'][-1]

last_epoch_metric

-0.0419025793671608

# HW 5

In [17]:
model = model_nn3

In [18]:
X = df_scaled_2.dropna().drop(columns = ['r(t+1)'])

sort data:<br>
by dolvol<br>

In [19]:
X = X.sort_values(by=["dolvol"]).reset_index(drop = True)

In [20]:
print(min, X['dolvol'].min())
print(max, X['dolvol'].max())

<built-in function min> -4.749755120876968
<built-in function max> 1.9821470349015913


create 5 bins

In [21]:
bins = list()
end = 0
start = 0
size = int(113000/5)

for i in range(5):
    end = end + size
    tmp = X.iloc[start:end,:]
    bins.append(tmp)
    start = start + size

In [22]:
# to make sure bins are correct

for item in bins:
    print(len(item))
    print(item['dolvol'].max())
    print(item['dolvol'].min())
    print('------')

22600
-0.9093693679629887
-4.749755120876968
------
22600
-0.17561548513264147
-0.9093631545240612
------
22600
0.35791880105283025
-0.1756097149360904
------
22600
0.8891905133723533
0.3579209861792339
------
22600
1.9821470349015913
0.8892152394947396
------


## LIME
features of interest: dolvol, baspred, sp
***
unclear instruction -> assumptions: 1) randomly draw a sample first  2) don't perturb data and draw 10 other instances to fit a local model

In [None]:
for df in bins:
    x = df.sample(1).values[0]
    
    x_sample = df.sample(10).values[0]

## Shapley
features of interest: dolvol(8), baspred(11), sp(17)

In [23]:
from itertools import combinations
import math

In [58]:

M = 10  # the number of random samples to calculate marginal contribution from
n_features = len(X.columns)
feature_idxs = [7, 10, 16]


for df in bins:  # for each bin
    x = df.sample(1).values[0]  # instance of interest
    shapley_values = []
    
    for i in range(3):   # for each feature of interest
        x_idx = [7, 10, 16]
        marg_contri_feature = []
    
        feature_of_interest = feature_idxs[i]
        print("Index of feature of interest: ", feature_of_interest)
        x_idx.remove(feature_of_interest)
        
        all_combinations = []  # all coalitions of features minus the feature of interest
        for r in range(1, len(x_idx) + 1):
            combinations_of_length_r = list(combinations(x_idx, r))
            all_combinations.extend(combinations_of_length_r)
            
        for coalition in all_combinations:  # for each coalition
            coalition = list(coalition)
            print('the current coalition: ', coalition)
            weight = math.factorial(len(coalition)) * math.factorial(n_features - len(coalition)-1)/ math.factorial(n_features)
            print("weight of this coalition: ", weight)
            
            marg_contri_coalition = []  # marginal contributions of this coalition
            
            for _ in range(M):
                z = df.sample(1).values[0]  # random select sample    

                # construct two new instances
                x_orig = np.array([x[i] if i in coalition + [feature_of_interest] else z[i] for i in range(n_features)])
                x_rando = np.array([x[i] if i in coalition else z[i] for i in range(n_features)])             
                
                # calculate marginal contribution
                marginal_contribution = model.predict(x_orig.reshape(1, -1))[0][0] - model.predict(x_rando.reshape(1, -1))[0][0]
                marg_contri_coalition.append(marginal_contribution)

            sigma_j = sum(marg_contri_coalition) / len(marg_contri_coalition)
            shapley_coalition = sigma_j*weight  # for one coaltion examined for feature of interest
            marg_contri_feature.append(shapley_coalition)

        shapley_j = sum(marg_contri_feature)  # shapley value of feature j
        shapley_values.append(shapley_j)
        
        print("feature of interest: ", feature_of_interest)
        print("shapley value of this feature: ", shapley_j)
        print('--------')
        print('--------')


Index of feature of interest:  7
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [10, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  7
shapley value of this feature:  -0.00025169260282483365
--------
--------
Index of feature of interest:  10
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  10
shapley value of this feature:  -0.00036963351250609807
--------
--------
Index of feature of interest:  16
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [10]
weight of this coalition:  0.002631578947368421


the current coalition:  [7, 10]
weight of this coalition:  0.00029239766081871346
feature of interest:  16
shapley value of this feature:  -0.00048436261931358025
--------
--------
Index of feature of interest:  7
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [10, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  7
shapley value of this feature:  -4.663699532229976e-05
--------
--------
Index of feature of interest:  10
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 16]
weight of this coalition:  0.00029239766081871346


feature of interest:  10
shapley value of this feature:  -0.00013401815598307733
--------
--------
Index of feature of interest:  16
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 10]
weight of this coalition:  0.00029239766081871346
feature of interest:  16
shapley value of this feature:  1.6175380918845444e-05
--------
--------
Index of feature of interest:  7
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [10, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  7
shapley value of this feature:  -3.8926370730569134e-05
--------
--------
Index of feature of interest:  10
the current coalition:  [7]
weight of this coalition:  0.002631578947368421


the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  10
shapley value of this feature:  -0.00016359665300370307
--------
--------
Index of feature of interest:  16
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 10]
weight of this coalition:  0.00029239766081871346
feature of interest:  16
shapley value of this feature:  -9.481523872206086e-05
--------
--------
Index of feature of interest:  7
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421


the current coalition:  [10, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  7
shapley value of this feature:  2.69184989190241e-05
--------
--------
Index of feature of interest:  10
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  10
shapley value of this feature:  -0.00020986255666796574
--------
--------
Index of feature of interest:  16
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 10]
weight of this coalition:  0.00029239766081871346


feature of interest:  16
shapley value of this feature:  -0.0003419389877174246
--------
--------
Index of feature of interest:  7
the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [10, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  7
shapley value of this feature:  4.501315574461256e-05
--------
--------
Index of feature of interest:  10
the current coalition:  [7]
weight of this coalition:  0.002631578947368421
the current coalition:  [16]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 16]
weight of this coalition:  0.00029239766081871346
feature of interest:  10
shapley value of this feature:  0.00018886326648136985
--------
--------
Index of feature of interest:  16
the current coalition:  [7]
weight of this coalition:  0.002631578947368421


the current coalition:  [10]
weight of this coalition:  0.002631578947368421
the current coalition:  [7, 10]
weight of this coalition:  0.00029239766081871346
feature of interest:  16
shapley value of this feature:  0.00013144437077302725
--------
--------


In [60]:
shapley = zip(feature_idxs, shapley_values)

In [63]:
for feature, value in shapley:
    print(feature)
    print(value)
    print('----')

7
4.501315574461256e-05
----
10
0.00018886326648136985
----
16
0.00013144437077302725
----
