In [1]:
## import standard modules for data handling and visualization
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

## import model specific modules
import cplex as cp
import slim_python as slim
import shap
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

In [2]:
cd ..

C:\Users\danie\Documents\StageDaniel\research


In [4]:
cd research

[WinError 2] Het systeem kan het opgegeven bestand niet vinden: 'research'
C:\Users\danie\Documents\StageDaniel\research


In [4]:
only_correct=True
no_zeros = True

In [5]:
# haberman and breastcancer gives large slim values
# mushroom and bankruptcy, spambase gives large ebm values 
# mushroom creates high sparsity for all models except EBM

dataset = 'simulation_test'
def load_models(name):
    with open('results/models/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
models = load_models(dataset+'_models_3600')
data = load_models(dataset+'_data_3600')

In [6]:
# data['X_test']

In [7]:
X_names = list(data['X_train'].columns.values)
X_names.insert(0, '(Intercept)')

In [8]:
def extract_results(results, X):
    # Extract specific models from the results

    slim_results = results[0][1]

    rho = slim_results['rho']
    slim_predictions = pred_slim(X,rho)

    ebm = results[1]

    XGboost = results[2]
    explainer = shap.TreeExplainer(XGboost)
    shap_values = explainer.shap_values(X)

    logit = results[3]
    
    return rho, slim_predictions, ebm, explainer, shap_values, logit


def get_explanations(results, X, y, pred_idx=0, printing=False):
    
    rho, slim_predictions, ebm, explainer, shap_values, logit = results   
    
    observation = pd.Series(1, index=['Constant']).append(X.iloc[pred_idx]).append(pd.Series(y.iloc[pred_idx], index=['prediction']))
    
    slim_contributions = X.iloc[pred_idx]*rho[1:]
    slim_threshold = rho[0]
    slim_prediction = slim_predictions.iloc[pred_idx]
    slim_explanation = pd.Series(slim_threshold, index=['Constant']).append(slim_contributions).append(pd.Series(slim_prediction, index=['prediction'])) 

    logit_contributions = X.iloc[pred_idx]*logit.coef_.ravel()
    logit_intercept = logit.intercept_
    logit_prediction = int(sum(logit_contributions)+logit_intercept>0)
    logit_explanation = pd.Series(logit_intercept, index=['Constant']).append(logit_contributions).append(pd.Series(logit_prediction, index=['prediction']))

    ebm_local = ebm.explain_local(X.iloc[pred_idx:pred_idx+1], y.iloc[pred_idx:pred_idx+1], name='EBM')
    ebm_contributions = pd.Series(ebm_local.data(0)['scores'], index=X_names[1:])
    ebm_mean = ebm_local.data(0)['extra']['scores'][0]                               
    ebm_prediction = int(sum(ebm_contributions)+ebm_mean>0)
    ebm_explanation = pd.Series(ebm_mean, index= ['Constant']).append(ebm_contributions).append(pd.Series(ebm_prediction, index=['prediction']))

    shap_contributions = pd.Series(shap_values[pred_idx,:], index=X_names[1:])
    shap_mean = explainer.expected_value                              
    shap_prediction =  int(sum(shap_contributions)+ shap_mean>0)
    shap_explanation = pd.Series(shap_mean, index = ['Constant']).append(shap_contributions).append(pd.Series(shap_prediction, index=['prediction']))
    
    explanations = pd.DataFrame([observation, slim_explanation, logit_explanation, ebm_explanation, shap_explanation], index=['X', 'slim', 'logit', 'ebm', 'shap'])
    explanations['sum'] = explanations[list(explanations.columns)].sum(axis=1)
    if printing:
        display(explanations)
        print("prediction slim: ", slim_prediction)
        print("prediction logit: ", logit_prediction)
        print("prediction ebm: ", ebm_prediction)
        print("prediction shap: ", shap_prediction)
        print("true y: ", y.iloc[pred_idx])   
    
    return explanations

## Simple function for getting predictions for a SLIM scoring system
def pred_slim(X, rho):
    return (X.dot(rho[1:])+rho[0]>=0)*1

def sigmoid(x):
    return 1/(1 + np.exp(-x))


def force_plot(explanations, model, X, pred_idx=0, link='identity'):
    return shap.force_plot(explanations.loc[model][0], explanations.loc[model][1:-1].values, X.iloc[pred_idx,:], link=link)

In [9]:
%%time

N = data['X_test'].shape[0]
expl = []
results = extract_results(models, data['X_test'])
for idx in range(N):
    expl.append(get_explanations(results,data['X_test'], data['y_test'], idx, printing=False))


Setting feature_perturbation = "tree_path_dependent" because no background data was given.


Wall time: 10.4 s


In [10]:
def clean_prediction(df):
    if df['prediction'].mean() in [0,1]: # unanimous
        return df
    
def zero_filter(df):
    sums = df.sum(axis=1)
    if (sums.isin([0])*1).sum()==0:
        
        return df
def preprocess_results(expl, only_correct=True, no_zeros=True):
    # add observation index
    complete_expl = pd.concat(expl, keys = ["{:02d}".format(x) for x in range(N)])
    complete_expl.index.names = ['obs', 'model']
    # clean to only include the contributions

    if only_correct:
        contrib = complete_expl.groupby(level='obs').apply(clean_prediction)
        if len(contrib.index.names)>2:
            contrib=contrib.droplevel(0)
        contrib = contrib.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
        contrib.index = contrib.index.remove_unused_levels()
    else:    
        contrib = complete_expl.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
        contrib.index = contrib.index.remove_unused_levels()

    if no_zeros:
        contrib = contrib.groupby(level='obs').apply(zero_filter)
        if len(contrib.index.names)>2:
            contrib=contrib.droplevel(0)
    
    print("Original number of explanations:  ", N)
    print("Cleaned explanation set size:     ", int(contrib.shape[0]/4))
    return contrib

In [16]:
preprocess_results(expl, False, False)

Original number of explanations:   1000
Cleaned explanation set size:      1000


Unnamed: 0_level_0,Unnamed: 1_level_0,bin_1,bin_2,bin_3,bin_4,cont_1,cont_2,cont_3,cont_4
obs,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
00,slim,-5.000000,-5.000000,0.000000,4.000000,0.000000,0.000000,3.600000,1.280000
00,logit,-18.335640,-18.673511,0.000000,18.479273,0.000000,0.018533,12.940393,5.811333
00,ebm,-8.497525,-8.671282,-8.900065,8.643842,0.394433,0.369318,2.281275,-4.930410
00,shap,-4.193167,-3.241052,-3.311318,2.310616,0.106954,-0.023586,1.532789,-1.491999
01,slim,-0.000000,-0.000000,0.000000,0.000000,0.000000,0.000000,2.650000,1.000000
...,...,...,...,...,...,...,...,...,...
998,shap,-3.806949,-3.008247,2.232424,-3.656013,-0.011013,-0.028196,-0.164323,-3.026967
999,slim,-0.000000,-5.000000,0.000000,4.000000,0.000000,0.000000,4.650000,0.240000
999,logit,-0.000000,-18.673511,0.000000,18.479273,0.000000,0.025225,16.714675,1.089625
999,ebm,8.721371,-8.671282,-8.900065,8.643842,0.504802,0.448461,7.345369,-7.536616


In [12]:
from scipy.stats import pearsonr, spearmanr
def pearson(df):
    return(pearsonr(df.iloc[0], df.iloc[1])[0])
def spearman(df):
    return(spearmanr(df.iloc[0], df.iloc[1])[0])
def get_pearson_matrix(df, func=pearson):
    
    models = list(df.index.levels[1])
    p = len(models)
    distance_matrix = np.zeros((p,p))
    
    for i in range(p):
        for j in range(i+1,p):
            to_drop = models.copy()
            del to_drop[j]
            del to_drop[i]
            diff = df.drop(to_drop, level=1).groupby(level='obs').apply(func).mean()
            distance_matrix[i,j] = round(diff,3)
            distance_matrix[j,i] = round(diff,3)
    distance_df = pd.DataFrame(distance_matrix, models, models)
    distance_df['Average correlation'] = distance_df.sum()/(p-1)
    return distance_df

In [14]:
# names = ['bankruptcy','haberman', 'breastcancer', 'mammo', 'spambase', 'mushroom', 'adult']
# names = ['bankruptcy', 'haberman']
names = ['simulation_test']
corrs = []

for dataname in names:
    models = load_models(dataname+'_models_3600')
    data = load_models(dataname+'_data_3600')
    X_names = list(data['X_train'].columns.values)
    X_names.insert(0, '(Intercept)')
    
    N = data['X_test'].shape[0]
    expl = []
    results = extract_results(models, data['X_test'])
    for idx in range(N):
        expl.append(get_explanations(results,data['X_test'], data['y_test'], idx, printing=False))
    
    print("Dataset: ", dataname)
    contrib = preprocess_results(expl, False, False)
#     # add observation index
#     complete_expl = pd.concat(expl, keys = ["{:02d}".format(x) for x in range(N)])
#     complete_expl.index.names = ['obs', 'model']
#     # clean to only include the contributions
#     contrib = complete_expl.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
#     contrib.index = contrib.index.remove_unused_levels()
    
    pearson_distance = get_pearson_matrix(contrib)
    corrs.append(pearson_distance)
    
corrs_df = pd.concat(corrs, keys = names)
corrs_df

Dataset:  simulation_test
Original number of explanations:   1000
Cleaned explanation set size:      1000


Unnamed: 0,Unnamed: 1,slim,logit,ebm,shap,Average correlation
simulation_test,slim,0.0,0.996,0.592,0.59,0.726
simulation_test,logit,0.996,0.0,0.598,0.596,0.73
simulation_test,ebm,0.592,0.598,0.0,0.978,0.722667
simulation_test,shap,0.59,0.596,0.978,0.0,0.721333


In [15]:
corrs_df.index.names = ['data', 'model']
stds = corrs_df.groupby(level='model').std().round(3).reindex(['slim', 'logit', 'ebm', 'shap'])
stds

Unnamed: 0_level_0,slim,logit,ebm,shap,Average correlation
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
slim,0.0,0.199,0.296,0.279,0.211
logit,0.199,0.0,0.3,0.194,0.177
ebm,0.296,0.3,0.0,0.326,0.278
shap,0.279,0.194,0.326,0.0,0.169


In [16]:
avgs = corrs_df.groupby(level='model').mean().round(3).reindex(['slim', 'logit', 'ebm', 'shap'])
avgs

Unnamed: 0_level_0,slim,logit,ebm,shap,Average correlation
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
slim,0.0,0.733,0.358,0.518,0.536
logit,0.733,0.0,0.462,0.518,0.571
ebm,0.358,0.462,0.0,0.608,0.476
shap,0.518,0.518,0.608,0.0,0.548


In [169]:
p_values.reindex(['slim', 'logit', 'ebm', 'shap'])

Unnamed: 0_level_0,ebm,logit,shap,slim,Average correlation
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
slim,0.0209,0.0001,0.0041,,0.0006
logit,0.0076,,0.0015,0.0001,0.0002
ebm,,0.0076,0.0027,0.0209,0.0044
shap,0.0027,0.0015,,0.0041,0.0002


In [13]:
from scipy.stats import t

def p_value(x):
    dof = 6
    return t.sf(t_stat, dof-1)*2

def t_stat(df):
    dof = len(df)
    s = df.std()/np.sqrt(dof)
    t_stat = df.mean()/s
    p_value = t.sf(t_stat, dof-1)*2 
    test = df
    test.iloc[0] = p_value
#     print(test.iloc[0])
#     print(pd.Series(p_value))
#     print(-t.ppf(0.975, 6)*s+df.mean())
    return test.iloc[0]
#     return -t.ppf(0.975, 6)*s+df.mean()
p_values = corrs_df.groupby(level='model').apply(t_stat).round(4).reindex(['slim', 'logit', 'ebm', 'shap'])

NameError: name 'corrs_df' is not defined

In [172]:
print(stds.drop(columns=['Average correlation','slim'])[['logit','ebm', 'shap']].to_latex())

\begin{tabular}{lrrr}
\toprule
{} &  logit &    ebm &   shap \\
model &        &        &        \\
\midrule
slim  &  0.207 &  0.304 &  0.298 \\
logit &  0.000 &  0.312 &  0.271 \\
ebm   &  0.312 &  0.000 &  0.329 \\
shap  &  0.271 &  0.329 &  0.000 \\
\bottomrule
\end{tabular}



In [148]:
corrs[0]

obs,00,01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
slim_logit,0.900429,,0.880068,,0.900429,0.900429,0.943621,0.950199,0.880068,0.900429,0.900429,0.900429,1.0,0.943621,,1.0,0.900429,0.935193,0.943621,0.882011,0.900429,0.900429,0.900429,0.900429,0.900429,0.935193,,0.880068,0.900429,0.935193,0.900429,,1.0,0.900429,0.900429,0.900429,1.0,0.900429,1.0,0.943621,,0.935193,,0.900429,0.935193,0.935193,0.900429,0.943621,0.880068,0.882011
slim_ebm,0.798026,,0.733234,,0.798026,0.763243,0.862052,0.745435,0.733234,0.763243,0.858878,0.791882,,0.861731,,0.833136,0.818079,0.699078,0.861731,0.864546,0.796643,0.807949,0.797383,0.84686,0.858878,0.756725,,0.74288,0.858878,0.668478,0.791882,,,0.763243,0.811021,0.847574,0.756958,0.767123,0.756958,0.937858,,0.697271,,0.798026,0.670654,0.757439,0.799897,0.949444,0.769176,0.762767
slim_shap,0.9637,,0.946445,,0.9637,0.966608,0.981016,0.842098,0.946445,0.966608,0.966608,0.9637,,0.981016,,0.96805,0.962784,0.973786,0.981016,0.840764,0.966608,0.973786,0.966608,0.970029,0.966608,0.973786,,0.947046,0.966608,0.971912,0.9637,,,0.966608,0.965867,0.970029,0.971912,0.971912,0.971912,0.971912,,0.971912,,0.9637,0.971912,0.96805,0.9637,0.973786,0.931261,0.981634
logit_ebm,0.975345,0.500436,0.922389,0.767529,0.975345,0.95607,0.922063,0.869306,0.922389,0.95607,0.9506,0.975882,,0.946565,0.543674,0.833136,0.986326,0.796475,0.946565,0.966316,0.94421,0.938582,0.944331,0.952141,0.9506,0.832196,0.543674,0.959168,0.9506,0.754826,0.975882,0.912946,,0.95607,0.955923,0.952117,0.756958,0.863581,0.756958,0.934524,0.648804,0.789367,0.513369,0.975345,0.753903,0.840093,0.933294,0.954386,0.95459,0.961054
logit_shap,0.983761,-0.051173,0.979199,0.838084,0.983761,0.981692,0.983287,0.955582,0.979199,0.981692,0.981692,0.983761,,0.983287,-0.051131,0.96805,0.98412,0.943323,0.983287,0.990069,0.981692,0.97169,0.981692,0.973756,0.981692,0.943323,-0.051131,0.978533,0.981692,0.941508,0.983761,0.476671,,0.981692,0.982202,0.973756,0.971912,0.972816,0.971912,0.978908,0.067738,0.941508,-0.051399,0.983761,0.941508,0.937665,0.983761,0.977891,0.985707,0.952814
ebm_shap,0.926356,0.461317,0.879291,0.910521,0.926356,0.897335,0.926783,0.971547,0.879291,0.897335,0.934812,0.924715,0.607934,0.9223,0.482676,0.920537,0.942223,0.833637,0.9223,0.984896,0.900135,0.874968,0.900536,0.898957,0.934812,0.87408,0.482676,0.911233,0.934812,0.796155,0.924715,0.70101,0.478047,0.897335,0.916515,0.899236,0.840977,0.817758,0.840977,0.95399,0.488164,0.828724,0.381176,0.926356,0.795325,0.890588,0.901075,0.976668,0.903759,0.855924


In [145]:
names = ['bankruptcy','haberman', 'breastcancer', 'mammo', 'spambase', 'mushroom', 'adult']
corrs = []

for dataname in names:
    models = load_models(dataname+'_models_3600')
    data = load_models(dataname+'_data_3600')
    X_names = list(data['X_train'].columns.values)
    X_names.insert(0, '(Intercept)')
    
    N = data['X_test'].shape[0]
    expl = []
    results = extract_results(models, data['X_test'])
    for idx in range(N):
        expl.append(get_explanations(results,data['X_test'], data['y_test'], idx, printing=False))
    
    # add observation index
    complete_expl = pd.concat(expl, keys = ["{:02d}".format(x) for x in range(N)])
    complete_expl.index.names = ['obs', 'model']
    # clean to only include the contributions
    contrib = complete_expl.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
    contrib.index = contrib.index.remove_unused_levels()
    
    corr = raw_cor(contrib)
    corrs.append(corr)
    
# full_corrs_df = pd.concat(corrs, keys = names)
# full_corrs_df

In [131]:
def raw_cor(df):
    models = list(df.index.levels[1])
    p = len(models)
    # distance_matrix = np.zeros((p,p))
    correlation_list = []
    comparison = ['slim_logit', 'slim_ebm', 'slim_shap', 'logit_ebm', 'logit_shap', 'ebm_shap']

    for i in range(p):
        for j in range(i+1,p):
            to_drop = models.copy()
            del to_drop[j]
            del to_drop[i]
            diff = df.drop(to_drop, level=1).groupby(level='obs').apply(pearson)
            correlation_list.append(diff)
    #         distance_matrix[i,j] = round(diff,3)
    #         distance_matrix[j,i] = round(diff,3)
    distance_df = pd.DataFrame(correlation_list, comparison)
    # distance_df['Average correlation'] = distance_df.sum()/(p-1)
    return distance_df

In [127]:
correlation_list[1]

obs
00    0.942511
01    0.500436
02    0.893712
03    0.687742
04    0.942511
05    0.921803
06    0.921679
07    0.828200
08    0.893712
09    0.921803
10    0.927479
11    0.942309
12         NaN
13    0.950752
14    0.543674
15    0.833136
16    0.954334
17    0.765545
18    0.950752
19    0.929065
20    0.915062
21    0.868050
22    0.915249
23    0.879322
24    0.927479
25    0.811755
26    0.543674
27    0.941932
28    0.927479
29    0.728203
30    0.942309
31    0.916947
32         NaN
33    0.921803
34    0.926906
35    0.879265
36    0.756958
37    0.797580
38    0.756958
39    0.921483
40    0.359248
41    0.760715
42    0.513369
43    0.942511
44    0.728665
45    0.816537
46    0.905848
47    0.942714
48    0.918267
49    0.930704
dtype: float64