In [24]:
## import standard modules for data handling and visualization
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

## import model specific modules
import cplex as cp
import slim_python as slim
import shap
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)

In [25]:
cd ..

C:\Users\danie\Documents\StageDaniel


In [26]:
cd research

C:\Users\danie\Documents\StageDaniel\research


In [27]:
only_correct=True
no_zeros = True

In [28]:
# haberman and breastcancer gives large slim values
# mushroom and bankruptcy, spambase gives large ebm values 
# mushroom creates high sparsity for all models except EBM

dataset = 'simulation_test'
timelimit='3600'
def load_models(name):
    with open('results/models/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
models = load_models(dataset+'_models_'+timelimit)
data = load_models(dataset+'_data_'+timelimit)

In [29]:
# data['X_test']

In [30]:
X_names = list(data['X_train'].columns.values)
X_names.insert(0, '(Intercept)')

In [31]:
def extract_results(results, X):
    # Extract specific models from the results

    slim_results = results[0][1]

    rho = slim_results['rho']
    slim_predictions = pred_slim(X,rho)

    ebm = results[1]

    XGboost = results[2]
    explainer = shap.TreeExplainer(XGboost)
    shap_values = explainer.shap_values(X)

    logit = results[3]
    
    return rho, slim_predictions, ebm, explainer, shap_values, logit


def get_explanations(results, X, y, pred_idx=0, printing=False):
    
    rho, slim_predictions, ebm, explainer, shap_values, logit = results   
    
    observation = pd.Series(1, index=['Constant']).append(X.iloc[pred_idx]).append(pd.Series(y.iloc[pred_idx], index=['prediction']))
    
    slim_contributions = X.iloc[pred_idx]*rho[1:]
    slim_threshold = rho[0]
    slim_prediction = slim_predictions.iloc[pred_idx]
    slim_explanation = pd.Series(slim_threshold, index=['Constant']).append(slim_contributions).append(pd.Series(slim_prediction, index=['prediction'])) 

    logit_contributions = X.iloc[pred_idx]*logit.coef_.ravel()
    logit_intercept = logit.intercept_
    logit_prediction = int(sum(logit_contributions)+logit_intercept>0)
    logit_explanation = pd.Series(logit_intercept, index=['Constant']).append(logit_contributions).append(pd.Series(logit_prediction, index=['prediction']))

    ebm_local = ebm.explain_local(X.iloc[pred_idx:pred_idx+1], y.iloc[pred_idx:pred_idx+1], name='EBM')
    ebm_contributions = pd.Series(ebm_local.data(0)['scores'], index=X_names[1:])
    ebm_mean = ebm_local.data(0)['extra']['scores'][0]                               
    ebm_prediction = int(sum(ebm_contributions)+ebm_mean>0)
    ebm_explanation = pd.Series(ebm_mean, index= ['Constant']).append(ebm_contributions).append(pd.Series(ebm_prediction, index=['prediction']))

    shap_contributions = pd.Series(shap_values[pred_idx,:], index=X_names[1:])
    shap_mean = explainer.expected_value                              
    shap_prediction =  int(sum(shap_contributions)+ shap_mean>0)
    shap_explanation = pd.Series(shap_mean, index = ['Constant']).append(shap_contributions).append(pd.Series(shap_prediction, index=['prediction']))
    
    explanations = pd.DataFrame([observation, slim_explanation, logit_explanation, ebm_explanation, shap_explanation], index=['X', 'slim', 'logit', 'ebm', 'shap'])
    explanations['sum'] = explanations[list(explanations.columns)].sum(axis=1)
    if printing:
        display(explanations)
        print("prediction slim: ", slim_prediction)
        print("prediction logit: ", logit_prediction)
        print("prediction ebm: ", ebm_prediction)
        print("prediction shap: ", shap_prediction)
        print("true y: ", y.iloc[pred_idx])   
    
    return explanations

## Simple function for getting predictions for a SLIM scoring system
def pred_slim(X, rho):
    return (X.dot(rho[1:])+rho[0]>=0)*1

def sigmoid(x):
    return 1/(1 + np.exp(-x))


def force_plot(explanations, model, X, pred_idx=0, link='identity'):
    return shap.force_plot(explanations.loc[model][0], explanations.loc[model][1:-1].values, X.iloc[pred_idx,:], link=link)

In [32]:
%%time

N = data['X_test'].shape[0]
expl = []
results = extract_results(models, data['X_test'])
for idx in range(N):
    expl.append(get_explanations(results,data['X_test'], data['y_test'], idx, printing=False))


Wall time: 9.9 s


In [33]:
def clean_prediction(df):
    if df['prediction'].mean() in [0,1]: # unanimous
        return df
    
def zero_filter(df):
    sums = df.sum(axis=1)
    if (sums.isin([0])*1).sum()==0:
        
        return df
def preprocess_results(expl, only_correct=True, no_zeros=True):
    # add observation index
    complete_expl = pd.concat(expl, keys = ["{:02d}".format(x) for x in range(N)])
    complete_expl.index.names = ['obs', 'model']
    # clean to only include the contributions

    if only_correct:
        contrib = complete_expl.groupby(level='obs').apply(clean_prediction)
        if len(contrib.index.names)>2:
            contrib=contrib.droplevel(0)
        contrib = contrib.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
        contrib.index = contrib.index.remove_unused_levels()
    else:    
        contrib = complete_expl.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
        contrib.index = contrib.index.remove_unused_levels()

    if no_zeros:
        contrib = contrib.groupby(level='obs').apply(zero_filter)
        if len(contrib.index.names)>2:
            contrib=contrib.droplevel(0)
    
    print("Original number of explanations:  ", N)
    print("Cleaned explanation set size:     ", int(contrib.shape[0]/4))
    return contrib

In [34]:
# preprocess_results(expl, False, False)

In [35]:
from scipy import spatial

def cosine(df):
    return(1-spatial.distance.cosine(df.iloc[0], df.iloc[1]))
def get_cosine_matrix(df, func=cosine):
    
    models = list(df.index.levels[1])
    p = len(models)
    distance_matrix = np.zeros((p,p))
    
    for i in range(p):
        for j in range(i+1,p):
            to_drop = models.copy()
            del to_drop[j]
            del to_drop[i]
            diff = df.drop(to_drop, level=1).groupby(level='obs').apply(func).mean()
            distance_matrix[i,j] = round(diff,3)
            distance_matrix[j,i] = round(diff,3)
    distance_df = pd.DataFrame(distance_matrix, models, models)
    distance_df['Average cosine similarity'] = distance_df.sum()/(p-1)
    return distance_df

In [39]:
names = ['bankruptcy','haberman', 'breastcancer', 'mammo', 'spambase', 'mushroom', 'adult']
cosines = []

for dataname in names:
    models = load_models(dataname+'_models_3600')
    data = load_models(dataname+'_data_3600')
    X_names = list(data['X_train'].columns.values)
    X_names.insert(0, '(Intercept)')
    
    N = data['X_train'].shape[0]
    expl = []
    results = extract_results(models, data['X_train'])
    for idx in range(N):
        expl.append(get_explanations(results,data['X_train'], data['y_train'], idx, printing=False))
    
    print("Dataset: ", dataname)
    contrib = preprocess_results(expl, True, True)
#     # add observation index
#     complete_expl = pd.concat(expl, keys = ["{:02d}".format(x) for x in range(N)])
#     complete_expl.index.names = ['obs', 'model']
#     # clean to only include the contributions
#     contrib = complete_expl.drop(['X'], level = 1).drop(['Constant', 'sum', 'prediction'], axis=1)
#     contrib.index = contrib.index.remove_unused_levels()
    
    cosine_distance = get_cosine_matrix(contrib)
    cosines.append(cosine_distance)
    
cosines_df = pd.concat(cosines, keys = names)
cosines_df

Dataset:  bankruptcy
Original number of explanations:   200
Cleaned explanation set size:      153
Dataset:  haberman
Original number of explanations:   244
Cleaned explanation set size:      179
Dataset:  breastcancer
Original number of explanations:   546
Cleaned explanation set size:      531
Dataset:  mammo
Original number of explanations:   768
Cleaned explanation set size:      573
Dataset:  spambase
Original number of explanations:   3680
Cleaned explanation set size:      3091
Dataset:  mushroom
Original number of explanations:   6499
Cleaned explanation set size:      4735
Dataset:  adult
Original number of explanations:   26048
Cleaned explanation set size:      16482


Unnamed: 0,Unnamed: 1,ebm,logit,shap,slim,Average cosine similarity
bankruptcy,ebm,0.0,0.935,0.924,0.83,0.896333
bankruptcy,logit,0.935,0.0,0.978,0.922,0.945
bankruptcy,shap,0.924,0.978,0.0,0.961,0.954333
bankruptcy,slim,0.83,0.922,0.961,0.0,0.904333
haberman,ebm,0.0,0.129,0.817,0.142,0.362667
haberman,logit,0.129,0.0,-0.074,0.976,0.343667
haberman,shap,0.817,-0.074,0.0,-0.075,0.222667
haberman,slim,0.142,0.976,-0.075,0.0,0.347667
breastcancer,ebm,0.0,-0.092,0.891,-0.068,0.243667
breastcancer,logit,-0.092,0.0,-0.076,0.679,0.170333


In [40]:
cosines_df.index.names = ['data', 'model']
stds = cosines_df.groupby(level='model').std().round(3).reindex(['slim', 'logit', 'ebm', 'shap'])
stds

Unnamed: 0_level_0,ebm,logit,shap,slim,Average cosine similarity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
slim,0.336,0.192,0.377,0.0,0.237
logit,0.388,0.0,0.403,0.192,0.253
ebm,0.0,0.388,0.344,0.336,0.303
shap,0.344,0.403,0.0,0.377,0.246


In [41]:
avgs = cosines_df.groupby(level='model').mean().round(3).reindex(['slim', 'logit', 'ebm', 'shap'])
avgs

Unnamed: 0_level_0,ebm,logit,shap,slim,Average cosine similarity
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
slim,0.321,0.765,0.457,0.0,0.515
logit,0.385,0.0,0.46,0.765,0.537
ebm,0.0,0.385,0.637,0.321,0.448
shap,0.637,0.46,0.0,0.457,0.518


In [42]:
print(avgs.drop(columns=['Average cosine similarity'])[['slim','logit','ebm', 'shap']].to_latex())

\begin{tabular}{lrrrr}
\toprule
{} &   slim &  logit &    ebm &   shap \\
model &        &        &        &        \\
\midrule
slim  &  0.000 &  0.765 &  0.321 &  0.457 \\
logit &  0.765 &  0.000 &  0.385 &  0.460 \\
ebm   &  0.321 &  0.385 &  0.000 &  0.637 \\
shap  &  0.457 &  0.460 &  0.637 &  0.000 \\
\bottomrule
\end{tabular}



In [43]:
cosines[0]

Unnamed: 0,ebm,logit,shap,slim,Average cosine similarity
ebm,0.0,0.935,0.924,0.83,0.896333
logit,0.935,0.0,0.978,0.922,0.945
shap,0.924,0.978,0.0,0.961,0.954333
slim,0.83,0.922,0.961,0.0,0.904333
