# Read data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import re


data = pd.read_pickle('data/global_train_data.pkl').sample(10000)
data = data.rename(columns=lambda x: re.sub('[^A-Za-z0-9_]+', '', x))
data = data.replace([np.inf, -np.inf], np.nan)

# Get X and y from data
y = data['TARGET'].values
X = data.drop(['TARGET', 'SK_ID_CURR'], axis=1)

# preprocessing steps
pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
X_prepro = pd.DataFrame(pipe.fit_transform(X), columns=X.columns)

# Split in train and test
X_train, X_valid, y_train, y_valid = train_test_split(
    X_prepro, y, stratify=y, random_state=1)

print('Shape of X_train:', X_train.shape)
print('Shape of X_valid:', X_valid.shape)

# Models fitting on data

In [None]:
%%time

from sklearn.metrics import roc_auc_score
import lightgbm as lgb


params = {'boosting_type': 'gbdt', 'objective': 'binary', 'max_depth': 18,
          'n_jobs': -1, 'num_leaves': 30, 'learning_rate': 0.02, 'n_estimators': 1600,
          'max_bin': 512, 'subsample_for_bin': 200, 'subsample': 0.8,
          'subsample_freq': 1, 'colsample_bytree': 0.8,
          'reg_alpha': 80, 'reg_lambda': 20,
          'min_split_gain': 0.5, 'min_child_weight': 1,
          'min_child_samples': 10, 'scale_pos_weight': 11.5, 'num_class': 1,
          'metric': 'auc', 'learning_rate': 0.02
          }

LGB_clf = lgb.LGBMClassifier(**params)
LGB_clf.fit(X_train, y_train)

# evaluate predictions
print('AUC: ', roc_auc_score(
    y_valid, LGB_clf.predict_proba(X_valid)[:, 1]))

# Explaining models

## LIME

In [None]:
from lime.lime_tabular import LimeTabularExplainer

def get_lime_explainer(model, data, labels):  

    cat_feat_ix = [i for i,c in enumerate(data.columns) if pd.api.types.is_categorical_dtype(data[c])]
    feat_names = list(data.columns)
    class_names = list(set(labels))
    lime_explainer = LimeTabularExplainer(data,
                                      feature_names=feat_names,
                                      class_names=class_names,
                                      categorical_features=cat_feat_ix ,
                                      mode="classification"
                                      )
    return lime_explainer

def lime_explain(explainer, data, predict_method, num_features): 
    explanation = explainer.explain_instance(data, predict_method, num_features=num_features) 
    return explanation

lime_data_explainations = []
lime_metrics = []
lime_explanation_time = []
feat_names = list(X.columns)


predict_method = LGB_clf.predict_proba 

# explain first sample from test data
lime_explainer = get_lime_explainer(LGB_clf, X_train, y_train)
explanation = lime_explain(lime_explainer, scaled_test_data[6], predict_method, top_x) 

ex_holder = {}
for feat_index,ex in explanation.as_map()[1] :
    ex_holder[feat_names[feat_index]] = ex

lime_data_explainations.append(ex_holder) 
actual_pred = predict_method(scaled_test_data[6].reshape(1,-1))
perc_pred_diff =  abs(actual_pred[0][1] - explanation.local_pred[0])   
lime_explanation_time.append({"time": elapsed_time, "model": "LGB" })
lime_metrics.append({"lime class1": explanation.local_pred[0], "actual class1": actual_pred[0][1], "class_diff": round(perc_pred_diff,3), "model": current_model["name"] })

In [None]:
from lime.lime_tabular import LimeTabularExplainer


def get_lime_explainer(model, data, labels):

    feat_names = list(data.columns)
    class_names = list(set(labels))
    lime_explainer = LimeTabularExplainer(data,
                                          feature_names=feat_names,
                                          class_names=class_names,
                                          mode="classification"
                                          )
    return lime_explainer


def lime_explain(explainer, data, predict_method, num_features):
    explanation = explainer.explain_instance(
        data, predict_method, num_features=num_features)
    return explanation


# explain first sample from test data
lime_explainer = get_lime_explainer(LGB_clf, X_train, y_train)
explanation = lime_explain(lime_explainer, X_test,
                           LGB_clf.predict_proba, top_x)

ex_holder = {}
for feat_index, ex in explanation.as_map()[1]:
    ex_holder[X.columns[feat_index]] = ex

actual_pred = LGB_clf.predict_proba(X_test.reshape(1, -1))
perc_pred_diff = abs(actual_pred[0][1] - explanation.local_pred[0])

In [None]:
def plot_lime_exp(fig, fig_index, exp_data, title):
    features =  list(exp_data.keys())[::-1]
    features = [string[:10] for string in features]
    explanations = list(exp_data.values())[::-1]
    ax = fig.add_subplot(fig_index) 
    lime_bar = ax.barh( features, explanations ) 
    ax.set_title(title, fontsize = 20)
    for i,bar in enumerate(lime_bar):
        cols_abrev = [string[:10] for string in X.columns]
        bar.set_color(color_list[list(cols_abrev).index(features[i])])
        plt.box(False) 
fig = plt.figure(figsize=(19,8))

# Plot lime explanations for trained models
for i, dex in enumerate(lime_data_explainations):
    fig_index = "23" + str(i+1)
    plot_lime_exp(fig, int(fig_index), lime_data_explainations[i], trained_models[i]["name"])

plt.suptitle( " LIME Explanation for single test data instance.  Top " + str(top_x) + " Features", fontsize=20, fontweight="normal")
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

# Plot run time for explanations
lx_df = pd.DataFrame(lime_explanation_time)
lx_df.sort_values("time", inplace=True)
setup_plot()
lx_ax = lx_df.plot(kind="line", x="model", title="Runtime (seconds) for single test data instance LIME explanation", figsize=(22,6))
lx_ax.title.set_size(20)
lx_ax.legend(["Run time"])
plt.box(False)

In [None]:
# Plot run time for explanations
lime_metrics_df = pd.DataFrame(lime_metrics)  
lime_metrics_df_ax = lime_metrics_df[["lime class1", "actual class1", "model"]].plot(kind="line", x="model", title="LIME Actual Prediction vs Local Prediction ", figsize=(22,6))
lime_metrics_df_ax.title.set_size(20)
lime_metrics_df_ax.legend(["Lime Local Prediction", "Actual Prediction"])
plt.box(False)

## SHAP

In [None]:
current_model = trained_models[3] # Explain the Random Forest Model
clf = current_model["model"]["clf"]
scaler = current_model["model"]["scaler"]
scaled_train_data = scaler.transform(X_train)
sub_sampled_train_data = shap.sample(scaled_train_data, 600, random_state=0) # use 600 samples of train data as background data

scaled_test_data = scaler.transform(X_test) 
subsampled_test_data =scaled_test_data[test_data_index].reshape(1,-1)

start_time = time.time()
explainer = shap.KernelExplainer(clf.predict_proba, sub_sampled_train_data)
shap_values = explainer.shap_values(subsampled_test_data,  l1_reg="aic")
elapsed_time = time.time() - start_time
# explain first sample from test data
print("Kernel Explainer SHAP run time", round(elapsed_time,3) , " seconds. ", current_model["name"])
print("SHAP expected value", explainer.expected_value)
print("Model mean value", clf.predict_proba(scaled_train_data).mean(axis=0))
print("Model prediction for test data", clf.predict_proba(subsampled_test_data))
shap.initjs()
pred_ind = 0
shap.force_plot(explainer.expected_value[1], shap_values[1][0], subsampled_test_data[0], feature_names=X_train.columns)



In [None]:
shap.initjs()
shap.summary_plot(shap_values, subsampled_test_data, feature_names=X_train.columns, max_display=10)

In [None]:
current_model = trained_models[3]
clf = current_model["model"]["clf"]
scaler = current_model["model"]["scaler"]

scaled_test_data = scaler.transform(X_test) 
subsampled_test_data =scaled_test_data[test_data_index].reshape(1,-1)

# explain first sample from test data
start_time = time.time()
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(subsampled_test_data)
elapsed_time = time.time() - start_time

print("Tree Explainer SHAP run time", round(elapsed_time,3) , " seconds. ", current_model["name"])
print("SHAP expected value", explainer.expected_value)
print("Model mean value", clf.predict_proba(scaled_train_data).mean(axis=0))
print("Model prediction for test data", clf.predict_proba(subsampled_test_data))
shap.initjs()
pred_ind = 0
shap.force_plot(explainer.expected_value, shap_values[0], subsampled_test_data[0], feature_names=X_train.columns)



In [None]:
import shap

def get_kernel_shap_explainer(model, background_data, train_data):  
    shap_explainer = shap.KernelExplainer(model.predict_proba, background_data)   
    return shap_explainer 

def shap_explain(explainer, test_data): 
    shap_values = explainer.shap_values(test_data, l1_reg="aic")

    return shap_values

shap_data_explainations = []
shape_explanation_time = []
feat_names = list(X.columns) 
data_subsample = 500 
for current_model in trained_models:  
    scaler = current_model["model"]["scaler"]
    scaled_test_data = scaler.transform(X_test)
    scaled_train_data = scaler.transform(X_train)
    sampled_scaled_train_data = shap.sample(scaled_train_data, data_subsample) # subsample background data to make things faster

    start_time = time.time()
    shap_explainer  = get_kernel_shap_explainer(current_model["model"]["clf"], sampled_scaled_train_data, scaled_train_data)

    # explain first sample from test data 
    sampled_scaled_test_data = scaled_test_data[test_data_index].reshape(1,-1)
    shap_values = shap_explain(shap_explainer, sampled_scaled_test_data) 
    elapsed_time = time.time() - start_time 
    idx = np.argsort(np.abs(shap_values[1][0]))[::-1] 
    ex_holder = { feat_names[idx[i]] : shap_values[1][0][idx[i]] for i in range(top_x)} 


    shap_data_explainations.append(ex_holder) 
    shape_explanation_time.append({"time": elapsed_time, "model": current_model["name"] })
  

In [None]:
def plot_shap_exp(fig, fig_index, exp_data, title):
    features =  list(exp_data.keys())[::-1]
    explanations = list(exp_data.values())[::-1]
    ax = fig.add_subplot(fig_index) 
    lime_bar = ax.barh( features, explanations ) 
    ax.set_title(title, fontsize = 20)
    for i,bar in enumerate(lime_bar):
        bar.set_color(color_list[list(current_data.columns).index(features[i])])
        plt.box(False) 


# Plot SHAP explanations for a given test set item
fig = plt.figure(figsize=(19,8))
for i, dex in enumerate(shap_data_explainations):
    fig_index = "23" + str(i+1)
    plot_lime_exp(fig, int(fig_index), shap_data_explainations[i], trained_models[i]["name"])

plt.suptitle( "Kernel SHAP Explanation for single test data instance.  Top " + str(top_x) + " Features", fontsize=20, fontweight="normal")
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

# Plot SHAP explanation run time
shapx_df = pd.DataFrame(shape_explanation_time)
shapx_df.sort_values("time", inplace=True)
# setup_plot()
# shapx_df_ax = shapx_df.plot(kind="line", x="model", title="Runtime (seconds) for single instance SHAP explanation", figsize=(22,6))
# shapx_df_ax.title.set_size(20)
# shapx_df_ax.legend(["Run time"])
# plt.box(False)


# Plot both LIME and SHAP explanation run times
m_df =  shapx_df.merge(lx_df, on="model", suffixes=("_SHAP", "_LIME")) 
m_df.head() 
mx_df_ax = m_df.plot(kind="line", x="model", title="Kernel SHAP vs LIME: Runtime (seconds) for single instance explanation", figsize=(22,6))
mx_df_ax.title.set_size(20)
mx_df_ax.legend(["Run time for SHAP", "Run time for LIME"])
plt.box(False)

In [None]:
current_model = trained_models[3]
clf = current_model["model"]["clf"]
scaler = current_model["model"]["scaler"]

sample_sizes = [50, 100, 150, 200, 300, 600, 1000, 2000]
metric_holder = []
 

for sample_size in sample_sizes:
    scaled_train_data = scaler.transform(X_train)
    sub_sampled_train_data = shap.sample(scaled_train_data, sample_size, random_state=0) # use x samples of train data as background data

    scaled_test_data = scaler.transform(X_test)
    test_data_index = 10
    subsampled_test_data =scaled_test_data[test_data_index].reshape(1,-1)

    start_time = time.time()
    explainer = shap.KernelExplainer(clf.predict_proba, sub_sampled_train_data)
    shap_values = explainer.shap_values(subsampled_test_data,  l1_reg="aic")
    elapsed_time = time.time() - start_time

    metric_holder.append({"class0 exp": explainer.expected_value[0], 
                        "class1 exp":explainer.expected_value[1],
                        "run time": elapsed_time,
                        "sample size": sample_size
                        })
    
metric_df = pd.DataFrame(metric_holder)


In [None]:
mean_pred = clf.predict_proba(scaled_train_data).mean(axis=0)
metric_df["mean prediction class0"] = mean_pred[0]
metric_df["mean prediction class1"] = mean_pred[1]
metric_df.head()
setup_plot()
metrix_ax = metric_df[["sample size", "run time"]].plot(
    kind="line", x="sample size", title="Kernel SHAP Run time (seconds) vs Background Data Size", figsize=(22, 4))
metrix_ax.title.set_size(20)
plt.box(False)

setup_plot()
metrix_ax = metric_df[["sample size", "mean prediction class0", "class0 exp"]].plot(
    kind="line", x="sample size", title="Expected Value vs Mean Prediction", figsize=(22, 4))
metrix_ax.title.set_size(20)
metrix_ax.legend(["Mean prediction on train set", "SHAP Expected Value"])
plt.box(False)