<a href="https://colab.research.google.com/github/AjeetSingh02/ExplainableAI/blob/master/shap_explainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%%capture
! pip install shap

In [0]:
import shap
import sklearn
import xgboost
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
# Functions to generate shap values for a model and data
def tree_explainer(model, X_test):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    return explainer, shap_values


def kernel_explainer(model, X_test, link="identity", nsamples = "auto"):
    '''
        nsamples:   auto or int
                    Number of times to re-evaluate the model when explaining each prediction.
                    More sample lead to lower variance estimates of the SHAP values.
                    the "auto" setting uses
                    n_samples = 2 * X.shape[1] + 2048

        link:       identity or logit
    '''

    explainer = shap.KernelExplainer(model.predict_proba, X_test, link)
    shap_values = explainer.shap_values(X_test, nsamples=nsamples)

    return explainer, shap_values


def generate_shap_values(model, model_type, X_test, link="identity", nsamples = "auto"):
    if model_type == "tree":
        return tree_explainer(model, X_test)
    elif model_type == "kernel":
        return kernel_explainer(model, X_test, link, nsamples)
    else:
        print("Model type can be either tree or kernel")
        return None

In [0]:
# Functions for text explainations

# Printing function
def printer(row, class_num, columns, globl):

    max_index = np.argmax(row)
    min_index = np.argmin(row)

    most_string = f"{columns[max_index]} was the most influential in swaying the model output"
    min_string = f"{columns[min_index]} was the least influential in swaying the model output"
    
    if globl:
        if class_num != None:
            print(f"{most_string} for class {class_num} (mean SHAP score = {round(row[max_index],2)})")
            print(f"{min_string} for class {class_num} (mean SHAP score = {round(row[min_index],2)})")
        else:
            print(f"{most_string} (mean SHAP score = {round(row[max_index],2)})")
            print(f"{min_string} (mean SHAP score = {round(row[min_index],2)})")
    else:
        print(most_string)
        print(min_string)


# Calculating which features were most and least effective
def shap_describe(shap_v, columns, globl, row = False, class_num = None):
    # For global
    if globl == True:
        mean_list = []

        shap_df = pd.DataFrame(shap_v, columns = columns)

        for col in columns:
            mean_list.append(np.mean(np.abs(shap_df[col])))

        printer(mean_list, class_num, columns, globl)

    # For local
    elif globl == False:
        if not isinstance(row, bool):
            printer(np.abs(row), class_num, columns, globl)
        else:
            print("Provide row")
            return
    else:
        print("globl can be either True or False")
        return


# Explaination for whole dataset
# If multiple classes are there then explaiantion will be for each class
def shap_global(shap_values, columns):
    if isinstance(shap_values, np.ndarray):
        shap_describe(shap_values, columns, globl=True)
    elif isinstance(shap_values, list):
        for i in range(len(shap_values)):
            shap_describe(shap_values[i], columns, globl=True, class_num=i)


# Explaination for one example
# If multiple classes are there then explaiantion will be for each class
def shap_local(shap_values, columns, indx = None):
    if indx != None:
        if isinstance(shap_values, np.ndarray):
            # call shap_describe for one class
            row = shap_values[indx]
            shap_describe(shap_values, columns, globl = False, row = row)
            
        elif isinstance(shap_values, list):
            # call shap_describe for every class
            for i in range(len(shap_values)):
                row = shap_values[i][indx]
                shap_describe(shap_values[i], columns, globl = False, row = row, class_num = i)
    else:
        print("provide index")
        return

In [0]:
shap_df1.shape

(506, 13)

Case 1: Price Prediction data to xgboost

Case 2: Flower Classification data to xgboost

Case 3: Flower Classification data to SVM

Case 4: Price Prediction data to SVM

In [0]:
# Price Prediction data
X_sales, y_sales = shap.datasets.boston() 
# Flower Classification data 
X_class, X_test, Y_class, Y_test = train_test_split(*shap.datasets.iris(), test_size=0.2, random_state=0) 

In [0]:
%%capture

# Case 1: Price Prediction data to xgboost
model1 = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X_sales, label=y_sales), 100)
explainer1, shap_values1 = generate_shap_values(model1, "tree", X_sales)

# Case 2:  Flower Classification data to xgboost
model2 = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X_class, label=Y_class), 100)
explainer2, shap_values2 = generate_shap_values(model2, "tree", X_class)

# Case 3: Flower Classification data to SVM
svm3 = sklearn.svm.SVC(kernel='rbf', probability=True)
svm3.fit(X_class, Y_class)
explainer3, shap_values3 = generate_shap_values(svm3, "kernel", X_class, link="logit", nsamples=100)

# Case 4: Price Prediction data to SVM
y_sales_int = list(map(int, y_sales))
svm4 = sklearn.svm.SVC(kernel='rbf', probability=True)
svm4.fit(X_sales, y_sales_int)
explainer4, shap_values4 = generate_shap_values(svm4, "kernel", X_sales.iloc[1:100,:], link="logit", nsamples=100)

In [0]:
# 1, 4 - X_sales, y_sales trained on XGBOOST
# 2, 3 - X_class, X_test, Y_class, Y_test trained on SVD

In [0]:
np.array(shap_values4).shape

(43, 99, 13)

In [0]:
indx = 0
shap_values = shap_values2
X = X_class
explainer = explainer2

In [0]:
# shap_global(shap_values3, X_class.columns)
shap_local(shap_values, X.columns, indx)

petal length (cm) was the most influential in swaying the model output
sepal length (cm) was the least influential in swaying the model output


In [0]:
# shap.summary_plot(shap_values3, X_class, plot_type="bar")
shap.initjs()
c = 0
shap.force_plot(explainer.expected_value, shap_values[indx,:], X.iloc[indx,:], link="logit")

In [0]:
X = X_sales 
Y = list(map(int, y_sales))

In [0]:
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


In [0]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, Y)
explainer5, shap_values5 = generate_shap_values(clf, "tree", X.iloc[1:100,:], link="logit", nsamples=100)

In [0]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0)
lr.fit(X_class, Y_class)

In [0]:
shap_values = shap_values5
# shap_global(shap_values5, X.columns)
shap_local(shap_values, X.columns, indx)

LSTAT was the most influential in swaying the model output
ZN was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
ZN was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
CHAS was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
CHAS was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
ZN was the least influential in swaying the model output
CRIM was the most influential in swaying the model output
CHAS was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
CHAS was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
CHAS was the least influential in swaying the model output
LSTAT was the most influential in swaying the model output
CHAS