In [2]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Explainers:
# 1. SHAP Tabular Explainer
from interpret.ext.blackbox import TabularExplainer

# OR

# 2. Mimic Explainer
from interpret.ext.blackbox import MimicExplainer
# You can use one of the following four interpretable models as a global surrogate to the black box model
from interpret.ext.glassbox import LGBMExplainableModel
from interpret.ext.glassbox import LinearExplainableModel
from interpret.ext.glassbox import SGDExplainableModel
from interpret.ext.glassbox import DecisionTreeExplainableModel
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)
# fill missing values
data = data.fillna(method="ffill")
data = data.fillna(method="bfill")

In [3]:

from sklearn.model_selection import train_test_split

numeric_features = ['age', 'fare']
categorical_features = ['embarked', 'sex', 'pclass']

y = data['survived'].values
X = data[categorical_features + numeric_features]

# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# We add many to one and many to many transformations for illustration purposes.
# The support for raw feature explanations with many to one and many to many transformations are only supported 
# When allow_all_transformations is set to True on explainer creation
from sklearn.preprocessing import FunctionTransformer
many_to_one_transformer = FunctionTransformer(lambda x: x.sum(axis=1).reshape(-1, 1), validate=True)
many_to_many_transformer = FunctionTransformer(lambda x: np.hstack(
    (np.prod(x, axis=1).reshape(-1, 1), (np.prod(x, axis=1)**2).reshape(-1, 1))
), validate=True)

In [5]:

from sklearn.compose import ColumnTransformer

transformations = ColumnTransformer([
    ("age_fare_1", Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ]), ["age", "fare"]),
    ("age_fare_2", many_to_one_transformer, ["age", "fare"]),
    ("age_fare_3", many_to_many_transformer, ["age", "fare"]),
    ("embarked", Pipeline(steps=[
        ("imputer", SimpleImputer(strategy='constant', fill_value='missing')), 
        ("encoder", OneHotEncoder(sparse=False))]), ["embarked"]),
    ("sex_pclass", OneHotEncoder(sparse=False), ["sex", "pclass"])    
])

In [6]:
!pip install sklearn-pandas



In [7]:
from sklearn_pandas import DataFrameMapper

In [8]:
transformations = [
    (["age", "fare"], Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])),
    (["age", "fare"], many_to_one_transformer),
    (["age", "fare"], many_to_many_transformer),
    (["embarked"], Pipeline(steps=[
        ("imputer", SimpleImputer(strategy='constant', fill_value='missing')), 
        ("encoder", OneHotEncoder(sparse=False))])),
    (["sex", "pclass"], OneHotEncoder(sparse=False))    
]

In [12]:
clf = Pipeline(steps=[('preprocessor', DataFrameMapper(transformations)),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

In [13]:
model = clf.fit(x_train, y_train)

In [14]:
# 1. Using SHAP TabularExplainer
# When the last parameter allow_all_transformations is passed, we handle many to one and many to many transformations to 
# generate approximations to raw feature importances. When this flag is passed, for transformations not recognized as one to 
# many, we distribute feature importances evenly to raw features generating them.
# clf.steps[-1][1] returns the trained classification model
explainer = TabularExplainer(clf.steps[-1][1], 
                             initialization_examples=x_train, 
                             features=x_train.columns, 
                             transformations=transformations, 
                             allow_all_transformations=True)

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
The option feature_dependence has been renamed to feature_perturbation!
The option feature_perturbation="independent" is has been renamed to feature_perturbation="interventional"!
The feature_perturbation option is now deprecated in favor of using the appropriate masker (maskers.Independent, or maskers.Impute)


In [15]:
explainer

<interpret_community.tabular_explainer.TabularExplainer at 0x7f30799b7e48>

In [16]:
# 2. Using MimicExplainer
# augment_data is optional and if true, oversamples the initialization examples to improve surrogate model accuracy to fit original model.  Useful for high-dimensional data where the number of rows is less than the number of columns. 
# max_num_of_augmentations is optional and defines max number of times we can increase the input data size.
# LGBMExplainableModel can be replaced with LinearExplainableModel, SGDExplainableModel, or DecisionTreeExplainableModel
explainer2 = MimicExplainer(clf.steps[-1][1], 
                            x_train, 
                            LGBMExplainableModel, 
                            augment_data=True, 
                            max_num_of_augmentations=10, 
                            features=x_train.columns, 
                            transformations=transformations, 
                            allow_all_transformations=True)





# 3. Using PFIExplainer

# Use the parameter "metric" to pass a metric name or function to evaluate the permutation. 
# Note that if a metric function is provided a higher value must be better.
# Otherwise, take the negative of the function or set the parameter "is_error_metric" to True.
# Default metrics: 
# F1 Score for binary classification, F1 Score with micro average for multiclass classification and
# Mean absolute error for regression


# explainer = PFIExplainer(clf.steps[-1][1], 
#                          features=x_train.columns, 
#                          transformations=transformations)

Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


In [18]:
# 3. Using PFIExplainer

# Use the parameter "metric" to pass a metric name or function to evaluate the permutation. 
# Note that if a metric function is provided a higher value must be better.
# Otherwise, take the negative of the function or set the parameter "is_error_metric" to True.
# Default metrics: 
# F1 Score for binary classification, F1 Score with micro average for multiclass classification and
# Mean absolute error for regression


#explainer3 = PFIExplainer(clf.steps[-1][1], 
                          features=x_train.columns, 
                          transformations=transformations)


IndentationError: unexpected indent (<ipython-input-18-1217136fe970>, line 12)

In [19]:
# Passing in test dataset for evaluation examples - note it must be a representative sample of the original data
# x_train can be passed as well, but with more examples explanations will take longer although they may be more accurate

global_explanation = explainer.explain_global(x_test)

# Note: if you used the PFIExplainer in the previous step, use the next line of code instead
# global_explanation = explainer.explain_global(x_test, true_labels=y_test)

Many to one/many maps found in input


Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.
Changing the sparsity structure of a csr_matrix is expensive. lil_matrix is more efficient.


In [20]:
# Sorted SHAP values
print('ranked global importance values: {}'.format(global_explanation.get_ranked_global_values()))
# Corresponding feature names
print('ranked global importance names: {}'.format(global_explanation.get_ranked_global_names()))
# Feature ranks (based on original order of features)
print('global importance rank: {}'.format(global_explanation.global_importance_rank))
# Per class feature names
print('ranked per class feature names: {}'.format(global_explanation.get_ranked_per_class_names()))
# Per class feature importance values
print('ranked per class feature values: {}'.format(global_explanation.get_ranked_per_class_values()))

ranked global importance values: [0.026931014185821292, 0.026931014185816903, 1.1112543378382588e-14, 5.754780545608044e-15, 5.061392803096062e-15]
ranked global importance names: ['fare', 'age', 'sex', 'pclass', 'embarked']
global importance rank: [4, 3, 1, 2, 0]
ranked per class feature names: [['fare', 'age', 'sex', 'pclass', 'embarked'], ['fare', 'age', 'sex', 'pclass', 'embarked']]
ranked per class feature values: [[0.026931014185821292, 0.026931014185816903, 1.1112543378382588e-14, 5.754780545608044e-15, 5.061392803096062e-15], [0.026931014185821292, 0.026931014185816903, 1.1112543378382588e-14, 5.754780545608044e-15, 5.061392803096062e-15]]


In [21]:
# Print out a dictionary that holds the sorted feature importance names and values
print('global importance rank: {}'.format(global_explanation.get_feature_importance_dict()))

global importance rank: {'fare': 0.026931014185821292, 'age': 0.026931014185816903, 'sex': 1.1112543378382588e-14, 'pclass': 5.754780545608044e-15, 'embarked': 5.061392803096062e-15}


In [22]:
# feature shap values for all features and all data points in the training data
print('local importance values: {}'.format(global_explanation.local_importance_values))

local importance values: [[[4.271188043403938e-15, 8.647979302384874e-15, 5.628356667119459e-15, 0.008125449913578246, 0.00812544991358028], [-7.182743129076331e-15, 8.647979302384874e-15, 5.628356667119459e-15, 0.007997430689910195, 0.007997430689913989], [4.271188043403938e-15, 8.647979302384874e-15, 5.628356667119459e-15, 0.008119723538939944, 0.008119723538942473], [4.271188043403938e-15, 8.647979302384874e-15, 5.628356667119459e-15, 0.008322816970244762, 0.008322816970251928], [-5.550700513548274e-15, 8.647979302384874e-15, 5.628356667119459e-15, 0.008216042882279744, 0.008216042882283815], [4.271188043403938e-15, 8.647979302384874e-15, -7.250073239721703e-15, 0.003393122314695679, 0.0033931223146928032], [4.271188043403938e-15, 8.647979302384874e-15, 5.628356667119459e-15, 0.00820494726653635, 0.008204947266540292], [4.271188043403938e-15, 8.647979302384874e-15, -4.284534164796876e-15, 0.007778053736586524, 0.007778053736588498], [-5.550700513548274e-15, 8.647979302384874e-15, 5.

In [23]:
# Note: Do not run this cell if using PFIExplainer, it does not support local explanations
# You can pass a specific data point or a group of data points to the explain_local function

# E.g., Explain the first data point in the test set
instance_num = 1
local_explanation = explainer.explain_local(x_test[:instance_num])

Many to one/many maps found in input


In [24]:
# Get the prediction for the first member of the test set and explain why model made that prediction
prediction_value = clf.predict(x_test)[instance_num]

sorted_local_importance_values = local_explanation.get_ranked_local_values()[prediction_value]
sorted_local_importance_names = local_explanation.get_ranked_local_names()[prediction_value]

print('local importance values: {}'.format(sorted_local_importance_values))
print('local importance names: {}'.format(sorted_local_importance_names))

local importance values: [[-4.271188043403938e-15, -5.628356667119459e-15, -8.647979302384874e-15, -0.008125449913578246, -0.00812544991358028]]
local importance names: [['embarked', 'pclass', 'sex', 'age', 'fare']]
