In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_csv("bank.csv",sep=";")

In [2]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
5,35,management,single,tertiary,no,747,no,no,cellular,23,feb,141,2,176,3,failure,no
6,36,self-employed,married,tertiary,no,307,yes,no,cellular,14,may,341,1,330,2,other,no
7,39,technician,married,secondary,no,147,yes,no,cellular,6,may,151,2,-1,0,unknown,no
8,41,entrepreneur,married,tertiary,no,221,yes,no,unknown,14,may,57,2,-1,0,unknown,no
9,43,services,married,primary,no,-88,yes,yes,cellular,17,apr,313,1,147,2,failure,no


In [3]:
train_cols = df.columns[0:-1]
label = df.columns[-1]
X = df[train_cols]

In [4]:
y = df[label].apply(lambda x: 0 if x == "no" else 1) # 0為沒有簽定期存款(term deposit), 1為有存款 

In [5]:
X_encodings = pd.get_dummies(X, prefix_sep='.')

In [6]:
feature_names = list(X_encodings.columns)

In [7]:
seed = 1

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_encodings, y, test_size=0.20, random_state=seed)

# 訓練模型(Black boxes)

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

#Blackbox system can include preprocessing, not just a classifier!
pca = PCA()
rf = RandomForestClassifier(n_estimators=100, n_jobs=-1)

blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('pca', PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('rf', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
          ..._jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

# 解釋pca+rf的模型

In [10]:
from interpret import show
from interpret.perf import ROC

blackbox_perf = ROC(blackbox_model.predict_proba).explain_perf(X_test, y_test, name='Blackbox')
show(blackbox_perf)

# 用Lime解釋模型

In [11]:
from interpret.blackbox import LimeTabular
from interpret import show

#Blackbox explainers need a predict function, and optionally a dataset
lime = LimeTabular(predict_fn=blackbox_model.predict_proba, data=X_train, random_state=seed)

#Pick the instances to explain, optionally pass in labels if you have them
lime_local = lime.explain_local(X_test[:100], y_test[:100], name='LIME')

show(lime_local)


Data with input dtype int64 was converted to float64 by StandardScaler.


Ill-conditioned matrix (rcond=2.69635e-44): result may not be accurate.


Ill-conditioned matrix (rcond=2.16e-44): result may not be accurate.


Ill-conditioned matrix (rcond=1.22544e-44): result may not be accurate.


Ill-conditioned matrix (rcond=2.99982e-44): result may not be accurate.


Ill-conditioned matrix (rcond=6.99838e-44): result may not be accurate.


Ill-conditioned matrix (rcond=6.11753e-44): result may not be accurate.


Ill-conditioned matrix (rcond=4.53204e-44): result may not be accurate.


Ill-conditioned matrix (rcond=1.02434e-43): result may not be accurate.


Ill-conditioned matrix (rcond=1.24201e-43): result may not be accurate.


Ill-conditioned matrix (rcond=1.08146e-44): result may not be accurate.


Ill-conditioned matrix (rcond=1.23038e-43): result may not be accurate.



# 用Kernel SHAP解釋模型(linear LIME+Shapley)

In [12]:
from interpret.blackbox import ShapKernel
import numpy as np

background_val = np.median(X_train, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=blackbox_model.predict_proba, data=background_val, feature_names=feature_names)
shap_local = shap.explain_local(X_test[:50], y_test[:50], name='SHAP')

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to




In [13]:
show(shap_local)

# 用Morris Sensitivity來解釋feature重要程度(global)

In [14]:
from interpret.blackbox import MorrisSensitivity

sensitivity = MorrisSensitivity(predict_fn=blackbox_model.predict_proba, data=X_train)
sensitivity_global = sensitivity.explain_global(name="Global Sensitivity for blackbox")

show(sensitivity_global)

# 畫Partial Dependence plot來解釋feature對target之影響

In [15]:
from interpret.blackbox import PartialDependence

pdp = PartialDependence(predict_fn=blackbox_model.predict_proba, data=X_train)
pdp_global = pdp.explain_global(name='Partial Dependence')

show(pdp_global)

In [16]:
show([blackbox_perf, lime_local, shap_local, sensitivity_global, pdp_global])

# 測試ebm_global與上述global method之不同

# first, fit an EBM model

In [17]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()
ebm.fit(X_train, y_train)

ExplainableBoostingClassifier(data_n_episodes=2000,
               early_stopping_run_length=50,
               early_stopping_tolerance=1e-05,
               feature_names=['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous', 'job.admin.', 'job.blue-collar', 'job.entrepreneur', 'job.housemaid', 'job.management', 'job.retired', 'job.self-employed', 'job.services', 'job.student', 'job.technician', 'job.unemployed', 'job.unknown', 'marital....nth.oct', 'month.sep', 'poutcome.failure', 'poutcome.other', 'poutcome.success', 'poutcome.unknown'],
               feature_step_n_inner_bags=0,
               feature_types=['continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'continuous', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'categorical', 'catego...egorical', 'categorical', 'categorical', 'categorical', 'categorical', 'ca

In [18]:
ebm_global = ebm.explain_global()
show(ebm_global)

In [19]:
ebm_local = ebm.explain_local(X_test, y_test)
show(ebm_local)

In [20]:
show([blackbox_perf])

In [21]:
from interpret.perf import RegressionPerf

ebm_perf = RegressionPerf(ebm.predict).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)

In [22]:
ebm_perf_ROC = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show([ebm_perf_ROC,blackbox_perf])

## 可以看的出來ebm在fit bank marketing data的預測力確實高 pca+rf一些

# 比較其他可解釋的預測模型
# 訓練一般可解釋regression模型(Logistic Regression、ClassificationTree)

In [23]:
from interpret.glassbox import LogisticRegression, ClassificationTree

feature_names = list(X_encodings.columns)
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_encodings, y, test_size=0.20, random_state=seed)

lr = LogisticRegression(random_state=seed, feature_names=feature_names, penalty='l1')
lr.fit(X_train_enc, y_train)

clf_tree = ClassificationTree()
clf_tree.fit(X_train_enc, y_train)





<interpret.glassbox.decisiontree.ClassificationTree at 0x1aaac1633c8>

In [24]:
lr_perf = ROC(lr.predict_proba).explain_perf(X_test_enc, y_test, name='Logistic Regression')
clf_tree_perf = ROC(clf_tree.predict_proba).explain_perf(X_test_enc, y_test, name='Classification Tree')

In [25]:
lr_global = lr.explain_global(name='LR')
clf_tree_global = clf_tree.explain_global(name='Tree')

In [26]:
lr_local = lr.explain_local(X_test[:10], y_test[:10], name='lr')

# 秀出目前為止全部的模型 與其之解釋

In [27]:
lr_local = lr.explain_local(X_test[:10], y_test[:10], name='lr')
clf_tree_local = clf_tree.explain_local(X_test[:10], y_test[:10], name='Tree')
show([lr_global, lr_perf,lr_local, clf_tree_global, clf_tree_perf,clf_tree_local, ebm_global, ebm_perf_ROC, ebm_local], share_tables=True)

# 用Morris Sensitivty看看三個可解釋模型

In [28]:
sensitivity_lr = MorrisSensitivity(predict_fn=lr.predict_proba, data=X_train)
sensitivity_lr_global = sensitivity_lr.explain_global(name="Global Sensitivity for logistic r")

sensitivity_clf_tree = MorrisSensitivity(predict_fn=clf_tree.predict_proba, data=X_train)
sensitivity_clf_tree_global = sensitivity_clf_tree.explain_global(name="Global Sensitivity for clf tree")

sensitivity_ebm = MorrisSensitivity(predict_fn=ebm.predict_proba, data=X_train)
sensitivity_ebm_global = sensitivity_ebm.explain_global(name="Global Sensitivity for EBM")

show([sensitivity_lr_global,sensitivity_clf_tree_global,sensitivity_ebm_global])

# 由此看得出來其實logistic regression跟EBM比較make sense
# 我沒有仔細去研究decision tree跟logisitc regression內部是怎麼去學習，但就以預設的hyperparameters來說，Logistic Regression跟EBM學習效果較make sense