In [1]:
import pandas as pd
import numpy as np
import sklearn


In [2]:
pathbevoelkerung = 'shap-master/data/bevoelkerung.xls'
dfbevoelkerung = pd.read_excel(pathbevoelkerung)
dfbevoelkerung.head()
dfbevoelkerung.dropna(axis='columns')

Unnamed: 0,Stadtteil,Bevölkerung Einwohnerinnen und Einwohner 2012,Bevölkerung Durchschnittsalter 2012,Bevölkerung Einwohnerinnen und Einwohner unter 18 Jahren in % 2012,Bevölkerung Einwohnerinnen und Einwohner von 18 bis 64 Jahren in % 2012,Bevölkerung Einwohnerinnen und Einwohner ab 65 Jahren in % 2012,Bevölkerung Ausländerinnen und Ausländer in % 2012,Bevölkerung Deutsche mit Migrationshintergrund in % 2012,Bevölkerung Einwohnerentwicklung (5-Jahresvergleich) 2012,Bevölkerung Einpersonenhaushalte in % 2012,Bevölkerung Familien mit Kindern in % 2012
0,Altstadt,3601,43.4,10.1,73.0,17.0,34.82366,20.2,5.2,66.7,10.4
1,Innenstadt,6334,41.6,9.0,76.5,14.5,43.242817,17.9,4.0,71.3,8.1
2,Bahnhofsviertel,3117,37.5,7.7,85.6,6.6,52.293872,12.0,25.5,71.6,7.3
3,Westend-Süd,17076,40.7,14.4,70.5,15.2,23.735067,17.3,5.8,62.4,13.9
4,Westend-Nord,9083,40.0,15.9,69.0,15.1,25.454145,21.9,12.1,55.3,17.0
5,Nordend-West,28262,41.2,13.0,72.5,14.5,19.627061,15.3,6.3,63.0,13.2
6,Nordend-Ost,25410,40.7,12.5,75.2,12.3,21.090122,15.6,4.0,65.4,13.0
7,Ostend,26540,42.5,11.7,71.9,16.4,27.848531,18.8,4.7,62.6,12.5
8,Bornheim,26105,43.2,12.9,69.0,18.2,22.731278,18.2,4.6,62.0,13.8
9,Gutleutviertel,5952,41.1,11.1,74.0,14.9,37.567204,18.3,3.4,66.1,10.1


In [3]:
#Setup a regression experiment
from sklearn.model_selection import train_test_split
#setting y as the target variabl
y_data = dfbevoelkerung['Bevölkerung Ausländerinnen und Ausländer in %  2012']
#Seperating the target from the features
x_data=dfbevoelkerung[['Bevölkerung Durchschnittsalter  2012','Bevölkerung Einwohnerinnen und Einwohner  2012'
                       ,'Bevölkerung Einwohnerinnen und Einwohner unter 18 Jahren in %  2012'
                       ,'Bevölkerung Einpersonenhaushalte in %  2012'
                       ,'Bevölkerung Familien mit Kindern in %  2012'
                       ,'Bevölkerung Einwohnerentwicklung (5-Jahresvergleich)  2012']]

feature_names = list(x_data.columns)

#Splitting the dataset into train and test
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.20, random_state=1)


In [4]:
from interpret.glassbox import ExplainableBoostingRegressor, LinearRegression, RegressionTree

lr = LinearRegression(random_state=0)
lr.fit(x_train, y_train)

rt = RegressionTree(random_state=0)
rt.fit(x_train, y_train)

ebm = ExplainableBoostingRegressor(random_state=0)
ebm.fit(x_train, y_train)  
# For Classifier, use ebm = ExplainableBoostingClassifier()

ExplainableBoostingRegressor(binning_strategy='quantile', data_n_episodes=2000,
                             early_stopping_run_length=50,
                             early_stopping_tolerance=1e-05,
                             feature_names=['Bevölkerung Durchschnittsalter  '
                                            '2012',
                                            'Bevölkerung Einwohnerinnen und '
                                            'Einwohner  2012',
                                            'Bevölkerung Einwohnerinnen und '
                                            'Einwohner unter 18 Jahren in %  '
                                            '2012',
                                            'Bevölkerung Einpersonenhaushalte '
                                            'in %  2012',
                                            'Bevöl...
                             feature_step_n_inner_bags=0,
                             feature_types=['continuous', 'continuous

In [5]:
#Show blackbox model performance
from interpret import show
from interpret.perf import RegressionPerf

ebm_perf = RegressionPerf(ebm.predict).explain_perf(x_test, y_test, name='EBM')
lr_perf = RegressionPerf(lr.predict).explain_perf(x_test, y_test, name='Linear Regression')
rt_perf = RegressionPerf(rt.predict).explain_perf(x_test, y_test, name='Regression Tree')
show(ebm_perf)
show(lr_perf)
show(rt_perf)

In [6]:
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)

In [7]:
#Understand individual predictions
ebm_local = ebm.explain_local(x_test[:5], y_test[:5], name='EBM')
show(ebm_local)

In [8]:
#Train a blackbox regression system

from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

#Blackbox system can include preprocessing, not just a regressor!
pca = PCA()
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1)

blackbox_model = Pipeline([('pca', pca), ('rf', rf)])
blackbox_model.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('pca',
                 PCA(copy=True, iterated_power='auto', n_components=None,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('rf',
                 RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                       criterion='mse', max_depth=None,
                                       max_features='auto', max_leaf_nodes=None,
                                       max_samples=None,
                                       min_impurity_decrease=0.0,
                                       min_impurity_split=None,
                                       min_samples_leaf=1, min_samples_split=2,
                                       min_weight_fraction_leaf=0.0,
                                       n_estimators=100, n_jobs=-1,
                                       oob_score=False, random_state=None,
                                       verbose=0, 

In [9]:
#How an individual prediction was made

from interpret.blackbox import ShapKernel
import numpy as np

background_val = np.median(x_train, axis=0).reshape(1, -1)
shap = ShapKernel(predict_fn=blackbox_model.predict, data=background_val, feature_names=feature_names)
shap_local = shap.explain_local(x_test[:5], y_test[:5], name='SHAP')
show(shap_local)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!
l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!





l1_reg="auto" is deprecated and in the next version (v0.29) the behavior will change from a conditional use of AIC to simply "num_features(10)"!


In [10]:
#Global Explanations: How the model behaves overall

from interpret.blackbox import MorrisSensitivity

sensitivity = MorrisSensitivity(predict_fn=blackbox_model.predict, data=x_train)
sensitivity_global = sensitivity.explain_global(name="Global Sensitivity")

show(sensitivity_global)

In [11]:
from interpret import show
from interpret.data import ClassHistogramX_t_prep = pd.DataFrame(data=pipeline_ebm[0:3].transform(X_t), columns=feature_names)
hist = ClassHistogram().explain_data(X_t_prep, y_t, name = 'Train Data')
show(hist)

SyntaxError: invalid syntax (<ipython-input-11-79a4c232bfaa>, line 2)