In [1]:
import pandas as pd

df = pd.read_csv("./data/adultincome trunc.csv")
data_prep = pd.get_dummies(df, drop_first=True)
X = data_prep.iloc[:, :-1]
Y = data_prep.iloc[:, -1]

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234, stratify=Y)

In [3]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=1234)
trained_model = rfc.fit(X_train, Y_train)

Y_predict = rfc.predict(X_test)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_predict)
score = rfc.score(X_test, Y_test)
score

0.7983333333333333

In [6]:
# Create explanations for the model

from interpret.ext.blackbox import TabularExplainer

classes = ["Not Greater than 50K", "Greater than 50k"]
features = list(X.columns)

tab_explainer = TabularExplainer(trained_model,
                                 X_train,
                                 features=features,
                                 classes=classes)

# Get the global explanations
global_explanation = tab_explainer.explain_global(X_train)

In [7]:
# Get the feature importance data
global_fi = global_explanation.get_feature_importance_dict()
global_fi

{'marital status_Married': 0.134457897553098,
 'age': 0.08737999026945,
 'hours per week': 0.054862878214208904,
 'education_ HS-grad': 0.05459228226812601,
 'marital status_ Never-married': 0.04201492036510847,
 'education_ Some-college': 0.03902094175894924,
 'gender_ Male': 0.026954924512638787,
 'education_ Masters': 0.013130510698434426,
 'race_ White': 0.013018742998495862,
 'wc_ Private': 0.008260686989919535,
 'wc_ Local-gov': 0.006514623134033063,
 'education_ Prof-school': 0.006411507490632027,
 'race_ Black': 0.00431141694330449,
 'education_ Doctorate': 0.003320328208831908,
 'race_ Asian-Pac-Islander': 0.0021244991478086886,
 'marital status_ Widowed': 0.0016788032177812158,
 'race_ Other': 0.0003622585777075659,
 'education_ Preschool': 3.30258572723802e-05}

In [11]:
# Get the Local feature importances

X_explain = X_test[0:5]

local_explanation = tab_explainer.explain_local(X_explain)

local_features = local_explanation.get_ranked_local_names()
local_importance = local_explanation.get_ranked_local_values()

for i in range(len(local_features)):
    labels = local_features[i]
    print(f"\n Feature support values for: {classes[i]}")
    for j in range(len(labels)):
        if Y_predict[j] == i:
            print(f"\n\tObservation number: {j+1}")
            feature_names = labels[j]
            print("\t\t", "Feature Name".ljust(30),  " Value")
            print("\t\t", "-"*30, "-"*10)
            for k in range(len(feature_names)):
                print("\t\t", feature_names[k].ljust(30), round(local_importance[i][j][k], 6))


 Feature support values for: Not Greater than 50K

	Observation number: 1
		 Feature Name                    Value
		 ------------------------------ ----------
		 education_ HS-grad             0.157534
		 hours per week                 0.062715
		 wc_ Private                    0.0179
		 education_ Masters             0.012778
		 education_ Prof-school         0.004132
		 race_ Black                    0.003311
		 education_ Doctorate           0.001875
		 education_ Preschool           -1.1e-05
		 race_ Other                    -0.000325
		 wc_ Local-gov                  -0.000783
		 race_ Asian-Pac-Islander       -0.001856
		 marital status_ Widowed        -0.002629
		 gender_ Male                   -0.006227
		 race_ White                    -0.013401
		 education_ Some-college        -0.022706
		 marital status_ Never-married  -0.04377
		 age                            -0.064633
		 marital status_Married         -0.149913

	Observation number: 2
		 Feature Name                   