In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

from interpret import show
from interpret.blackbox import LimeTabular
from interpret.blackbox import ShapKernel

In [2]:
# import data
df = pd.read_csv("./data/CEE_DATA.csv", quotechar="'")

X = df[
    [
        "Gender",
        "Caste",
        "coaching",
        "time",
        "Class_ten_education",
        "twelve_education",
        "medium",
        "Class_X_Percentage",
        "Class_XII_Percentage",
        "Father_occupation",
        "Mother_occupation",
    ]
]
Y = df["Performance"].values.reshape(-1,)

## Data Preprocessing

In [3]:
categorical_names = {}
X_transformed = X.copy()
for j, column in enumerate(X.columns):
    le = LabelEncoder()
    le.fit(X[column])
    X_transformed[column] = le.transform(X[column])
    categorical_names[j] = le.classes_
# X_transformed

In [4]:
le = LabelEncoder()
Y = le.fit_transform(Y)
Y

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2,

In [5]:
# split data
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(
    X_transformed, Y, test_size=0.3, random_state=seed
)
print("Train Size Instances: ", X_train.shape[0])
print("Test Size Instances:", X_test.shape[0])

Train Size Instances:  466
Test Size Instances: 200


In [6]:
columnTransformer = ColumnTransformer([("ohe", OneHotEncoder(), list(categorical_names.keys()))], 
                                      remainder = 'passthrough',
                                      sparse_threshold=0)
columnTransformer.fit(X_transformed)

ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                  transformers=[('ohe', OneHotEncoder(),
                                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])])

In [7]:
X_train_transformed = columnTransformer.transform(X_train)
X_test_transformed = columnTransformer.transform(X_test)
X_test_transformed

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 1., 0., 0.]])

## LIME Tabular

In [8]:
# clf = svm.SVC(probability=True)
# clf = naive_bayes.MultinomialNB()
# clf = KNeighborsClassifier(n_neighbors=5)
clf = RandomForestClassifier()
clf.fit(X_train_transformed, Y_train)

RandomForestClassifier()

In [9]:
predict_fn = lambda x: clf.predict_proba(columnTransformer.transform(x))

In [10]:
Y_test

array([2, 2, 1, 2, 0, 2, 0, 1, 2, 3, 3, 0, 3, 2, 2, 1, 1, 2, 2, 0, 2, 3,
       3, 0, 2, 0, 3, 3, 2, 0, 2, 0, 2, 3, 0, 0, 3, 0, 2, 0, 0, 1, 0, 3,
       1, 0, 2, 1, 0, 0, 1, 0, 2, 2, 2, 0, 2, 2, 2, 3, 3, 3, 2, 1, 0, 1,
       2, 2, 0, 0, 1, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 2, 0, 0, 2, 3, 3, 1,
       3, 1, 2, 2, 3, 1, 3, 0, 3, 2, 0, 2, 3, 3, 2, 1, 3, 1, 2, 1, 2, 2,
       3, 2, 3, 3, 0, 2, 2, 0, 2, 3, 3, 1, 2, 3, 2, 2, 1, 2, 0, 3, 2, 0,
       0, 2, 3, 3, 3, 2, 3, 2, 0, 1, 2, 0, 3, 2, 0, 3, 0, 0, 1, 3, 1, 2,
       2, 2, 2, 0, 2, 2, 3, 3, 3, 0, 0, 2, 3, 3, 1, 3, 2, 1, 3, 1, 2, 2,
       3, 2, 2, 3, 0, 2, 2, 2, 0, 1, 3, 1, 2, 2, 0, 1, 3, 2, 0, 0, 3, 1,
       3, 1])

In [11]:
# lime = LimeTabular(
#     predict_fn,
#     data=X_train.values,
#     feature_names=list(X_train.columns),
#     explain_kwargs={"top_labels": 3, "num_features": 5},
#     class_names=clf.classes_.tolist(),
#     categorical_features=range(len(X_train.columns)),
#     categorical_names=categorical_names,
#     kernel_width=3,
#     mode='classification'
# )
# lime_local = lime.explain_local(X_test.values[47], Y_test[47])

# show(lime_local)

In [12]:
shap = ShapKernel(predict_fn=predict_fn, data=X_train.values)
shap_local = shap.explain_local(X_test.values[:5], Y_test[:5])

show(shap_local)

Using 466 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


  0%|          | 0/5 [00:00<?, ?it/s]


The dash_html_components package is deprecated. Please replace
`import dash_html_components as html` with `from dash import html`

The dash_core_components package is deprecated. Please replace
`import dash_core_components as dcc` with `from dash import dcc`

The dash_table package is deprecated. Please replace
`import dash_table` with `from dash import dash_table`

Also, if you're using any of the table format helpers (e.g. Group), replace 
`from dash_table.Format import Group` with 
`from dash.dash_table.Format import Group`
