In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn import naive_bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer

from interpret import show
from interpret.blackbox import LimeTabular

In [2]:
# import data
df = pd.read_csv("./data/CEE_DATA.csv", quotechar="'")

X = df[
    [
        "Gender",
        "Caste",
        "coaching",
        "time",
        "Class_ten_education",
        "twelve_education",
        "medium",
        "Class_X_Percentage",
        "Class_XII_Percentage",
        "Father_occupation",
        "Mother_occupation",
    ]
]
Y = df["Performance"].values.reshape(-1,)

## Data Preprocessing

In [3]:
categorical_names = {}
X_transformed = X.copy()
for j, column in enumerate(X.columns):
    le = LabelEncoder()
    le.fit(X[column])
    X_transformed[column] = le.transform(X[column])
    categorical_names[j] = le.classes_
# X_transformed

In [4]:
# split data
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(
    X_transformed, Y, test_size=0.3, random_state=seed
)
print("Train Size Instances: ", X_train.shape[0])
print("Test Size Instances:", X_test.shape[0])

Train Size Instances:  466
Test Size Instances: 200


In [5]:
columnTransformer = ColumnTransformer([("ohe", OneHotEncoder(), list(categorical_names.keys()))], 
                                      remainder = 'passthrough',
                                      sparse_threshold=0)
columnTransformer.fit(X_transformed)

ColumnTransformer(remainder='passthrough', sparse_threshold=0,
                  transformers=[('ohe', OneHotEncoder(),
                                 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])])

In [6]:
X_train_transformed = columnTransformer.transform(X_train)
X_test_transformed = columnTransformer.transform(X_test)
X_test_transformed

array([[1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 1., 0., 0.]])

## LIME Tabular

In [7]:
# clf = svm.SVC(probability=True)
# clf = naive_bayes.MultinomialNB()
# clf = KNeighborsClassifier(n_neighbors=5)
clf = RandomForestClassifier()
clf.fit(X_train_transformed, Y_train)

RandomForestClassifier()

In [8]:
predict_fn = lambda x: clf.predict_proba(columnTransformer.transform(x))

In [9]:
lime = LimeTabular(
    data=X_train.values,
    feature_names=list(X_train.columns),
    class_names=clf.classes_.tolist(),
    categorical_features=range(len(X_train.columns)),
    categorical_names=categorical_names,
    kernel_width=3,
    mode='classification',
    predict_fn=predict_fn
)
lime_local = lime.explain_local(X_test[:1], Y_test[:1])

show(lime_local)

NotImplementedError: LIME does not currently support classifier models without probability scores. If this conflicts with your use case, please let us know: https://github.com/datascienceinc/lime/issues/16