In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics

from interpret import show
from interpret.glassbox import ExplainableBoostingClassifier

# from interpret import set_visualize_provider
# from interpret.provider import InlineProvider
# set_visualize_provider(InlineProvider())

In [2]:
# import data
df = pd.read_csv("./data/CEE_DATA.csv", quotechar="'")

X = df[
    [
        "Gender",
        "Caste",
        "coaching",
        "time",
        "Class_ten_education",
        "twelve_education",
        "medium",
        "Class_X_Percentage",
        "Class_XII_Percentage",
        "Father_occupation",
        "Mother_occupation",
    ]
]
Y = df["Performance"]

In [3]:
# split data
seed = 1
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.3, random_state=seed
)
print("Train Size Instances: ", X_train.shape[0])
print("Test Size Instances:", X_test.shape[0])

Train Size Instances:  466
Test Size Instances: 200


In [4]:
ohe = OneHotEncoder(sparse=False)
Xoded = pd.DataFrame(ohe.fit_transform(X),columns=ohe.get_feature_names(['Gender', 'Caste', 'coaching', 'time', 'Class_ten_education',
       'twelve_education', 'medium', 'Class_X_Percentage',
       'Class_XII_Percentage', 'Father_occupation', 'Mother_occupation']))
X_train_enc, X_test_enc, Y_train_enc, Y_test_enc = train_test_split(
    Xoded, Y, test_size=0.3, random_state=seed
)
print("Train Size Instances: ", X_train_enc.shape[0])
print("Test Size Instances:", X_test_enc.shape[0])

Train Size Instances:  466
Test Size Instances: 200


## EBM

In [None]:
# https://interpret.ml/docs/ebm.html?highlight=multiclass#

ebm = ExplainableBoostingClassifier(random_state=seed)
ebm.fit(X_train_enc, Y_train_enc)

# global explanations
ebm_global = ebm.explain_global()
show(ebm_global)

  warn("Multiclass is still experimental. Subject to change per release.")
  warn("Detected multiclass problem: forcing interactions to 0")


In [None]:
ebm_global.data(-1)

In [None]:
ebm.get_params()

In [None]:
# local explanations
ebm_local = ebm.explain_local(X_test_enc[:5], Y_train_enc[:5])
show(ebm_local)

In [None]:
Y_pred = ebm.predict(X_test_enc)
print("Accuracy: ", metrics.accuracy_score(Y_test, Y_pred))
print("Precision: ", metrics.precision_score(Y_test, Y_pred, average="micro"))
print("Recall: ", metrics.recall_score(Y_test, Y_pred, average="micro"))
print("F1 score: ", metrics.f1_score(Y_test, Y_pred, average="micro"))
cm = metrics.confusion_matrix(Y_test, Y_pred, labels=ebm.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ebm.classes_)
disp.plot()
plt.show()

## Linear Model

In [None]:
# https://interpret.ml/docs/lr.html

from interpret.glassbox import LogisticRegression

lr = LogisticRegression(random_state=seed)
lr.fit(X_train_enc, Y_train_enc)

lr_global = lr.explain_global()
show(lr_global)

In [None]:
# seems this one doesn't support strings for y labels
# lr_local = lr.explain_local(X_test_enc[:5], Y_test_enc[:5])
# show(lr_local)

In [None]:
def show_metrics(model, X_test, Y_test):
    Y_pred = model.predict(X_test)
    print("Accuracy: ", metrics.accuracy_score(Y_test, Y_pred))
    print("Precision: ", metrics.precision_score(Y_test, Y_pred, average="micro"))
    print("Recall: ", metrics.recall_score(Y_test, Y_pred, average="micro"))
    print("F1 score: ", metrics.f1_score(Y_test, Y_pred, average="micro"))
    cm = metrics.confusion_matrix(Y_test, Y_pred, labels=model._model().classes_)
    disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model._model().classes_)
    disp.plot()
    plt.show()

In [None]:
show_metrics(lr, X_test_enc, Y_test_enc)

## Decision tree

In [None]:
from interpret.glassbox import ClassificationTree

dt = ClassificationTree(random_state=seed)
dt.fit(X_train_enc, Y_train_enc)

dt_global = dt.explain_global()
show(dt_global)

In [None]:
# seems this one doesn't support strings for y labels
# dt_local = dt.explain_local(X_test_enc[:5], Y_test_enc[:5])
# show(dt_local)

In [None]:
show_metrics(dt, X_test_enc, Y_test_enc)

## Descision Rule

In [None]:
from interpret.glassbox import DecisionListClassifier

dl = DecisionListClassifier(random_state=seed)
dl.fit(X_train_enc, Y_train_enc)

dl_global = dl.explain_global()
show(dl_global)

In [None]:
# seems this one doesn't support strings for y labels
# dl_local = dl.explain_local(X_test_enc[:5], Y_test_enc[:5])
# show(dl_local)

In [None]:
Y_pred = dl.predict(X_test_enc)
print("Accuracy: ", metrics.accuracy_score(Y_test, Y_pred))
print("Precision: ", metrics.precision_score(Y_test, Y_pred, average="micro"))
print("Recall: ", metrics.recall_score(Y_test, Y_pred, average="micro"))
print("F1 score: ", metrics.f1_score(Y_test, Y_pred, average="micro"))
cm = metrics.confusion_matrix(Y_test, Y_pred, labels=dl.classes_)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=dl.classes_)
disp.plot()
plt.show()