# XAI - EBM

## Loading libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

## Loading dataset

In [2]:
data = pd.read_csv("../dataset/cleaned_user_profiles.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio,hashtags_ratio,reply_count_mean,reply_count_std,favorite_count_mean,favorite_count_std,favorite_count_entropy,retweet_count_mean,retweet_count_std,retweet_count_entropy
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152,0.0,0.272727,0.098485,0.0,0.0,0.037879,0.190903,0.232481,0.037879,0.190903,0.232481
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322,0.0,0.338843,0.024793,0.0,0.0,0.049587,0.21709,0.284639,0.024793,0.155495,0.167568
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,65.340909,14.694444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463,0.022331,0.006281,0.072575,0.0,0.0,0.165387,0.530838,0.669155,0.826239,13.034008,0.39285
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073,0.000825,0.506461,0.118229,0.0,0.0,0.056365,0.243387,0.317182,0.016772,0.142619,0.120737


## Preprocessing of the data

In [4]:
target = data.pop('bot')

In [5]:
# convert datetime to timestamp to permit classification
data["created_at"] = pd.to_datetime(data.created_at).values.astype(np.int64) // 10 ** 9

In [6]:
data.head()

Unnamed: 0,name,lang,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio,hashtags_ratio,reply_count_mean,reply_count_std,favorite_count_mean,favorite_count_std,favorite_count_entropy,retweet_count_mean,retweet_count_std,retweet_count_entropy
2353593986,Lamonica Raborn,en,1550858442,76,62.340909,14.015152,0.0,0.272727,0.098485,0.0,0.0,0.037879,0.190903,0.232481,0.037879,0.190903,0.232481
2358850842,Lourie Botton,en,1551150152,54,69.082645,15.041322,0.0,0.338843,0.024793,0.0,0.0,0.049587,0.21709,0.284639,0.024793,0.155495,0.167568
137959629,Dadan Syarifudin,en,1430377796,53,65.340909,14.694444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
466124818,Carletto Focia,it,1484707758,50,86.944871,18.689463,0.022331,0.006281,0.072575,0.0,0.0,0.165387,0.530838,0.669155,0.826239,13.034008,0.39285
2571493866,MBK Ebook,en,1560886221,7085,72.311246,14.582073,0.000825,0.506461,0.118229,0.0,0.0,0.056365,0.243387,0.317182,0.016772,0.142619,0.120737


In [7]:
# categorical_features = ["lang", "bot", "created_at", "name"]
categorical_features = ["lang", "name"]

# remove categorical variables
numerical_features = list(data.columns).copy()

for feat in categorical_features:
    numerical_features.remove(feat)

In [8]:
numerical_data = data[numerical_features]

## Training the model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(numerical_data, target, test_size=0.20, random_state=42)

ebm = ExplainableBoostingClassifier(random_state=42)
ebm.fit(X_train, y_train)

## Evaluation

In [10]:
print(classification_report(y_test, ebm.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.78      0.87      1037
           1       0.85      0.99      0.91      1265

    accuracy                           0.90      2302
   macro avg       0.92      0.89      0.89      2302
weighted avg       0.91      0.90      0.89      2302



## Plot Explanataions

We can see that in the global explanation of the importance of each feature in the predictions, the **statuses_count** is the most important one; this result is consistent with the Shap and Lime ones.

In [11]:
ebm_global = ebm.explain_global()
show(ebm_global)

ebm_local = ebm.explain_local(X_test[:20], y_test[:20])
show(ebm_local)