# Task 4: XAI - TABNET

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import os
import shutil
import tensorflow as tf
import tabnet

2022-12-13 13:02:20.173898: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Utilities

In [2]:
def plot_masks(model):
    for mask in model.tabnet.feature_selection_masks:
        fig, ax = plt.subplots(figsize=(5, 20))
        ax.imshow(mask[0])
        plt.xticks(range(len(numerical_features)), numerical_features, rotation="vertical") 
        plt.show()

    fig, ax = plt.subplots(figsize=(5, 20))
    ax.imshow(model.tabnet.aggregate_feature_selection_mask[0])
    plt.xticks(range(len(numerical_features)), numerical_features, rotation="vertical") 
    plt.show()

In [3]:
def plot_averaged_masks(model):
    for mask in model.tabnet.feature_selection_masks:
        fig, ax = plt.subplots(figsize=(5, 10))
        avg = np.mean(mask[0], axis=0)
        ax.imshow([avg/np.sum(avg)])
        plt.xticks(range(len(numerical_features)), numerical_features, rotation="vertical")
        plt.yticks([])
        plt.show()

    fig, ax = plt.subplots(figsize=(5, 10))
    avg = np.mean(model.tabnet.aggregate_feature_selection_mask[0], axis=0)
    ax.imshow([np.array(avg)/np.sum(avg)])
    plt.xticks(range(len(numerical_features)), numerical_features, rotation="vertical")
    plt.yticks([])
    plt.show()

## Get Data and Preprocessing

In [4]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)

In [5]:
data.head()

Unnamed: 0,name,lang,bot,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio,hashtags_ratio,reply_count_mean,reply_count_std,favorite_count_mean,favorite_count_std,favorite_count_entropy,retweet_count_mean,retweet_count_std
2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,62.340909,14.015152,0.0,0.272727,0.098485,0.0,0.0,0.037879,0.190903,0.232481,0.037879,0.190903
2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,69.082645,15.041322,0.0,0.338843,0.024793,0.0,0.0,0.049587,0.21709,0.284639,0.024793,0.155495
137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,53,65.340909,14.694444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,86.944871,18.689463,0.022331,0.006281,0.072575,0.0,0.0,0.165387,0.530838,0.669155,0.826239,13.034008
2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,72.311246,14.582073,0.000825,0.506461,0.118229,0.0,0.0,0.056365,0.243387,0.317182,0.016772,0.142619


In [6]:
target = data.pop('bot')

In [7]:
# convert datetime to timestamp to permit classification
data["created_at"] = pd.to_datetime(data.created_at).values.astype(np.int64) // 10 ** 9

In [8]:
# categorical_features = ["lang", "bot", "created_at", "name"]
categorical_features = ["lang", "name"]

# remove categorical variables
numerical_features = list(data.columns).copy()

for feat in categorical_features:
    numerical_features.remove(feat)

In [9]:
numerical_features

['created_at',
 'statuses_count',
 'avg_length',
 'avg_special_chars',
 'urls_ratio',
 'mentions_ratio',
 'hashtags_ratio',
 'reply_count_mean',
 'reply_count_std',
 'favorite_count_mean',
 'favorite_count_std',
 'favorite_count_entropy',
 'retweet_count_mean',
 'retweet_count_std']

In [10]:
numerical_data = data[numerical_features]

In [11]:
numerical_data.head()

Unnamed: 0,created_at,statuses_count,avg_length,avg_special_chars,urls_ratio,mentions_ratio,hashtags_ratio,reply_count_mean,reply_count_std,favorite_count_mean,favorite_count_std,favorite_count_entropy,retweet_count_mean,retweet_count_std
2353593986,1550858442,76,62.340909,14.015152,0.0,0.272727,0.098485,0.0,0.0,0.037879,0.190903,0.232481,0.037879,0.190903
2358850842,1551150152,54,69.082645,15.041322,0.0,0.338843,0.024793,0.0,0.0,0.049587,0.21709,0.284639,0.024793,0.155495
137959629,1430377796,53,65.340909,14.694444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
466124818,1484707758,50,86.944871,18.689463,0.022331,0.006281,0.072575,0.0,0.0,0.165387,0.530838,0.669155,0.826239,13.034008
2571493866,1560886221,7085,72.311246,14.582073,0.000825,0.506461,0.118229,0.0,0.0,0.056365,0.243387,0.317182,0.016772,0.142619


In [12]:
target.head()

2353593986    1
2358850842    0
137959629     1
466124818     1
2571493866    0
Name: bot, dtype: int64

In [13]:
numerical_dataset = tf.data.Dataset.from_tensor_slices((numerical_data, target))

2022-12-13 13:02:23.199580: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
def transform(data, target):
    data = tf.unstack(data)
    
    x = dict(zip(numerical_features, data))
    y = tf.one_hot(target, 2)
    
    return x, y

In [15]:
BATCH_SIZE = 128

train_size = int(data.shape[0] / 100 * 70)

In [16]:
ds_full = numerical_dataset.shuffle(data.shape[0], seed=0)
# ds_full = numerical_dataset

In [17]:
ds_train = ds_full.take(train_size)
ds_train = ds_train.map(transform)
ds_train = ds_train.batch(BATCH_SIZE)
ds_test = ds_full.skip(train_size)
ds_test = ds_test.map(transform)
ds_test = ds_test.batch(BATCH_SIZE)

feature_columns = []
for col_name in numerical_features:
    feature_columns.append(tf.feature_column.numeric_column(col_name))

In [18]:
feature_columns

[NumericColumn(key='created_at', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='statuses_count', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='avg_length', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='avg_special_chars', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='urls_ratio', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='mentions_ratio', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='hashtags_ratio', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='reply_count_mean', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='reply_count_std', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='favorite_count

## Define and Train the model

In [25]:
# Group Norm does better for small datasets
model = tabnet.TabNetClassifier(feature_columns, num_classes=2,
                                feature_dim=128, output_dim=64,
                                num_decision_steps=12, relaxation_factor=1.5,
                                sparsity_coefficient=0., batch_momentum=0.8,
                                virtual_batch_size=None, norm_type="batch")
                                

lr = tf.keras.optimizers.schedules.ExponentialDecay(0.0012, decay_steps=100, decay_rate=0.9, staircase=False)
optimizer = tf.keras.optimizers.Adam(lr)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

[TabNet]: 64 features will be used for decision steps.


In [None]:
early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

model.fit(ds_train, epochs=100, validation_data=ds_test, callbacks=[early_stopping_callback])

model.summary()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100

## Evaluation and plot the Explanations

In [None]:
results = model.evaluate(ds_test, batch_size=128)
print("test loss, test acc:", results)

In [None]:
# Force eager execution mode to generate the masks
x, y = next(iter(ds_train))

_ = model(x)

### Explanations

More yellow cells indicates more importance features for a certain sample.
In the following we can see the masks generated by the TabNet algorithm. The last is the aggregation of the others

In [None]:
plot_masks(model)

Report now the averaging of the masks over the different samples.

We can see in the aggregation that the most important feature is the **favourite_count_entropy**, this is not consistent with the other results get from the other XAI methods.

In [None]:
plot_averaged_masks(model)