In [1]:
import sys

sys.path.append('../')

from ucimlrepo import fetch_ucirepo 
from pathlib import Path
import pandas as pd
import Uncertainpy.src.uncertainpy.gradual as grad
from mlp_to_qbaf_converter.argument_attribution_explanation import AAE
from mlp_to_qbaf_converter.relation_attribution_explanation import RAE
from mlp_to_qbaf_converter.utils import plot_qbaf
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from mlp_to_qbaf_converter.mlp_to_qbaf import MLPToQBAF
import random
from tqdm import tqdm
from sparx import sparx


import warnings
warnings.filterwarnings('ignore')

In [2]:
from compas_load_and_preprocess import load_compas

# fetch datasets
print("Fetching datasets...")
print("Breast Cancer...")
breast_cancer = fetch_ucirepo(id=17) 
print("Iris...")
iris = fetch_ucirepo(id=53)
print("COMPAS...")
compas = pd.read_csv('../../data/compas-scores-two-years.csv')

# data (as pandas dataframes) 
X_cancer = breast_cancer.data.features 
y_cancer = breast_cancer.data.targets 

X_iris = iris.data.features
y_iris = iris.data.targets

compas = load_compas(compas)

X_compas = compas.drop(columns=['two_year_recid'])
y_compas = compas[['two_year_recid']]

Fetching datasets...
Breast Cancer...
Iris...
COMPAS...


In [3]:
endcoder_iris = LabelEncoder()
encoder_cancer = LabelEncoder()
encoder_compas = LabelEncoder()

y_cancer = y_cancer.apply(encoder_cancer.fit_transform);
y_iris = y_iris.apply(endcoder_iris.fit_transform);

X_compas = X_compas.apply(encoder_compas.fit_transform);
y_compas = y_compas.apply(encoder_compas.fit_transform);

In [4]:
X_cancer = X_cancer.to_numpy()
y_cancer = y_cancer.to_numpy()

X_iris = X_iris.to_numpy()
y_iris = y_iris.to_numpy()


X_compas = X_compas.to_numpy()
y_compas = y_compas.to_numpy()

# split data
X_train_cancer, X_test_cancer, y_train_cancer, y_test_cancer = train_test_split(X_cancer, y_cancer, test_size=0.2, random_state=42)
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)
X_train_compas, X_test_compas, y_train_compas, y_test_compas = train_test_split(X_compas, y_compas, test_size=0.2, random_state=42)

# scale data
scaler_cancer = MinMaxScaler()
scaler_iris = MinMaxScaler()
scaler_compas = MinMaxScaler()

# For AAEs clip value
h = 1e-8

scaler_cancer.fit(X_train_cancer)
X_train_cancer = scaler_cancer.transform(X_train_cancer)
X_test_cancer = np.clip(scaler_cancer.transform(X_test_cancer), h, 1 - h)

scaler_iris.fit(X_train_iris)
X_train_iris = scaler_iris.transform(X_train_iris)
X_test_iris = np.clip(scaler_iris.transform(X_test_iris), h, 1 - h)

scaler_compas.fit(X_train_compas)
X_train_compas = scaler_compas.transform(X_train_compas)
X_test_compas = np.clip(scaler_compas.transform(X_test_compas), h, 1 - h)

y_train_cancer = y_train_cancer.reshape(-1)
y_test_cancer = y_test_cancer.reshape(-1)

y_train_iris = y_train_iris.reshape(-1)
y_test_iris = y_test_iris.reshape(-1)

y_train_compas = y_train_compas.reshape(-1)
y_test_compas = y_test_compas.reshape(-1)


In [5]:
import joblib

if not Path("classifier_cancer.pkl").exists():
    classifier_cancer = MLPClassifier(
        hidden_layer_sizes=(50, 50), 
        activation="logistic", 
        max_iter=100, 
        random_state=2025
        )
    classifier_cancer.fit(X_train_cancer, y_train_cancer)
    joblib.dump(classifier_cancer, "classifier_cancer.pkl")
    
else: 
    classifier_cancer = joblib.load("classifier_cancer.pkl")

if not Path("classifier_iris.pkl").exists():
    classifier_iris = MLPClassifier(
        hidden_layer_sizes=(50, 50), 
        activation="logistic", 
        max_iter=100, 
        random_state=2025,
        solver='lbfgs',
        )
    classifier_iris.fit(X_train_iris, y_train_iris)
    joblib.dump(classifier_iris, "classifier_iris.pkl")
else:
    classifier_iris = joblib.load("classifier_iris.pkl")


if not Path("classifier_compas.pkl").exists():
    classifier_compas = MLPClassifier(
        hidden_layer_sizes=(50, 50),
        activation="logistic",
        max_iter=100,
        random_state=2025,
        learning_rate="adaptive",
        alpha=0.1,
    )
    classifier_compas.fit(X_train_compas, y_train_compas)
    joblib.dump(classifier_compas, "classifier_compas.pkl")
else:
    classifier_compas = joblib.load("classifier_compas.pkl")

In [6]:
from sklearn.metrics import classification_report

print("Cancer dataset")
print("Train accuracy: ", classifier_cancer.score(X_train_cancer, y_train_cancer))
print("Test accuracy: ", classifier_cancer.score(X_test_cancer, y_test_cancer))
print(classification_report(y_test_cancer, classifier_cancer.predict(X_test_cancer)))

print("Iris dataset")
print("Train accuracy: ", classifier_iris.score(X_train_iris, y_train_iris))
print("Test accuracy: ", classifier_iris.score(X_test_iris, y_test_iris))
print(classification_report(y_test_iris, classifier_iris.predict(X_test_iris)))

print("COMPAS dataset")
print("Train accuracy: ", classifier_compas.score(X_train_compas, y_train_compas))
print("Test accuracy: ", classifier_compas.score(X_test_compas, y_test_compas))
print(classification_report(y_test_compas, classifier_compas.predict(X_test_compas)))


Cancer dataset
Train accuracy:  0.9648351648351648
Test accuracy:  0.9736842105263158
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Iris dataset
Train accuracy:  0.9833333333333333
Test accuracy:  1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

COMPAS dataset
Train accuracy:  0.6800359712230216
Test accuracy:  0.6743350107836089
              precision    recall  f1-score   suppo

In [7]:
from mlp_to_qbaf_converter.utils import forward_pass_dataset, logistic


def input_output_unfaithfulness(model, sparse_model, local_dataset, example_weights):
    weights_original, biases_original = model
    weights_sparse, biases_sparse = sparse_model

    original_output = forward_pass_dataset(
            local_dataset[:, :-1],
            weights_original,
            biases_original,
            logistic,
        )[-1]
    
    sparse_output = forward_pass_dataset(
            local_dataset[:, :-1],
            weights_sparse,
            biases_sparse,
            logistic,
        )[-1]
    
    diff = sparse_output - original_output
    diff = np.power(diff, 2)
    total = np.sum(diff, axis=1) 
    total = np.multiply(total, example_weights)
    total = np.sum(diff)

    return total / np.sum(example_weights)

In [8]:
def structural_unfaithfulness(model, sparse_model, X_test, sp, example_num):

    weights_original, biases_original = model
    weights_sparse, biases_sparse = sparse_model
    preserve_percent = sp.preserve_percentage / 100
    original_activations = forward_pass_dataset(
        X_test,
        weights_original,
        biases_original,
        logistic,
    )
    sparse_activations = forward_pass_dataset(
        X_test,
        weights_sparse,
        biases_sparse,
        logistic,
    )

    structural_unfaithfulness = 0

    for i, original_activation in enumerate(original_activations):
        sparse_activation = sparse_activations[i]
        if i != 0:
            for cluster_label in range(int(len(biases_original[i - 1]) * preserve_percent)):
                if cluster_label in sp.cluster_labels[i - 1]:
                    structural_unfaithfulness += np.sum(np.abs(
                        np.mean(original_activation[example_num, sp.cluster_labels[i - 1] == cluster_label]) - 
                        sparse_activation[example_num, cluster_label]))
        else:
            structural_unfaithfulness += np.sum(
                np.abs(sparse_activation[example_num, :] - 
                        original_activation[example_num, :]))

    number_of_nodes = sum(sp.get_sparsified_shape()[1:])
    structural_unfaithfulness /= number_of_nodes 

    return structural_unfaithfulness


In [9]:
sparse_amounts = [20, 40, 60, 80]
row_names = ["20% (Original SpArX)", "20% (AAE SpArX)", "40% (Original SpArX)", "40% (AAE SpArX)", "60% (Original SpArX)", "60% (AAE SpArX)", "80% (Original SpArX)", "80% (AAE SpArX)"]
results_df_input_output = pd.DataFrame(columns=["Method", "Iris", "Cancer", "COMPAS"])
results_df_input_output["Method"] = row_names

results_df_structural = pd.DataFrame(columns=["Method", "Iris", "Cancer", "COMPAS"])
results_df_structural["Method"] = row_names

In [10]:
# Iris dataset results

train_set_full_iris = np.column_stack((X_train_iris, y_train_iris))
input_names_iris = iris.data.features.columns.tolist()
output_names_iris = endcoder_iris.classes_.tolist()
topic_arg_iris = output_names_iris[0]
kernel_size = np.sqrt(X_test_iris.shape[1]) * 0.75
model_iris_original = (classifier_iris.coefs_, classifier_iris.intercepts_)

sparse_amounts = [20, 40, 60, 80]

for sp_percent in tqdm(sparse_amounts):
    results_io_aae = []
    results_struct_aae = []
    results_io_original = []
    results_struct_original = []
    for example_row_num in range(len(X_test_iris)):
        example = X_test_iris[example_row_num]
        example_row = np.append(example, y_test_iris[example_row_num])

        sp_aae = sparx.LocalSpArX(classifier_iris.coefs_, classifier_iris.intercepts_, "logistic", sp_percent, example_row, train_set_full_iris, kernel_size, input_names_iris, output_names_iris, topic_arg_iris, sparx.ClusteringMethod.AAE_GRADIENT)
        s_weights, s_biases = sp_aae.get_sparsified_mlp()
        model_iris_sparse = (s_weights, s_biases)
        local_dataset = sp_aae.local_dataset
        example_weights = sp_aae.example_weights

        sp_original = sparx.LocalSpArX(classifier_iris.coefs_, classifier_iris.intercepts_, "logistic", sp_percent, example_row, train_set_full_iris, kernel_size)
        s_weights_original, s_biases_original = sp_original.get_sparsified_mlp()
        model_iris_sparse_original = (s_weights_original, s_biases_original)
        local_dataset_original = sp_original.local_dataset
        example_weights_original = sp_original.example_weights


        io_unfaithfulness_aae = input_output_unfaithfulness(model_iris_original, model_iris_sparse, local_dataset, example_weights)
        struct_unfaithfulness_aae = structural_unfaithfulness(model_iris_original, model_iris_sparse, X_test_iris, sp_aae, example_row_num)

        io_unfaithfulness_original = input_output_unfaithfulness(model_iris_original, model_iris_sparse_original, local_dataset_original, example_weights_original)
        struct_unfaithfulness_original = structural_unfaithfulness(model_iris_original, model_iris_sparse_original, X_test_iris, sp_original, example_row_num)

        results_io_aae.append(io_unfaithfulness_aae)
        results_struct_aae.append(struct_unfaithfulness_aae)

        results_io_original.append(io_unfaithfulness_original)
        results_struct_original.append(struct_unfaithfulness_original)

    
    sp_result_io = np.mean(results_io_aae)
    sp_result_struct = np.mean(results_struct_aae)
    sp_result_io_original = np.mean(results_io_original)
    sp_result_struct_original = np.mean(results_struct_original)

    results_df_input_output.loc[results_df_input_output["Method"] == f"{sp_percent}% (AAE SpArX)", "Iris"] = sp_result_io
    results_df_structural.loc[results_df_structural["Method"] == f"{sp_percent}% (AAE SpArX)", "Iris"] = sp_result_struct
    results_df_input_output.loc[results_df_input_output["Method"] == f"{sp_percent}% (Original SpArX)", "Iris"] = sp_result_io_original
    results_df_structural.loc[results_df_structural["Method"] == f"{sp_percent}% (Original SpArX)", "Iris"] = sp_result_struct_original


100%|██████████| 4/4 [02:06<00:00, 31.75s/it]


In [11]:
print(results_df_input_output)
print(results_df_structural)

                 Method      Iris Cancer COMPAS
0  20% (Original SpArX)  0.334683    NaN    NaN
1       20% (AAE SpArX)  0.463643    NaN    NaN
2  40% (Original SpArX)  0.704397    NaN    NaN
3       40% (AAE SpArX)   0.45708    NaN    NaN
4  60% (Original SpArX)  0.741685    NaN    NaN
5       60% (AAE SpArX)  0.407295    NaN    NaN
6  80% (Original SpArX)  1.116516    NaN    NaN
7       80% (AAE SpArX)  0.410521    NaN    NaN
                 Method      Iris Cancer COMPAS
0  20% (Original SpArX)  0.048017    NaN    NaN
1       20% (AAE SpArX)  0.054251    NaN    NaN
2  40% (Original SpArX)  0.076527    NaN    NaN
3       40% (AAE SpArX)  0.078695    NaN    NaN
4  60% (Original SpArX)  0.088782    NaN    NaN
5       60% (AAE SpArX)  0.099359    NaN    NaN
6  80% (Original SpArX)  0.075767    NaN    NaN
7       80% (AAE SpArX)  0.088701    NaN    NaN


In [12]:
# Cancer Dataset results

train_set_full_cancer = np.column_stack((X_train_cancer, y_train_cancer))
input_names_cancer = breast_cancer.data.features.columns.tolist()
output_names_cancer = breast_cancer.data.targets.columns.tolist()
topic_arg_cancer = "Diagnosis"
kernel_size = np.sqrt(X_test_cancer.shape[1]) * 0.75
model_cancer_original = (classifier_cancer.coefs_, classifier_cancer.intercepts_)


for sp_percent in tqdm(sparse_amounts):
    results_io_aae = []
    results_struct_aae = []
    results_io_original = []
    results_struct_original = []
    for example_row_num in range(len(X_test_cancer)):
        example = X_test_cancer[example_row_num]
        example_row = np.append(example, y_test_cancer[example_row_num])

        sp_aae = sparx.LocalSpArX(classifier_cancer.coefs_, classifier_cancer.intercepts_, "logistic", sp_percent, example_row, train_set_full_cancer, kernel_size, input_names_cancer, output_names_cancer, topic_arg_cancer, sparx.ClusteringMethod.AAE_GRADIENT)
        s_weights, s_biases = sp_aae.get_sparsified_mlp()
        model_cancer_sparse = (s_weights, s_biases)
        local_dataset = sp_aae.local_dataset
        example_weights = sp_aae.example_weights

        sp_original = sparx.LocalSpArX(classifier_cancer.coefs_, classifier_cancer.intercepts_, "logistic", sp_percent, example_row, train_set_full_cancer, kernel_size)
        s_weights_original, s_biases_original = sp_original.get_sparsified_mlp()
        model_cancer_sparse_original = (s_weights_original, s_biases_original)
        local_dataset_original = sp_original.local_dataset
        example_weights_original = sp_original.example_weights


        io_unfaithfulness_aae = input_output_unfaithfulness(model_cancer_original, model_cancer_sparse, local_dataset, example_weights)
        struct_unfaithfulness_aae = structural_unfaithfulness(model_cancer_original, model_cancer_sparse, X_test_cancer, sp_aae, example_row_num)

        io_unfaithfulness_original = input_output_unfaithfulness(model_cancer_original, model_cancer_sparse_original, local_dataset_original, example_weights_original)
        struct_unfaithfulness_original = structural_unfaithfulness(model_cancer_original, model_cancer_sparse_original, X_test_cancer, sp_original, example_row_num)

        results_io_aae.append(io_unfaithfulness_aae)
        results_struct_aae.append(struct_unfaithfulness_aae)

        results_io_original.append(io_unfaithfulness_original)
        results_struct_original.append(struct_unfaithfulness_original)
    
    sp_result_io = np.mean(results_io_aae)
    sp_result_struct = np.mean(results_struct_aae)

    sp_result_io_original = np.mean(results_io_original)
    sp_result_struct_original = np.mean(results_struct_original)

    results_df_input_output.loc[results_df_input_output["Method"] == f"{sp_percent}% (AAE SpArX)", "Cancer"] = sp_result_io
    results_df_structural.loc[results_df_structural["Method"] == f"{sp_percent}% (AAE SpArX)", "Cancer"] = sp_result_struct
    results_df_input_output.loc[results_df_input_output["Method"] == f"{sp_percent}% (Original SpArX)", "Cancer"] = sp_result_io_original
    results_df_structural.loc[results_df_structural["Method"] == f"{sp_percent}% (Original SpArX)", "Cancer"] = sp_result_struct_original


  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [09:37<00:00, 144.31s/it]


In [13]:
print(results_df_input_output)
print(results_df_structural)

                 Method      Iris    Cancer COMPAS
0  20% (Original SpArX)  0.334683  0.000259    NaN
1       20% (AAE SpArX)  0.463643  0.000002    NaN
2  40% (Original SpArX)  0.704397  0.001187    NaN
3       40% (AAE SpArX)   0.45708  0.000007    NaN
4  60% (Original SpArX)  0.741685  0.003463    NaN
5       60% (AAE SpArX)  0.407295  0.000014    NaN
6  80% (Original SpArX)  1.116516   0.00654    NaN
7       80% (AAE SpArX)  0.410521  0.000024    NaN
                 Method      Iris    Cancer COMPAS
0  20% (Original SpArX)  0.048017  0.001823    NaN
1       20% (AAE SpArX)  0.054251   0.00047    NaN
2  40% (Original SpArX)  0.076527  0.003985    NaN
3       40% (AAE SpArX)  0.078695  0.000887    NaN
4  60% (Original SpArX)  0.088782  0.006866    NaN
5       60% (AAE SpArX)  0.099359  0.001288    NaN
6  80% (Original SpArX)  0.075767  0.009491    NaN
7       80% (AAE SpArX)  0.088701  0.001672    NaN


In [14]:
# COMPAS Dataset results

train_set_full_compas = np.column_stack((X_train_compas, y_train_compas))
input_names_compas = compas.drop(columns=['two_year_recid']).columns.tolist()
output_names_compas = ['two_year_recid']
topic_arg_compas = "two_year_recid"
kernel_size = np.sqrt(X_test_compas.shape[1]) * 0.75
model_compas_original = (classifier_compas.coefs_, classifier_compas.intercepts_)

for sp_percent in tqdm(sparse_amounts):
    results_io_aae = []
    results_struct_aae = []
    results_io_original = []
    results_struct_original = []
    for example_row_num in range(len(X_test_compas)):
        example = X_test_compas[example_row_num]
        example_row = np.append(example, y_test_compas[example_row_num])

        sp_aae = sparx.LocalSpArX(classifier_compas.coefs_, classifier_compas.intercepts_, "logistic", sp_percent, example_row, train_set_full_compas, kernel_size, input_names_compas, output_names_compas, topic_arg_compas, sparx.ClusteringMethod.AAE_GRADIENT)
        s_weights, s_biases = sp_aae.get_sparsified_mlp()
        model_compas_sparse = (s_weights, s_biases)
        local_dataset = sp_aae.local_dataset
        example_weights = sp_aae.example_weights

        sp_original = sparx.LocalSpArX(classifier_compas.coefs_, classifier_compas.intercepts_, "logistic", sp_percent, example_row, train_set_full_compas, kernel_size)
        s_weights_original, s_biases_original = sp_original.get_sparsified_mlp()
        model_compas_sparse_original = (s_weights_original, s_biases_original)
        local_dataset_original = sp_original.local_dataset
        example_weights_original = sp_original.example_weights


        io_unfaithfulness_aae = input_output_unfaithfulness(model_compas_original, model_compas_sparse, local_dataset, example_weights)
        struct_unfaithfulness_aae = structural_unfaithfulness(model_compas_original, model_compas_sparse, X_test_compas, sp_aae, example_row_num)

        io_unfaithfulness_original = input_output_unfaithfulness(model_compas_original, model_compas_sparse_original, local_dataset_original, example_weights_original)
        struct_unfaithfulness_original = structural_unfaithfulness(model_compas_original, model_compas_sparse_original, X_test_compas, sp_original, example_row_num)

        results_io_aae.append(io_unfaithfulness_aae)
        results_struct_aae.append(struct_unfaithfulness_aae)

        results_io_original.append(io_unfaithfulness_original)
        results_struct_original.append(struct_unfaithfulness_original)
    
    sp_result_io = np.mean(results_io_aae)
    sp_result_struct = np.mean(results_struct_aae)

    sp_result_io_original = np.mean(results_io_original)
    sp_result_struct_original = np.mean(results_struct_original)

    results_df_input_output.loc[results_df_input_output["Method"] == f"{sp_percent}% (AAE SpArX)", "COMPAS"] = sp_result_io
    results_df_structural.loc[results_df_structural["Method"] == f"{sp_percent}% (AAE SpArX)", "COMPAS"] = sp_result_struct

    results_df_input_output.loc[results_df_input_output["Method"] == f"{sp_percent}% (Original SpArX)", "COMPAS"] = sp_result_io_original
    results_df_structural.loc[results_df_structural["Method"] == f"{sp_percent}% (Original SpArX)", "COMPAS"] = sp_result_struct_original

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [1:47:12<00:00, 1608.24s/it]


In [15]:
results_df_input_output

Unnamed: 0,Method,Iris,Cancer,COMPAS
0,20% (Original SpArX),0.334683,0.000259,0.001504
1,20% (AAE SpArX),0.463643,2e-06,0.0
2,40% (Original SpArX),0.704397,0.001187,0.005755
3,40% (AAE SpArX),0.45708,7e-06,0.0
4,60% (Original SpArX),0.741685,0.003463,0.011628
5,60% (AAE SpArX),0.407295,1.4e-05,0.0
6,80% (Original SpArX),1.116516,0.00654,0.018203
7,80% (AAE SpArX),0.410521,2.4e-05,1e-06


In [16]:
results_df_structural

Unnamed: 0,Method,Iris,Cancer,COMPAS
0,20% (Original SpArX),0.048017,0.001823,0.000795
1,20% (AAE SpArX),0.054251,0.00047,2.4e-05
2,40% (Original SpArX),0.076527,0.003985,0.001532
3,40% (AAE SpArX),0.078695,0.000887,3.2e-05
4,60% (Original SpArX),0.088782,0.006866,0.002161
5,60% (AAE SpArX),0.099359,0.001288,3.8e-05
6,80% (Original SpArX),0.075767,0.009491,0.002756
7,80% (AAE SpArX),0.088701,0.001672,6.9e-05
