In [1]:
%load_ext autoreload
%autoreload 2

In [115]:
import matplotlib.pyplot as plt
from abc import ABC, abstractmethod 
import random
from numba import njit
from numba.experimental import jitclass
from numba import int32, float64
from sklearn import datasets
import mpl_toolkits.mplot3d 
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from cxplain.xkm import XkmExplainer
from cxplain.tree import  DecisionTreeExplainer, RandomForestExplainer, ExKMCExplainer
from cxplain.shap import  ShapExplainer
from cxplain.gradient import GradientExplainer  
from cxplain.metrics import EuclideanMetric, Metric, ManhattenMetric
from cxplain.neon import NeonKMeansExplainer
from cxplain.errors import NonExistingRelevanceError
from imputer import NormalCKDEImputer, EmpiricalRandomImputer, get_imputer
from datasets import IrisDataset, WineDataset, WholeSaleDataset, LiveSellersDataset, BuddyMoveDataset, SyntheticDataset

Data sets to be considered:
- https://archive-beta.ics.uci.edu/dataset/292/wholesale+customers
- https://archive.ics.uci.edu/ml/datasets/BuddyMove+Data+Set# --> keine targets --> raus
- https://archive.ics.uci.edu/ml/datasets/Facebook+Live+Sellers+in+Thailand#
- https://archive.ics.uci.edu/ml/datasets/wine / https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine, included in scikit learn
- https://www.researchgate.net/publication/331616284_A_morphological_database_for_Colombian_anuran_species_from_conservation-priority_ecosystems
- https://archive.ics.uci.edu/ml/datasets/clickstream+data+for+online+shopping#
- https://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html

In [121]:
datasets = {"iris": IrisDataset.load_and_clean_dataset(),
            "wine": WineDataset.load_and_clean_dataset(),
            "wholesale": WholeSaleDataset.load_and_clean_dataset("../data/Wholesale customers data.csv"),
            "buddy": BuddyMoveDataset.load_and_clean_dataset(3, "../data/buddymove_holidayiq.csv"),
            "synthetic": SyntheticDataset.load_and_clean_dataset(15, "../data/data_s1.txt"),
            "live_sellers": LiveSellersDataset.load_and_clean_dataset("../data/Live_20210128.csv")}

In [124]:
only_global = True
use_imputer = True
imputer_name = "empirical"
n_trials = 20

for dataset_name, dataset in datasets.items():
    print(dataset_name)

    n_clusters = dataset.n_clusters
    X = dataset.features
    y = dataset.targets
    n_obs = dataset.n_obs
    n_features = dataset.n_features

    # fit Kmeans
    kmeans = KMeans(n_clusters=n_clusters, random_state=3).fit(X)
    cluster_centers = kmeans.cluster_centers_
    predictions = kmeans.predict(X)
    # init and fit explainer
    # list allexplainers
    explainers = {"tree": DecisionTreeExplainer(data= X, cluster_predictions=predictions),
                 "forest": RandomForestExplainer(data= X, cluster_predictions=predictions),
                 "exkmc": ExKMCExplainer(X, kmeans, k=n_clusters, max_leaves=2*n_clusters),
                 "gradient": GradientExplainer(X, cluster_centers, predictions, EuclideanMetric, enable_abs_calculation=False),
                 "shap": ShapExplainer(data= X, cluster_predictions=predictions),
                 "neon": NeonKMeansExplainer(cluster_centers=cluster_centers, data=X, predictions=predictions),
                 "xkm_next_best": XkmExplainer(X,  kmeans.cluster_centers_, "next_best", "euclidean", predictions),
                 "xkm_all": XkmExplainer(X,  kmeans.cluster_centers_, "all", "euclidean", predictions)}

    # fit and explain all explainers

    explanations = {explainer_name:explainer.fit_explain() for explainer_name, explainer  in explainers.items()}

    # first calculate all ROC curves for individual observations
    result_individual = {explainer_name: [] for explainer_name in explanations.keys()}
    if use_imputer:
        imputer = get_imputer(imputer_name)(X).fit()
    for _ in range(n_trials):
        print(f"trial: {_ + 1}")
        for explainer_name, explanation in explanations.items():
            # init curve_list
            curve_list = []
            for index_obs in range(n_obs):
                # init list curve_obs_i to all 1 (length = num_features)
                curve_obs = [1 for i in range(n_features)]
                # init array of feature observations, I use an array instead of a list, as it is easier  later on to calculate distances to cluster centers
                feature_obs = np.array([0.0 for i in range(n_features)])
                # get relevance scores for observation, for explainers with only global scores, these will be used for every observation
                if only_global:
                    relevance_scores = list(explanations[explainer_name].global_relevance)
                else:
                    try:
                        relevance_scores = list(explanations[explainer_name].pointwise_relevance.iloc[index_obs, :])
                    except NonExistingRelevanceError:
                        relevance_scores = list(explanations[explainer_name].global_relevance)

                for feature_index in range(n_features):
                    # get biggest score and column index (indicate which feature is meant) and pop from list
                    index_biggest_score = relevance_scores.index(max(relevance_scores))                
                    relevance_scores[index_biggest_score] = -100 # I set to large negative number as popping would ruin the index correspondence from relevance score to feature
                    # get observation for this feature
                    obs_biggest_score = X[index_obs, index_biggest_score]
                    # get corresponding cluster index for this observation
                    cluster_index = predictions[index_obs]
                    # add observation for feature to feature observations list
                    feature_obs.put(index_biggest_score, obs_biggest_score) # has to be at index of feature in training data, as otherwise distance calculation is wrong
                    # impute other entries (length = num_features) --> TBD
                    if use_imputer: 
                        if feature_index < (n_features - 1):
                            feature_obs_imputed = imputer.predict(feature_obs, index_obs)
                        else:
                            feature_obs_imputed = feature_obs.copy()
                    else:
                        feature_obs_imputed = feature_obs.copy()
                    # calculate distance to cluster centers for feature observations list
                    distances = [np.linalg.norm(feature_obs_imputed - center) for center in cluster_centers]
                    # get nearest_cluster_index
                    nearest_cluster_index = distances.index(min(distances))
                    # check whether cluster_index == nearest_cluster_index
                    # if yes: return curve_obs_i
                    # if no: replace first entry of curve_obs_i ith 0 and repeat
                    if cluster_index == nearest_cluster_index:
                        break
                    else:
                        curve_obs[feature_index] = 0
                    # if yes: return curve_obs_i
                    # if no: replace first entry of curve_obs_i ith 0 and repeat

                curve_list.append(curve_obs)

            # add explainer entry to dict
            result_individual[explainer_name].extend(curve_list)

    # Now compute AUC
    result_auc = {explainer_name: (1 /(n_obs*n_features*n_trials)) * sum(map(sum, curves)) for explainer_name, curves in result_individual.items()}

    dataset_results[dataset_name] = result_auc

iris
trial: 1
trial: 2
trial: 3
trial: 4
trial: 5
trial: 6
trial: 7
trial: 8
trial: 9
trial: 10
trial: 11
trial: 12
trial: 13
trial: 14
trial: 15
trial: 16
trial: 17
trial: 18
trial: 19
trial: 20
wine
trial: 1
trial: 2
trial: 3
trial: 4
trial: 5
trial: 6
trial: 7
trial: 8
trial: 9
trial: 10
trial: 11
trial: 12
trial: 13
trial: 14
trial: 15
trial: 16
trial: 17
trial: 18
trial: 19
trial: 20
wholesale
trial: 1
trial: 2
trial: 3
trial: 4
trial: 5
trial: 6
trial: 7
trial: 8
trial: 9
trial: 10
trial: 11
trial: 12
trial: 13
trial: 14
trial: 15
trial: 16
trial: 17
trial: 18
trial: 19
trial: 20
buddy
trial: 1
trial: 2
trial: 3
trial: 4
trial: 5
trial: 6
trial: 7
trial: 8
trial: 9
trial: 10
trial: 11
trial: 12
trial: 13
trial: 14
trial: 15
trial: 16
trial: 17
trial: 18
trial: 19
trial: 20
synthetic
trial: 1
trial: 2
trial: 3
trial: 4
trial: 5
trial: 6
trial: 7
trial: 8
trial: 9
trial: 10
trial: 11
trial: 12
trial: 13
trial: 14
trial: 15
trial: 16
trial: 17
trial: 18
trial: 19
trial: 20
live_sell

In [123]:
dataset_results

{'iris': {'tree': 0.8262499999999999,
  'forest': 0.9472499999999999,
  'exkmc': 0.8336666666666667,
  'gradient': 0.8019166666666666,
  'shap': 0.9486666666666667,
  'neon': 0.7834166666666667,
  'xkm_next_best': 0.8650833333333333,
  'xkm_all': 0.8718333333333333},
 'wine': {'tree': 0.9346585998271392,
  'forest': 0.9357173725151253,
  'exkmc': 0.8927830596369922,
  'gradient': 0.9031114952463267,
  'shap': 0.9550561797752809,
  'neon': 0.8845505617977528,
  'xkm_next_best': 0.9285220397579947,
  'xkm_all': 0.9226231633535004},
 'wholesale': {'tree': 0.9460037878787879,
  'forest': 0.9478030303030303,
  'exkmc': 0.9132386363636363,
  'gradient': 0.8964015151515151,
  'shap': 0.9682954545454545,
  'neon': 0.9400946969696969,
  'xkm_next_best': 0.9323484848484849,
  'xkm_all': 0.9004545454545455},
 'live_sellers': {'tree': 0.9459708431836091,
  'forest': 0.9523333333333334,
  'exkmc': 0.9457391646966116,
  'gradient': 0.9308242710795902,
  'shap': 0.9645949566587865,
  'neon': 0.927542