In [1]:
import matplotlib.pyplot as plt
from sklearn import datasets
import mpl_toolkits.mplot3d 
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import pandas as pd
from cxplain.xkm import Xkm
from cxplain.tree import  DecisionTreeExplainer, RandomForestExplainer, ExKMCExplainer
from cxplain.shap import  ShapExplainer
from cxplain.gradient import GradientExplainer  
from cxplain.metrics import EuclideanMetric, Metric, ManhattenMetric
from cxplain.neon import NeonKMeansExplainer
from cxplain.errors import NonExistingRelevanceError

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
class NormalCKDE:
    def __init__(self, data):
        self.epsilon = 0.001
        self.stopping_threshold = 100
        self.data = data
        self.variance = None
        self.n_obs = self.data.shape[0]
        self.n_features = self.data.shape[1]
        
    def fit(self):
        counter = 0
        variance = 1
        variance_list =[variance]
        
        while not self._is_converged(variance_list):
            variance_old = variance_list[-1]
            variance_new = self._update_variance(variance_old)
            variance_list.append(variance_new)
            counter += 1
            print(f"counter: {counter}")
            if counter >= self.stopping_threshold:
                print(f"No convergence after {self.stopping_threshold} steps!")
                break
                
        self.variance = variance_list[-1]
        
    def _update_variance(self, variance_old):
        update_weight = 1 / (self.n_obs * self.n_features)
        observation_list = []
        
        for obs_index in range(self.n_obs):
            base_obs = self.data[obs_index, :]
            nominator = sum([np.exp(np.linalg.norm(base_obs - self.data[i, :]) / (2 * variance_old))
                             * np.linalg.norm(base_obs - self.data[i, :]) 
                             for i in range(self.n_obs)
                             if i != obs_index])
            denominator = sum([np.exp(np.linalg.norm(base_obs - self.data[i, :]) / (2 * variance_old))
                               for i in range(self.n_obs)
                               if i != obs_index])
            observation_list.append(nominator / denominator)
            
        return update_weight * sum(observation_list)
    
    def _is_converged(self, variance_list):
        if len(variance_list) < 5:
            return False
        considered_elements = variance_list[-5:]
        differences = np.diff(considered_elements)
        return np.sum(differences >= self.epsilon) == 0
        
        

In [16]:
imputer = NormalCKDE(X)

In [17]:
imputer.fit()

counter: 1
counter: 2
counter: 3
counter: 4
counter: 5
counter: 6
counter: 7
counter: 8


In [18]:
imputer.variance

0.9290669531414758

In [None]:
dataset_index = {"iris": }
datasets = {"iris": }

First I only use the iris data set for evaluation

In [7]:
n_clusters = 3
iris = datasets.load_iris()
X = iris.data
y = iris.target
n_obs = X.shape[0]
n_features = X.shape[1]
only_global = True

In [122]:
# fit Kmeans
kmeans = KMeans(n_clusters=n_clusters, random_state=3).fit(X)
cluster_centers = kmeans.cluster_centers_
predictions = kmeans.predict(X)

In [123]:
# init and fit explainer
# list allexplainers
explainers = {"tree": DecisionTreeExplainer(data= X, cluster_predictions=predictions),
             "forest": RandomForestExplainer(data= X, cluster_predictions=predictions),
             "exkmc": ExKMCExplainer(X, kmeans, k=n_clusters, max_leaves=2*n_clusters),
             "gradient": GradientExplainer(X, cluster_centers, predictions, EuclideanMetric, enable_abs_calculation=False),
             "shap": ShapExplainer(data= X, cluster_predictions=predictions),
             "neon": NeonKMeansExplainer(cluster_centers=cluster_centers, data=X, predictions=predictions),
             "xkm": Xkm(data,  kmeans.cluster_centers_, "euclidean", predictions)}

# fit and explain all explainers

explanations = {explainer_name:explainer.fit_explain() for explainer_name, explainer  in explainers.items()}


In [124]:
# first calculate all ROC curves for individual observations
result_individual = {}
for explainer_name, explanation in explanations.items():
    # init curve_list
    curve_list = []
    for index_obs in range(n_obs):
        # init list curve_obs_i to all 1 (length = num_features)
        curve_obs = [1 for i in range(n_features)]
        # init array of feature observations, I use an array instead of a list, as it is easier  later on to calculate distances to cluster centers
        feature_obs = np.array([0.0 for i in range(n_features)])
        # get relevance scores for observation, for explainers with only global scores, these will be used for every observation
        if only_global:
            relevance_scores = list(explanations[explainer_name].global_relevance)
        else:
            try:
                relevance_scores = list(explanations[explainer_name].pointwise_relevance.iloc[index_obs, :])
            except NonExistingRelevanceError:
                relevance_scores = list(explanations[explainer_name].global_relevance)
        
        for feature_index in range(n_features):
            # get biggest score and column index (indicate which feature is meant) and pop from list
            index_biggest_score = relevance_scores.index(max(relevance_scores))                
            relevance_scores[index_biggest_score] = -100 # I set to large negative number as popping would ruin the index correspondence from relevance score to feature
            # get observation for this feature
            obs_biggest_score = X[index_obs, index_biggest_score]
            # get corresponding cluster index for this observation
            cluster_index = predictions[index_obs]
            # add observation for feature to feature observations list
            feature_obs.put(index_biggest_score, obs_biggest_score) # has to be at index of feature in training data, as otherwise distance calculation is wrong
            # impute other entries (length = num_features) --> TBD
            # calculate distance to cluster centers for feature observations list
            distances = [np.linalg.norm(feature_obs - center) for center in cluster_centers]
            # get nearest_cluster_index
            nearest_cluster_index = distances.index(min(distances))
            # check whether cluster_index == nearest_cluster_index
            # if yes: return curve_obs_i
            # if no: replace first entry of curve_obs_i ith 0 and repeat
            if cluster_index == nearest_cluster_index:
                break
            else:
                curve_obs[feature_index] = 0
            # if yes: return curve_obs_i
            # if no: replace first entry of curve_obs_i ith 0 and repeat
            
        curve_list.append(curve_obs)
        
    # add explainer entry to dict
    result_individual[explainer_name] = curve_list
      
# Now compute AUC
result_auc = {explainer_name: (1 /(n_obs*n_features)) * sum(map(sum, curves)) for explainer_name, curves in result_individual.items()}

In [125]:
result_auc

{'tree': 0.7366666666666667,
 'forest': 0.8016666666666667,
 'exkmc': 0.7533333333333334,
 'gradient': 0.7533333333333334,
 'shap': 0.8016666666666667,
 'neon': 0.6333333333333334,
 'xkm': 0.8016666666666667}