## Explainable k-Medoids

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
from sklearn import datasets
import mpl_toolkits.mplot3d 
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import pandas as pd
from cxplain.xkm import XkmExplainer

In [17]:
class XkmOld:

    "eXplainable k-medoids"
    
    def __init__(self, data, distance_metric, model):
        self.distance_metric = distance_metric
        self.cluster_centers = model.cluster_centers_  
        self.X = data
        self.model = model
        self.preds = model.predict(X)
        
    ############################################################################################################
    
    def calc_feature_wise_distance_matrix(self):
         
        centers_lst = []

        for i in X:
            centers_lst.append(np.array(self.cluster_centers))
    
        c = np.array(centers_lst)

        # calculate the distance of every feature value of ever obs to every feature value in every cluster.
    
        feature_wise_distance_matrix = []
    
        if self.distance_metric == "manhattan":
            for i,e in enumerate(self.X):
                feature_wise_distance_matrix.append(abs(c[i] - e))
        
    
        if self.distance_metric == "euclidean":
            for i,e in enumerate(self.X):
                feature_wise_distance_matrix.append((c[i] - e)**2)
                
    
        self.feature_wise_distance_matrix = np.array(feature_wise_distance_matrix)

    
    #############################################################################################################
    
    def best_calc(self):
    
        num_features = self.feature_wise_distance_matrix.shape[2]
    
        assinged_cluster_list = []
        fb_distance_to_assinged_cluster_list = []
    
        best_alterantive_list = []
        fb_distance_to_best_alternative_list = []
    
        #for every obs:
        for idx, e in enumerate(self.feature_wise_distance_matrix):
            #index of assinged cluster
            assigned_cluster = self.preds[idx]
            #feature-wise distances of point to assigned cluster
            distances_to_assigned = e[assigned_cluster]
        
            assinged_cluster_list.append(assigned_cluster)
            fb_distance_to_assinged_cluster_list.append(distances_to_assigned)
        
            #find best alternative:
        
            temp_bad = []
            temp_idx = []
        
            #for every feature
            for i in range(num_features):
            
            
                # best alternative: 
                best_alternative_distance = min(e[:,i])
                x = e[:,i].tolist()
                idx_best_alternative = x.index(best_alternative_distance)
            
            
                #if the best alternative is the assigned cluster, we have to find the second best alternative
                if idx_best_alternative == assigned_cluster:
                
                    del x[idx_best_alternative]
                    best_alternative_distance = min(x)
                    idx_best_alternative = x.index(best_alternative_distance)
                    
                temp_bad.append(best_alternative_distance)
                temp_idx.append(idx_best_alternative)

            best_alterantive_list.append(temp_idx)
            fb_distance_to_best_alternative_list.append(temp_bad)     
            
        self.ac ,self.fb_ac ,self.ba, self.fb_ba = np.array(assinged_cluster_list), np.array(fb_distance_to_assinged_cluster_list), np.array(best_alterantive_list), np.array(fb_distance_to_best_alternative_list)
    
    def calc_R_points(self):
        self.R_points = (self.fb_ba - self.fb_ac) / (self.fb_ba + self.fb_ac) 

    def calc_cluster_relevance(self):
        df_c = pd.DataFrame(self.R_points)
        df_c.rename({0:"R1", 1: "R2", 2: "R3", 3: "R4"}, axis=1, inplace = True)
        df_c["assigned_clusters"] = self.preds
        self.R_clusters = df_c.groupby(["assigned_clusters"]).mean()

    def calc_R_global(self):
        self.R_global = {"R_global_" + str(i) : np.sum(self.R_points[:,i]) / len(self.R_points) for i in range(self.R_points.shape[1])} 
        
    def explain(self):
        self.calc_feature_wise_distance_matrix()
        self.best_calc()
        self.calc_R_points()
        self.calc_R_global()
        self.calc_cluster_relevance()

In [3]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [4]:
data = np.array([[1, 2, 1, 2],
               [2, 3, 2, 3],
               [2, 2, 1, 2]])

In [5]:
data.shape

(3, 4)

In [6]:
kmeans = KMeans(n_clusters=2, random_state=3).fit(data)

In [7]:
kmeans.cluster_centers_

array([[1.5, 2. , 1. , 2. ],
       [2. , 3. , 2. , 3. ]])

In [8]:
kmeans.predict(data)

array([0, 1, 0])

In [9]:
kmeans = KMeans(n_clusters=3, random_state=3).fit(X)
cluster_centers = kmeans.cluster_centers_

In [38]:
predictions = kmeans.predict(X)
predictions[3]

1

In [62]:
F_old = XkmOld(data, "euclidean", kmeans)
F_new = XkmExplainer(X,  kmeans.cluster_centers_, "next_best", "euclidean", predictions, ["A", "B", "C", "D"])

In [63]:
F_new.fit()

<cxplain.xkm.XkmExplainer at 0x1f987589fd0>

In [64]:
explained = F_new.explain()

In [66]:
explained.pointwise_relevance

Unnamed: 0,A,B,C,D
0,0.972872,0.944535,0.999142,0.997224
1,0.977848,-0.942429,0.999142,0.997224
2,0.878198,-0.530299,0.994530,0.997224
3,0.822665,-0.987208,0.999655,0.997224
4,0.999911,0.807014,0.999142,0.997224
...,...,...,...,...
145,0.931810,0.842028,0.377537,0.869382
146,0.311760,0.684265,0.199168,-0.762638
147,0.490186,0.842028,0.377537,0.968985
148,-0.651891,-0.985382,0.792853,0.869382


In [27]:
F_old.R_global

AttributeError: 'XkmOld' object has no attribute 'R_global'

In [68]:
sum(explained.cluster_relevance.values != F_old.R_clusters.values)

array([0, 0, 0, 0])

In [42]:
sum(F_old.feature_wise_distance_matrix != F_new._calculate_feature_wise_distance_matrix())

array([[0, 0, 0, 0],
       [0, 0, 0, 0],
       [0, 0, 0, 0]])

In [39]:
F_new._calculate_feature_wise_distance_matrix()

array([[[6.42583247e-01, 5.64921956e-01, 8.96133195e+00, 1.52243757e+00],
        [8.83600000e-03, 5.18400000e-03, 3.84400000e-03, 2.11600000e-03],
        [3.06250000e+00, 1.81745152e-01, 1.88538781e+01, 3.50083795e+00]],

       [[1.00322841e+00, 6.33090531e-02, 8.96133195e+00, 1.52243757e+00],
        [1.12360000e-02, 1.83184000e-01, 3.84400000e-03, 2.11600000e-03],
        [3.80250000e+00, 5.42936288e-03, 1.88538781e+01, 3.50083795e+00]],

       [[1.44387357e+00, 2.03954214e-01, 9.57004162e+00, 1.52243757e+00],
        [9.36360000e-02, 5.19840000e-02, 2.62440000e-02, 2.11600000e-03],
        [4.62250000e+00, 1.59556787e-02, 1.97322992e+01, 3.50083795e+00]],

       ...,

       [[3.58067118e-01, 6.33090531e-02, 6.50364204e-01, 3.20502081e-01],
        [2.23203600e+00, 1.83184000e-01, 1.39726440e+01, 3.07651600e+00],
        [1.22500000e-01, 5.42936288e-03, 2.93878116e-01, 5.04847645e-03]],

       [[8.90348595e-02, 4.24599376e-01, 1.01294485e+00, 7.50179501e-01],
        [1.425636

In [21]:
distance_df = pd.DataFrame(F.feature_wise_distance_matrix[1], columns = cols)
distance_df

Unnamed: 0,1,2,3,4
0,1.003228,0.063309,8.961332,1.522438
1,0.011236,0.183184,0.003844,0.002116
2,3.8025,0.005429,18.853878,3.500838


In [31]:
clusters = [cluster_centers for obs_coordinates in X]

In [None]:
def _calculate_pointwise_relevance(cls, feature_wise_distance_matrix: NDArray[Shape["* num_obs, * num_clusters, * num_features"], Floating],
                                            cluster_predictions:  NDArray[Shape["* num_obs"], Int]) -> pd.DataFrame:
        # sum up distances over cluster
        complete_distances = np.sum(feature_wise_distance_matrix, axis=1)
        # get distance to actual assigned cluster for every observation and feature
        relevant_rows = [feature_wise_distance_matrix[cluster_predictions[i], i, :] 
                         for i in range(feature_wise_distance_matrix.shape[1])]
        actual_distances = np.vstack(relevant_rows) # TODO: make own utility function as also used in shap
        # calculate relevance
        n_clusters = feature_wise_distance_matrix.shape[1]
        pointwise_scores = ((complete_distances - n_clusters * actual_distances) /
                            complete_distances)
        return pd.DataFrame(pointwise_scores)