## Explainable k-Medoids

In [None]:
import matplotlib.pyplot as plt
from sklearn import datasets
import mpl_toolkits.mplot3d 
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import pandas as pd

In [115]:
class Xkm:

    "eXplainable k-medoids"
    
    def __init__(self, data, distance_metric, model):
        self.distance_metric = distance_metric
        self.cluster_centers = model.cluster_centers_  
        self.X = data
        self.model = model
        self.preds = model.predict(X)
        
    ############################################################################################################
    
    def calc_feature_wise_distance_matrix(self):
         
        centers_lst = []

        for i in X:
            centers_lst.append(np.array(self.cluster_centers))
    
        c = np.array(centers_lst)

        # calculate the distance of every feature value of ever obs to every feature value in every cluster.
    
        feature_wise_distance_matrix = []
    
        if self.distance_metric == "manhattan":
            for i,e in enumerate(self.X):
                feature_wise_distance_matrix.append(abs(c[i] - e))
        
    
        if self.distance_metric == "euclidean":
            for i,e in enumerate(self.X):
                feature_wise_distance_matrix.append((c[i] - e)**2)
                
    
        self.feature_wise_distance_matrix = np.array(feature_wise_distance_matrix)

    
    #############################################################################################################
    
    def best_calc(self):
    
        num_features = self.feature_wise_distance_matrix.shape[2]
    
        assinged_cluster_list = []
        fb_distance_to_assinged_cluster_list = []
    
        best_alterantive_list = []
        fb_distance_to_best_alternative_list = []
    
        #for every obs:
        for idx, e in enumerate(self.feature_wise_distance_matrix):
            #index of assinged cluster
            assigned_cluster = self.preds[idx]
            #feature-wise distances of point to assigned cluster
            distances_to_assigned = e[assigned_cluster]
        
            assinged_cluster_list.append(assigned_cluster)
            fb_distance_to_assinged_cluster_list.append(distances_to_assigned)
        
            #find best alternative:
        
            temp_bad = []
            temp_idx = []
        
            #for every feature
            for i in range(num_features):
            
            
                # best alternative: 
                best_alternative_distance = min(e[:,i])
                x = e[:,i].tolist()
                idx_best_alternative = x.index(best_alternative_distance)
            
            
                #if the best alternative is the assigned cluster, we have to find the second best alternative
                if idx_best_alternative == assigned_cluster:
                
                    del x[idx_best_alternative]
                    best_alternative_distance = min(x)
                    idx_best_alternative = x.index(best_alternative_distance)
                    
                temp_bad.append(best_alternative_distance)
                temp_idx.append(idx_best_alternative)

            best_alterantive_list.append(temp_idx)
            fb_distance_to_best_alternative_list.append(temp_bad)     
            
        self.ac ,self.fb_ac ,self.ba, self.fb_ba = np.array(assinged_cluster_list), np.array(fb_distance_to_assinged_cluster_list), np.array(best_alterantive_list), np.array(fb_distance_to_best_alternative_list)
    
    def calc_R_points(self):
        self.R_points = (self.fb_ba - self.fb_ac) / (self.fb_ba + self.fb_ac) 

    def calc_cluster_relevance(self):
        df_c = pd.DataFrame(self.R_points)
        df_c.rename({0:"R1", 1: "R2", 2: "R3", 3: "R4"}, axis=1, inplace = True)
        df_c["assigned_clusters"] = self.preds
        self.R_clusters = df_c.groupby(["assigned_clusters"]).mean()

    def calc_R_global(self):
        self.R_global = {"R_global_" + str(i) : np.sum(self.R_points[:,i]) / len(self.R_points) for i in range(self.R_points.shape[1])} 
        
    def explain(self):
        self.calc_feature_wise_distance_matrix()
        self.best_calc()
        self.calc_R_points()
        self.calc_R_global()
        self.calc_cluster_relevance()

In [116]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [117]:
kmeans = KMeans(n_clusters=3, random_state=3).fit(X)

In [118]:
F = Xkm(X, "euclidean", kmeans)


In [119]:
F.explain()

In [120]:
F.R_clusters

Unnamed: 0_level_0,R1,R2,R3,R4
assigned_clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.328449,0.246249,0.711564,0.578215
1,0.68629,0.221374,0.992866,0.980448
2,0.537812,0.084899,0.754957,0.621068
