## Explainable k-Medoids

In [1]:
import matplotlib.pyplot as plt
from sklearn import datasets
import mpl_toolkits.mplot3d 
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import pandas as pd

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
kmeans = KMeans(n_clusters=4, random_state=3).fit(X)

In [4]:
preds = kmeans.predict(X)
print(preds)
print(preds.shape)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 3 0 3 0 3 0 3 3 3 3 0 3 0 3 3 0 3 0 3 0 0
 0 0 0 0 0 3 3 3 3 0 3 0 0 0 3 3 3 0 3 3 3 3 3 0 3 3 2 0 2 0 2 2 3 2 2 2 0
 0 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 0 2 2 0 0 2 2 2 0 2 2 2 0 0
 2 0]
(150,)


In [5]:
centers = kmeans.cluster_centers_
print(centers)
print(centers.shape)

[[6.25714286 2.86190476 4.85       1.63333333]
 [5.006      3.428      1.462      0.246     ]
 [6.95       3.10666667 5.86666667 2.15333333]
 [5.53214286 2.63571429 3.96071429 1.22857143]]
(4, 4)


In [6]:
# one set of clusters for every obs - Idk. how to broadcast this to work with the later distance function
centers_lst = []

for i in preds:
    centers_lst.append(np.array(centers))

In [7]:
c = np.array(centers_lst)

In [8]:
#Check obs
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [9]:
#check cluster
c[0]

array([[6.25714286, 2.86190476, 4.85      , 1.63333333],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.95      , 3.10666667, 5.86666667, 2.15333333],
       [5.53214286, 2.63571429, 3.96071429, 1.22857143]])

In [10]:
#check point-wise Manhatten distance
abs(c[0] - X[0])

array([[1.15714286, 0.63809524, 3.45      , 1.43333333],
       [0.094     , 0.072     , 0.062     , 0.046     ],
       [1.85      , 0.39333333, 4.46666667, 1.95333333],
       [0.43214286, 0.86428571, 2.56071429, 1.02857143]])

In [11]:
#check point-wise euclidean distance
(c[0] - X[0])**2

array([[1.33897959e+00, 4.07165533e-01, 1.19025000e+01, 2.05444444e+00],
       [8.83600000e-03, 5.18400000e-03, 3.84400000e-03, 2.11600000e-03],
       [3.42250000e+00, 1.54711111e-01, 1.99511111e+01, 3.81551111e+00],
       [1.86747449e-01, 7.46989796e-01, 6.55725765e+00, 1.05795918e+00]])

In [12]:
def calc_feature_wise_distance_matrix(distance_metric, points, cluster_centers):
    
    # calculate the distance of every feature value of ever obs to every feature value in every cluster.
    
    feature_wise_distance_matrix = []
    
    if distance_metric == "manhattan":
        for i,e in enumerate(X):
            feature_wise_distance_matrix.append(abs(c[i] - e))
        
    
    if distance_metric == "euclidean":
        for i,e in enumerate(X):
            feature_wise_distance_matrix.append((c[i] - e)**2)
    
    return np.array(feature_wise_distance_matrix)

In [13]:
A =calc_feature_wise_distance_matrix("manhattan", X, c)

In [18]:
A[0]

array([[1.15714286, 0.63809524, 3.45      , 1.43333333],
       [0.094     , 0.072     , 0.062     , 0.046     ],
       [1.85      , 0.39333333, 4.46666667, 1.95333333],
       [0.43214286, 0.86428571, 2.56071429, 1.02857143]])

In [17]:
A[0][:,1]

array([0.63809524, 0.072     , 0.39333333, 0.86428571])

In [37]:
# Falsch!
def best_second_calc(feature_based_distanes, preds):
    
    num_features = feature_based_distanes.shape[2]
    
    assinged_cluster_list = []
    fb_distance_to_assinged_cluster_list = []
    
    best_alterantive_list = []
    fb_distance_to_best_alternative_list = []
    
    #for every obs:
    for idx, e in enumerate(feature_based_distanes):
        #index of assinged cluster
        assigned_cluster = preds[idx]
        #feature-wise distances of point to assigned cluster
        distances_to_assigned = e[:,assigned_cluster]
        
        assinged_cluster_list.append(assigned_cluster)
        fb_distance_to_assinged_cluster_list.append(distances_to_assigned)
        
        #find best alternative:
        
        temp_bad = []
        temp_idx = []
        
        #for every feature
        for i in range(num_features):
            
            
            # best alternative: 
            best_alternative_distance = min(e[:,i])
            x = e[:,i].tolist()
            idx_best_alternative = x.index(best_alternative_distance)
            
            
            #if the best alternative is the assigned cluster, we have to find the second best alternative
            if idx_best_alternative == assigned_cluster:
                
                del x[idx_best_alternative]
                best_alternative_distance = min(x)
                idx_best_alternative = x.index(best_alternative_distance)
                
            temp_bad.append(best_alternative_distance)
            temp_idx.append(idx_best_alternative)
            
        best_alterantive_list.append(temp_idx)
        fb_distance_to_best_alternative_list.append(temp_bad)     
            
    return np.array(assinged_cluster_list), np.array(fb_distance_to_assinged_cluster_list), np.array(best_alterantive_list), np.array(fb_distance_to_best_alternative_list)

In [38]:
ac ,fb_ac ,ba, fb_ba = best_second_calc(A, preds)

## Relevance of feature j for point i:

$$R_{ji} := R_j(x_{ij},c_{kj},c_{k'_{ij}}) := \frac{d_j(x_{ij},c_{k'_{ij}}) - d_j(x_{ij},c_{kj})}{d_j(x_{ij},c_{k'_{ij}}) + d_j(x_{ij},c_{kj})}$$

$d_j(x_{ij},c_{k'_{ij}}$ is the distance to the best alternative.

$d_j(x_{ij},c_{kj})$ is the distance to the associated cluster.

In [41]:
# broadcasted: for every i, for every j
R = (fb_ba - fb_ac) / (fb_ba + fb_ac)

In [42]:
R

array([[-0.19243604,  0.69054441,  0.73369872,  0.08679245],
       [ 0.64142195, -0.60099751,  0.92002142,  0.47692308],
       [ 0.42217701, -0.41908714,  0.93222097,  0.29147982],
       [ 0.5930824 , -0.96015936,  0.99459616,  0.37799043],
       [-0.16213683,  0.48296593,  0.67693147,  0.03225806],
       [-0.77416073,  0.25395153,  0.48047088, -0.20819113],
       [ 0.26801619,  0.82572614,  0.79444398,  0.09704641],
       [-0.00556174,  0.82572614,  0.7869802 ,  0.14741036],
       [ 0.93489318, -0.8654105 ,  0.85064097,  0.59116022],
       [ 0.45280438, -0.96015936,  0.99459616,  0.41704036],
       [-0.72760736,  0.37134052,  0.61144461, -0.01706485],
       [ 0.15276476,  0.82572614,  0.77895398,  0.14741036],
       [ 0.68262654, -0.60099751,  0.92002142,  0.51196172],
       [ 0.79843614, -0.60099751,  0.9281072 ,  0.51196172],
       [-0.61896698,  0.21929026,  0.5110445 , -0.14029851],
       [-0.80321005,  0.14184815,  0.31096594, -0.36088154],
       [-0.77416073,  0.