## Explainable k-Medoids

In [1]:
import matplotlib.pyplot as plt
from sklearn import datasets
import mpl_toolkits.mplot3d 
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
import pandas as pd

In [2]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [3]:
kmeans = KMeans(n_clusters=3, random_state=3).fit(X)

In [4]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [5]:
preds = kmeans.predict(X)
print(preds)
print(preds.shape)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 2 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]
(150,)


In [6]:
# prediction is "false" but only because the "label" is not correct, the class itself is correctly predicted. Maybe this needs to be solved later on differently.

lst = []

for i in preds:
    if i == 1: 
        lst.append(0)
    elif i == 0:
        lst.append(1)
    else:
        lst.append(i)
        
preds = np.array(lst)

In [7]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 1])

In [8]:
metrics.accuracy_score(y,preds)

0.8933333333333333

In [9]:
centers = kmeans.cluster_centers_
print(centers)
print(centers.shape)

[[5.9016129  2.7483871  4.39354839 1.43387097]
 [5.006      3.428      1.462      0.246     ]
 [6.85       3.07368421 5.74210526 2.07105263]]
(3, 4)


In [10]:
# one set of clusters for every obs - Idk. how to broadcast this to work with the later distance function
centers_lst = []

for i in preds:
    centers_lst.append(np.array(centers))

In [11]:
c = np.array(centers_lst)

In [12]:
#Check obs
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [13]:
#check cluster
c[0]

array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])

In [14]:
#check point-wise Manhatten distance
abs(c[0] - X[0])

array([[0.8016129 , 0.7516129 , 2.99354839, 1.23387097],
       [0.094     , 0.072     , 0.062     , 0.046     ],
       [1.75      , 0.42631579, 4.34210526, 1.87105263]])

In [15]:
#check point-wise euclidean distance
(c[0] - X[0])**2

array([[6.42583247e-01, 5.64921956e-01, 8.96133195e+00, 1.52243757e+00],
       [8.83600000e-03, 5.18400000e-03, 3.84400000e-03, 2.11600000e-03],
       [3.06250000e+00, 1.81745152e-01, 1.88538781e+01, 3.50083795e+00]])

In [16]:
def calc_feature_wise_distance_matrix(distance_metric, points, cluster_centers):
    
    # calculate the distance of every feature value of ever obs to every feature value in every cluster.
    
    feature_wise_distance_matrix = []
    
    if distance_metric == "manhattan":
        for i,e in enumerate(X):
            feature_wise_distance_matrix.append(abs(c[i] - e))
        
    
    if distance_metric == "euclidean":
        for i,e in enumerate(X):
            feature_wise_distance_matrix.append((c[i] - e)**2)
    
    return np.array(feature_wise_distance_matrix)

In [17]:
A =calc_feature_wise_distance_matrix("manhattan", X, c)

In [18]:
A[0]

array([[0.8016129 , 0.7516129 , 2.99354839, 1.23387097],
       [0.094     , 0.072     , 0.062     , 0.046     ],
       [1.75      , 0.42631579, 4.34210526, 1.87105263]])

In [19]:
A[0][:,1]

array([0.7516129 , 0.072     , 0.42631579])

In [45]:
# Falsch!
def best_second_calc(feature_based_distanes, preds):
    
    num_features = feature_based_distanes.shape[1]
    
    assinged_cluster_list = []
    fb_distance_to_assinged_cluster_list = []
    
    best_alterantive_list = []
    fb_distance_to_best_alternative_list = []
    
    #for every obs:
    for idx, e in enumerate(feature_based_distanes):
        #index of assinged cluster
        assigned_cluster = preds[idx]
        #feature-wise distances of point to assigned cluster
        distances_to_assigned = e[:,assigned_cluster]
        
        assinged_cluster_list.append(assigned_cluster)
        fb_distance_to_assinged_cluster_list.append(distances_to_assigned)
        
        #find best alternative:
        
        temp_bad = []
        temp_idx = []
        
        #for every feature
        for i in range(num_features):
            
            
            # best alternative: 
            best_alternative_distance = min(e[:,i])
            x = e[:,i].tolist()
            idx_best_alternative = x.index(best_alternative_distance)
            
            
            #if the best alternative is the assigned cluster, we have to find the second best alternative
            if idx_best_alternative == assigned_cluster:
                
                del x[idx_best_alternative]
                best_alternative_distance = min(x)
                idx_best_alternative = x.index(best_alternative_distance)
                
            temp_bad.append(best_alternative_distance)
            temp_idx.append(idx_best_alternative)

        best_alterantive_list.append(temp_idx)
        fb_distance_to_best_alternative_list.append(temp_bad)     
            
    return np.array(assinged_cluster_list), np.array(fb_distance_to_assinged_cluster_list), np.array(best_alterantive_list), np.array(fb_distance_to_best_alternative_list)

In [46]:
ac ,fb_ac ,ba, fb_ba = best_second_calc(A, preds)

## Relevance of feature j for point i:

$$R_{ji} := R_j(x_{ij},c_{kj},c_{k'_{ij}}) := \frac{d_j(x_{ij},c_{k'_{ij}}) - d_j(x_{ij},c_{kj})}{d_j(x_{ij},c_{k'_{ij}}) + d_j(x_{ij},c_{kj})}$$

$d_j(x_{ij},c_{k'_{ij}}$ is the distance to the best alternative.

$d_j(x_{ij},c_{kj})$ is the distance to the associated cluster.

In [53]:
fb_ba[0]

array([0.094, 0.072, 0.062])

In [54]:
fb_ac[0]

array([0.8016129, 0.094    , 1.75     ])

In [55]:
ac[0]

0

In [56]:
ba[0]

array([1, 1, 1])

In [57]:
X[0]

array([5.1, 3.5, 1.4, 0.2])

In [60]:
centers

array([[5.9016129 , 2.7483871 , 4.39354839, 1.43387097],
       [5.006     , 3.428     , 1.462     , 0.246     ],
       [6.85      , 3.07368421, 5.74210526, 2.07105263]])

In [62]:
preds[0]

0

In [61]:
y[0]

0

In [51]:
# broadcasted: for every i, for every j
R = (fb_ba - fb_ac) / (fb_ba + fb_ac)

In [52]:
R

array([[-0.79008788, -0.13253012, -0.93156733],
       [-0.80859739, -0.17984769, -0.93836978],
       [-0.59406025, -0.41563185, -0.85986159],
       [-0.52448239, -0.87825664, -0.96678322],
       [-0.9867785 ,  0.93258427, -0.93514644],
       [-0.1201556 ,  0.09006928, -0.71800948],
       [-0.52448239, -0.87096774, -0.94636678],
       [-0.9867785 ,  0.64705882, -0.95974576],
       [-0.42494184, -0.5544755 , -0.95063694],
       [-0.80859739, -0.60222753, -0.96177062],
       [-0.1201556 , -0.18318318, -0.94892473],
       [-0.68492204, -0.76068376, -0.8738574 ],
       [-0.68492204, -0.47308995, -0.94128788],
       [-0.38811228, -0.8109896 , -0.75137363],
       [ 0.77308745, -0.1625183 , -0.60060976],
       [ 0.54977669,  0.16686675, -0.93602694],
       [-0.1201556 ,  0.09006928, -0.79900744],
       [-0.79008788, -0.13253012, -0.93156733],
       [ 0.54977669, -0.30206379, -0.65706052],
       [-0.79008788,  0.59656652, -0.95749441],
       [-0.1201556 , -0.86729858, -0.718

In [63]:
#df = pd.DataFrame(R)
#df.rename({0:"R1", 1: "R2", 2: "R3", 3: "R4"}, axis=1, inplace = True)

In [64]:
#df.head()

In [65]:
#df["associated clusters"] = preds

In [66]:
#df["y"] = y

In [67]:
#df_X = pd.DataFrame(X)
#df.rename({0:"X1", 1: "X2", 2: "X3", 3: "X4"}, axis=1, inplace = True)

In [68]:
#df = pd.concat([df, df_X], axis=1)

In [69]:
#df