In [1]:
import pandas as pd
import numpy as np

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
df = pd.read_csv('Datasets/mushrooms.csv')

In [5]:
print(df.shape)
df.isna().sum()

(8124, 23)


class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-type                   0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [15]:
cols = ['cap-shape', 'cap-surface', 'cap-color', 'bruises', 'odor',
       'gill-attachment', 'gill-spacing', 'gill-size', 'gill-color',
       'stalk-shape', 'stalk-root', 'stalk-surface-above-ring',
       'stalk-surface-below-ring', 'stalk-color-above-ring',
       'stalk-color-below-ring', 'veil-type', 'veil-color', 'ring-number',
       'ring-type', 'spore-print-color', 'population', 'habitat']
X = df[cols].values

In [20]:
# dissimilarity matrix is where the value in each coulmn of X1 is not equal to value in X2
def dissimilarity(point1, point2):
    return (point1!=point2).sum(axis = 1)

In [23]:
num_of_centers = 4
centers = X[np.random.choice(X.shape[0], num_of_centers)]
centers

array([['f', 'y', 'n', 'f', 'y', 'f', 'c', 'n', 'b', 't', '?', 'k', 'k',
        'p', 'w', 'p', 'w', 'o', 'e', 'w', 'v', 'l'],
       ['x', 'f', 'y', 'f', 'f', 'f', 'c', 'b', 'g', 'e', 'b', 'k', 'k',
        'p', 'n', 'p', 'w', 'o', 'l', 'h', 'y', 'g'],
       ['x', 'y', 'n', 'f', 'n', 'f', 'w', 'n', 'w', 'e', 'b', 's', 'f',
        'w', 'n', 'p', 'w', 'o', 'e', 'w', 'v', 'l'],
       ['f', 's', 'n', 'f', 's', 'f', 'c', 'n', 'b', 't', '?', 's', 's',
        'w', 'w', 'p', 'w', 'o', 'e', 'w', 'v', 'l']], dtype=object)

In [40]:
distance_matrix = np.zeros((X.shape[0],num_of_centers))

In [55]:
for i,center in enumerate(centers):
    center_temp = np.repeat([center],X.shape[0], axis =0)
    distance_matrix[:,i] = dissimilarity(X,center_temp)

In [56]:
distance_matrix

array([[14., 15., 12., 10.],
       [16., 12., 14., 12.],
       [16., 15., 15., 12.],
       ...,
       [14., 16., 14., 11.],
       [ 3., 15.,  8.,  4.],
       [15., 15., 13., 12.]])

In [57]:
clusters = np.argmin(distance_matrix, axis=1)

In [58]:
clusters

array([3, 1, 3, ..., 3, 0, 3])

In [59]:
old_clusters = clusters

In [79]:
np.where(clusters == 0)

(array([1803, 1805, 1838, ..., 8117, 8118, 8122]),)

##### For K-Modes Clustering we need to use the mode of the values

In [124]:
# checking the mode using scipy
from scipy import stats

In [86]:
stats.mode(X[np.where(clusters == 0)])[0]

array([['f', 'y', 'n', 'f', 'f', 'f', 'c', 'n', 'b', 't', '?', 'k', 'k',
        'p', 'w', 'p', 'w', 'o', 'e', 'w', 'v', 'd']], dtype=object)

In [125]:
# Checking how numpy.unique works
np.unique([1,2,3,4,5,6,1,2,3,9,8,7,6,5], return_index = True, return_counts=True)

(array([1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([ 0,  1,  2,  3,  4,  5, 11, 10,  9]),
 array([2, 2, 2, 1, 2, 2, 1, 1, 1]))

In [127]:
# it returns the unique values, the index of the first occurance of the value and the number of instances of each value

# Code from https://stackoverflow.com/questions/16330831/most-efficient-way-to-find-mode-in-numpy-array
(_, idx, counts) = np.unique(X[np.where(clusters == 0)], return_index=True, return_counts=True)
index = idx[np.argmax(counts)]
mode = X[np.where(clusters == 0)][index]
mode

array(['f', 'f', 'w', 'f', 'n', 'f', 'w', 'b', 'k', 't', 'e', 's', 's',
       'w', 'w', 'p', 'w', 'o', 'e', 'n', 'a', 'g'], dtype=object)

In [128]:
def np_mode(A):
    (_, idx, counts) = np.unique(A, return_index=True, return_counts=True)
    index = idx[np.argmax(counts)]
    mode = A[index]
    return mode

In [129]:
# checking if the function returns the same value as expected
np_mode(X[np.where(clusters == 0)])

array(['f', 'f', 'w', 'f', 'n', 'f', 'w', 'b', 'k', 't', 'e', 's', 's',
       'w', 'w', 'p', 'w', 'o', 'e', 'n', 'a', 'g'], dtype=object)

In [130]:
def create_distance_matrix(X, centers):
    distance_matrix = np.zeros((X.shape[0],centers.shape[0]))
    
    for i,center in enumerate(centers):
        center_temp = np.repeat([center],X.shape[0], axis =0)
        distance_matrix[:,i] = dissimilarity(X,center_temp)
    
    return distance_matrix

In [131]:
def Kmodes(X, num_of_centers, epochs = 1500):
    
    epoch = 1
    old_clusters = None
    clusters = np.zeros(X.shape[0])
    
    # Selecting centers randomly
    centers = X[np.random.choice(X.shape[0], num_of_centers)]
    
    while((epoch<=epochs)):
        # Creating a distance matrix wrt all centers and data points
        distance_matrix = create_distance_matrix(X, centers)
        
        # Assigning cluster to the center where the distance is minimum
        clusters = np.argmin(distance_matrix, axis=1)
        
        # Updating old cluster
        for i in range(num_of_centers):
            centers[i] = np_mode(X[np.where(clusters == i)])
            
        # if the old clusters and new clusters are same then stop
        if np.array_equal(old_clusters,clusters):
            break
        old_clusters = clusters
        
#         print(epoch)
        epoch += 1
            
    return centers, clusters


In [132]:
centers, clusters = Kmodes(X,2)

In [133]:
clusters

array([1, 1, 1, ..., 1, 0, 1])