# K-Means

In [3]:
import pandas as pd
import numpy as np
# from sklearn.cluster import KMeans
import os
os.chdir("..") if "notebook" in os.getcwd() else None
import config
from tqdm import tqdm
np.random.seed(42)

# Load data
X = pd.read_csv(os.path.join(config.DATA_FOLDER, 'iris.csv'))
X = X.sample(frac=1).reset_index(drop=True)
y = X.pop('target')
X = (X - X.mean()) / X.std()

In [17]:
import numpy as np
import numpy_indexed as npi
from scipy.spatial.distance import cdist

class KMeans:

    def __init__(self, k, distance='euclidean', max_iter=1000, tol=1e-4, random_state=None):
        self.k = k
        self.distance = distance # 'euclidean', 'jensenshannon', 'cosine', 'p'
        self.max_iter = max_iter
        self.tol = tol
        self.random_state = random_state
        self.centroids = None
        self.labels_ = None

    def initialize_centroids(self, X):
        idxs = np.random.choice(X.shape[0], self.k)
        return X[idxs, :]
    
    @staticmethod
    def update_centroids(X, labels):
        return npi.group_by(labels).mean(X)[1]

    def fit(self, X):
        if self.random_state is not None:
            np.random.seed(self.random_state)

        centroids = self.initialize_centroids(X)
        it = 0
        while True:
            d_matrix = cdist(X, centroids, metric=self.distance)
            labels = d_matrix.argmin(axis=1)
            centroids_old = centroids
            centroids = self.update_centroids(X, labels)
            it += 1

            if ((np.linalg.norm(centroids - centroids_old, 1) < self.tol) | (it > self.max_iter)):
                break
        
        self.centroids = centroids
        self.labels = labels


In [18]:
model = KMeans(4, random_state=10)
model.fit(X.values)
model.labels

array([0, 1, 2, 1, 2, 1, 1, 2, 0, 0, 2, 1, 1, 1, 1, 2, 2, 0, 3, 2, 1, 2,
       1, 2, 2, 2, 0, 2, 1, 1, 1, 2, 2, 1, 1, 3, 2, 1, 1, 1, 3, 2, 2, 1,
       1, 0, 0, 2, 2, 2, 1, 2, 1, 1, 2, 0, 1, 1, 1, 3, 3, 1, 1, 1, 3, 1,
       1, 2, 1, 0, 3, 1, 2, 3, 0, 1, 2, 3, 1, 3, 3, 1, 1, 3, 2, 1, 3, 1,
       1, 2, 3, 2, 3, 3, 2, 3, 1, 1, 3, 2, 1, 1, 1, 0, 2, 1, 2, 2, 1, 1,
       0, 3, 2, 2, 1, 2, 1, 2, 0, 3, 1, 0, 2, 3, 1, 1, 2, 2, 1, 3, 2, 2,
       1, 2, 1, 2, 2, 2, 3, 2, 0, 0, 3, 2, 1, 0, 3, 1, 0, 2], dtype=int64)

In [16]:
model.inertia

460.1359737991966

In [10]:
from sklearn.cluster import KMeans as sk_KMeans
from sklearn.metrics.cluster import rand_score
model_sk = sk_KMeans(4, random_state=10)
model_sk.fit(X)
rand_score(model_sk.labels_, model.labels)

  super()._check_params_vs_input(X, default_n_init=10)


0.7857718120805369

In [11]:
model.centroids

array([[ 0.1757777 , -1.0855001 ,  0.23123741,  0.03086579],
       [-0.85603136,  0.66278386, -1.02116825, -0.99682228],
       [ 1.07150731,  0.18610456,  0.92214282,  0.95107504],
       [-0.24072152, -1.12086341,  0.39521949,  0.42629726]])

In [12]:
model = KMeans(4)
cents = model.initialize_centroids(X.values)
cents

array([[ 6.72249049e-01,  3.36720285e-01,  8.73563532e-01,
         1.44312105e+00],
       [-4.14620671e-01, -1.04706171e+00,  3.63481020e-01,
         1.74711992e-03],
       [ 6.72249049e-01,  3.36720285e-01,  8.73563532e-01,
         1.44312105e+00],
       [ 4.30722444e-01, -1.96958304e+00,  4.20156854e-01,
         3.94849102e-01]])

In [13]:
import numpy_indexed as npi
arr = np.array([[1, 1, 1], [0, 1, 2], [1, 2, 3]])
npi.group_by([0, 1, 1]).sum(arr[:, 1:])

(array([0, 1]),
 array([[1, 1],
        [3, 5]]))

In [48]:
model.update_centroids(X, d_matrix.argmin(axis=1))

(array([0, 1, 2, 3], dtype=int64),
 array([[-1.08430807, -0.51591306, -0.85590815, -0.87181284],
        [-0.38644257,  0.50200536, -0.36952644, -0.29526327],
        [ 1.96039094,  0.83641934,  1.42142993,  1.39944306],
        [ 0.92561519, -0.35517071,  0.82133286,  0.74684238]]))

In [28]:
cdist([[1, 0]], [[0, 1]])

array([[1.41421356]])

In [34]:
from scipy.spatial.distance import cdist
arr = np.zeros(X.shape[0])
arr[0] = 1
d_matrix = cdist(X, cents)
d_matrix.argmin(axis=1)

array([3, 1, 3, 1, 3, 1, 1, 3, 3, 1, 3, 0, 1, 0, 1, 3, 3, 0, 1, 3, 0, 1,
       1, 3, 2, 3, 3, 3, 0, 0, 1, 1, 3, 0, 0, 3, 3, 1, 1, 1, 1, 1, 3, 1,
       1, 0, 3, 3, 3, 2, 1, 2, 1, 1, 3, 1, 1, 1, 1, 0, 3, 1, 0, 1, 1, 1,
       1, 2, 0, 3, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 3, 0, 0, 0, 3, 0, 1, 1,
       1, 1, 0, 3, 1, 3, 2, 0, 0, 0, 3, 3, 0, 1, 0, 3, 1, 0, 3, 1, 0, 3,
       3, 3, 3, 3, 1, 3, 1, 3, 0, 1, 1, 0, 3, 1, 1, 1, 3, 3, 1, 0, 2, 3,
       0, 3, 0, 3, 3, 3, 1, 3, 3, 3, 1, 1, 1, 3, 0, 1, 1, 3], dtype=int64)