# Code Written by:
**Shweta Tiwari**
*20 Oct 2023*

## Algorithm:  K-Means

In [1]:
import time

In [2]:
import numpy as np
from bokeh.plotting import figure, gridplot, show, output_notebook

In [3]:
!pip install --upgrade bokeh==2.4.3



# Algorithm

In [4]:
%%time
def kmeans(points, n_clusters):
    # sample initial centroids
    sample = np.random.choice(len(points), n_clusters, replace=False)
    centroid = points[sample]

    loss = [-1, -2]
    while not np.allclose(*loss):
        # compute distance for each pair: point/centroid
        distance = [np.sqrt(((points - c) ** 2).sum(1)) for c in centroid]
        # new loss
        loss = loss[1:] + [np.sum(distance)]
        # assign new clusters
        cluster = np.argmin(distance, axis=0)
        # update centroids by new cluster means
        for i in range(n_clusters):
            centroid[i] = np.mean(points[cluster == i], axis=0)

    return cluster

CPU times: user 7 µs, sys: 0 ns, total: 7 µs
Wall time: 22.6 µs


# Run

*generate clusters*

In [5]:
%%time
n = 100
A = np.random.multivariate_normal([2, 0], [[1, .1], [-4, 1]], n)
B = np.random.multivariate_normal([-2, 0], [[1, -4], [.1, 1]], n)
C = np.random.multivariate_normal([2, -2], [[1, 4], [-.1, 1]], n)
D = ['red', 'green', 'blue']

points = np.r_[A, B, C]
original_color = np.repeat(D[:3], n)

CPU times: user 1.86 ms, sys: 988 µs, total: 2.85 ms
Wall time: 2.86 ms




*detect k-means clusters*

In [6]:
%%time
cluster = kmeans(points, 3)
new_color = [D[i] for i in cluster]

CPU times: user 5.22 ms, sys: 0 ns, total: 5.22 ms
Wall time: 5.24 ms


*plot original and new clusters*

In [7]:
%%time
output_notebook()

plot1 = figure(title='original clusters', plot_height=300)
plot1.scatter(x=points[:, 0], y=points[:, 1], color=original_color)

plot2 = figure(title='k-means clusters', plot_height=300)
plot2.scatter(x=points[:, 0], y=points[:, 1], color=new_color)

show(gridplot([[plot1], [plot2]]))

CPU times: user 116 ms, sys: 2.96 ms, total: 119 ms
Wall time: 121 ms


# The End