# K-means clustering.

The aim of this interactive demo is to show the sensitivity of the K-means clustering algorithm to the initial centroid locations. We start with unlabelled data, which, visually, we can observe and can be classified into three categories. Setting K=3, we proceed to compute the clusters. Adjust the sliders below and see how the clustering evolves. You can run one epoch at a time to see slowly how the algorithm proceeds or run 10 epochs at once.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import Markdown
import functools

In [2]:
from sklearn import datasets
blobs = datasets.make_blobs(n_samples=300, cluster_std=1.5, random_state=12345)
X=blobs[0]

In [10]:
def assign(mu1, mu2, mu3):
    Y = np.zeros(len(X))
    for i in range(len(X)):
        dist1 = (X[i][0] - mu1[0])**2 + (X[i][1] - mu1[1])**2
        dist2 = (X[i][0] - mu2[0])**2 + (X[i][1] - mu2[1])**2
        dist3 = (X[i][0] - mu3[0])**2 + (X[i][1] - mu3[1])**2

        if( dist1 <= dist2 and dist1 <= dist3 ):
            Y[i] = 1
        elif( dist2 <= dist1 and dist2 <= dist3 ):
            Y[i] = 2
        else:
            Y[i] = 3  
    return Y    

In [15]:
def centroids(Y):
    mu1l = np.array([0,0])
    mu2l = np.array([0,0])
    mu3l = np.array([0,0])

    n1=0
    n2=0
    n3=0
    
    for i in range(len(Y)):
        if(Y[i] == 1 ):
            mu1l = mu1l + X[i]
            n1 += 1
        elif(Y[i] == 2 ):
            mu2l = mu2l + X[i]
            n2 += 1
        else:
            mu3l = mu3l + X[i]
            n3 += 1

    if(n1 != 0 ):
        mu1l = mu1l/(n1+1e-10)
    else:
        mu1l = mu1

    if(n2 != 0 ):
        mu2l = mu2l/(n2+1e-10)
    else:
        mu2l = mu2

    if(n3 != 0 ):
        mu3l = mu3l/(n3+1e-10)
    else:
        mu3l = mu3
        
    return mu1l, mu2l, mu3l

In [12]:
colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

In [16]:
try:
    plt.style.use('seaborn')
except:
    pass


initial_mu1_x = widgets.FloatSlider(2, min=-10, max=10.0, description=r'Initial $\mu_1 x$:', continuous_update = False)
initial_mu2_x = widgets.FloatSlider(3, min=-10, max=10.0, description=r'Initial $\mu_2 x$:', continuous_update = False)
initial_mu3_x = widgets.FloatSlider(-3, min=-10, max=10.0, description=r'Initial $\mu_3 x$:', continuous_update = False)

initial_mu1_y = widgets.FloatSlider(2, min=-10, max=10.0, orientation='vertical', description=r'Initial $\mu_1 y$:', continuous_update = False)
initial_mu2_y = widgets.FloatSlider(3, min=-10, max=10.0, orientation='vertical', description=r'Initial $\mu_2 y$:', continuous_update = False)
initial_mu3_y = widgets.FloatSlider(-3, min=-10, max=10.0, orientation='vertical', description=r'Initial $\mu_3 y$:', continuous_update = False)

left_box = widgets.VBox([initial_mu1_x, initial_mu2_x, initial_mu3_x ])
right_box = widgets.HBox([initial_mu1_y, initial_mu2_y, initial_mu3_y])
sliders = widgets.HBox([left_box, right_box])

sub1 = widgets.Button(description='1 epoch')
sub10 = widgets.Button(description='10 epochs')
out = widgets.Output()

#display(initial_mu1_x, initial_mu2_x, initial_mu3_x)
#display(initial_mu1_y, initial_mu2_y, initial_mu3_y)
display(sliders)
display(sub1, sub10, out)

Y = np.zeros(len(X), dtype=int)

mu1 = np.array([initial_mu1_x.value, initial_mu1_y.value])
mu2 = np.array([initial_mu2_x.value, initial_mu2_y.value])
mu3 = np.array([initial_mu3_x.value, initial_mu3_y.value])

def train(epochs):
    out.clear_output(wait=True)
    with out:
        global mu1, mu2, mu3
        for _ in range(int(epochs)):
            Y = assign(mu1, mu2, mu3)
            mu1, mu2, mu3 = centroids(Y)
        plt.scatter(X[:,0], X[:,1], color=colors[Y.astype(int)])

        plt.scatter(mu1[0], mu1[1], edgecolor = 'k', color=colors[1], s = 200, marker='X')
        plt.scatter(mu2[0], mu2[1], edgecolor = 'k', color=colors[2], s = 200, marker='X')
        plt.scatter(mu3[0], mu3[1], edgecolor = 'k', color=colors[3], s = 200, marker='X')
        plt.xlabel(r'$x_1$')
        plt.ylabel(r'$x_2$')
            
        plt.show()



def reset_Y(b=None):
    out.clear_output(wait=True)
    with out:
        global mu1, mu2, mu3
        mu1 = np.array([initial_mu1_x.value, initial_mu1_y.value])
        mu2 = np.array([initial_mu2_x.value, initial_mu2_y.value])
        mu3 = np.array([initial_mu3_x.value, initial_mu3_y.value])
        Y = np.zeros(len(X), dtype=int)
        plt.scatter(X[:,0], X[:,1], color=colors[Y.astype(int)])
        plt.scatter(mu1[0], mu1[1], edgecolor = 'k', color=colors[1], s = 200, marker='X')
        plt.scatter(mu2[0], mu2[1], edgecolor = 'k', color=colors[2], s = 200, marker='X')
        plt.scatter(mu3[0], mu3[1], edgecolor = 'k', color=colors[3], s = 200, marker='X')
        plt.xlabel(r'$x_1$')
        plt.ylabel(r'$x_2$')
        
        plt.show()

def train1(b):
    train(1)

def train10(b):
    train(10)

reset_Y()

initial_mu1_x.observe(reset_Y)
initial_mu2_x.observe(reset_Y)
initial_mu3_x.observe(reset_Y)
initial_mu1_y.observe(reset_Y)
initial_mu2_y.observe(reset_Y)
initial_mu3_y.observe(reset_Y)
sub1.on_click(train1)
sub10.on_click(train10)

HBox(children=(VBox(children=(FloatSlider(value=2.0, continuous_update=False, description='Initial $\\mu_1 x$:…

Button(description='1 epoch', style=ButtonStyle())

Button(description='10 epochs', style=ButtonStyle())

Output()