# K-Means

Algorithm

1. Choose k random starting points
2. Assign each point in the dataset to the closest centroid
3. Re-position the centroids such that they are in the middle (average) of their assigned data points
4. Repeat from step 2 until done

## Setup

In [None]:
import math
from functools import partial

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs

In [None]:
plt.rc('figure', figsize=(16, 9))
plt.rc('font', size=13)

In [None]:
def distance(p, q):
    '''
    distance between 2d points p and q
    
    assumes p and q both have properties x1 and x2
    '''
    return math.sqrt((p.x1 - q.x1) ** 2 + (p.x2 - q.x2) ** 2)


def find_cluster(centroids: pd.DataFrame, row: pd.Series):
    'Given centroids and a row, return the closest centroid'
    distances = centroids.apply(lambda center: distance(center, row), axis=1)
    return distances.idxmin()

## Demo

In [None]:
# Generate random data for clustering
np.random.seed(73)
X, _ = make_blobs(cluster_std=1.5)

df = pd.DataFrame(X, columns=["x1", "x2"])

sns.relplot(data=df, x='x1', y='x2', aspect=1.5)
plt.title('Our initial data')

In [None]:
# Generate random 3 centroids
centroids = pd.DataFrame(np.random.rand(3, 2), columns=["x1", "x2"])

In [None]:
sns.relplot(x=df.x1, y=df.x2, aspect=1.5)
plt.title('3 Random Points are chosen for centroids')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

In [None]:
# assign each data point to the closest centroid, that is the point's cluster
df['cluster'] = df.apply(partial(find_cluster, centroids), axis=1).astype('category')

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Each point has been assigned to the closest centroid')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

In [None]:
# update centroids
centroids = df.groupby('cluster').mean()

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Centroids have been updated such that\nthey are in the middle of their clusters')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

Repeat...

In [None]:
df['cluster'] = df.apply(partial(find_cluster, centroids), axis=1).astype('category')

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Each point assigned to the closest centroid')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

In [None]:
# update centroids
centroids = df.groupby('cluster').mean()

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Centroids repositioned at the middle of their clusters')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

Repeat...

In [None]:
df['cluster'] = df.apply(partial(find_cluster, centroids), axis=1).astype('category')

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Each point assigned to the closest centroid')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

In [None]:
# update centroids
centroids = df.groupby('cluster').mean()

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Centroids repositioned at the middle of their clusters')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

Repeat...

In [None]:
df['cluster'] = df.apply(partial(find_cluster, centroids), axis=1).astype('category')

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Each point assigned to the closest centroid')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')

In [None]:
# update centroids
centroids = df.groupby('cluster').mean()

In [None]:
sns.relplot(data=df, x='x1', y='x2', hue='cluster', aspect=1.5)
plt.title('Centroids repositioned at the middle of their clusters')
plt.scatter(centroids.x1, centroids.x2, marker='x', s=600, c='red')