In [None]:
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image
from numpy.linalg import norm
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans

![](../img/k_means_clustering.png)

K-means optimization objective function

$x^{(1)}, x^{(2)},\dotsc,x^{(m)}$ are data points, $\mu_1,\dotsc,\mu_k$ are cluster centers. 

Let 
* $c^{(i)}$ denote the index of the closest centroids to $x^{(i)}$.
* $\mu_{c^{(i)}}$ denote the centroid assigned to the $i^{\text{th}}$ example.

$$
J(c^{(1)},\dotsc,c^{(m)},\mu_1,\dotsc,\mu_k) = \frac1m\sum_{i=1}^m \left\vert\left\vert x^{(i)} - \mu_{c^{(i)}}\right\vert\right\vert ^2 \rightarrow \text{min}!
$$
is the distortion cost function.

In [None]:
image_path = 'images/sloth.jpeg' 

image = np.array(Image.open(image_path))
image = image / 255

In [None]:
fig = plt.figure(figsize=(15, 10))
plt.imshow(image)
plt.show()

In [None]:
def get_initial_centroids(data_points, number_of_clusters):
    return data_points[:number_of_clusters, :]


def assign_data_points_to_centroids(data_points, centroids):
    return np.argmin(cdist(data_points, centroids), axis=1)


def calculate_centroids(data_points, cluster_assignment, number_of_clusters, dimension):
    centroids = np.zeros((number_of_clusters, dimension), dtype=np.float)
    for k in range(number_of_clusters):
        points_in_cluster_k = data_points[cluster_assignment == k, :]
        centroids[k, :] = np.mean(points_in_cluster_k, axis=0)
    return centroids

In [None]:
def centroids_remained_the_same(new_centroids, centroids):
    for ix, centroid in enumerate(centroids):
        if norm(centroid - new_centroids[ix, :]) > 1e-12:
            return False
    return True


def lloyd_algoritm_for_k_means_clustering(data_points, number_of_clusters, dimension):
    centroids = get_initial_centroids(data_points, number_of_clusters)
    cluster_assignment = assign_data_points_to_centroids(data_points, centroids)
    while True:
        new_centroids = calculate_centroids(data_points, cluster_assignment, number_of_clusters, dimension)
        if centroids_remained_the_same(new_centroids, centroids):
            return centroids
        centroids = new_centroids
        cluster_assignment = assign_data_points_to_centroids(data_points, centroids)

In [None]:
nr_rows, nr_cols, _ = image.shape
X = image.reshape(nr_rows * nr_cols, 3)

nr_clusters = 10

centroids = lloyd_algoritm_for_k_means_clustering(X, nr_clusters, 3)

In [None]:
cluster_assignments = assign_data_points_to_centroids(X, centroids)

In [None]:
X_compressed = centroids[cluster_assignments, :].reshape(nr_rows, nr_cols, 3)

In [None]:
fig = plt.figure(figsize=(15, 10))
plt.imshow(X_compressed)
plt.show()

### Remarks:


farthest-first traversal:
1. initial_centroid = a random data point, centroids = [initial_centroid]
2. while len(centroids) < nr_clusters:
       data_point = the point in Data maximizing d(data_points, centroids) 
       centroids.append(data_point

k-means++
1. Choose one center at random among the data points.
2. For each data point x, compute D(x), the distance between x and the nearest center that has already been chosen.
3. Choose one new data point at random as a new center, using a weighted probability distribution where a point x is chosen with probability proportional to $D(x)^2$.

In [None]:
image = np.array(Image.open("images/Castle_hill.jpg"))

In [None]:
image = image / 255
row, col, _ = image.shape
print("pixels in one channel: {} * {}".format(row, col))

In [None]:
fig = plt.figure(figsize=(15, 10))
plt.imshow(image)
plt.title("Castle Hill, Budapest")
plt.show()

In [None]:
X = image.reshape(row * col, 3)

In [None]:
nr_clusters = 4

kmeans = KMeans(n_clusters=nr_clusters, n_init=10, max_iter=30)
cluster_assignments = kmeans.fit_predict(X)

In [None]:
X_compressed = kmeans.cluster_centers_[cluster_assignments, :].reshape(row, col, 3)

In [None]:
fig = plt.figure(figsize=(15, 10))
plt.imshow(X_compressed)
plt.title("Castle Hill, Budapest")
plt.show()