In [None]:
import numpy as np
import matplotlib.pyplot as plt
import PIL 
import time

def kmeans(img, k_clusters, max_iter, init_centroids='random'):
    # centroids init
    cache = img[:, np.newaxis]
    centroids = []
    if init_centroids == 'random':
        centroids = np.reshape([np.random.choice(range(0, 255), size=3) for _ in range(k_clusters)], (k_clusters, 3))
    else:
        unique_colors = np.unique(img, axis=0)
        centroids = unique_colors[np.random.choice(unique_colors.shape[0], size=k_clusters, replace=False)]
        # centroids = img[np.random.choice(img.shape[0], size=k_clusters, replace=False)]


    for _ in range(max_iter):
        #using Manhattan distance instead
        labels = np.argmin(np.sum(np.abs(cache - centroids), axis=2), axis=1)

        # recalculate the centroids
        new_centroids = np.asanyarray([np.mean(img[labels == i, :], axis=0) for i in range(k_clusters)])

        if np.all(centroids == new_centroids):
            break

        centroids = new_centroids

    return labels, centroids

**Input file path and K-Means parameters**

In [None]:
filepath = input("Enter image path: ")
k = int(input("Enter k_clusters: "))
it = int(input("Enter max_iter: "))
init = input("Choose init method: ")

image = PIL.Image.open(filepath)
img = np.array(image, dtype=int)

h, w, c = img.shape
image_reshape = img.reshape(h*w, c)

In [None]:
labels, centroids = kmeans(img=image_reshape, k_clusters=k, max_iter=it, init_centroids=init)

**Reconstruct image**

In [None]:
#reconstruct image array from centroids and labels
compressed_img = centroids[labels].reshape((h, w, c)).astype(np.uint8)
#construct an image from the image array
compressed_image = PIL.Image.fromarray(compressed_img)

fig, axes = plt.subplots(1, 2, figsize=(20, 10))

axes[0].imshow(image)
axes[0].axis('off')
axes[0].set_title('Before')

axes[1].imshow(compressed_image)
axes[1].axis('off')
axes[1].set_title('After')

plt.tight_layout()
plt.show()

**Runs K-Means multiple times for more accurate runtime results**

In [None]:
execution_times = []

for _ in range(10):
    start_time = time.time()
    labels, centroids = kmeans(img=image_reshape, k_clusters=k, max_iter=it, init_centroids=init)
    end_time = time.time()
    execution_time = end_time - start_time
    execution_times.append(execution_time)
    
plt.plot(execution_times)
plt.xlabel('Iteration')
plt.ylabel('Execution Time (seconds)')
plt.title('Execution Time of My Code Snippet')
plt.show()

In [None]:
# tests done with donut image
#1: 1m3.3s, k=7 (with unique colors)
#2: 42.4s, k=7 (without unique colors)
#4: 1m20s, k=7 (euclidean, without unique colors)
#4: ~5s, k=7 (using scikit learn Kmeans)

# some conclusions:
# for images that doesn't have many colors, we can use unique colors to speed up the process
# but generally this is a bad idea
# euclidean is much slower

In [None]:
from sklearn.cluster import KMeans

# Example data
h, w, c = image.height, image.width, len(image.getbands())
data = np.reshape(image, (h*w, c))

# Create a KMeans object
kmeans = KMeans(n_clusters=12, random_state=0, max_iter=1000, init='random')

# Fit the data to the KMeans model
kmeans.fit(data)

# Get the cluster labels assigned to each data point
labels = kmeans.labels_

# Get the cluster centers
centroids = kmeans.cluster_centers_

compressed_img = centroids[labels].reshape((h, w, c)).astype(np.uint8)

# Convert numpy array back to PIL Image
compressed_image = PIL.Image.fromarray(compressed_img)

plt.axis('off')
plt.imshow(compressed_image);