In [1]:
import numpy as np

from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.preprocessing import scale

np.random.seed(42)

digits = load_digits()
data = scale(digits.data)

n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target

kmeans = KMeans(init='random', n_clusters=n_digits, n_init=10)
out = kmeans.fit_predict(data)
sklearn_centers = kmeans.cluster_centers_

In [2]:
test = np.array([[1,3,2], [1,1,0], [3,2,4]])
print(test)
print(np.sort(test,0))

[[1 3 2]
 [1 1 0]
 [3 2 4]]
[[1 1 0]
 [1 2 2]
 [3 3 4]]


In [3]:
def distance(point1, point2):
    return np.linalg.norm(point1-point2, 2)

def find_closest(point , centers):
    min_dist = distance(point, centers[0])
    center = 0
    for i in range(1,len(centers)):
        dist = distance(point, centers[i])
        if dist < min_dist:
            min_dist = dist
            center = i
    return center

def my_kmeans(data, n_clusters, init, tol=0.0001):
    n_samples, n_features = np.shape(data)
    # center_indices = np.random.choice(n_samples, n_clusters, replace=False)
    # centers = data[center_indices]
    cluster_indices = np.zeros(n_samples, dtype=np.int)
    centers = init
    old_centers = init-1
    
    while not np.all(np.abs(centers - old_centers) < tol):
        old_centers = centers.copy()
        # assign points
        for i in range(n_samples):
            cluster_indices[i] = find_closest(data[i], centers)
        # update centers
        centers = np.zeros((n_clusters, n_features))
        cluster_counts = np.zeros(n_clusters, dtype=np.int)
        for i in range(n_samples):
            centers[cluster_indices[i]] += data[i]
            cluster_counts[cluster_indices[i]] += 1
        for i in range(n_clusters):
            centers[i] /= cluster_counts[i]
    return centers, cluster_indices

In [4]:
n_samples = 30
n_clusters = 3
n_features = 5
dt = np.random.random_sample((n_samples,n_features))
center_indices = np.random.choice(n_samples, n_clusters, replace=False)
init = dt[center_indices].copy()

from sklearn.cluster import KMeans
kmeans = KMeans(init=init, n_clusters=n_clusters, n_init=1, tol=0.0001)
sklearn_indices = kmeans.fit_predict(dt)
print('sklearn finished computing the clusters.')

centers, cluster_indices = my_kmeans(data=dt, n_clusters=n_clusters, init=init, tol=0.0001)
print('my implementation finished computing the clusters')
print(sklearn_indices)
print(cluster_indices)
print(np.all(sklearn_indices == cluster_indices))

sklearn finished computing the clusters.
my implementation finished computing the clusters
[2 2 1 1 0 0 1 2 2 2 2 1 0 1 1 1 1 0 0 1 1 2 1 2 2 1 1 1 0 0]
[2 2 1 1 0 0 1 2 2 2 2 1 0 1 1 1 1 0 0 1 1 2 1 2 2 1 1 1 0 0]
True
