In [57]:
import numpy as np
from mnist import MNIST
from tqdm import tqdm
from sklearn.metrics.pairwise import euclidean_distances

In [2]:
mnist_loader = MNIST("../data/mnist/")
mnist_loader.gz = True

In [3]:
images, labels = mnist_loader.load_training()
timages, tlabels = mnist_loader.load_testing()

In [4]:
images = np.array(images)
labels = np.array(labels)
timages = np.array(timages)
tlabels = np.array(tlabels)

In [5]:
images.shape, timages.shape

((60000, 784), (10000, 784))

In [6]:
n_rows = images.shape[0]
t_n_rows = timages.shape[0]

# kmeans
## init

In [305]:
def init(k):
    _mu = np.empty((k, images.shape[1]))
    for rand in range(k):
        img_id = np.random.choice(len(images))
        _mu[rand] = images[img_id]
    return _mu

## E step
Assign memberships from current mu's. <br>
Also evaluate loss

In [306]:
def e_step(mu):
    distances = np.matrix(euclidean_distances(X=images, Y=mu))
    loss = np.sum(distances.min(axis=1))
    return np.array(distances.argmin(1).T).ravel(), loss

## M step
Calculate new mu's from current memberships

In [315]:
def m_step(pi, k):
    mu = np.empty((k, images.shape[1]))
    for cluster_id in range(k):
        cluster_images = images[np.where(_pi == cluster_id)[0]]
        mu[cluster_id] = np.mean(cluster_images, axis=0)
    return mu

## k means

In [313]:
k = 10
np.random.seed(666)
max_epochs = 100

In [314]:
_mu = init(k)
for _epoch in range(max_epochs):
    _pi, loss = e_step(_mu)
    mu = m_step(_pi, k)
    if _epoch % 5 == 0:
        print('Epoch:', _epoch, 'Loss:', loss)
    if np.allclose(_mu, mu):
        print('Convergence at epoch {0}, loss {1}'.format(_epoch, loss))
        break
    _mu = mu

Epoch: 0 Loss: 128768701.369
Epoch: 5 Loss: 95821331.0201
Epoch: 10 Loss: 95368733.6418
Epoch: 15 Loss: 95316648.4302
Epoch: 20 Loss: 95169685.9731
Epoch: 25 Loss: 94823078.0926
Epoch: 30 Loss: 94775526.8056
Epoch: 35 Loss: 94773679.5606
Epoch: 40 Loss: 94773467.4025
Epoch: 45 Loss: 94773676.6578
Convergence at epoch 46, loss 94773678.2702097
