In [1]:
import numpy as np
from mnist import MNIST
from sklearn.cluster import AgglomerativeClustering

In [2]:
mnist_loader = MNIST("../data/mnist/")
mnist_loader.gz = True

In [3]:
images, labels = mnist_loader.load_training()

In [4]:
images = np.array(images)
labels = np.array(labels)

In [5]:
n_rows, dims = images.shape

Sampling, 20000 datapoints (2000 from each label)

In [6]:
sample_size = 2000
total_labels = 10

In [7]:
label_idx = {}
np.random.seed(42)

# pick datapoints
for idx in range(total_labels):
    label_idx[idx] = np.random.choice(np.where(labels == idx)[0], sample_size, replace=False)

# placeholder
mnist_sub = np.empty((sample_size * total_labels, dims), dtype='int16')
labels_sub = np.empty((sample_size * total_labels, 1), dtype='int8')

# add value to placeholder
for idx in range(total_labels):
    start = idx * sample_size
    end = (idx + 1) * sample_size
    mnist_sub[start:end] = images[label_idx[idx]]
    labels_sub[start:end] = idx

# shuffle new data
s = np.arange(sample_size * total_labels)
np.random.shuffle(s)
mnist_sub = mnist_sub[s]
labels_sub = labels_sub[s]

# free up space
del images, labels

In [8]:
model = AgglomerativeClustering(n_clusters=10)

In [9]:
%%time
model.fit(mnist_sub)

CPU times: user 2min 24s, sys: 1.98 s, total: 2min 26s
Wall time: 2min 27s


AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=10,
            pooling_func=<function mean at 0x7f5684124c80>)

In [10]:
yhat = model.labels_

In [11]:
conf_mat = np.empty((total_labels, 10), dtype='int32')
for idx in range(total_labels):
    true_labels = labels_sub[np.where(yhat == idx)[0]].ravel()
    conf_mat[idx] = np.bincount(true_labels, minlength=10)

In [15]:
def gini(confusion_matrix):
    Mj = confusion_matrix.sum(axis=1, keepdims=True)
    gj = 1 - ((confusion_matrix/Mj)**2).sum(axis=1, keepdims=True)
    return (sum(gj*Mj)/sum(Mj))[0]

print("Gini Index: {0}".format(gini(conf_mat)))

Gini Index: 0.4010186269784047


In [14]:
def purity(confusion_matrix):
    Pj = confusion_matrix.max(axis=1)
    Mj = confusion_matrix.sum(axis=1)
    return sum(Pj)/sum(Mj)

print("Purity: {0}".format(purity(conf_mat)))

Purity: 0.69955
