#### import libraries

In [78]:
import matplotlib.pyplot as plt
from keras.datasets import mnist
from sklearn import metrics


#### download dataset

In [79]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

#### print size of train and test

In [80]:
print('test Data:', (x_test.shape))
print('test Labels:', (y_test.shape))
print('Training Data:', (x_train.shape))
print('Training Labels:', (y_train.shape))

test Data: (10000, 28, 28)
test Labels: (10000,)
Training Data: (60000, 28, 28)
Training Labels: (60000,)


#### convert 28*28 image data to a vector

In [81]:
X = x_train.reshape(len(x_train),-1)
Y = y_train

# normalize the data to 0 - 1
X = X.astype(float) / 255.

print(X.shape)
print(X[0].shape)

(60000, 784)
(784,)


#### initialize and fit the train data

In [82]:
from sklearn.cluster import MiniBatchKMeans
# give the number of classes
n_digits = len(np.unique(y_test))
print(n_digits)

# Initialize KMeans model
kmeans = MiniBatchKMeans(n_clusters = n_digits)

# Fit the model
kmeans.fit(X)

10


MiniBatchKMeans(n_clusters=10)

In [93]:
kmeans.labels_

array([13,  8, 11, ..., 13,  1,  9], dtype=int32)

In [94]:
len(kmeans.labels_)

60000

#### Assigning Cluster Labels
K-means clustering is an unsupervised machine learning method; consequently, the labels assigned by our KMeans algorithm refer to the cluster each array was assigned randomly, so the labels are diffrent with the actual target integer. so we need to adjust the numbers.

In [84]:
def infer_cluster_labels(kmeans, actual_labels):
    """
    Associates most probable label with each cluster in KMeans model
    returns: dictionary of clusters assigned to each label
    """

    inferred_labels = {}
    for i in range(kmeans.n_clusters):
        # find index of points in cluster
        labels = []
        index = np.where(kmeans.labels_ == i)
        # append actual labels for each point in cluster
        labels.append(actual_labels[index])
        # determine most common label
        if len(labels[0]) == 1:
            counts = np.bincount(labels[0])
        else:
            counts = np.bincount(np.squeeze(labels))
        # assign the cluster to a value in the inferred_labels dictionary
        if np.argmax(counts) in inferred_labels:
            # append the new number to the existing array at this slot
            inferred_labels[np.argmax(counts)].append(i)
        else:
            # create a new array in this slot
            inferred_labels[np.argmax(counts)] = [i]
        #print(labels)
        #print('Cluster: {}, label: {}'.format(i, np.argmax(counts)))
    return inferred_labels  

In [85]:
def infer_data_labels(X_labels, cluster_labels):
    """
    Determines label for each array, depending on the cluster it has been assigned to.
    returns: predicted labels for each array
    """
    
    # empty array of len(X)
    predicted_labels = np.zeros(len(X_labels)).astype(np.uint8)
    
    for i, cluster in enumerate(X_labels):
        for key, value in cluster_labels.items():
            if cluster in value:
                predicted_labels[i] = key
                
    return predicted_labels

In [98]:
cluster_labels = infer_cluster_labels(kmeans, Y)
X_clusters = kmeans.predict(X)
print(len(Y))
predicted_labels = infer_data_labels(X_clusters, cluster_labels)
print (predicted_labels[:10])
print (Y[:10])

60000
[5 0 4 1 9 2 1 8 1 4]
[5 0 4 1 9 2 1 3 1 4]


In [99]:
n_clusters=10


X_test = x_test.reshape(len(x_test),-1)

# normalize the data to 0 - 1
X_test = X_test.astype(float) / 255.

# initialize and fit KMeans algorithm on training data
kmeans = MiniBatchKMeans(n_clusters = 20)
kmeans.fit(X)
cluster_labels = infer_cluster_labels(kmeans, Y)

# predict labels for testing data
test_clusters = kmeans.predict(X_test)
predicted_labels = infer_data_labels(kmeans.predict(X_test), cluster_labels)
       
# calculate metrics
print('Accuracy: {}\n'.format(metrics.accuracy_score(y_test, predicted_labels)))
print('Precision: {}\n'.format(metrics.precision_score(y_test, predicted_labels,average=None)))
print('Recall: {}\n'.format(metrics.recall_score(y_test, predicted_labels,average=None)))
print('Jaccard: {}\n'.format(metrics.jaccard_score(y_test, predicted_labels,average=None)))


Accuracy: 0.6568

Precision: [0.94965035 0.84182909 0.76746507 0.6129613  0.49563319 0.44615385
 0.88422247 0.5157832  0.5460076  0.50664137]

Recall: [0.69285714 0.98942731 0.74515504 0.67425743 0.46232179 0.26008969
 0.8131524  0.84241245 0.73716632 0.26461843]

Jaccard: [0.66830709 0.83432392 0.60790514 0.47291667 0.31440443 0.19661017
 0.73490566 0.47039652 0.45703374 0.21040189]

