# Unsupervised Classification of MNSIT

In this notebook we will be applying an unsupervised classification protocol, for the classification of MNIST images. For the step of feature extraction we will be using both an autoencoder and the VGG-16 pre-trained model. When running this code the user will have to choose one of those to run at a time.

---


In [None]:
import os

import sys
sys.path.append('..')

from pathlib import Path
import pickle

%load_ext autoreload
%autoreload 2
from helper import visualize as vis

import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras

from keras import models
from keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, BatchNormalization, Reshape, UpSampling2D, Dropout, Conv2DTranspose, Activation, Concatenate
from keras.models import Model, load_model
from keras.datasets import mnist
from keras.utils import to_categorical
from keras.regularizers import l2
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import preprocess_input

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import matplotlib.style
import matplotlib as mpl

from scipy.stats import mode, stats

import numpy as np
import math
import random

import idx2numpy

from tqdm.auto import tqdm

---

## Step 1: Importing Datasets & Re-shaping Images.

---

In [None]:
### Downloading the MNIST Dataset ###

(train_images, train_labels), (test_images_, test_labels_) = mnist.load_data()

# Normalisation
train_images = train_images / np.max(train_images)
test_images_  = test_images_ / np.max(test_images_)


In [None]:

# we only classify the last 5000 images from the test set, unseen by our autoencoder up to now.
test_images_1 = test_images_[5000:]
test_labels   = test_labels_[5000:]

# ----------------------- Re shaping ----------------------- #
def resize_and_convert( images, shape ):
    
    images_expanded = tf.expand_dims( images , -1)
    
    rgb_images = tf.image.grayscale_to_rgb( images_expanded )
    
    resized_images = tf.image.resize( rgb_images , shape)
    
    return resized_images
# ---------------------------------------------------------- #

autoencoder_shape = [64,64]
vgg_shape = [224,224]

# Reshaping images
test_images_auto = resize_and_convert( test_images_1, autoencoder_shape )
test_images_vgg = resize_and_convert( test_images_1, vgg_shape )


---

## Step 2: Loading Model

---

In [None]:
### ====================== Loading Autoencoder Model ====================== ###

# Uploading Autoencoder model trained for this purpose
model_path = './autoencoder_model_mnist'

# loading whole model
entire_model = load_model( model_path )

### ====================== Loading VGG-16 Model =========================== ###

vgg16_path = Path('..','models','VGG16.h5')
if not vgg16_path.is_file():
    vgg16 = keras.applications.VGG16( include_top=True, weights='imagenet' )
    vgg16.save(vgg16_path)
    
else:   
    vgg16 = keras.models.load_model(vgg16_path)


In [None]:
### =========================== GETTING FEATURE EXTRACTOR =========================== ###

def layer_extractor( layer, model ):
    
    assert layer in [x.name for x in model.layers]  # make sure the layer exists

    new_model = keras.Model(inputs = model.input, outputs=[ model.get_layer( layer ).output ])
    
    return new_model

### ===================== Creating Feature Extractor and feature map ================ ###

# Getting the feature extractor
feature_extractor_auto = layer_extractor('flatten', entire_model)
feature_extractor_vgg  = layer_extractor( 'fc1' , vgg16 )

# Computing feature map
feature_map_auto = feature_extractor_auto.predict( test_images_auto , verbose=True)
feature_map_vgg = feature_extractor_vgg.predict( test_images_vgg , verbose=True)


---

## Step 3: PCA Components.

- We move forward by choosing one of the two feature maps we have created.

---

In [None]:
### Calculating all PCA components

pca_n = PCA(svd_solver='full')

x_pca_ = pca_n.fit_transform( feature_map_auto )

# Variance per component
var_ = pca_n.explained_variance_ratio_.cumsum()


In [None]:
### PLOTTING GRAPH: How much variance is kept for a PCA component ###

# Plotting
fig1 = plt.figure(figsize=(15, 5))

# ----------------------------------------------- #
ax1 = fig1.add_subplot(1,2,1)
ax1.plot( range(1,len(var_)+1), var_ , marker='o')
ax1.set_xscale('log')
ax1.set_xlabel('Number of Components $Log_{10}x$')
ax1.set_ylabel('Cumulative Variance')
ax1.grid(True)

ax2 = fig1.add_subplot(1,2,2)
ax2.plot( var_ , marker='o')
ax2.set_xlabel('Number of Components')
ax2.set_ylabel('Cumulative Variance')
ax2.grid(True)

fig1.suptitle('Cumulative Variance by PCA Components');


In [None]:
### Keeping 50 Components ### Applying whitening in one case and not in the other ###

pca_w = PCA(n_components=50, svd_solver='full', whiten=True, random_state=123414)
x_w = pca_w.fit_transform( feature_map_auto )

pca_nw = PCA(n_components=50, svd_solver='full', whiten=False, random_state=123414)
x_nw = pca_nw.fit_transform( feature_map_auto )


---

### Visualising Clustering feature map through t-SNE.

---

In [None]:
### Reducing dimensionality to 2D with tSNE ###

tsne_nw = TSNE( n_components=2, random_state=12214 )
tsne_w  = TSNE( n_components=2, random_state=654753 )

x_nw_tsne = tsne_nw.fit_transform( x_nw )
x_w_tsne  = tsne_w.fit_transform( x_w )


In [None]:
### Plotting on a scatter graph ###

fig1 = plt.figure(figsize=(15,5))

# --------------------- Plot 1 --------------------- #
ax1 = fig1.add_subplot(1,2,1)
ax2 = fig1.add_subplot(1,2,2)

ax1.scatter( x_nw_tsne[:, 0], x_nw_tsne[:, 1], marker='o' )
ax1.set_title('Without Whitening')
ax2.scatter( x_w_tsne[:, 0], x_w_tsne[:, 1], marker='o' )
ax2.set_title('With Whitening')

fig1.suptitle('t-SNE Visualization of Image Features');


---

## Step 4: K Means Clustering.

---

In [None]:
### Applying k-means to the Data with whitening applied ###

# --------------------------------------------- #

kmeans_w = KMeans( n_clusters = 10 , init='k-means++', max_iter=500, n_init=500, random_state=213460)
kmeans_w.fit( x_w )

labels_unmatched_w = kmeans_w.labels_

print('inertia: {:.2f}'.format(kmeans_w.inertia_))


In [None]:
### Applying k-means to the Data with no whitening applied ###

kmeans_nw = KMeans( n_clusters = 10 , init='k-means++', max_iter=500, n_init=500, random_state=218460)
kmeans_nw.fit( x_nw )

labels_unmatched_nw = kmeans_nw.labels_

print('inertia: {:.2f}'.format( kmeans_nw.inertia_ ))


---

## Step 5: Matching Clusters & Accuracy

---

In [None]:
### Checking accuracy Based on MNIST Labels ###

def matching_clustering( cluster_labels ):
    
    # --------------- map the cluster labels to the real Label --------------- #
    labels_map = np.zeros_like( cluster_labels )

    for i in range(10):
        mask = (cluster_labels == i)
        labels_map[ mask ] = mode( test_labels[ mask ] )[0]

    # --------------- Calculate the accuracy ---------------- #
    accuracy = accuracy_score(test_labels, labels_map)

    # ----------------- Printing Accuracy ------------------- #
    print(f"\nTotal Accuracy of K Means clustering: { round(accuracy*100, 2) }%")

    # ========================================================================================== #

    print('\n-----------------------------------------------------------\n')

    cluster_accuracies = np.zeros((10,2))

    for i in range(0,10):

        positions_cluster_i = np.where( cluster_labels == i )[0]

        numbers_in_cluster = test_labels[ positions_cluster_i ]

        cluster_real_label = stats.mode( numbers_in_cluster )

        cluster_accuracies[i] = [ cluster_real_label[0], round((cluster_real_label[1] / len(numbers_in_cluster)) * 100, 2)]

        print(f'Cluster { i }:')
        print(f'\nThe number {cluster_real_label[0]} appears most often...')
        print(f'It appears { round((cluster_real_label[1] / len(numbers_in_cluster)) * 100, 2) } % of the time ({cluster_real_label[1]} times).\n')

    # ========================================================================================== #

    sorted_indices = cluster_accuracies[:, 0].argsort()
    sorted_array = cluster_accuracies[ sorted_indices ]

    # ========================================================================================== #
    ### Mapping clusters to correct label ###

    index = cluster_accuracies[:,0].astype('int')
    
    # ========================================================================================== #
    # Applying the mapping to get the true labels
    labels_matched = index[ cluster_labels ]
    
    plt.figure()

    plt.bar( sorted_array[ :, 0 ] , sorted_array[ :, 1 ] )
    plt.title('Accuracy per Cluster')
    plt.xlabel('Label')
    plt.ylabel('Accuracy / %')

    plt.grid()
    
    plt.show()
    
    return labels_matched, index


In [None]:
### Computing Accuracies for whitening and no whitening ###

labels_w, index_w  = matching_clustering( labels_unmatched_w )

labels_nw, index_nw = matching_clustering( labels_unmatched_nw )


In [None]:
### =========================== Comparing to REAL Labels =========================== ###

fig1 = plt.figure(figsize=(30,7.5))

# --------------------- Plot 1 (k Means Without Whitening) --------------------- #
ax1 = fig1.add_subplot(1,3,1)
scatter1 = ax1.scatter(x_nw_tsne[:, 0], x_nw_tsne[:, 1], c=labels_nw, cmap='tab10', marker='o')
ax1.set_title(f'k Means With No Whitening')
cbar1 = plt.colorbar(scatter1, ax=ax1)
cbar1.set_label('Label')

# --------------------- Plot 2 (k Meas With Whitening) --------------------- #
ax2 = fig1.add_subplot(1,3,2)
scatter2 = ax2.scatter(x_nw_tsne[:, 0], x_nw_tsne[:, 1], c=labels_w, cmap='tab10', marker='o')
ax2.set_title(f'k Means With Whitening')
cbar2 = plt.colorbar(scatter2, ax=ax2)
cbar2.set_label('Label')

# --------------------- Plot 3 (Real Labels) --------------------- #
ax3 = fig1.add_subplot(1,3,3)
scatter3 = ax3.scatter(x_nw_tsne[:, 0], x_nw_tsne[:, 1], c=test_labels, cmap='tab10', marker='o')
ax3.set_title('Real Labels')
cbar3 = plt.colorbar(scatter3, ax=ax3, ticks=range(10))
cbar3.set_label('Label')

fig1.suptitle('Comparing classification results');



---

### Further Visualisations

---

In [None]:
### Looking at the images belonging to each cluster, as classified by k-means clustering ###

n_images = 49

sides = int(np.sqrt( n_images ))

cluster_n = 7
positions = np.where( labels_unmatched_w == cluster_n )[0]

fig1 = plt.figure( figsize=(25,25) )

for i in range( 0, n_images ):

    ax = fig1.add_subplot( sides, sides, i+1 )
    ax.imshow( test_images_auto[ positions[i] ] )
    ax.set_title(f'Number { test_labels[ positions[i] ] }')
    
fig1.suptitle(f'CLUSTER {cluster_n}, Predicted to be {index_w[cluster_n]}.',fontsize=30);

plt.show()


In [None]:
### Creating and designing confusion matrix ###

# Creating confusion matrix
CM = confusion_matrix( test_labels , labels_nw )

# Using helper funciton for visualisation 
labels_ordered = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
vis.pretty_cm(CM, labels_ordered)


In [None]:

# using helper function to create visualisation of feature map with images
vis.pano_plot(x_nw_tsne[:,0], x_nw_tsne[:,1], test_images_auto, patch_shape=(2, 2))


---

## PCA research

---

In [None]:

def PCA_components_function( N, k, data, whitening_bool ):
    '''
    Function which carries out classification protocol for varying values of principal components.
    INPUT:
    - N              => number of principal components
    - k              => k value for k-means clustering
    - data           => feature map following feature extraction
    - whitening_bool => boolean setting whether whitening is or isnt applied
    OUTPUT:
    - Percentage accuracy of classification
    '''
    
    # PCA # 
    pca = PCA( n_components = N, svd_solver='full', whiten=whitening_bool )
    pca_data = pca.fit_transform( data )

    # K Means clustering #
    kmeans_ = KMeans( n_clusters = k, init='k-means++', max_iter=500, n_init=500, random_state= k*50 )
    kmeans_.fit( pca_data )
    
    # Cluster centres and labels #
    centers = kmeans_.cluster_centers_
    labels = kmeans_.labels_
    
    # map the cluster labels to the real Label #
    labels_map = np.zeros_like( labels )

    for i in range(10):
        mask = (labels == i)
        labels_map[ mask ] = mode( test_labels[ mask ] )[0]

    # Calculate the accuracy #
    accuracy = accuracy_score(test_labels, labels_map)
    
    return round(accuracy*100, 1)


In [None]:
### Calculating Accuracies where whitening is applied ###

# Principal Component values
pca_values = np.array([ 1, 5, 10, 20, 50, 100, 150, 250, 500, 750, 1000, 1800, 3600])
accuracies_per_PCA_whitening = np.zeros( len( pca_values ) )

whitening = True

for i, i_value in tqdm(enumerate(pca_values)):
    
    accuracies_per_PCA_whitening[i] = PCA_components_function( i_value, 10, feature_map_auto , whitening )


In [None]:
### Calculating Accuracies where whitening is not applied ###

# Principal Component values
pca_values = np.array([ 1, 5, 10, 20, 50, 100, 150, 250, 500, 750, 1000, 1800, 3600])
accuracies_per_PCA_no_whitening = np.zeros( len( pca_values ) )

whitening = False

for i, i_value in tqdm(enumerate(pca_values)):
    
    accuracies_per_PCA_no_whitening[i] = PCA_components_function( i_value, 10, feature_map_auto , whitening )


In [None]:
### Plotting Results ###

plt.figure(figsize=(15,10))

plt.plot( pca_values, accuracies_per_PCA_no_whitening, label='No Whitening' )
plt.plot( pca_values, accuracies_per_PCA_whitening,    label='Whitening' )

plt.xscale('log')

plt.title('Accuracy of clustering')
plt.xlabel('PCA components')
plt.ylabel('Accuracy / %')
plt.grid()

plt.legend();



---