## Using PCA and NCM for MNIST feature extraction and classification
### Summary
MNIST 데이터셋을 분류하기 위해 PCA로 feature 추출 후 NCM으로 분류.

기존 CNN Test Accuracy : 약 98.5% 

CNN + NCM : 약 93.7% 

In [21]:
import time
from tqdm import tqdm
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from PIL import Image

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import NearestCentroid
from sklearn.decomposition import PCA

from torch.utils.data import DataLoader
import torchvision
from torchvision.datasets import MNIST
from torchvision import transforms

In [13]:
class BaseConfig:
    def __init__(self):
        pass

    def info(self):
        config_list = {
            0 : ['Dataset', self.dataset, 
                 "Train size: " + str(int((1-self.test_size)*100))+"%",
                 'Feature size: ' + str(self.feature_size), 1],
            1 : ['Method', self.method, 
                 "k = " + str(self.k), self.distance, 2],
            2 : ['Dimension reduction', 
                 'Method: ' + str(self.reduction_method[0]),
                 'Component size: ' + str(self.reduction_method[1]),
                 'Feature Reduction Ratio: ' + \
                    str(round(self.reduction_method[1]/self.feature_size, 3)*100 if self.reduction_method[1] is not None else None)+"%", 3],
            3 : ['Iteration', str(self.iter), -1]
        }

        print("Device " + "─" * 2 + " " + self.device)
        print("│")

        parent = 1
        for child in range(len(config_list)):
            for idx, contents in enumerate(config_list[child][:-1]):
                if idx == 0 and child == len(config_list)-1:
                    print("└" + "─" * 2 + contents)
                elif idx == 0:
                    print("├" + "─" * 2 + contents)
                elif child == len(config_list)-1:
                    print(" " * 4 + "└" + "─" * 4 + contents)
                else:
                    print("│" + " " * 4 + "└" + "─" * 4 + contents)
            parent = config_list[child][-1]
            if parent == -1: break
            print("│")

class Config(BaseConfig):
    def __init__(self):
        super(BaseConfig, self).__init__()
        self.device = 'PC'
        self.dataset = 'Mnist'
        self.test_size = 0.2
        self.feature_size = 784
        self.method = 'NCM'
        self.k = None
        self.distance = 'Euclidean'
        self.reduction_method = [None, None] # method, n_components
        self.iter = 10

In [3]:
train_mnist = MNIST(root='../../datasets', train=True, download=False)
test_mnist = MNIST(root='../../datasets', train=False, download=False)

  return torch.from_numpy(parsed.astype(m[2], copy=False)).view(*s)


In [4]:
train_mnist_X, train_mnist_y = train_mnist.data.numpy(), train_mnist.targets.numpy()
test_mnist_X, test_mnist_y = test_mnist.data.numpy(), test_mnist.targets.numpy()

print(f'Train data X shape : {train_mnist_X.shape}, y shape : {train_mnist_y.shape}')
print(f'Test data X shape : {test_mnist_X.shape}, y shape : {test_mnist_y.shape}')

Train data X shape : (60000, 28, 28), y shape : (60000,)
Test data X shape : (10000, 28, 28), y shape : (10000,)


In [11]:
print(f'Mnist Data range [{train_mnist_X.min()}, {train_mnist_X.max()}] ')

train_mnist_X, test_mnist_X = train_mnist_X / 255., test_mnist_X / 255.

print(f'--> Mnist Data Normalized range [{train_mnist_X.min()}, {train_mnist_X.max()}] ')

Mnist Data range [0, 255] 
--> Mnist Data Normalized range [0.0, 1.0] 


In [12]:
print(f'Mnist Data shape train : {train_mnist_X.shape}, test : {test_mnist_X.shape}')

train_mnist_X = train_mnist_X.reshape(-1, 784)
test_mnist_X = test_mnist_X.reshape(-1, 784)

print(f'--> Mnist Data shape train : {train_mnist_X.shape}, test : {test_mnist_X.shape}')

Mnist Data shape train : (60000, 28, 28), test : (10000, 28, 28)
--> Mnist Data shape train : (60000, 784), test : (10000, 784)


In [14]:
config = Config()
config.info()

Device ── PC
│
├──Dataset
│    └────Mnist
│    └────Train size: 80%
│    └────Feature size: 784
│
├──Method
│    └────NCM
│    └────k = None
│    └────Euclidean
│
├──Dimension reduction
│    └────Method: None
│    └────Component size: None
│    └────Feature Reduction Ratio: None%
│
└──Iteration
    └────10


In [19]:
def NCM_run(train_data_X = train_mnist_X,
            train_data_y = train_mnist_y,
            test_data_X = test_mnist_X,
            test_data_y = test_mnist_y,
            config = None,
            weights = 'distance'):

    assert config is not None
    config.info()
    max_seed = config.iter

    avg_test_acc = []
    avg_ncm_fit_time = []
    avg_pred_time = []
    preds = []

    for seed in tqdm(range(max_seed)):
        x_train, x_test, y_train, y_test = train_data_X, test_data_X, train_data_y, test_data_y

        ncm = NearestCentroid()

        start_time = time.perf_counter()
        ncm.fit(x_train, y_train)
        ncm_fit_time = time.perf_counter() - start_time
        avg_ncm_fit_time.append(ncm_fit_time)

        start_time = time.perf_counter()
        pred = ncm.predict(x_test)
        pred_time = time.perf_counter() - start_time
        preds.append(pred)

        test_score = accuracy_score(pred, y_test)
        avg_test_acc.append(test_score)
        avg_pred_time.append(pred_time)

    print("Train size : ", len(x_train), " / Test size : ", len(x_test))
    print("-----" * 8)
    print("Test set score: %f" % np.array(avg_test_acc).mean())
    print("NCM fitting Time: %.4f ± %.5f" % (np.array(avg_ncm_fit_time).mean(), np.array(avg_ncm_fit_time).std()), "sec")
    print("All Test dataset Prediction Time at once : %.4f ± %.5f" % (np.array(avg_pred_time).mean(), np.array(avg_pred_time).std()), "sec")
    print("Divide the Prediction Time by Test size : %.8f ± %.8f" % (np.array(avg_pred_time).mean()/len(x_test)*1e6, np.array(avg_pred_time).std()/len(x_test)*1e6), "microsec")

    return avg_test_acc, avg_ncm_fit_time, avg_pred_time

In [16]:
def pca_run(train_data_X = train_mnist_X,
            test_data_X = test_mnist_X,
            config=None):

    assert config is not None
    config.info()

    n_components = config.reduction_method[1]

    pca_dims = PCA(n_components)
    print(f"The number of components : {n_components}")

    start_time = time.perf_counter()
    pca_dims.fit(train_data_X)
    pca_fit_time = time.perf_counter () - start_time
    print()
    print(f"Calculating SVD Matrix Time on Train Data-{train_data_X.shape} : {pca_fit_time:4f} sec")
    
    start_time = time.perf_counter()
    train_features = pca_dims.transform(train_data_X)
    train_features_extract_time = time.perf_counter () - start_time
    print(f"Transform train X-{train_data_X.shape} to {n_components}-PCA Time: {train_features_extract_time:4f} sec")

    start_time = time.perf_counter()
    test_features = pca_dims.transform(test_data_X)
    test_features_extract_time = time.perf_counter () - start_time
    print(f"Transform test X-{train_data_X.shape} to {n_components}-PCA Time: {test_features_extract_time:4f} sec")

    return pca_dims, train_features, test_features, pca_fit_time, train_features_extract_time, test_features_extract_time

In [17]:
n_components_list = []
pca_fit_time_list = []
train_features_extract_time_list = []
test_features_extract_time_list = []

In [25]:
config.reduction_method = ['PCA', 64]

pca_dims, train_features, test_features, pca_fit_time, train_features_extract_time, test_features_extract_time = \
    pca_run(train_data_X = train_mnist_X,
            test_data_X = test_mnist_X,
            config=config)

n_components_list.append(config.reduction_method[1])
pca_fit_time_list.append(pca_fit_time)
train_features_extract_time_list.append(train_features_extract_time)
test_features_extract_time_list.append(test_features_extract_time)

Device ── PC
│
├──Dataset
│    └────Mnist
│    └────Train size: 80%
│    └────Feature size: 784
│
├──Method
│    └────NCM
│    └────k = None
│    └────Euclidean
│
├──Dimension reduction
│    └────Method: PCA
│    └────Component size: 64
│    └────Feature Reduction Ratio: 8.200000000000001%
│
└──Iteration
    └────10
The number of components : 64

Calculating SVD Matrix Time on Train Data-(60000, 784) : 1.922037 sec
Transform train X-(60000, 784) to 64-PCA Time: 0.252412 sec
Transform test X-(60000, 784) to 64-PCA Time: 0.062013 sec


In [26]:
avg_test_acc, avg_knn_fit_time, avg_pred_time = NCM_run(train_data_X = train_features,
                                                        train_data_y = train_mnist_y,
                                                        test_data_X = test_features,
                                                        test_data_y = test_mnist_y,
                                                        config = config,
                                                        weights = 'distance')

Device ── PC
│
├──Dataset
│    └────Mnist
│    └────Train size: 80%
│    └────Feature size: 784
│
├──Method
│    └────NCM
│    └────k = None
│    └────Euclidean
│
├──Dimension reduction
│    └────Method: PCA
│    └────Component size: 64
│    └────Feature Reduction Ratio: 8.200000000000001%
│
└──Iteration
    └────10


100%|██████████| 10/10 [00:00<00:00, 29.83it/s]

Train size :  60000  / Test size :  10000
----------------------------------------
Test set score: 0.818100
NCM fitting Time: 0.0286 ± 0.00193 sec
All Test dataset Prediction Time at once : 0.0041 ± 0.00161 sec
Divide the Prediction Time by Test size : 0.40941700 ± 0.16129807 microsec



