# Prototype selection for nearest neighbor

One way to speed up nearest neighbor classification is to replace the training set by a carefully chosen
subset of "prototypes". Think of a good strategy for choosing prototypes from the training set, bearing in mind that the ultimate goal is good classification performance. Assume that 1-NN will be used. Then implement and test it on the MNIST dataset, available at: http://yann.lecun.com/exdb/mnist/index.html

## Brief description

K-Means algorithm was used, and basically each cluster of samples was replaced by the cluster
centroid, the number of which just equals to $\dfrac{M}{10}$.

## Implementation & Test
Compare the performance to that of uniform-random selection
### Import Libraries & MNIST

In [2]:
from mnist import MNIST
from random import sample
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import ClusterCentroids
from collections import Counter

print "Reading data..."
mndata = MNIST('./mnist_data_files')
images_train, labels_train = mndata.load_training()
data_train = zip(images_train, labels_train)
images_test, labels_test = mndata.load_testing()
print "done"

Reading data...
done


### Uniform-random

In [5]:
M1 = 1000
M2 = 5000
M3 = 10000

def random_select(data_train, size, images_test, labels_test):
    proto_random = sample(data_train, size)
    images_random, labels_random = zip(*proto_random)
    neigh_random = KNeighborsClassifier(n_neighbors=1)
    neigh_random.fit(images_random, labels_random)
    acc_random = sum(neigh_random.predict(images_test) == labels_test) * 1.0 / len(labels_test)
    return acc_random

print("acc_random_M1 = " + str(random_select(data_train, M1, images_test, labels_test)) + ";acc_random_M2 = " + str(random_select(data_train, M2, images_test, labels_test)) + ";acc_random_M3 = " + str(random_select(data_train, M3, images_test, labels_test)))

acc_random_M1 = 0.8898;acc_random_M2 = 0.9357;acc_random_M3 = 0.9484


### Condensed Nearest Neighbour (CNN)

In [None]:
def cnn(labeled_train, M):
    S, G = [labeled_train[0]], []
    Count = 0
    count = 0
    for d in labeled_train:
        neigh = KNeighborsClassifier(n_neighbors=1, p=1)
        neigh.fit(zip(*S)[0], zip(*S)[1])
        if neigh.predict([d[0]]) == d[1]:
            G.append(d)
        else:
            S.append(d)
            count += 1
            print "count = " + str(count)
            if len(S) == M:
                return S
    flg = 1
    while G != [] and flg ==1:
        flg = 0
        for i in G:
            neigh = KNeighborsClassifier(n_neighbors=1, p=1)
            neigh.fit(zip(*S)[0], zip(*S)[1])
            if neigh.predict([i[0]]) != i[1]:
                S.append(i)
                print "round = " + str(Count) + ";count = " + str(count)
                if len(S) == M:
                    return S
                G.remove(i)
                flg = 1
                break
        Count += 1
    return S

#slow for large size
def cnn_select(data_train, size, images_test, labels_test):
    proto_cnn = cnn(data_train, size)
    images_cnn, labels_cnn = zip(*proto_cnn)
    neigh_cnn = KNeighborsClassifier(n_neighbors=1, p=1)
    neigh_cnn.fit(images_cnn, labels_cnn)
    acc_cnn = sum(neigh_cnn.predict(images_test) == labels_test) * 1.0 / len(labels_test)
    return acc_cnn

print("acc_cnn_M1 = " + str(cnn_select(data_train, M1, images_test, labels_test)) + ";acc_cnn_M2 = " + str(cnn_select(data_train, M2, images_test, labels_test)) + ";acc_cnn_M3 = " + str(cnn_select(data_train, M3, images_test, labels_test)))

### Clustering

In [3]:
rus_M1 = ClusterCentroids(ratio = {1: 100, 2:100, 3: 100, 4:100, 5:100, 6:100, 7:100, 8:100, 9:100, 0:100})
rus_M2 = ClusterCentroids(ratio = {1: 500, 2:500, 3: 500, 4:500, 5:500, 6:500, 7:500, 8:500, 9:500, 0:500})
rus_M3 = ClusterCentroids(ratio = {1: 1000, 2:1000, 3: 1000, 4:1000, 5:1000, 6:1000, 7:1000, 8:1000, 9:1000, 0:1000})

def cluster_select(rus, images_train, labels_train, images_test, labels_test):
    images_cluster, labels_cluster = rus.fit_sample(images_train, labels_train)
    neigh_cluster = KNeighborsClassifier(n_neighbors=1)
    neigh_cluster.fit(images_cluster, labels_cluster)
    acc_cluster = sum(neigh_cluster.predict(images_test) == labels_test) * 1.0 / len(labels_test)
    return acc_cluster

print("acc_cluster_M1 = " + str(cluster_select(rus_M1, images_train, labels_train, images_test, labels_test)) + "；acc_cluster_M2 = " + str(cluster_select(rus_M2, images_train, labels_train, images_test, labels_test)) +"；acc_cluster_M3 = " + str(cluster_select(rus_M3, images_train, labels_train, images_test, labels_test)))

acc_cluster_M1 = 0.9576；acc_cluster_M2 = 0.9695；acc_cluster_M3 = 0.9692


The improvement was explicitly reflected on the performance, and also implicitly in
the aspect of storage requirements.