In [None]:
import h5py
import numpy as np
from numpy import linalg as LA
from scipy.spatial.distance import cdist
import numpy.random as Random
import matplotlib.pyplot as plt
import time
from mpl_toolkits.mplot3d import Axes3D

#The whole procee of this cold is to determine PCA, KNN, K-fold cross validation, and use them to get the results.
#The file should be put on the ./Dataset for using purpose

start = time.time()

class PCA:
    def __init__(self,n_components):
        """
        初始化函数
        :param n_components: 所降低的维度
        """
        self.n_components=n_components

        self._mean=0
        self._std=0

    def fit(self,X):
        """

        :param X: n个数据，m个特征
        :return:
        """
        X = X.astype(float)
        n = X.shape[0]
        m = X.shape[1]
        
        #Normalization
        self._mean = np.mean(X,axis=0)
        self._std = np.std(X,axis=0)
        X = (X - self._mean)/self._std
        
        Sigma = np.matmul(X.transpose(),X)/n

        w,v = LA.eig(Sigma)
        sorted_indices = np.argsort(w)

        W = w[sorted_indices[:-self.n_components-1:-1]]
        V = v[:,sorted_indices[:-self.n_components-1:-1]]


        return np.matmul(X,V)

class KNN:
    def __init__(self,K):
        self.K = K

    def fit(self,train_data,train_label):
        self.train_data = train_data
        self.train_label = train_label

    def predict(self,test_data):
        M = cdist(test_data, self.train_data, metric='euclidean', p=2)
        ind = M.argsort(axis=1)[:,:self.K]
        return [np.argmax(np.bincount(self.train_label[x].astype(int))) for x in ind]

    def score(self,test_data,test_label):
        return (sum(self.predict(test_data) == test_label)/test_label.shape[0])

def cross_validate(d,k,data,label,k_fold=5,random_state=None):
    if random_state != None:
        Random.seed(random_state)

    IND = []
    for i in range(np.max(label)+1):
        I = []
        ind_i = np.argwhere(label == i)
        ind_i = ind_i.squeeze(axis = 1)
        Random.shuffle(ind_i)
        batch_size = (ind_i.shape[0])//k_fold
        for k_ in range(k_fold-1):
            I.append(ind_i[k_*batch_size:(k_+1)*batch_size])
        I.append(ind_i[(k_fold-1)*batch_size:ind_i.shape[0]])

        IND.append(I)

    pca = PCA(n_components=max(d))
    data_d = pca.fit(data)

    acc = np.zeros((len(d),len(k),k_fold))
    for kk in range(k_fold):
        data_test0 = np.concatenate([data_d[IND[ii][kk],:] for ii in range(np.max(label)+1)],axis=0)
        label_test = np.concatenate([label[IND[ii][kk]] for ii in range(np.max(label)+1)],axis=0)

        data_train0 = np.concatenate([np.concatenate([data_d[IND[ii][kk_],:] for ii in range(np.max(label)+1)],axis=0) for kk_ in set(range(k_fold)).difference(set([kk]))],axis=0)
        label_train=np.concatenate([np.concatenate([label[IND[ii][kk_]] for ii in range(np.max(label)+1)],axis=0) for kk_ in set(range(k_fold)).difference(set([kk]))],axis=0)


        for i,d_ in enumerate(d):
            data_test = data_test0[:,:d_]
            data_train = data_train0[:,:d_]
            for j,k_ in enumerate(k):
                knn = KNN(K=k_)
                knn.fit(data_train,label_train)
                acc[i,j,kk] = knn.score(data_test,label_test)
                print('{}_fold:\tfold{}:d={},k={},acc={}%'.format(k_fold, kk + 1,d_,k_,acc[i,j,kk]*100))




    ACC = np.mean(acc,axis=2)
    IJ = np.argwhere(ACC == np.max(ACC))
    plt.plot(ACC,'o-')
    plt.xlabel('d')
    plt.ylabel('Acc')
    plt.legend(['k='+str(i) for i in k])
    plt.xticks(range(len(d)), d)
    plt.savefig('Parameters.jpg')
    plt.show()

    return d[IJ[0][0]],k[IJ[0][1]]









def load_data():
    # 导入数据
    with h5py.File('Dataset/images_training.h5', 'r') as H:
        train_data = np.copy(H['data'])
        train_data = np.reshape(train_data/255., (-1, 28 * 28))
    with h5py.File('Dataset/labels_training.h5', 'r') as H:
        train_label = np.copy(H['label'])
    with h5py.File('Dataset/images_testing.h5', 'r') as H:
        test_data = np.copy(H['data'])
        test_data = np.reshape(test_data/255., (-1, 28 * 28))
    with h5py.File('Dataset/labels_testing_2000.h5', 'r') as H:
        test_2000_label = np.copy(H['label'])


    return (train_data,train_label,test_data,test_2000_label)



train_data,train_label,test_data,test_2000_label=load_data()
data = train_data.astype(float)
label = train_label.astype(int)




#best_d,best_k = cross_validate([1,5,10,50,100,200,28*28],[1,5,10,50,100,200],data,label,k_fold=5,random_state=0)

#After doing the K-fold cross validation
best_d=200
best_k=5

print('d={},k={}'.format(best_d,best_k))
all = np.concatenate((train_data.astype(float),test_data),axis=0)
pca = PCA(best_d)
all = pca.fit(all)
knn = KNN(best_k)
knn.fit(all[:train_data.shape[0],:],train_label.astype(int))
print(knn.score(all[train_data.shape[0]:train_data.shape[0]+2000,:],test_2000_label))

test_results = knn.predict(all[train_data.shape[0]:,:])
f = h5py.File("predicted_labels.h5","w")
d1 = f.create_dataset("label",data=test_results)
end = time.time()
print(end - start)