In [1]:
%matplotlib inline

import h5py
import numpy as np
from matplotlib import pyplot as plt
import matplotlib as mpl
import scipy
from collections import Counter
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from IPython.display import display
from numpy.random import shuffle
from scipy.spatial.distance import pdist, squareform
from scipy.spatial import KDTree

mpl.style.use('seaborn')

  from ._conv import register_converters as _register_converters


In [2]:
def get_accuracy(y_true, y_pred): 
    """
    Calculate the accuracy score.
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    return np.sum(y_true == y_pred)/len(y_true)

def flat(data):
    """
    Flatten images in the data array. 
    """
    return data.reshape(data.shape[0], data.shape[1]**2)

In [3]:
# load training data and labels
with h5py.File('images_training.h5','r') as H:
    data = np.copy(H['data'])
    data = np.array([x/np.amax(x) for x in data]) # normalize images
with h5py.File('labels_training.h5','r') as H:
    label = np.copy(H['label'])
    
# load testing data and labels
with h5py.File('images_testing.h5','r') as H:
    data_test = np.copy(H['data'])
    data_test = np.array([x/np.amax(x) for x in data_test]) # normalize images
with h5py.File('labels_testing_2000.h5','r') as H:
    label_test = np.copy(H['label'])

## Singular Value Decomposition 

In [4]:
k_svd = 10 # number of singular values to save

In [5]:
# Singular Value Decomposition on training images

u, s, vh = np.linalg.svd(data)
data_svd = np.array([u[i][:,:k_svd] @ np.diag(s[i][:k_svd]) @ vh[i][:k_svd:,] for i in range(s.shape[0])])

In [6]:
# Singular Value Decomposition on testing images

u, s, vh = np.linalg.svd(data_test)
data_test_svd = np.array([u[i][:,:k_svd] @ np.diag(s[i][:k_svd]) @ vh[i][:k_svd:,] for i in range(s.shape[0])])

## Principal Component Analysis

Note: PCA is applied to the entire dataset, while SVD is applied to each image. 

In [7]:
def PCA(n_components, data):
    """
    Principal Component Analysis
    
    Parameters: 
    
    n_components: int
        Number of principal components to use
        
    data : array-like, shape = (n_samples, n_features)
        Data to perform PCA on
    
    """
    n_components = n_components
    data = data
    X = data - np.mean(data, axis=0) # data matrix normalized by mean
    S = np.cov(X.T) # compute covariance matrix of X
    L, V = np.linalg.eig(S) # L - array of eigenvalues; V - matrix of eigenvectors
    V = V[:,np.argsort(-L)][:,:n_components] # sort eigenvectors by descending order on eigenvalues
    L = -np.sort(-L)[:n_components] # sort eigenvalues by descending order
    X_PCA = V.T @ X.T
    X_PCA = X_PCA.T
    return X_PCA, V

In [8]:
n_components = 20 # number of principal components to use
X_PCA, V = PCA(n_components=n_components, data=np.append(flat(data), flat(data_test), axis=0))
data_pca = X_PCA[:data.shape[0]]
data_test_pca = X_PCA[-data_test.shape[0]:]
print(data_pca.shape)
print(data_test_pca.shape)

(30000, 20)
(5000, 20)


## K-fold Cross Validation

In [9]:
def kfold(N, k=10):
    """
    Generate lists of indices for data divided into k equal parts for cross validation. 
    
    Parameters
    ----------
    N: int
        number of samples in the data. 
    
    k: int
        number of parts the data will be split into. 
    
    Returns
    -------
    indices: array, shape(k,) 
        An array of arrays, each containing the indices for one part of data. 
    """
    
    arr = np.arange(N)
    np.random.shuffle(arr)
    indices = np.array([arr[i::k] for i in range(k)])
    
    return indices