In [1]:
%run ./Utils.ipynb

In [2]:
# Load dataset function
def load_dataset():
    train_X = np.load(f"train_{COLLECTION_FILE_NAME}_x.npy")
    train_Y = np.load(f"train_{COLLECTION_FILE_NAME}_y.npy")
    test_X = np.load(f"test_{COLLECTION_FILE_NAME}_x.npy")
    test_Y = np.load(f"test_{COLLECTION_FILE_NAME}_y.npy")
    return train_X, train_Y, test_X, test_Y
    
X_train, Y_train, X_test, Y_test = load_dataset() 
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(320, 10304) (320,) (80, 10304) (80,)


In [3]:
# Extra shuffle for samples
def shuffle_samples(x, y):
    data = np.concatenate((x, y.reshape(-1,1)), axis=1)
    np.random.shuffle(data)
    return data[:,:-1] , data[:,-1]

X_train, Y_train = shuffle_samples(X_train, Y_train)
print(X_train.shape, Y_train.shape)

(320, 10304) (320,)


In [4]:
# Default scaling
def minmaxscaler(samples):
    return samples/255.

# Scale images A
def scale_by_mean(samples, axis=0):
    mean = np.mean(samples, axis=axis)
    mean = mean.reshape(samples.shape[0], -1) if axis == 1 else mean
    return samples - mean

# Scale images B
def scale_by_mean_std(samples, axis=0):
    std = np.std(samples, axis=axis)
    std = std.reshape(samples.shape[0], -1) if axis == 1 else std
    return scale_by_mean(samples, axis)/std

X_train = minmaxscaler(X_train)
X_test  = minmaxscaler(X_test)
print(X_test.shape, X_test.mean())
print(X_train.shape, X_train.mean())

(80, 10304) 0.44243154666758017
(320, 10304) 0.4415062551854979


In [5]:
# PCA realization
def get_data(A):
    # calculate the mean of each column
    M = np.mean(A, axis=0)
    # center columns by subtracting column means
    return A - M

def get_eigendecomposition(C):
    # eigendecomposition of covariance matrix
    vectors, values,_ = np.linalg.svd(C.T)
    
    explained_variance = (values ** 2) / (C.shape[0] - 1)
    
    total_var = explained_variance.sum()
    explained_variance_ratio = explained_variance / total_var
    
    return values, vectors, explained_variance, explained_variance_ratio

def explained_variance(values):
    container = []
    accumulator = 0. 

    for i in values:
        if accumulator <= .95:
            accumulator += i
            container.append(i)
    
    return np.asarray(container)

def reduce_dimensions(matrix_W, n_comp, C):
    return C.dot(matrix_W.T[:n_comp.shape[0]].T)

In [6]:
# Fit X_train
train_matrix_C = get_data(X_train)
eigen_values, eigen_vectors,exp_var, exp_ratio = get_eigendecomposition(train_matrix_C)

exp_ratio_n_comp = explained_variance(exp_ratio)

In [7]:
# Transform X_train
X_train_PCA = reduce_dimensions(eigen_vectors, exp_ratio_n_comp, train_matrix_C)
print(X_train_PCA.shape)

(320, 161)


In [8]:
# Transform X_test
test_matrix_C = get_data(X_test)

X_test_PCA = reduce_dimensions(eigen_vectors, exp_ratio_n_comp, test_matrix_C)
print(X_test_PCA.shape)

(80, 161)


In [9]:
# Model realization
from scipy import stats

def euclidean_distance(q, p):
    return np.sqrt(np.sum((q - p)**2, axis=1))
    
def model(X_train, y_train, X_test, y_test, K):
    predict = np.zeros(X_test.shape[0])
    for i in range(X_test.shape[0]):
        euclid_dist = euclidean_distance(X_train, X_test[i])
        indices = np.argsort(euclid_dist)[:K]
        
        predict[i] = stats.mode(y_train[indices])[0] == y_test[i]
    return predict.mean()

In [10]:
d = model(X_train_PCA, Y_train, X_test_PCA, Y_test.reshape(-1, 1), 1)
d

0.9375