In [1]:
%run ./Utils.ipynb

In [2]:
# Load dataset function
def load_dataset():
    """Load dataset from files
    Returns
    -------
    train_X : array-like, shape (n_samples_train, n_features)  
              with train samples
    train_Y : array-like, shape (n_samples_train, 1) 
              with train labels
    train_X : array-like, shape (n_samples_test, n_features) 
              with test samples
    train_Y : array-like, shape (n_samples_test, 1) 
              with test labels
    """
    train_X = np.load(f"train_{COLLECTION_FILE_NAME}_x.npy")
    train_Y = np.load(f"train_{COLLECTION_FILE_NAME}_y.npy")
    test_X = np.load(f"test_{COLLECTION_FILE_NAME}_x.npy")
    test_Y = np.load(f"test_{COLLECTION_FILE_NAME}_y.npy")
    return train_X, train_Y, test_X, test_Y

In [3]:
# Load dataset to work with
X_train, Y_train, X_test, Y_test = load_dataset() 
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(320, 10304) (320,) (80, 10304) (80,)


In [4]:
# Shuffle function
def shuffle_samples(x, y):
    """Shuffle data of labels and samples
    Parameters
    ----------
    X : array-like, shape (n_samples, n_features)
        Samples, where n_samples is the number of samples
        and n_features is the number of features.
    y : array-like, shape (n_samples, 1)
        Labels, where n_samples is the number of samples.
    Returns
    -------
    data[:,:-1] : array-like, shape (n_samples, n_features) 
    data[:,-1]  : array-like, shape (n_samples, label)
    """
    data = np.concatenate((x, y.reshape(-1,1)), axis=1)
    np.random.shuffle(data)
    return data[:,:-1] , data[:,-1]

In [5]:
# Extra shuffle for samples
X_train, Y_train = shuffle_samples(X_train, Y_train)
print(X_train.shape, Y_train.shape)

(320, 10304) (320,)


In [6]:
# Default scaling
def minmaxscaler(samples):
    """Scale features by maximum of data
    Parameters
    ----------
    samples : array-like, shape (n_samples, n_features)
              Samples, where n_samples is the number of samples
              and n_features is the number of features.
    Returns
    -------
    samples : array-like, same shape as samples
    """
    return samples/255.

# Scale images A
def scale_by_mean(samples, axis=0):
    """Scale features by mean of data depending on the axis value
    Parameters
    ----------
    samples : array-like, shape (n_samples, n_features)
              Samples, where n_samples is the number of samples
              and n_features is the number of features.
    axis    : number type of int(0, 1)
    Returns
    -------
    samples : array-like, same shape as samples
    """
    mean = np.mean(samples, axis=axis)
    mean = mean.reshape(samples.shape[0], -1) if axis == 1 else mean
    return samples - mean

# Scale images B
def scale_by_mean_std(samples, axis=0):
    """Scale features by mean diveded by standart deviation
       of data depending on the axis value
    Parameters
    ----------
    samples : array-like, shape (n_samples, n_features)
              Samples, where n_samples is the number of samples
              and n_features is the number of features.
    axis    : number type of int(0, 1)
    Returns
    -------
    samples : array-like, same shape as samples
    """
    std = np.std(samples, axis=axis)
    std = std.reshape(samples.shape[0], -1) if axis == 1 else std
    return scale_by_mean(samples, axis)/std

In [7]:
# Scaling of samples
X_train = minmaxscaler(X_train)
X_test  = minmaxscaler(X_test)
print(X_test.shape, X_test.mean())
print(X_train.shape, X_train.mean())

(80, 10304) 0.44243154666758017
(320, 10304) 0.4415062551854981


In [8]:
# PCA realization
def get_data(A):
    """Count centered matrix C
    Parameters
    ----------
    A : array-like, shape (n_samples, n_features)
        Samples, where n_samples is the number of samples
        and n_features is the number of features.
    Returns
    -------
    C : array-like, same shape as samples
    """
    # calculate the mean of each column
    M = np.mean(A, axis=0)
    # center columns by subtracting column means
    C =  A - M
    return C

def get_svd(C):
    """Count singular value decomposition of matrix C
    Method count explained variance and explained variance ratio
    Parameters
    ----------
    C : array-like, shape (n_samples, n_features)
        Samples, where n_samples is the number of samples
        and n_features is the number of features.
    Returns
    -------
    values  : array-like, shape (n_features,) diagonal matrix of singular values
    vectors : array-like, shape (n_features, n_features)
    explained_variance       : shape (n_features,) where n_features is 
                               a number of components
    explained_variance_ratio : shape (n_features,) where n_features is 
                               a number of components
    """
    # svd method
    vectors, values,_ = np.linalg.svd(C.T)
    
    # count diagonal matrix of eigenvector
    explained_variance = (values ** 2) / (C.shape[0] - 1)
    
    # count explained variance ratio     
    total_var = explained_variance.sum()
    explained_variance_ratio = explained_variance / total_var
    
    return values, vectors, explained_variance, explained_variance_ratio

def explained_variance(values, ratio):
    """ Count decision how many n components will be used
    Parameters
    ----------
    values : array-like, shape (n_components,)
             Explained variance ratio where n_components
             is the number of components.
    ratio  : float number, number in range (0, 1)
    Returns
    -------
    container : array-like, shape(n_components_reduced,)
                where n_components_reduced < n_components
    """
    container = []
    accumulator = 0. 

    for i in values:
        if accumulator <= ratio:
            accumulator += i
            container.append(i)
    
    return np.asarray(container)

def reduce_dimensions(matrix_W, n_comp, C):
    """ Apply dimension reduction for matrix C
    Parameters
    ----------
    matrix_W : array-like, shape (n_features, n_features)
               Matrix W, where n_features is the number of features.
    n_comp   : array-like, shape (n_components_reduced,) 
               Selected principal components, where n_components_reduced
               is a number of components to use.
    С        : array-like, shape (n_samples, n_features)
               Samples, where n_samples is the number of samples
               and n_features is the number of features.
    Returns
    -------
    C : array-like, shape(n_samples, n_components_reduced)
    """
    return C.dot(matrix_W.T[:n_comp.shape[0]].T)

In [9]:
# Fit X_train
train_matrix_C = get_data(X_train)
eigen_values, eigen_vectors,exp_var, exp_ratio = get_svd(train_matrix_C)

exp_ratio_n_comp = explained_variance(exp_ratio, .95)

In [10]:
# Transform X_train
X_train_PCA = reduce_dimensions(eigen_vectors, exp_ratio_n_comp, train_matrix_C)
print(X_train_PCA.shape)

(320, 161)


In [11]:
# Transform X_test
test_matrix_C = get_data(X_test)

X_test_PCA = reduce_dimensions(eigen_vectors, exp_ratio_n_comp, test_matrix_C)
print(X_test_PCA.shape)

(80, 161)


In [12]:
# Model realization
from scipy import stats

def euclidean_distance(q, p):
    """Count Euclidean Distance
    Parameters
    ----------
    q : array-like, shape (n_samples, n_features)
        Samples, where n_samples is the number of samples
        and n_features is the number of features.
    p : array-like, shape (n_features,)
        Sample, where n_features is the number of features
    Returns
    -------
    array-like, shape (n_samples,)
    """
    return np.sqrt(np.sum((q - p)**2, axis=1))
    
def model(X_train, y_train, X_test, y_test, K):
    """Build classificator
    Return accuracy of predictions
    Parameters
    ----------
    X_train : array-like, shape (n_samples, n_features)
              Samples, where n_samples is the number of samples
              and n_features is the number of features.
    y_train : array-like, shape (n_samples,)
              Labels, where n_samples is the number of samples
    X_test  : array-like, shape (n_samples, n_features)
              Samples, where n_samples is the number of samples
              and n_features is the number of features.
    y_train : array-like, shape (n_samples,)
              Labels, where n_samples is the number of samples
    K       : int number, number for knn algorithm 
    Returns
    -------
    preidct : float number, precision of classification
    """
    predict = np.zeros(X_test.shape[0])
    for i in range(X_test.shape[0]):
        euclid_dist = euclidean_distance(X_train, X_test[i])
        indices = np.argsort(euclid_dist)[:K]
        
        predict[i] = stats.mode(y_train[indices])[0] == y_test[i]
    return predict.mean()

In [13]:
d = model(X_train_PCA, Y_train, X_test_PCA, Y_test.reshape(-1, 1), 1)
d

0.9375