In [33]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from skimage.io import imread
from sklearn.decomposition import PCA

In [16]:
TEST_FOLDER = os.path.join('..', '..', 'test_imgs')

In [45]:
def load_images_from_main_folder(main_folder_path):
    images_per_folder, test_images_per_folder = {}, {}
    folder_names = os.listdir(main_folder_path)
    for folder_name in folder_names:
        folder_path = os.path.join(main_folder_path, folder_name)
        image_names = os.listdir(folder_path)
        # Load and store "train" images
        images_per_image_name = {}
        for image_name in image_names[:-1]:
            image_path = os.path.join(folder_path, image_name)
            image = imread(image_path)
            images_per_image_name[image_name] = image
        images_per_folder[folder_name] = images_per_image_name
        # Load and store "test" images
        test_images_per_image_name = {}
        for image_name in image_names[-1:]:
            image_path = os.path.join(folder_path, image_name)
            image = imread(image_path)
            test_images_per_image_name[image_name] = image
        test_images_per_folder[folder_name] = test_images_per_image_name
    return images_per_folder, test_images_per_folder

In [46]:
images, test_images = load_images_from_main_folder(TEST_FOLDER)

In [40]:
print(images.keys())

dict_keys(['banana', 'parking_meter', 'pizza', 'traffic_light', 'zebra'])


In [52]:
test_images['banana']['2413024__banana__1.0.jpg']

array([[[242, 225, 107],
        [243, 226, 112],
        [243, 225, 113],
        ...,
        [243, 226, 120],
        [238, 224, 117],
        [242, 228, 123]],

       [[244, 225, 107],
        [244, 224, 109],
        [245, 225, 113],
        ...,
        [239, 225, 120],
        [242, 228, 123],
        [244, 231, 126]],

       [[244, 223, 106],
        [243, 222, 107],
        [245, 223, 111],
        ...,
        [243, 230, 125],
        [242, 229, 124],
        [241, 228, 123]],

       ...,

       [[  1,   0,   2],
        [  2,   0,   3],
        [  3,   1,   6],
        ...,
        [ 19,  39,  12],
        [  9,  31,   8],
        [  3,  27,   5]],

       [[  0,   5,   1],
        [  0,   1,   0],
        [  0,   0,   4],
        ...,
        [ 18,  33,  14],
        [  8,  25,   9],
        [  0,  17,   3]],

       [[ 67, 102,  38],
        [ 52,  81,  27],
        [ 37,  58,  15],
        ...,
        [ 12,  24,  10],
        [  3,  13,   4],
        [  0,   2,   0]]

In [93]:
def train_project_pca_on_images(images):
    pca_per_class = {}
    for image_class, image_name_pairs  in images.items():
        # Get image data array from class
        images_data = []
        for _, image in image_name_pairs.items():
            images_data.append(np.reshape(image, newshape=-1))
        images_data = np.array(images_data)
        # Apply PCA and store results
        pca = PCA(n_components=5) # THIS WILL FAIL IF NO. OF SAMPLES IS < 5
        new_images_data = pca.fit_transform(images_data)
        pca_per_class[image_class] = {
            'pca_object' : pca,
            'projections' : new_images_data
        }
    return pca_per_class

In [98]:
pca_train_results = train_project_pca_on_images(images, TEST_FOLDER)

In [99]:
pca_train_results

{'banana': {'pca_object': PCA(n_components=5),
  'projections': array([[-1.38475268e+04, -1.24464900e+04, -9.19093270e+03,
          -1.25638553e+04,  3.76650372e-11],
         [-1.05322201e+04,  8.31149716e+03, -1.21511565e+04,
           1.44094562e+04,  3.76650372e-11],
         [-3.65586985e+03,  1.91405291e+04,  1.27966844e+04,
          -7.87458729e+03,  3.76650372e-11],
         [ 1.16430955e+03, -1.51079125e+04,  1.70418005e+04,
           8.28606962e+03,  3.76650372e-11],
         [ 2.68713072e+04,  1.02376229e+02, -8.49639575e+03,
          -2.25708319e+03,  3.76650372e-11]])},
 'parking_meter': {'pca_object': PCA(n_components=5),
  'projections': array([[ -2925.23294467,  -8755.57710443,   -547.45758435,
           -1267.01691658,  -9794.37014389],
         [  4319.01074382,   8973.95158026,  12514.87215514,
           -5566.32161797,  13036.00465274],
         [ -2593.69221816,  12441.26191552,  -7149.21207723,
          -11735.65461262,  -1123.99087294],
         [ -9028.5

In [124]:
def get_distance_to_space_prj(image, image_prj):
    reshaped_image = np.reshape(image, newshape=-1)
    return np.linalg.norm(reshaped_image - image_prj)

def project_image_to_space(image, pca_object):
    reshaped_image = np.reshape(image, newshape=(1, -1))
    image_prj = pca_object.transform(reshaped_image)
    no_pad_positions = reshaped_image.shape[1] - image_prj.shape[1]
    image_full_prj = np.append(image_prj, np.zeros(no_pad_positions))
    return image_full_prj

def project_test_images(test_images, pca_train_results):
    # For each class...
    test_results_per_class = {}
    for test_image_class, test_images_pairs in test_images.items():
        # and for each image of each class...
        test_results_per_image = {}
        for test_image_name, test_image in test_images_pairs.items():
            # ...calculate the distances to its different projections...
            test_image_distances = []
            for pca_image_class, pca_data in pca_train_results.items():
                test_image_prj = project_image_to_space(test_image, pca_data['pca_object'])
                dist_to_space = get_distance_to_space_prj(test_image, test_image_prj)
                test_image_distances.append(dist_to_space)
            # ...then determine the nearest class
            test_image_distances = np.array(test_image_distances)
            idx_min = np.argmin(test_image_distances)
            class_min = list(pca_train_results.keys())[idx_min]
            # ...and store the results
            test_results_per_image[test_image_name] = {
                'distances': np.round(test_image_distances, 2),
                'nearest_class': class_min,
                'is_expected_class': class_min == test_image_class
            }
        test_results_per_class[test_image_class] = test_results_per_image
    return test_results_per_class

In [125]:
projection_results = project_test_images(test_images, pca_train_results)

In [126]:
projection_results

{'banana': {'2413024__banana__1.0.jpg': {'distances': array([59999.79, 59311.  , 60166.96, 59213.26, 59292.31]),
   'nearest_class': 'traffic_light',
   'is_expected_class': False}},
 'parking_meter': {'2417421__parking_meter__0.9999999.jpg': {'distances': array([46356.89, 46325.07, 46348.4 , 46635.42, 46555.66]),
   'nearest_class': 'parking_meter',
   'is_expected_class': True}},
 'pizza': {'2414892__pizza__0.99999857.jpg': {'distances': array([66529.28, 65888.78, 65781.89, 65085.7 , 66011.86]),
   'nearest_class': 'traffic_light',
   'is_expected_class': False}},
 'traffic_light': {'2413495__traffic_light__0.9999925.jpg': {'distances': array([65361.8 , 65688.83, 65042.9 , 65184.86, 65099.19]),
   'nearest_class': 'pizza',
   'is_expected_class': False}},
 'zebra': {'2414277__zebra__0.9999908.jpg': {'distances': array([50472.35, 50920.68, 50515.81, 50181.42, 50943.51]),
   'nearest_class': 'traffic_light',
   'is_expected_class': False}}}

#### Padding projected vectors with zeroes yield an "accuracy" of 1/5 with respect to expected nearest classes. Should the full PCA matrix be calculated to get real projections?

#### Dr. Orozco says that it's possible to get projections with as many features as their original representations, no matter the number of components detected during PCA.

Someone kill me, please... * sigh *