In [50]:
# data manipulation modules
import pandas as pd
import numpy as np
from numpy import linalg as LA
from scipy import misc
from sklearn.cluster import KMeans
from scipy.spatial import distance

from skimage import color
from skimage import io


# plot modules
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()


# metrics modules
from sklearn.metrics import f1_score, accuracy_score

# csv modules
import csv

# image manipulation modules
from PIL import Image

# path modules
from pathlib import Path

# system modules
import os


""" GLOBALS """
image_dir_path = 'Desktop/pca_script/'




[0]


In [51]:
class PCA():
    
    def __init__(self, n_components):
        self.n_components = n_components
        self.principal_components = np.empty([n_components])
        self.m = 0

    
    def fit(self, X):
        # Fit the model with X.
        self.m = np.mean(X, axis=0)
        #lst = np.array([(X[i]- self.m) * np.vstack((X[i] - self.m)) for i in range(len(X))])
        #cov_matrix = np.sum(np.stack(lst, axis=0), axis=0) / len(X)
        cov_matrix = np.cov(X.T)

        cov_matrix_eig = LA.eig(cov_matrix)
        self.principal_components = np.array([cov_matrix_eig[1][:, i] for i in (cov_matrix_eig[0].argsort()[-self.n_components:][::-1])])
        
        
    
    def transform(self, X):
        # Apply dimensionality reduction to X.
        transformed_dataset = [np.dot(self.principal_components, X[i]) for i in range(len(X))]
        return transformed_dataset

    
    

In [57]:

def grayscale_and_convert_to_nparray(img_path: str):
    
    img = Image.open(img_path)
    imgGray = img.convert('L')
    imgGray.save('Desktop/test_gray.jpg')
    gray_img = np.asarray(imgGray) / 255
    return gray_img.flatten()


def load_data_to_df(image_dir_path):
    dataset_img_names = []

    for dirpath, dirnames, filenames in os.walk(image_dir_path):
        for directory in dirnames:
            for img_name in list(os.listdir(image_dir_path + directory))[:50]:
                img_path = image_dir_path + directory + "/" + img_name
                dataset_img_names.append((directory, grayscale_and_convert_to_nparray(img_path)))
                
    df = pd.DataFrame.from_records(dataset_img_names, columns = ['id', 'img'])
    return df

       

df = load_data_to_df(image_dir_path)
X = df['img'].to_numpy()
X = np.stack(X, axis=0)


pca = PCA(n_components=100)
pca.fit(X)
trans = pca.transform(X)



"""
# only using euclidean distance (before pca and k-means): 38.5 %
for ind in range(450, 500):
    item = df.loc[ind]['img']
    distances = []
    for idx in df['id'].unique():
        sum = 0
        for i in df.loc[(df['id'] == idx)]['img']: 
            sum += np.linalg.norm(item - i)
        distances.append(sum)
    print(np.argmin(distances))
"""







[891.8235725763407, 945.7931377832997, 830.960681198491, 982.558309599667, 915.7406997238895, 885.8842928261652, 863.1240169770853, 892.3053140956005, 866.8868948636477, 909.3453804202587]


"\n# only using euclidean distance (before pca and k-means): 38.5 %\nfor ind in range(450, 500):\n    item = df.loc[ind]['img']\n    distances = []\n    for idx in df['id'].unique():\n        sum = 0\n        for i in df.loc[(df['id'] == idx)]['img']: \n            sum += np.linalg.norm(item - i)\n        distances.append(sum)\n    print(np.argmin(distances))\n"