# Data Loading

In [16]:
import numpy as np
import cv2
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [17]:
file_path = '../input/att-database-of-faces' #modify
non_faces_folder_path = '../input/nonfaces' #modify

In [24]:
test_image = cv2.imread(file_path + '/s1/1.pgm', -1)
print(test_image.shape)
plt.imshow(test_image, cmap='gray')
plt.show()

In [25]:
def load_faces(num_labels = 40, images_pre_label = 10):
    dataset = []
    labels = []
    for folder_index in range(1, num_labels+1):
        for file_index in range(1, images_pre_label+1):
            
            file_name = f'{file_path}/s{folder_index}/{file_index}.pgm'
            image = cv2.imread(file_name, -1).astype(np.float64)
            dataset.append(image.reshape(-1))
            labels.append(folder_index)
    
    dataset = np.array(dataset)
    labels = np.array(labels).reshape(-1, 1)

    return dataset, labels

In [26]:
dataset, labels = load_faces()

In [27]:
def my_train_test_split(X, y):
    trainX = X[::2,  : ]
    testX  = X[1::2, : ]
    trainY = y[::2,  : ]
    testY  = y[1::2, : ]
    return trainX, testX, trainY, testY

trainX, testX, trainY, testY = my_train_test_split(dataset, labels)
print(trainX.shape, testX.shape)

In [None]:
df = pd.DataFrame(data = np.concatenate((trainX, testY), axis=1))
df.plot.scatter(x=0, y=1, c=2, colormap = 'viridis')
print(df.shape)
print(trainX.shape, testY.shape)

In [None]:
class MyPCA:
    def __init__(self, alpha):
        self.alpha = alpha

    def fit(self, X):
        self.U, self.mu = self._implementation(X, self.alpha)
        
    def fit_transform(self, X):
        self.U, self.mu = self._implementation(X, self.alpha)
        centered = X - self.mu
        return centered @ self.U

    def predict(self, X):
        centered = X - self.mu
        return centered @ self.U
        
    def _implementation(self, X, alpha):
        mu = np.mean(X, axis=0)
        Z = X - mu
        cov_mat = Z.T @ Z / Z.shape[0]
        eig_vals, eig_vecs = np.linalg.eigh(cov_mat)

        eig_vals_cumsum = np.cumsum(eig_vals[::-1]) / np.sum(eig_vals)
        rank = np.where(eig_vals_cumsum >= alpha)[0][0] + 1
        return eig_vecs[:, :-(rank + 1):-1], mu

In [None]:
pca = MyPCA(0.85)
pca.fit(trainX)
print(pca.U.shape)

In [None]:
reduced_train_X = pca.predict(trainX)
reduced_test_X = pca.predict(testX)

print(reduced_train_X.shape, reduced_test_X.shape)
print(trainY.shape, testY.shape)

# Testing

In [None]:
# acc = 0

# for i in range(len(testX)):
#     norm_array = np.linalg.norm(reduced_train_X - reduced_test_X[i, :], axis=1)
#     nearest_neighbour = np.argmin(norm_array)
#     acc += trainY[nearest_neighbour] == testY[i]
# print(f'accuracy = {acc / len(testX) * 100}%')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def plotKNN(train_X, train_Y, test_X, test_Y):
    scores = []
    for nei in range(1, 15, 2):
        KNN = KNeighborsClassifier(n_neighbors=nei)
        KNN.fit(train_X, train_Y)
        score = KNN.score(test_X, test_Y)
        scores.append([nei,score])
    print(scores)
    pd.DataFrame(data=scores).plot(x=0, y=1)

plotKNN(reduced_train_X, trainY, reduced_test_X, testY)

# LDA

In [21]:
def split_data_classes(X, y):
    num_labels = len(np.unique(y))
    D = [[] for i in range(num_labels)]
    for i, instance in enumerate(X):
        label_index = y[i][0]-1
        D[label_index].append(instance)
    return D


def calculate_class_means(D):
    mu = []
    for i in range(len(D)):
        mu.append(np.mean(D[i], axis=0))
    return np.array(mu)

def center_class_matrices(D, mu):
    Z = []
    for i in range(len(D)):
      Z.append(D[i] - mu[i])
    return Z

def calculate_Sb(D, mu, d, overall_mean): # between class scatter matrix
    Sb = np.zeros((d,d))
    for k in range(len(D)):
        nk = len(D[k])
        temp = np.subtract(mu[k], overall_mean)
        Sb += nk * np.outer(temp, temp)
    return Sb


def calculate_S_total(D, mu, d): # within-class scatter matrix
    S_total  = np.zeros((d,d))
    for i in range(len(D)):
        Z = D[i] - mu[i]
        S_total += Z.T @ Z
    return S_total

In [30]:
class MyLDA:

    def fit(self, X, y):
        self.U = self._implementation(X, y)
        return self.U
        
    def project(self, X):
        return X @ self.U
        
    def _implementation(self, X, y):
        n = X.shape[0]
        d = X.shape[1]
        print("X.shape = ", X.shape)
        overall_mean = np.mean(X, axis=0)
        D = split_data_classes(X, y)
        mu = calculate_class_means(D)

        print("Overall mean = ")
        print(overall_mean.shape)
        print(overall_mean)
        print("----------------------")
        print("mu.shape = ", mu.shape)
        print("mu1 = ")
        print(mu[0])
        print("----------------------")


        Sb = calculate_Sb(D, mu, d, overall_mean)

        print("Sb = ")
        print(Sb)
        print("----------------------")
        S_total = calculate_S_total(D, mu, d)

        print("S_total = ")
        print(S_total)
        print("----------------------")

        _, eig_vecs = np.linalg.eig(np.linalg.inv(S_total) @ Sb)
        
        # drop first eigen value/vector
        eig_vecs = eig_vecs[:, :-(39 + 1): -1]

        print("Eigenvectors")
        print(eig_vecs)

        return eig_vecs


    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.project(X)

    

In [28]:
lda = MyLDA()
eig = lda.fit(trainX, trainY)

# LDA: More efficient and compact implementation 

In [None]:
class LDA_efficient:
    def _implementation(self, X, y):
        labels, counts = np.unique(y, return_counts=True)
        D = []

        for label in labels:
            D.append(X[(y == label).flatten()])
        D = np.array(D)

        class_mu = np.mean(D, axis=1)
        dataset_mu = np.mean(D, axis=(1, 0))
        mu_centered = class_mu - dataset_mu

        print("Class_mu = ")
        print(class_mu.shape)
        print(class_mu[0])
        print("----------------------")


        Sb = mu_centered.T @ np.diag(counts) @ mu_centered
        Z = D - class_mu[:, None, :]
        S_total = np.einsum('ijk,ijm->km', Z, Z)

        _, eig_vecs = np.linalg.eigh(np.linalg.inv(S_total) @ Sb)

        U = eig_vecs[:, :-(39 + 1): -1]

        print(Sb)
        print(S_total)
        print(U)

        return U
            
    def fit(self, X, y):
        self.U = self._implementation(X, y)
    
    def predict(Self, X):
        return X @ Self.U

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.predict(X)

In [None]:
lda_ef = LDA_efficient()
lda_ef.fit(trainX, trainY)

In [None]:
reduced = lda.project(trainX[0])
reduced_ef = lda_ef.predict(trainX[0])

print(reduced)
print("-----")
print(reduced_ef)

In [None]:
reduced_train_X = lda.project(trainX)
reduced_test_X = lda.project(testX)
print(reduced_train_X.shape, trainY.shape, reduced_test_X.shape, testY.shape)
# reduced_train_X = lda_ef.predict(trainX)
# reduced_test_X = lda_ef.predict(testX)

In [None]:
acc = 0

for i in range(len(testX)):
    norm_array = np.linalg.norm(reduced_train_X - reduced_test_X[i, :], axis=1)
    nearest_neighbour = np.argmin(norm_array)
    acc += trainY[nearest_neighbour] == testY[i]
print(f'accuracy = {acc / len(testX) * 100}%')


In [None]:
plotKNN(reduced_train_X, trainY, reduced_test_X, testY)

# KNN

In [None]:
# from scipy.stats import mode
# class MyKNN:
#     def __init__(self, X, y, n_neighbors):
#         self.X = X
#         self.y = y
#         self.n_neighbors = n_neighbors
    
#     def predict(self, X_test):
#         ans = []

#         for datapoint in X_test:
#             norms = np.linalg.norm(self.X - datapoint, axis=1)
#             indices = np.argpartition(norms, self.n_neighbors)[:self.n_neighbors]
#             predictions = self.y[indices]
#             ans.append(mode(predictions)[0][0])
        
#         return np.array(ans).reshape(-1, 1)

In [None]:
# for k in range(1, 9+1, 2):
#     clf = MyKNN(reduced_train_X, trainY, k)
#     predictions = clf.predict(reduced_test_X)
#     print(f'accuracy for KNN with {k} neighbors = {(predictions.flatten() == testY.flatten()).sum() / len(testY) * 100.0}%')

# faces vs non-facess

In [None]:
img = cv2.imread(non_faces_folder_path + '/(90).pgm', -1);
print(img.shape)
plt.imshow(img, cmap = 'gray')

In [None]:
def load_images(num_images):
    dataset = []
    labels = []
    for image_index in range(1, num_images + 1):
        file_name = f'{non_faces_folder_path}/({image_index}).pgm'
        image = cv2.imread(file_name, -1).astype(np.float32)
        dataset.append(image.reshape(-1))
        labels.append(0)
    dataset = np.array(dataset)
    labels = np.array(labels).reshape(-1, 1)
    return dataset, labels

non_faces_dataset, non_faces_labels = load_images(100)
print(non_faces_dataset.shape)

In [None]:
faces_dataset, _ = load_faces()
faces_labels = np.ones((faces_dataset.shape[0],1), dtype=int)

all_labels = np.concatenate((faces_labels, non_faces_labels), axis=0)
all_dataset = np.concatenate((faces_dataset, non_faces_dataset), axis=0)

print(all_labels.shape, all_dataset.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(all_dataset, all_labels, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
pca = MyPCA(0.9)
pca.fit(X_train)
reduced_X_train = pca.predict(X_train)
reduced_X_test = pca.predict(X_test)
print(reduced_X_train.shape, reduced_X_test.shape)

In [None]:
plotKNN(reduced_X_train, y_train, reduced_X_test, y_test)

## LDA

In [None]:
print(X_train.shape, y_train.shape)


In [None]:
lda_ef = MyLDA()
lda_ef.fit(X_train, y_train)
reduced_train_X = lda_ef.project(X_train)
reduced_test_X = lda_ef.project(X_test)
print(reduced_X_train.shape, reduced_X_test.shape)
plotKNN(reduced_X_train, y_train, reduced_X_test, y_test)

# plot non_faces count vs accuracy 

### PCA

In [None]:
### PC
scores = []

for num_labels in range(10, 40 + 1,5):
    for num_non_faces in range(50, 401, 50):
        non_faces_dataset, non_faces_labels = load_images(num_non_faces)
        faces_dataset, _ = load_faces(num_labels = num_labels)
        faces_labels = np.ones((faces_dataset.shape[0],1), dtype=int)

        print(faces_dataset.shape, non_faces_dataset.shape)
        # combine faces and non_faces
        all_labels = np.concatenate((faces_labels, non_faces_labels), axis=0)
        all_dataset = np.concatenate((faces_dataset, non_faces_dataset), axis=0)

        print(all_dataset.shape,all_labels.shape)

        X_train, X_test, y_train, y_test = train_test_split(all_dataset, all_labels, test_size=0.33, random_state=42)

        # train
        pca = MyPCA(0.9)
        pca.fit(X_train)
        reduced_X_train = pca.predict(X_train)
        reduced_X_test = pca.predict(X_test)
        print(reduced_X_train.shape, reduced_X_test.shape)

        KNN = KNeighborsClassifier(n_neighbors=1)
        KNN.fit(reduced_X_train, y_train)

        # test 
        score = KNN.score(reduced_X_test, y_test)
        
        num_faces = face_dataset.shape[0]
        scores.append([num_faces, num_non_faces,score])
        print("new score",num_faces, num_non_faces,score)

pd.DataFrame(data=scores).plot(x=0, y=1, c=2, colormap='virdis')

## LDA

In [None]:
scores = []

num_labels = 20
for num_non_faces in range(50, 401, 50):
    non_faces_dataset, non_faces_labels = load_images(num_non_faces)
    faces_dataset, _ = load_faces(num_labels = num_labels)
    faces_labels = np.ones((faces_dataset.shape[0],1), dtype=int)

    print(faces_dataset.shape, non_faces_dataset.shape)
    # combine faces and non_faces
    all_labels = np.concatenate((faces_labels, non_faces_labels), axis=0)
    all_dataset = np.concatenate((faces_dataset, non_faces_dataset), axis=0)

    print(all_dataset.shape,all_labels.shape)

    X_train, X_test, y_train, y_test = train_test_split(all_dataset, all_labels, test_size=0.33, random_state=42)

    # train
    lda = MyLDA()
    lda.fit(X_train, y_train)
    reduced_X_train = lda.project(X_train)
    reduced_X_test = lda.project(X_test)
    print(reduced_X_train.shape, reduced_X_test.shape)

    KNN = KNeighborsClassifier(n_neighbors=1)
    KNN.fit(reduced_X_train, y_train)

    # test 
    score = KNN.score(reduced_X_test, y_test)

    scores.append([num_non_faces,score])
    print("new score", num_non_faces,score)

pd.DataFrame(data=scores).plot(x=0, y=1)