Kevin Lin

Data 602, Homework 3

# Task
Implement 1NN with 5-fold cross validation and use PCA to reduce the dimensionality to 60.

In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
from pathlib import Path
from PIL import Image
from sklearn.model_selection import StratifiedKFold

# Prepare and Preprocess the Data
First import the images and turn them into 1 dimensional vectors and create a list of their corresponding labels from the file names.

In [None]:
# Unzip the data set
file_path = '/content/Face Data for Homework.zip'
with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Remove the text file contained in the folder
if os.path.exists('/content/ATT/README'):
    os.remove('/content/ATT/README')

# Read each image row by row and turn into a 1D list and then append to a 2D list of all the images for a subject
# Also get the Label for the image from the filename (ie. '10_2.png' is label_count.png) and use the label as the index in the list of lists
vector_images = []
vector_labels = []
for image_file in Path('/content/ATT').glob('*.png'):
    vector_labels.append(int(image_file.stem.split('_')[0]))
    with Image.open(image_file) as image:
      vector_images.append(np.array(image).flatten())

print("Images:", len(vector_images), "\nImage Vector Length:", len(vector_images[0]), "\nLabels:", len(vector_labels))
print("Label:", vector_labels[0], "| Image:", vector_images[0])

Images: 400 
Image Vector Length: 10304 
Labels: 400
Label: 25 | Image: [123 122 121 ...  84  92  94]


# Implement 1NN with 5-Fold Cross-Validation (Stratified)
We need to shuffle the images within their corresponding labels so there the same amount of training and testing images for all 40 subjects. This will give 8 photos for training and 2 testing photos per person for a total of 320 training photos and 80 testing photos per fold.

---

Then for each fold implement PCA using `numpy.linalg.eig` on the dataset to be used for kNN where k = 1.

---

Test and display results for each fold. Then aggregate and average the results at the end (we are asked for average prediction accuracy which is TP/(TP+FP)).

In [None]:
# Make our own kNN classifier
def ONEnn_predict(X_train, y_train, X_test):
    # Distance is Euclidean norm between test point and all of training set samples
    distances = np.linalg.norm(X_train - X_test, axis=1)

    # Sort the distances to find the lowest value/closest point
    sorted_indices = np.argsort(distances)

    # Get and return the label of the closest point
    return y_train[sorted_indices[0]]

In [None]:
# Implement our own SVD to get the Principal Components
def compute_svd(matrix):
  # Use the shorter side and not the image size side
  left_singular_values = matrix @ matrix.T

  # Eigenvalue decomposition to get the principal components
  eigenvalues, U = np.linalg.eig(left_singular_values)

  # Map the eigenvectors back onto the 320 by 10304 matrix (This is so we can later transform 10304 columns into 60 columns)
  eigenvectors = matrix.T @ U

  # Normalize the V matrix
  eigenfaces = eigenvectors / np.linalg.norm(eigenvectors, axis=0)

  # Return the first 60 principal components
  return eigenfaces[:, :60]

In [None]:
# Use the builtin StratifiedKFold to divide the folds
skf = StratifiedKFold(n_splits=5, shuffle=True)

# Convert vector_images and vector_labels into numpy arrays
vector_images = np.array(vector_images)
vector_labels = np.array(vector_labels)

# For each fold implement the Eigenfaces (PCA) Method, and then test on the testing dataset
results = []
for fold, (train_index, test_index) in enumerate(skf.split(vector_images, vector_labels), 1):
    # Split the data into training and testing sets
    X_train, X_test = vector_images[train_index], vector_images[test_index]
    y_train, y_test = vector_labels[train_index], vector_labels[test_index]

    # Center the training and testing features on the mean of the training sample
    X_train_mean = np.mean(X_train, axis=0)
    X_train_centered = X_train - X_train_mean
    X_test_centered = X_test - X_train_mean

    # Perform PCA on the training data
    # X_train is 320x10304 2D list, I will end up with 320 possible principal components after SVD and I choose the first 60 Principal Components
    pca_60 = compute_svd(X_train_centered)

    # Transform the training and testing data using the PCA components
    # Apply the PCA components to the centered training and test features
    X_train_pca = X_train_centered @ pca_60
    X_test_pca = X_test_centered @ pca_60
    # print(X_train_pca.shape, X_test_pca.shape)

    # Loop through each transformed test point and their corresponding test label, tally the correct predictions
    correct_predictions = 0
    for test_point, test_label in zip(X_test_pca, y_test):
        # Predict the label for the test point
        y_pred = ONEnn_predict(X_train_pca, y_train, test_point)
        # Check if the prediction is correct
        if y_pred == test_label:
            correct_predictions += 1

    # Calculate the accuracy for this fold (TP + FP I realize is just the length of 1 fold)
    accuracy = correct_predictions / len(y_test)
    results.append(accuracy)

    # Display the results for each of the folds
    print(f"Fold {fold}: Accuracy = {accuracy}")


# Aggregate the results at the end
average_accuracy = np.mean(results)
print(f"Average Accuracy: {average_accuracy}")

(320, 60) (80, 60)
Fold 1: Accuracy = 0.975
(320, 60) (80, 60)
Fold 2: Accuracy = 0.9625
(320, 60) (80, 60)
Fold 3: Accuracy = 1.0
(320, 60) (80, 60)
Fold 4: Accuracy = 0.9625
(320, 60) (80, 60)
Fold 5: Accuracy = 0.975
Average Accuracy: 0.975
