__Principal Component Analysis__  
Basic Example: Rotation of axis to align with the Principal Component



In [None]:
import random
import math
import matplotlib.pyplot as plt

# Step 1: Generate correlated 2D data
def generate_data(n=100):
    data = []
    for _ in range(n):
        x = random.uniform(0, 10)
        y = 2 * x + random.gauss(0, 2)
        data.append([x, y])
    return data

# Step 2: Mean center the data
def mean_center(data):
    n = len(data)
    mean_x = sum(p[0] for p in data) / n
    mean_y = sum(p[1] for p in data) / n
    centered = [[p[0] - mean_x, p[1] - mean_y] for p in data]
    return centered, (mean_x, mean_y)

# Step 3: Covariance matrix
def covariance_matrix(data):
    n = len(data)
    cov_xx = sum(p[0]*p[0] for p in data) / n
    cov_yy = sum(p[1]*p[1] for p in data) / n
    cov_xy = sum(p[0]*p[1] for p in data) / n
    return [[cov_xx, cov_xy], [cov_xy, cov_yy]]

# Step 4: Eigen decomposition (2x2)
def eigen_decomposition_2x2(matrix):
    a, b = matrix[0]
    _, d = matrix[1]
    trace = a + d
    det = a * d - b * b
    discriminant = math.sqrt(trace**2 - 4 * det)
    eig1 = (trace + discriminant) / 2
    eig2 = (trace - discriminant) / 2

    def eigenvector(eig):
        if b != 0:
            return [eig - d, b]
        else:
            return [1, 0] if a >= d else [0, 1]

    v1 = eigenvector(eig1)
    v2 = eigenvector(eig2)
    return [eig1, eig2], [v1, v2]

# Step 5: Project onto both principal components
def project_full(data, eigenvectors):
    def dot(u, v):
        return sum(ui * vi for ui, vi in zip(u, v))
    def normalize(v):
        norm = math.sqrt(sum(x**2 for x in v))
        return [x / norm for x in v]
    pc1 = normalize(eigenvectors[0])
    pc2 = normalize(eigenvectors[1])
    projected = [[dot(p, pc1), dot(p, pc2)] for p in data]
    return projected, pc1, pc2

# Step 6: Plot original and transformed data
def plot_full(original_data, projected_data, eigenvectors, mean,eigenvalues):
    ox, oy = zip(*original_data)
    px, py = zip(*projected_data)

    plt.figure(figsize=(12, 5))

    # Original data
    plt.subplot(1, 2, 1)
    plt.scatter(ox, oy, alpha=0.6)
    plt.quiver(mean[0], mean[1], eigenvectors[0][0], eigenvectors[0][1], 
               angles='xy', scale_units='xy', scale=0.125, color='r', label='PC1')
    plt.quiver(mean[0], mean[1], eigenvectors[1][0], eigenvectors[1][1], 
               angles='xy', scale_units='xy', scale=0.125, color='g', label='PC2')
    plt.title("Original Data with Principal Components")
    plt.axis('equal')
    plt.legend()

    # Transformed data
    plt.subplot(1, 2, 2)
    plt.scatter(px, py, alpha=0.6, color='orange')
    plt.title("Data in PCA Basis (PC1 vs PC2)")
    plt.axis('equal')

    plt.tight_layout()
    plt.show()


# Run PCA
data = generate_data()
centered_data, mean = mean_center(data)
cov = covariance_matrix(centered_data)
eigenvalues, eigenvectors = eigen_decomposition_2x2(cov)
projected_data, pc1, pc2 = project_full(centered_data, eigenvectors)

# Output eigen info
print("Eigenvalues:", eigenvalues)
print("Eigenvectors:")
for vec in eigenvectors:
    print(vec)

# Plot
plot_full(data, projected_data, [pc1, pc2], mean,eigenvalues)

PCA applied to face data (ORL Dataset)

In [None]:
import os
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Path to ORL face dataset
dataset_path = 'orl_faces'
num_classes = 10
images_per_class = 5

# Set up the plot grid
fig, axes = plt.subplots(num_classes, images_per_class, figsize=(10, 10))
fig.suptitle("ORL Face Dataset: 5 Faces from Each of 10 Classes", fontsize=16)

for class_idx in range(1, num_classes + 1):
    class_folder = os.path.join(dataset_path, f's{class_idx}')
    image_files = sorted(os.listdir(class_folder))[:images_per_class]

    for img_idx, img_file in enumerate(image_files):
        img_path = os.path.join(class_folder, img_file)
        img = mpimg.imread(img_path)
        ax = axes[class_idx - 1, img_idx]
        ax.imshow(img, cmap='gray')
        ax.axis('off')
        if img_idx == 0:
            ax.set_ylabel(f'Class {class_idx}', fontsize=10)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from PIL import Image

# Step 1: Load ORL face images into a matrix
def load_orl_faces(dataset_path='orl_faces', num_classes=40, images_per_class=10):
    data = []
    image_shape = None
    for class_idx in range(1, num_classes + 1):
        class_folder = os.path.join(dataset_path, f's{class_idx}')
        image_files = sorted(os.listdir(class_folder))[:images_per_class]
        for img_file in image_files:
            img_path = os.path.join(class_folder, img_file)
            img = Image.open(img_path).convert('L')  # grayscale
            img_array = np.asarray(img, dtype=np.float32).flatten()
            data.append(img_array)
            if image_shape is None:
                image_shape = img.size[::-1]  # (height, width)
    return np.array(data), image_shape

# Step 2: Perform PCA
def perform_pca(data, n_components=100):
    pca = PCA(n_components=n_components, svd_solver='randomized', whiten=False)
    transformed = pca.fit_transform(data)
    return pca, transformed

# Step 3: Plot mean face and eigenfaces
import matplotlib.pyplot as plt

def plot_mean_and_top50_eigenfaces(pca, image_shape):
    """
    Plots the mean face and the first 50 eigenfaces from a PCA model.

    Parameters:
        pca (PCA): Trained PCA object with `mean_` and `components_`.
        image_shape (tuple): Shape of the original images (height, width).
    """
    num_eigenfaces = 50
    rows, cols = 5, 10
    total_plots = num_eigenfaces + 1  # +1 for the mean face

    plt.figure(figsize=(20, 10))

    # Plot mean face
    plt.subplot(rows, cols, 1)
    plt.imshow(pca.mean_.reshape(image_shape), cmap='gray')
    plt.title("Mean Face")
    plt.axis('off')

    # Plot eigenfaces
    for i in range(num_eigenfaces-1):
        plt.subplot(rows, cols, i + 2)
        plt.imshow(pca.components_[i].reshape(image_shape), cmap='gray')
        plt.title(f"PC {i+1}")
        plt.axis('off')

    plt.tight_layout()
    plt.show()

# Step 4: Reconstruct faces using increasing number of eigenfaces
def plot_reconstructions(pca, transformed, original_data, image_shape, face_indices=[0, 1], steps=[5, 15, 25, 50, 100]):
    for idx in face_indices:
        plt.figure(figsize=(12, 2))
        original = original_data[idx].reshape(image_shape)
        plt.subplot(1, len(steps) + 1, 1)
        plt.imshow(original, cmap='gray')
        plt.title("Original")
        plt.axis('off')

        for i, n in enumerate(steps):
            approx = pca.mean_.copy()
            for j in range(n):
                approx += transformed[idx][j] * pca.components_[j]
            plt.subplot(1, len(steps) + 1, i + 2)
            plt.imshow(approx.reshape(image_shape), cmap='gray')
            plt.title(f"{n} PCs")
            plt.axis('off')

        plt.suptitle(f"Reconstruction of Face {idx}")
        plt.tight_layout()
        plt.show()

# Run the pipeline
data, image_shape = load_orl_faces()
pca, transformed = perform_pca(data, n_components=400)

plot_mean_and_top50_eigenfaces(pca, image_shape)
plot_reconstructions(pca, transformed, data, image_shape, face_indices=[0, 10], steps=[25, 50, 100, 200, 400])


**Face Recognition**


In [None]:
# !pip install seaborn
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

def evaluate_knn_face_recognition(data, image_shape, pca, transformed, num_components=50, test_size=0.3, k=3):
    num_classes = 40
    labels = np.repeat(np.arange(num_classes), data.shape[0] // num_classes)

    # Use only the top `num_components` PCA features
    reduced_data = transformed[:, :num_components]

    # Split data
    X_train, X_val, y_train, y_val, idx_train, idx_val = train_test_split(
        reduced_data, labels, np.arange(len(data)), test_size=test_size, stratify=labels, random_state=42
    )

    # Train k-NN
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)

    # Accuracy
    acc = accuracy_score(y_val, y_pred)
    print(f"Validation Accuracy using {num_components} PCA components: {acc:.2%}")

    # Confusion matrix
    cm = confusion_matrix(y_val, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

    # Visualize misclassifications
    misclassified = np.where(y_val != y_pred)[0]
    print(f"Number of misclassified samples: {len(misclassified)}")

    for i in misclassified[:10]:  # Show up to 10 errors
        true_idx = idx_val[i]
        pred_class = y_pred[i]
        true_class = y_val[i]

        # Find a representative image from the predicted class
        pred_idx = np.where(labels == pred_class)[0][0]

        fig, axes = plt.subplots(1, 2, figsize=(6, 3))
        axes[0].imshow(data[true_idx].reshape(image_shape), cmap='gray')
        axes[0].set_title(f"True: s{true_class+1}")
        axes[0].axis('off')

        axes[1].imshow(data[pred_idx].reshape(image_shape), cmap='gray')
        axes[1].set_title(f"Predicted: s{pred_class+1}")
        axes[1].axis('off')

        plt.suptitle("Misclassified Example")
        plt.tight_layout()
        plt.show()

evaluate_knn_face_recognition(data, image_shape, pca, transformed, num_components=200, test_size=0.3, k=3)
