# PCA (Principal Component Analysis) from Scratch
**Objective:** Implement PCA using Eigendecomposition of the Covariance Matrix to understand dimensionality reduction, variance maximizing directions, and reconstruction error.

## Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Set seed for reproducibility
np.random.seed(42)

## Problem Setup (Minimal)
**Dimensionality Reduction** simplifies models and visualization by reducing feature count while keeping important information.

**PCA Idea:** Find new orthogonal axes (Principal Components) that point in the directions of **maximum variance** in the data.

**Requirement:** Data must be **centered** (mean = 0) so that the covariance matrix correctly represents the spread of data around the origin.

## Data

In [None]:
# A) 2D Highly Correlated Data
n_samples = 200
x_1 = np.random.rand(n_samples) * 10
x_2 = 2 * x_1 + 3 + np.random.randn(n_samples) * 1.5 # Strong linear correlation
X_2d = np.vstack([x_1, x_2]).T

# B) 3D Structured Data (2D Plane + Noise in 3rd dim)
x = np.random.randn(n_samples)
y = np.random.randn(n_samples)
z = 0.1 * np.random.randn(n_samples) # Little variance here
# Rotate to make it interesting
X_3d = np.vstack([x, y + 2*x, z]).T

# Visualize 2D Data
plt.figure(figsize=(6, 4))
plt.scatter(X_2d[:, 0], X_2d[:, 1], alpha=0.7)
plt.title("2D Correlated Data")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.grid(True)
plt.show()

## Implementation (NumPy)
We will compute PCA via **Eigendecomposition of the Covariance Matrix**: $C = \frac{1}{n-1} X^T X$.

In [None]:
def standardize_or_center(X, use_std=False):
    """Centers data (mean=0). Optionally standardizes (std=1)."""
    mean = np.mean(X, axis=0)
    X_centered = X - mean
    if use_std:
        std = np.std(X_centered, axis=0)
        # Avoid division by zero
        std[std == 0] = 1
        X_centered = X_centered / std
    return X_centered, mean

def covariance_matrix(X_centered):
    """Computes covariance matrix for centered data: (X^T @ X) / (n-1)"""
    n = X_centered.shape[0]
    return (X_centered.T @ X_centered) / (n - 1)

def pca_fit_eig(X, n_components):
    """Fits PCA using Eigendecomposition."""
    # 1. Center Data
    X_centered, mean = standardize_or_center(X)
    
    # 2. Compute Covariance Matrix
    cov_mat = covariance_matrix(X_centered)
    
    # 3. Eigendecomposition
    # eigh is optimized for symmetric matrices (like covariance)
    eigenvalues, eigenvectors = np.linalg.eigh(cov_mat)
    
    # 4. Sort method (High to Low)
    sorted_index = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[sorted_index]
    sorted_eigenvectors = eigenvectors[:, sorted_index]
    
    # 5. Select Top-K Components
    components = sorted_eigenvectors[:, :n_components].T # Shape (n_components, n_features)
    
    # Explained Variance Stats
    explained_variance = sorted_eigenvalues[:n_components]
    explained_variance_ratio = explained_variance / np.sum(sorted_eigenvalues)
    
    return components, explained_variance, explained_variance_ratio, mean

def pca_transform(X, components, mean):
    """Projects X onto the principal components."""
    return (X - mean) @ components.T

def pca_inverse_transform(Z, components, mean):
    """Reconstructs X from Z."""
    return (Z @ components) + mean

def reconstruction_mse(X, X_recon):
    return np.mean((X - X_recon)**2)

## Experiments

In [None]:
# Experiment 1: 2D Data -> 1 Component
components_2d, _, ratio_2d, mean_2d = pca_fit_eig(X_2d, n_components=1)

# Transform and Reconstruct
Z_2d = pca_transform(X_2d, components_2d, mean_2d)
X_recon_2d = pca_inverse_transform(Z_2d, components_2d, mean_2d)
mse_2d = reconstruction_mse(X_2d, X_recon_2d)

print(f"2D Data Explained Variance Ratio (1st comp): {ratio_2d[0]:.4f}")
print(f"2D Data Reconstruction MSE: {mse_2d:.4f}")

# Visualization: Principal Component Direction
plt.figure(figsize=(6, 4))
plt.scatter(X_2d[:, 0], X_2d[:, 1], alpha=0.3, label="Original Data")
plt.scatter(X_recon_2d[:, 0], X_recon_2d[:, 1], color='red', alpha=0.5, label="Reconstructed (1D)")

# Draw Arrow for PC1
start = mean_2d
end = mean_2d + components_2d[0] * 5 # Scale for visibility
plt.arrow(start[0], start[1], end[0]-start[0], end[1]-start[1], 
          color='black', width=0.2, head_width=0.8, label="PC1 Direction")

plt.title(f"PCA (1 Component) on 2D Data")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Experiment 2: 3D Data -> K Components
n_dims_total = X_3d.shape[1]
mses = []
cumulative_variance = []

for k in range(1, n_dims_total + 1):
    comps, _, ratio, m = pca_fit_eig(X_3d, n_components=k)
    Z = pca_transform(X_3d, comps, m)
    X_rec = pca_inverse_transform(Z, comps, m)
    mse = reconstruction_mse(X_3d, X_rec)
    
    mses.append(mse)
    # Re-fit to get cumulative ratio easily or just sum `ratio` (since we request k)
    cumulative_variance.append(np.sum(ratio))

# Plotting Stats
fig, ax1 = plt.subplots(figsize=(8, 5))

color = 'tab:blue'
ax1.set_xlabel('Number of Components K')
ax1.set_ylabel('Reconstruction MSE', color=color)
ax1.plot(range(1, n_dims_total + 1), mses, color=color, marker='o')
ax1.tick_params(axis='y', labelcolor=color)

ax2 = ax1.twinx()  
color = 'tab:green'
ax2.set_ylabel('Cumulative Explained Variance (%)', color=color)
ax2.plot(range(1, n_dims_total + 1), [x*100 for x in cumulative_variance], color=color, marker='s', linestyle='--')
ax2.tick_params(axis='y', labelcolor=color)

plt.title("Tradeoff: Dimensionality vs Information Loss (3D Data)")
plt.show()

## Comparison (Optional)

In [None]:
from sklearn.decomposition import PCA
sklearn_pca = PCA(n_components=1)
sklearn_pca.fit(X_2d)

print("Comparing 2D Data Results:")
print(f"My Explained Var Ratio:      {ratio_2d[0]:.6f}")
print(f"Sklearn Explained Var Ratio: {sklearn_pca.explained_variance_ratio_[0]:.6f}")
print("\nNote: Values should be identical (within float precision).")

## Results & Takeaways
*   **Variance as Information:** PCA assumes that the interesting features are the ones with the largest variance. The 1st component is the direction of widest spread.
*   **Centering is Key:** Without subtracting the mean, the covariance computation would be incorrect, as it would capture the spread relative to origin (0,0) rather than the data's center.
*   **Correlation = Redundancy:** In the 2D example, features were highly correlated. PCA found that ~99% of the information could be compressed into a single dimension (linear combination of the two).
*   **Reconstruction Error:** As we increase $K$, we capture more variance and the Mean Squared Error (MSE) of reconstruction drops to zero.

## Next Steps
*   This concludes **Module 1: Machine Learning**.
*   Proceed to **Module 2: Deep Learning**.
*   [Start Neural Networks](../nonlinear-models-neural-networks/README.md) (or verify next module path)