# PCA
## Principal Components

In [1]:
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

X, y = make_moons(n_samples=1000, noise=0.15)

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [3]:
import numpy as np

X_centered = X - X.mean(axis=0)
U, s, Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0]
c2 = Vt.T[:, 1]
print(c1)
print(c2)

[-0.94598919  0.32419816]
[0.32419816 0.94598919]


In [4]:
s2 = s**2
s2n = s2 / np.sum(s2)
# Note: "normalized squared singular value" is the "explained variance ratio"
print(s)
print(s2)
print(s2n)

[29.09371596 14.13204119]
[846.4443084  199.71458829]
[0.80909727 0.19090273]


In [5]:
W2 = Vt.T[:, :2]
X2d = X_centered @ W2

In [6]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

In [7]:
pca.components_.T[:, 0]

array([-0.94598919,  0.32419816])

In [8]:
# Explained Variance Ratio = normalized squared singular value
pca.explained_variance_ratio_

array([0.80909727, 0.19090273])

## Choosing the Right Number of Dimensions

In [9]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [10]:
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

## PCA for Compression

In [11]:
# from sklearn.datasets import fetch_openml

# mnist = fetch_openml('mnist_784', version=1, as_frame=False)
# X, y = mnist['data'], mnist['target']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
# # Store the dataset as a binary file using memmap (memory map)
# filename = "./mnist_data_X_train.dat"
# print(X_train.shape)
# fp = np.memmap(filename, dtype='float32', mode='w+', shape=X_train.shape)

# # Write the data in
# fp[:] = X_train[:]
# fp.flush()

In [13]:
# pca = PCA(n_components=154)
# X_reduced = pca.fit_transform(X)
# X_recovered = pca.inverse_transform(X_reduced)

## Randomized PCA

In [14]:
# rnd_pca = PCA(n_components=154, svd_solver="randomized")
# X_reduced = rnd_pca.fit_transform(X_train)

## Incremental PCA

In [15]:
# from sklearn.decomposition import IncrementalPCA

# n_batches = 100
# inc_pca = IncrementalPCA(n_components=154)
# for X_batch in np.array_split(X_train, n_batches):
#     inc_pca.partial_fit(X_batch)
    
# X_reduced = inc_pca.transform(X_train)

In [16]:
# from sklearn.decomposition import IncrementalPCA

# filename = "./mnist_data_X_train.dat"
# X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(56000, 784))

# n_batches = 100
# batch_size = X_mm.shape[0] // n_batches
# inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
# inc_pca.fit(X_mm)

# Kernel PCA

In [18]:
from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
        ("kpca", KernelPCA(n_components=2)),
        ("log_reg", LogisticRegression())
    ])
param_grid = [{
        "kpca__gamma": np.linspace(0.03, 0.05, 10),
        "kpca__kernel": ["rbf", "sigmoid"]
    }]

grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('kpca', KernelPCA(n_components=2)),
                                       ('log_reg', LogisticRegression())]),
             param_grid=[{'kpca__gamma': array([0.03      , 0.03222222, 0.03444444, 0.03666667, 0.03888889,
       0.04111111, 0.04333333, 0.04555556, 0.04777778, 0.05      ]),
                          'kpca__kernel': ['rbf', 'sigmoid']}])

In [21]:
grid_search.best_params_

{'kpca__gamma': 0.05, 'kpca__kernel': 'rbf'}

In [23]:
# Fit the inverse transform
rbf_pca = KernelPCA(n_components=2, kernel="rbf", gamma=0.0433, fit_inverse_transform=True)
x_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)

In [24]:
from sklearn.metrics import mean_squared_error

mean_squared_error(X, X_preimage)

0.03434644559768076

# LLE

In [26]:
# Locally Linear Embedding
from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)