# Dimensionality Reduction

In [2]:
import numpy as np

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)

In [3]:
from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)

In [4]:
pca.explained_variance_ratio_

array([0.84248607, 0.14631839])

This tells you that 84.2% of the dataset’s variance lies along the first axis, and 14.6% lies along the
second axis. This leaves less than 1.2% for the third axis, so it is reasonable to assume that it probably
carries little information.

In [7]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784')

In [8]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

You could then set n_components=d and run PCA again. However, there is a much better option: instead
of specifying the number of principal components you want to preserve, you can set n_components to be
a float between 0.0 and 1.0, indicating the ratio of variance you wish to preserve:

In [9]:
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train)

In [10]:
pca.explained_variance_ratio_

array([0.09719832, 0.07155316, 0.06170876, 0.05401742, 0.04905855,
       0.0430278 , 0.03278245, 0.02884629, 0.02748578, 0.02356632,
       0.02108144, 0.02040221, 0.01706009, 0.01686189, 0.01576744,
       0.01493166, 0.0132792 , 0.01284905, 0.01186795, 0.01144615,
       0.01066611, 0.01009644, 0.00958792, 0.00902655, 0.00879315,
       0.00835302, 0.0080838 , 0.00787544, 0.00741299, 0.00688476,
       0.00655737, 0.00647011, 0.00598237, 0.00585278, 0.00567335,
       0.0054527 , 0.00505394, 0.00489152, 0.00480259, 0.00465999,
       0.00455198, 0.00445293, 0.00416951, 0.00397401, 0.00384236,
       0.00375304, 0.0036188 , 0.00348855, 0.00337779, 0.00321521,
       0.00318462, 0.00308914, 0.00296798, 0.00286653, 0.00282645,
       0.0026906 , 0.00267981, 0.00257117, 0.00254425, 0.00246185,
       0.00239921, 0.00236976, 0.00228849, 0.00220956, 0.00212458,
       0.00205084, 0.00202087, 0.00195857, 0.00192135, 0.00187604,
       0.00186431, 0.00179534, 0.00176207, 0.00173921, 0.00165

In [11]:
pca.n_components_

154