In [1]:
import pandas as pd
import numpy as np
# from numpy.linalg import svd

In [2]:
X = np.random.rand(100,3)
X.shape # 3d Data

(100, 3)

In [3]:
from sklearn.decomposition import PCA

In [4]:
pca = PCA(n_components=2)
X2D = pca.fit_transform(X)

### Explained Variance Ratio

In [5]:
pca.explained_variance_ratio_

array([0.40783159, 0.29978538])

In [6]:
pca.explained_variance_

array([0.10038079, 0.07378706])

# Choosing Right Number of Dimensions

In [18]:
from sklearn.decomposition import PCA

In [19]:
# Build 3D data set 

np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)


In [20]:
pca = PCA()

In [21]:
X.shape

(60, 3)

In [22]:
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [23]:
pca.explained_variance_

array([0.77830975, 0.1351726 , 0.01034272])

In [24]:
cumsum = np.cumsum(pca.explained_variance_)
cumsum

array([0.77830975, 0.91348235, 0.92382507])

In [35]:
np.argmax(cumsum >= 0.77)

0

In [31]:
# get dimension 
d = np.argmax(cumsum >= 0.95)  + 1
d

1

#### MNIST Data Set

In [36]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
mnist.target = mnist.target.astype(np.uint8)

In [37]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [49]:
X_train.shape

(52500, 784)

In [50]:
pca.explained_variance_ratio_.shape

(784,)

In [51]:
pca.explained_variance_ratio_[1:10]

array([0.07155316, 0.06170876, 0.05401742, 0.04905855, 0.0430278 ,
       0.03278245, 0.02884629, 0.02748578, 0.02356632])

In [52]:
np.cumsum(pca.explained_variance_ratio_)

array([0.09719832, 0.16875148, 0.23046024, 0.28447767, 0.33353621,
       0.37656401, 0.40934646, 0.43819275, 0.46567853, 0.48924485,
       0.51032629, 0.5307285 , 0.54778858, 0.56465048, 0.58041792,
       0.59534958, 0.60862878, 0.62147783, 0.63334578, 0.64479193,
       0.65545804, 0.66555448, 0.67514241, 0.68416896, 0.69296211,
       0.70131513, 0.70939893, 0.71727437, 0.72468736, 0.73157212,
       0.73812949, 0.74459959, 0.75058197, 0.75643475, 0.76210811,
       0.7675608 , 0.77261474, 0.77750626, 0.78230885, 0.78696883,
       0.79152081, 0.79597374, 0.80014325, 0.80411726, 0.80795962,
       0.81171266, 0.81533146, 0.81882001, 0.8221978 , 0.82541301,
       0.82859763, 0.83168676, 0.83465474, 0.83752128, 0.84034773,
       0.84303834, 0.84571815, 0.84828932, 0.85083357, 0.85329543,
       0.85569464, 0.8580644 , 0.86035289, 0.86256245, 0.86468703,
       0.86673787, 0.86875875, 0.87071732, 0.87263867, 0.87451472,
       0.87637902, 0.87817437, 0.87993644, 0.88167564, 0.88332

In [54]:
cumsum >= 0.95

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [56]:
np.argmax([False,False,False, False, True, True])

4

In [59]:
np.argmax([False,False,False, True, True, True])

3

In [61]:
np.argmax([False,False,True, True, True, True])

2

In [63]:
np.argmax([True,True,True, True, True, True])

0

In [66]:
np.argmax([False,False,False, False, False, False])

0

In [68]:
np.argmax([False,True,False, True, False, True])

# For getting dim we add 1 

1

In [69]:
np.argmax(cumsum >= 0.95)

153

In [70]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1

In [71]:
cumsum.shape

(784,)

In [72]:
d

154

# Direct Way

In [74]:
pca  = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X_train)

In [75]:
X_reduced.shape

(52500, 154)

In [76]:
# Great 