# Chapter 8 Dimensionality Reduction

## Section 3 PCA (Principle Component Analysis)

### 3.2 Principle Components

In [1]:
# the creation of 3D matrix

# import the libs for data matrix creation
import numpy as np

# create 3-dimensional data matrix
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m) 

In [2]:
print(X)

[[-1.01570027 -0.55091331 -0.26132626]
 [-0.00771675  0.59958572  0.03507755]
 [-0.95317135 -0.46453691 -0.24920288]
 [-0.92012304  0.21009593  0.02182381]
 [-0.76309739  0.158261    0.19152496]
 [ 1.11816122  0.32508721  0.31710572]
 [-1.02258878 -0.64384064 -0.13368695]
 [ 0.67351984 -0.27342519 -0.00787835]
 [ 1.01619558  0.51546608  0.46783297]
 [ 0.54957723  0.67728016  0.2340159 ]
 [-0.98960443  0.00886617 -0.12152034]
 [ 1.13248106  0.28229967  0.06972972]
 [-0.99337712 -0.26832824 -0.15761101]
 [-0.94763679 -0.4995849  -0.13927911]
 [ 1.10095709  0.1706481   0.09790432]
 [-0.34604591  0.45334414  0.06433843]
 [ 0.69102621 -0.27337761 -0.05926516]
 [ 0.68302902  0.70421846  0.24642318]
 [ 0.8774031  -0.16775101 -0.02162333]
 [-1.06090127 -0.48213721 -0.38573526]
 [ 0.52336644  0.66585845  0.39019099]
 [-0.94419403 -0.61502157 -0.47610118]
 [-0.9716288   0.00742468 -0.18764369]
 [-1.10560661 -0.31903307 -0.17189644]
 [ 1.1748694   0.15718214  0.26579776]
 [ 0.91337123 -0.1608149 

In [2]:
# obtain the centered matrix
X_centered = X - X.mean(axis=0)

# Obtain the SVD results
U, s, Vt = np.linalg.svd(X_centered)

# obtain the PCs 
print(Vt)

[[ 0.93636116  0.29854881  0.18465208]
 [-0.34027485  0.90119108  0.2684542 ]
 [-0.08626012 -0.31420255  0.94542898]]


In [3]:
print(X.mean(axis=0).shape)

(3,)


### 3.3 Projecting Down to d-Dimensions

In [4]:
# obtain the weight matrix containing the first d principle components
W2 = Vt.T[:, :2]

# obtain the projected result
X2D = X_centered.dot(W2)

# print the projected matrix
print(X2D)

[[-1.26203346 -0.42067648]
 [ 0.08001485  0.35272239]
 [-1.17545763 -0.36085729]
 [-0.89305601  0.30862856]
 [-0.73016287  0.25404049]
 [ 1.10436914 -0.20204953]
 [-1.27265808 -0.46781247]
 [ 0.44933007 -0.67736663]
 [ 1.09356195  0.04467792]
 [ 0.66177325  0.28651264]
 [-1.04466138  0.11244353]
 [ 1.05932502 -0.31189109]
 [-1.13761426 -0.14576655]
 [-1.16044117 -0.36481599]
 [ 1.00167625 -0.39422008]
 [-0.2750406   0.34391089]
 [ 0.45624787 -0.69707573]
 [ 0.79706574  0.26870969]
 [ 0.66924929 -0.65520024]
 [-1.30679728 -0.37671343]
 [ 0.6626586   0.32706423]
 [-1.25387588 -0.56043928]
 [-1.04046987  0.08727672]
 [-1.26047729 -0.1571074 ]
 [ 1.09786649 -0.38643428]
 [ 0.7130973  -0.64941523]
 [-0.17786909  0.43609071]
 [ 1.02975735 -0.33747452]
 [-0.94552283  0.22833268]
 [ 0.80994916  0.33810729]
 [ 0.20189175  0.3514758 ]
 [-1.34219411 -0.42415687]
 [ 0.13599883  0.37258632]
 [ 0.8206931  -0.55120835]
 [ 0.90818634 -0.31869127]
 [ 0.06703671  0.42486148]
 [ 0.13936893  0.41906961]
 

### 3.4 Using Scikit-Learn

In [1]:
# the creation of 3D matrix

# import the libs for data matrix creation
import numpy as np

# create 3-dimensional data matrix
np.random.seed(4)
m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5
X = np.empty((m, 3))
X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2
X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2
X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m) 

In [2]:
# import the lib
from sklearn.decomposition import PCA

# create the PCA object
pca = PCA(n_components=2)

# fit and transform the data
X2D = pca.fit_transform(X)


In [4]:
# show the all PCs
print(pca.components_)

# show the first Principal Component axis
print("The first PC axis is :{}".format(pca.components_.T[:, 0]))

[[-0.93636116 -0.29854881 -0.18465208]
 [ 0.34027485 -0.90119108 -0.2684542 ]]
The first PC axis is :[-0.93636116 -0.29854881 -0.18465208]


### 3.5 Explained Variance Ratio

In [5]:
# show the EVR of the previous fitted PCA 
print(pca.explained_variance_ratio_)

[0.84248607 0.14631839]


### 3.6 Choosing the Right Number of Dimensions

In [3]:
# import the libs
from sklearn.decomposition import PCA
import numpy as np

# create a PCA object
pca = PCA()

# fit the PCA with training instances
pca.fit(X)

# obtain the cumsum 
cumsum = np.cumsum(pca.explained_variance_ratio_)

# find the optimal d for 0.95 variance
d = np.argmax(cumsum >= 0.95)

# create the optimal PCA object with the d
pca = PCA(n_components=d)

# obtain the dimensionality-reduced training instances
X_reduced = pca.fit_transform(X)


### 3.7 PCA for Compression - MNIST

#### MNIST dataset loading

In [1]:
# import the lib for MNIST data
from sklearn.datasets import fetch_openml
import numpy as np

# load the dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

#### training data and test data spliting

In [2]:
# import the lib
from sklearn.model_selection import train_test_split

# create data instances and labels
X = mnist["data"]
y = mnist["target"]

# split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### X_reduced by PCA 

In [5]:
# import the libs
from sklearn.decomposition import PCA
import numpy as np

# create a PCA object
pca = PCA()

# fit the PCA with training instances
pca.fit(X)

# obtain the cumsum 
cumsum = np.cumsum(pca.explained_variance_ratio_)

# find the optimal d for 0.95 variance
d = np.argmax(cumsum >= 0.95) + 1

# create the optimal PCA object with the d
pca = PCA(n_components=d)

# obtain the dimensionality-reduced training instances
X_reduced = pca.fit_transform(X)

In [6]:
d

154

#### X_recovered by inverse_transform()

In [8]:
# create a new PCA object
pca = PCA(n_components=154)

# obtain the X_reduced
X_reduced = pca.fit_transform(X_train)

# obtain the X_recovered
X_recovered = pca.inverse_transform(X_reduced)

### 3.8 Incremental PCA 

#### MNIST dataset load

In [1]:
# import the lib 
from sklearn.datasets import fetch_openml
import numpy as np

# load the dataset
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

#### training data and test data splitting

In [2]:
# import the lib
from sklearn.model_selection import train_test_split

# obtain the training instances and training labels
X, y = mnist["data"], mnist["target"]

# split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#### IPCA implementation

In [4]:
# import the lib
from sklearn.decomposition import IncrementalPCA

# define the number of batches
n_batches = 100

# create the IPCA object
inc_pca = IncrementalPCA(n_components=154)

# loop for fit each batch to the IPCA
for X_batch in np.array_split(X_train, n_batches):
    
    inc_pca.partial_fit(X_batch)

# obtain the X_reduced
X_reduced = inc_pca.transform(X_train)
    