<a href="https://colab.research.google.com/github/AashiDutt/Hands-on-Machine-Learning-with-sklearn-keras-and-tensorflow/blob/main/Dimensionlity_reduction_Chapter_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dimensionality reduction techniques

1. PCA - Principal Component Analysis
   First identifies the plane, then projects data onto it.

   In short the axis that minimizes the mean squared distance between the orignal dataset and its projection onto that axis.

In [2]:
# Finding Principal Components
# Using Singular Value Decomposition - 
#decomposes the matrix X into matix multiplication of 3 matrices 
#(U SIGMA V) where V contains the unit vector that define all the principal components

# Lets create a 3D dataset
import numpy as np

np.random.seed(4)

m = 60
w1, w2 = 0.1, 0.3
noise = 0.1

angles = np.random.rand(m) * 3*np.pi/2-0.5
X = np.empty((m,3))
X[:, 0] = np.cos(angles) +np.sin(angles)/2 + noise * np.random.randn(m)/2
X[:, 1] = np.sin(angles) +0.7+ noise * np.random.randn(m)/2
X[:, 2] = X[:,0] *w1 +X[:,1] *w2 + noise * np.random.randn(m)

In [3]:
# Centering the data around origin
X_centered = X -X.mean(axis =0)
U,s,Vt = np.linalg.svd(X_centered)
c1 = Vt.T[:, 0] # component 1
c2 = Vt.T[:,1] # component 2

In [7]:
# Projecting training set onto the plane defined by first two pricipal components to reduce dimesionality down to any no. of dimensions while preserving the variance as much as possible
W2 = Vt.T[:,:2]
X2D = X_centered.dot(W2)
X2D[:5]


array([[-1.50202733, -0.44169998],
       [ 0.24659144,  0.48321221],
       [-1.40746895, -0.40506286],
       [-0.81513127,  0.47807754],
       [-0.65530869,  0.40920943]])

#PCA using Sklearn

In [5]:
# PCA using sklearn

from sklearn.decomposition import PCA

pca = PCA(n_components = 2) #c1, c2
X2D = pca.fit_transform(X)

In [6]:
X2D[:5]

array([[ 1.50202733,  0.44169998],
       [-0.24659144, -0.48321221],
       [ 1.40746895,  0.40506286],
       [ 0.81513127, -0.47807754],
       [ 0.65530869, -0.40920943]])

In [9]:
# ratio indicates the proportion of dataset's variance that lies along each principal component

pca.explained_variance_ratio_ 

array([0.77778267, 0.21304902])

It shows that 77% of datasets variance lies along 1st principal component(PC), 21% along 2nd PC and rest along 3rd PC i.e it contains least info




# Choosing right number of Dimensions

In [10]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version =1, as_frame = False)
mnist.target = mnist.target.astype(np.uint8)


In [11]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist['target']

X_train, X_test, y_train, y_test = train_test_split(X,y)



In [12]:
pca = PCA()
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) +1 # dimesnions

In [13]:
d

154

In [14]:
pca = PCA(n_components = 0.95)
X_reduced = pca.fit_transform(X_train)

In [15]:
# Inverse the dimensionality reduction

pca = PCA(n_components = 154)
X_reduced = pca.fit_transform(X_train)
X_recovered = pca.inverse_transform(X_reduced)

In [16]:
# Not all data will be recovered

# Randomized PCA
rnd_pca = PCA(n_components = 154, svd_solver = 'randomized')
X_reduced = rnd_pca.fit_transform(X_train)

In [17]:
# Incremental PCA - split large training sets into mini batches and apply PCA online

from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components = 154)

for X_batch in np.array_split(X_train, n_batches):
  inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

In [None]:
# Kernel PCA - same as Kernel trick

from sklearn.decomposition import KernelPCA

rbf_pca = KernelPCA(n_components = 2, kernel = 'rbf', gamma = 0.04)
X_reduced = rbf_pca.fit_transform(X)

In [2]:
# Use grid search to find best kernel and hyperparameter for best performance

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ("kpca", KernelPCA(n_components = 2)),
    ("log_reg", LogisticRegression())
])

param_grid = [{
    "kpca__gamma": np.linspace(0.03, 0.05, 10),
    "kpca__kernel":["rbf", "sigmoid"]
}]

grid_search = GridSearchCV(clf, param_grid, cv = 3)
grid_search.fit(X,y)

print(grid_search.best_params_)


NameError: ignored

In [None]:
rbf_pca = KernelPCA(n_components = 2, kernel = "rbf", gamma = 0.0433, fit_inverse_transform=True)
X_reduced = rbf.fit_transform(X)
X_preimage= rbf_pca.inverse_transform(X_reduced)

In [None]:
mean_squared_error(X, X_reduced)

In [None]:
# LLE - Locally Linear Embedding

from sklearn.manifold import LocallyLinearEmbedding

lle = LocallyLinearEmbedding(n_components = 2, n_neighbors= 10)
X_reduced = lle.fit_transform(X)