# Dimensionality Reduction

In [1]:
import numpy as np


# Create a 3D dataset with 100 samples
# Each sample has 3 features (x, y, z)
X = np.random.rand(100, 3)  # Random values between 0 and 1

print(X[:5])  # Display the first 5 samples

[[0.05246491 0.25529052 0.16923047]
 [0.13353044 0.95626236 0.65876455]
 [0.30544921 0.72159398 0.69822434]
 [0.87279322 0.73824913 0.99376109]
 [0.43095919 0.57000067 0.03808895]]


In [2]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
x_2D = pca.fit_transform(X)
x_2D

array([[-0.4407131 , -0.37520217],
       [-0.24898622,  0.43655864],
       [-0.12267151,  0.3026938 ],
       [ 0.43066388,  0.48754529],
       [-0.00895368, -0.31301718],
       [ 0.34145367, -0.16146949],
       [ 0.40736643, -0.08820988],
       [ 0.12588442, -0.12692892],
       [ 0.08770057, -0.06959405],
       [ 0.18406471,  0.05497621],
       [ 0.13225008, -0.32216954],
       [ 0.24538149,  0.26704695],
       [ 0.46159005, -0.19077682],
       [ 0.13056031, -0.19604653],
       [-0.3365981 ,  0.20582878],
       [ 0.39678788, -0.05154614],
       [ 0.16527847,  0.0087418 ],
       [ 0.08210423,  0.44217019],
       [-0.20940139, -0.02933025],
       [-0.29218922, -0.18224127],
       [-0.12715374,  0.18583519],
       [ 0.12444929,  0.14776459],
       [-0.48609931, -0.23590516],
       [-0.18179958, -0.37363287],
       [-0.00377939, -0.14280171],
       [ 0.14034586, -0.05891119],
       [-0.02321209, -0.20399925],
       [-0.37386799,  0.45193934],
       [ 0.26604882,

In [3]:
pca.explained_variance_ratio_

array([0.35831813, 0.33823647])

In [4]:
from sklearn.datasets import fetch_openml


mnist = fetch_openml('mnist_784', as_frame=False)
X_train, y_train = mnist.data[:60_000], mnist.target[:60_000]
X_test, y_test = mnist.data[60_000:], mnist.target[60_000:]

pca = PCA()
pca.fit(X_train)

In [5]:
cumsum  = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum>=0.95)+1


In [6]:
pca = PCA(n_components=0.95)
x_reduced = pca.fit_transform(X_train)
pca.n_components_

np.int64(154)

In [7]:
x_reduced

array([[ 123.93258866,  312.67426202,   24.51405176, ...,   55.01899792,
          20.08327427,   39.58995229],
       [1011.71837587,  294.85703827, -596.33956104, ...,    7.24129874,
          12.45780869,  -12.7432306 ],
       [ -51.84960805, -392.17315286,  188.50974943, ...,  -54.19582221,
         -48.47979747,  -73.27826256],
       ...,
       [-178.0534496 , -160.07821109,  257.61308227, ...,   55.54485537,
         -87.99883556,   -5.78979735],
       [ 130.60607208,    5.59193642, -513.85867395, ...,   23.30835402,
          -5.06237836,  -65.26525587],
       [-173.43595244,   24.71880226, -556.01889393, ...,   52.4956069 ,
         -12.63192292,  -45.74001227]], shape=(60000, 154))

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline

clf = make_pipeline(
    PCA(random_state=42),
    RandomForestClassifier(random_state=42)
)

param_dist = {
    "pca__n_components": np.arange(10, 80),
    "randomforestclassifier__n_estimators": np.arange(50, 500)
}

rnd_search = RandomizedSearchCV(clf, param_dist, n_iter=10, cv=3, random_state=42, n_jobs=-1)
rnd_search

In [9]:
rnd_search.fit(X_train[:1000], y_train[:1000])

In [10]:
rnd_search.best_params_

{'randomforestclassifier__n_estimators': np.int64(475),
 'pca__n_components': np.int64(57)}

In [11]:
pca.inverse_transform(x_reduced)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(60000, 784))

In [13]:
rnd_pcs = PCA(svd_solver = 'randomized', random_state=42, n_components=154)
X_reduced = rnd_pcs.fit_transform(X_train)
X_reduced

array([[ 123.93258864,  312.67426198,   24.51405174, ...,   62.00213296,
          -8.8147422 ,  -66.93993166],
       [1011.71837586,  294.85703831, -596.33956108, ...,   24.52514836,
          26.58534428,   16.99077095],
       [ -51.84960804, -392.17315289,  188.50974941, ...,    8.99144972,
          -2.99473092,   56.93622984],
       ...,
       [-178.0534496 , -160.0782111 ,  257.61308227, ...,  -35.30439525,
          -2.75142691,   23.97581712],
       [ 130.60607212,    5.59193632, -513.85867376, ...,   15.84132904,
         -18.38612585,   39.40742042],
       [-173.43595246,   24.71880228, -556.01889398, ...,  -29.62816702,
         -52.61652274,   27.99524134]], shape=(60000, 154))

In [15]:
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_train, n_batches):
    inc_pca.partial_fit(X_batch)

X_reduced = inc_pca.transform(X_train)

In [16]:
X_reduced

array([[ 123.93240015,  312.67412765,   24.51396855, ...,   55.02414967,
         -18.82319306,   57.12605157],
       [1011.71883902,  294.85791533, -596.3396284 , ...,   40.79115354,
         -28.52753525,  -32.93944347],
       [ -51.84977972, -392.17395257,  188.50798593, ...,   18.5109603 ,
         -75.96611653,   -7.67736302],
       ...,
       [-178.0534095 , -160.07838721,  257.61233558, ...,  -57.3811145 ,
           6.70673288,  -54.26797595],
       [ 130.60654125,    5.59174593, -513.85834969, ...,  -22.43044205,
          12.51568244,  -36.3004746 ],
       [-173.43566358,   24.71937319, -556.01892138, ...,  -48.33215133,
          19.2936437 ,  -30.58306681]], shape=(60000, 154))

In [19]:
from sklearn.random_projection import johnson_lindenstrauss_min_dim
m=5000
e=0.1
d = johnson_lindenstrauss_min_dim(n_samples=m, eps = e)
d

np.int64(7300)

In [20]:
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.datasets import make_swiss_roll
from sklearn.manifold import LocallyLinearEmbedding

X_swiss, t = make_swiss_roll(n_samples=1000, noise=0.2, random_state=42)
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10, random_state=42)
X_unrolled = lle.fit_transform(X_swiss)
X_unrolled

array([[ 0.01083493, -0.04071144],
       [-0.05152258,  0.04027771],
       [-0.02360686,  0.0147381 ],
       ...,
       [ 0.03497162, -0.0179331 ],
       [-0.05313366,  0.02821268],
       [ 0.00434517, -0.04424245]], shape=(1000, 2))