Example:

If you perform PCA and get the following explained variances for 5 components:

PC1: 60% variance explained

PC2: 25% variance explained

PC3: 10% variance explained

PC4: 4% variance explained

PC5: 1% variance explained

The cumulative variance will be:

After PC1: 60%

After PC1 + PC2: 60% + 25% = 85%

After PC1 + PC2 + PC3: 85% + 10% = 95%

After PC1 + PC2 + PC3 + PC4: 95% + 4% = 99%

After PC1 + PC2 + PC3 + PC4 + PC5: 99% + 1% = 100%

In [1]:
from sklearn.decomposition import PCA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# Mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Loading the data


In [13]:
# Paths to the .npz files
output_dir = '/content/drive/My Drive/Fabric Detection Project/Extracted Features'
data_path = os.path.join(output_dir, 'X.npz')
labels_path = os.path.join(output_dir, 'y.npz')
groups_path = os.path.join(output_dir, 'groups.npz')


# Loading the .npz files
data = np.load(data_path)
labels = np.load(labels_path)
groups = np.load(groups_path)

#  Access the arrays stored inside the .npz files
X = data['data']
y = labels['labels']
groups = groups['groups']

n_components=1000

In [6]:
pca = PCA(n_components=1000)  # Set n_components to the number of features or less
pca.fit(X)


In [7]:
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)


[0.07601781 0.00710605 0.0065163  0.00258245 0.00223257 0.00211498
 0.00200888 0.00196381 0.00187754 0.00178936 0.00167458 0.00165027
 0.00162976 0.00160925 0.00150117 0.00146715 0.00144702 0.00142698
 0.0013728  0.00134292 0.00131881 0.00129086 0.00127755 0.00125408
 0.00123941 0.00123763 0.00119497 0.00119246 0.00117103 0.00116289
 0.00115585 0.0011414  0.00112318 0.00111589 0.00111009 0.00109496
 0.00108581 0.00106784 0.00106441 0.0010538  0.00104224 0.00103674
 0.00102237 0.00101723 0.00100909 0.00099997 0.00098666 0.00098128
 0.00097248 0.00095481 0.00095137 0.00094397 0.00093658 0.00093282
 0.00092587 0.00091911 0.00091641 0.00090642 0.0008971  0.0008898
 0.000883   0.00087879 0.00087699 0.00087049 0.00085812 0.00084671
 0.00084387 0.00083923 0.00083333 0.00082715 0.00081911 0.00081688
 0.00081097 0.00080225 0.00079632 0.00079389 0.00078647 0.00078409
 0.00078204 0.0007752  0.00076842 0.00076311 0.00075597 0.00075569
 0.00074542 0.00074359 0.00073764 0.00073184 0.00072896 0.00072

In [8]:
cumulative_variance = np.cumsum(explained_variance_ratio)
print(cumulative_variance)


[0.07601781 0.08312386 0.08964016 0.09222261 0.09445518 0.09657016
 0.09857904 0.10054286 0.10242039 0.10420975 0.10588434 0.1075346
 0.10916437 0.11077362 0.11227479 0.11374194 0.11518896 0.11661594
 0.11798874 0.11933166 0.12065047 0.12194133 0.12321888 0.12447295
 0.12571236 0.12695    0.12814496 0.12933742 0.13050846 0.13167135
 0.13282719 0.13396859 0.13509177 0.13620766 0.13731775 0.1384127
 0.13949851 0.14056636 0.14163076 0.14268456 0.1437268  0.14476354
 0.14578592 0.14680314 0.14781223 0.1488122  0.14979886 0.15078014
 0.15175262 0.15270742 0.1536588  0.15460277 0.15553935 0.15647217
 0.15739805 0.15831715 0.15923356 0.16013998 0.16103708 0.16192688
 0.16280988 0.16368867 0.16456566 0.16543615 0.16629427 0.16714098
 0.16798485 0.16882408 0.16965741 0.17048455 0.17130366 0.17212055
 0.17293152 0.17373377 0.17453009 0.17532398 0.17611045 0.17689454
 0.17767658 0.17845177 0.17922019 0.1799833  0.18073927 0.18149496
 0.18224038 0.18298397 0.18372161 0.18445345 0.18518241 0.185907

n_components=2000

In [4]:
pca = PCA(n_components=2000)  # Set n_components to the number of features or less
pca.fit(X)

In [5]:
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)

[0.07601781 0.00710605 0.0065163  ... 0.00010882 0.00010872 0.00010841]


In [6]:
cumulative_variance = np.cumsum(explained_variance_ratio)
print(cumulative_variance)

[0.07601781 0.08312386 0.08964016 ... 0.59814471 0.59825343 0.59836184]


In [7]:
del pca

Took almost 18 GB

n_components=3000

In [8]:
pca = PCA(n_components=3000)  # Set n_components to the number of features or less
pca.fit(X)

In [9]:
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)

[7.60178053e-02 7.10605150e-03 6.51630052e-03 ... 7.89645288e-05
 7.88972141e-05 7.88542823e-05]


In [10]:
cumulative_variance = np.cumsum(explained_variance_ratio)
print(cumulative_variance)

[0.07601781 0.08312386 0.08964016 ... 0.70717133 0.70725023 0.70732908]


In [14]:
del pca

19.7 -6 GB = 13GB

In [15]:
pca = PCA(n_components=3000, svd_solver='randomized', random_state=42)
pca.fit(X)

In [16]:
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)

[7.60178053e-02 7.10605150e-03 6.51630052e-03 ... 7.90710079e-05
 7.89937924e-05 7.88421790e-05]


In [17]:
cumulative_variance = np.cumsum(explained_variance_ratio)
print(cumulative_variance)

[0.07601781 0.08312386 0.08964016 ... 0.70716624 0.70724524 0.70732408]


1 or 2 GB more

In [18]:
pca = PCA(n_components=4000)  # Set n_components to the number of features or less
pca.fit(X)

In [19]:
explained_variance_ratio = pca.explained_variance_ratio_
print(explained_variance_ratio)

[7.60178053e-02 7.10605150e-03 6.51630052e-03 ... 5.86426635e-05
 5.85754638e-05 5.84729513e-05]


In [20]:
cumulative_variance = np.cumsum(explained_variance_ratio)
print(cumulative_variance)

[0.07601781 0.08312386 0.08964016 ... 0.78759016 0.78764874 0.78770721]


2GB more