# PCA Demo
This demo illustrates how SIMBSIG can be used for Principal Component Analysis (PCA), and how the use compares to scikit-learn. 

In [1]:
!pip install simbsig



In [2]:
from simbsig.decomposition import PCA
from sklearn.decomposition import PCA as PCA_sk
import h5py as h5py
import numpy as np
import os

## Set Parameters

In [3]:
np.random.seed(98)
n_samples = 20
n_dim = 3

## Create Toy Data
### numpy arrays

In [4]:
# numpy arrays
X = np.random.uniform(low=-5, high=5, size=(n_samples, n_dim))

### hdf5 files
#### Google Colab
If you work from google colab, you can execute the statement below: Of course you can also store the data in a directory more suitable to you.

In [5]:
# when working on colab, google drive can be used to save and read data
from google.colab import drive
drive.mount('/content/drive')

# depending on the structure of your google drive, you might want to choose a different dataset path
dataset_path = '/content/drive/MyDrive/'

ModuleNotFoundError: No module named 'google'

#### Your Computer
Alternatively, if you work from your computer, you can execute the statement below:
Of course you can also store the data in a directory more suitable to you.

In [6]:
# when working from your computer, your disk can be used to save and read data
import os
dataset_path = os.path.dirname(os.path.realpath("__file__"))

In [7]:
# hdf5 files using h5py
X_file = f'train.hdf5'

with h5py.File(os.path.join(dataset_path, f"{X_file}"), 'w') as f:
    f.create_dataset("X", data=X)

## Scikit-learn


In [8]:
pca_sk = PCA_sk(n_components=2)

pca_sk.fit(X)
pca_sk.transform(X)

array([[-1.58473104,  0.79893721],
       [-0.92822381,  0.28984871],
       [ 1.80687758,  0.74262492],
       [ 4.08544446,  1.71724538],
       [-3.95675763, -3.44066341],
       [-4.43290221, -2.36003492],
       [-4.27286936, -2.0032996 ],
       [-1.62268984,  5.66375729],
       [ 6.15953168,  0.132802  ],
       [-5.54501202,  2.07147302],
       [-3.02132808, -0.12196357],
       [-1.30904908, -3.21932466],
       [ 2.49030777,  2.06508366],
       [-5.72565927,  3.44920117],
       [ 4.07681933, -1.86069684],
       [ 5.19637105,  3.06962547],
       [ 3.17831714, -2.71930587],
       [-0.36333476,  0.62327764],
       [ 1.06274721, -3.83762988],
       [ 4.70614087, -1.0609577 ]])

## SIMBSIG
### Using numpy arrays and CPU only
SIMBSIG can be used very similar to scikit-learn. In an existing workflow using scikit-learn, which may be on the verge of exceeding runtime or memory requirements, this allows a seamless transition to SIMBSIG. Notice that when using different methods for PCA, the principal components may have a different sign.


In [9]:
pca = PCA(n_components=2)

pca.fit(X)
pca.transform(X)

                                                             

array([[-1.584731  , -0.79894   ],
       [-0.9282242 , -0.28985322],
       [ 1.8068779 , -0.74262524],
       [ 4.0854454 , -1.7172432 ],
       [-3.956759  ,  3.4406645 ],
       [-4.4329033 ,  2.3600342 ],
       [-4.27287   ,  2.0032988 ],
       [-1.6226879 , -5.6637545 ],
       [ 6.159532  , -0.13280044],
       [-5.545012  , -2.0714743 ],
       [-3.0213284 ,  0.1219622 ],
       [-1.3090497 ,  3.2193284 ],
       [ 2.490309  , -2.0650814 ],
       [-5.7256584 , -3.4492006 ],
       [ 4.076819  ,  1.8606944 ],
       [ 5.196372  , -3.069625  ],
       [ 3.1783173 ,  2.719311  ],
       [-0.36333475, -0.62327933],
       [ 1.0627458 ,  3.837627  ],
       [ 4.706141  ,  1.0609591 ]], dtype=float32)

### Using hdf5 files and CPU only
If saving the entire data at once in the computer memory using numpy arrays is not reasonable anymore, the hdf5 file format can help. SIMBSIG can use data in hdf5 files, by setting the `mode` argument to `cpu`

In [10]:
pca_hdf5 = PCA(n_components=2, mode='hdf5')

# open the hdf5 file for use
X_data = h5py.File(os.path.join(dataset_path, X_file), 'r')

pca_hdf5.fit(X_data)
trafo = pca_hdf5.transform(X_data)

# close hdf5 file
X_data.close()

trafo

                                                             

array([[-1.584731  , -0.798935  ],
       [-0.92822397, -0.28984517],
       [ 1.8068776 , -0.7426246 ],
       [ 4.0854445 , -1.7172467 ],
       [-3.9567575 ,  3.440663  ],
       [-4.4329023 ,  2.3600357 ],
       [-4.272869  ,  2.0033002 ],
       [-1.6226906 , -5.6637588 ],
       [ 6.159532  , -0.13280292],
       [-5.5450125 , -2.0714724 ],
       [-3.0213282 ,  0.12196495],
       [-1.3090488 ,  3.2193215 ],
       [ 2.490308  , -2.0650852 ],
       [-5.7256603 , -3.4492004 ],
       [ 4.07682   ,  1.8606987 ],
       [ 5.1963706 , -3.0696254 ],
       [ 3.1783178 ,  2.719302  ],
       [-0.36333486, -0.6232762 ],
       [ 1.0627477 ,  3.8376315 ],
       [ 4.7061415 ,  1.0609567 ]], dtype=float32)

### Using GPU acceleration
If data gets big, the execution time becomes an issue. SIMBSIG features GPU acceleration, by setting the `device` argument to `gpu`. This works with both inputs, numpy arrays and hdf5 files.

In [16]:
pca_hdf5 = PCA(n_components=2, mode='hdf5', device='gpu')

# open the hdf5 file for use
X_data = h5py.File(os.path.join(dataset_path, X_file), 'r')

pca_hdf5.fit(X_data)
trafo = pca_hdf5.transform(X_data)

# close hdf5 file
X_data.close()

trafo

AssertionError: Torch not compiled with CUDA enabled