In [None]:
import numpy as np
import pandas as pd

import plotly.express as px

# PCA, step by step

## 0. original data



In [None]:
def data_generating_process(n, t, locs, scales, seed=None):
    if seed is not None:
        np.random.seed(seed)
    z = np.random.multivariate_normal(mean=locs, cov=np.diag(scales), size=n)
    R = np.array([
        [np.cos(t), -np.sin(t)],
        [np.sin(t), np.cos(t)]
    ])
    return R @ z.T

n = 1000
t = 251
locs = (2, 5)
scales = (11, 3)
X = data_generating_process(n, t, locs, scales, seed=0)

fig = px.scatter(x = X[0], y = X[1], opacity=0.5)
fig.update_yaxes(scaleanchor="x")



## 1. Center the data

In [None]:
X_center = X - np.mean(X, axis=1)[:, np.newaxis]
fig = px.scatter(x = X_center[0], y = X_center[1], opacity=0.5)
fig.update_yaxes(scaleanchor="x")

## 2. Rotate the data

In [None]:
t = 251
X_center_rotate = np.array([
    [np.cos(-t), -np.sin(-t)],
    [np.sin(-t), np.cos(-t)]
]) @ X_center
fig = px.scatter(x = X_center_rotate[0], y = X_center_rotate[1], opacity=0.5)
fig.update_yaxes(scaleanchor="x")

## 3. Scale the data

In [None]:
V = np.sqrt(np.array(scales)[:, np.newaxis])
X_center_rotate_scale = (X_center_rotate / V)
fig = px.scatter(x = X_center_rotate_scale[0], y = X_center_rotate_scale[1], opacity=0.5)
fig.update_yaxes(scaleanchor="x")

## 4. _ta-da!_ we did PCA

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True) # scaling conventing
X_pca = pca.fit_transform(X.T)
fig = px.scatter(x = X_pca[:, 0], y = X_pca[:, 1], opacity=0.5)
fig.update_traces(marker=dict(color="teal"))
fig.update_yaxes(scaleanchor="x")

# PCA example: radial data, again

In [None]:
np.random.seed(0)

n_cat = 150
r0 = np.random.uniform(0, 1, n_cat)
r1 = np.random.uniform(3, 5, n_cat)
r2 = np.random.uniform(8, 10, n_cat)

r = np.concatenate((r0, r1, r2))
t = np.random.uniform(0, 2*np.pi, n_cat*3)

x = r * np.cos(t)
y = r * np.sin(t)
z = np.random.normal(size=3*n_cat)

label = (["A"]*n_cat) + (["B"]*n_cat) + (["C"]*n_cat)

data = pd.DataFrame({"x": x, "y": y, "label": label, "r": r, "t": t, "z": z})
fig = px.scatter_3d(data, x="x", y="y", z="z")
fig.update_yaxes(scaleanchor="x")

In [None]:
X_pca_cartesian = PCA(n_components=2).fit_transform(data[["x", "y", "z"]])
fig = px.scatter(x = X_pca_cartesian[:, 0], y = X_pca_cartesian[:, 1])
fig.update_yaxes(scaleanchor="x")
fig