# 03 - PCA Feature Extraction

This notebook builds a per-region feature matrix from XRF elemental maps,
standardizes it, and applies PCA to reduce dimensionality.

1. Build a feature matrix (mean intensity per element per region)
2. Standardize features (z-score)
3. Fit PCA and choose number of components
4. Visualize principal components and loadings

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

%matplotlib inline

## Synthetic data

We simulate a feature matrix representing 50 segmented regions
across 10 elemental channels, mimicking the output of the
segmentation pipeline.

In [None]:
rng = np.random.default_rng(7)
n_regions = 50
element_names = ["Fe", "Cu", "Zn", "Ca", "K", "P", "S", "Cl", "Mn", "Ti"]
n_elements = len(element_names)

# Three latent groups with correlated elemental profiles
group_centers = rng.uniform(1, 10, size=(3, n_elements))
labels_true = rng.choice(3, size=n_regions)
X_raw = group_centers[labels_true] + rng.normal(0, 0.8, size=(n_regions, n_elements))
X_raw = np.clip(X_raw, 0, None)

print(f"Feature matrix shape: {X_raw.shape}  (regions x elements)")

## 1. Build the feature matrix

In a real workflow the feature matrix is assembled by computing
`regionprops` mean intensities for each elemental map. Here we
already have `X_raw` from the synthetic step above.

In [None]:
print("Per-element means across all regions:")
for i, name in enumerate(element_names):
    print(f"  {name}: {X_raw[:, i].mean():.3f} +/- {X_raw[:, i].std():.3f}")

## 2. Standardize (z-score)

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X_raw)

print(f"After standardization - mean: {X_std.mean(axis=0).round(6)}")
print(f"After standardization - std:  {X_std.std(axis=0).round(4)}")

## 3. PCA

In [None]:
pca = PCA()
X_pca = pca.fit_transform(X_std)

# Scree plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].bar(range(1, n_elements + 1), pca.explained_variance_ratio_)
axes[0].set_xlabel("Principal Component")
axes[0].set_ylabel("Explained Variance Ratio")
axes[0].set_title("Scree Plot")

cumvar = np.cumsum(pca.explained_variance_ratio_)
axes[1].plot(range(1, n_elements + 1), cumvar, "o-")
axes[1].axhline(0.95, ls="--", color="red", label="95% threshold")
axes[1].set_xlabel("Number of Components")
axes[1].set_ylabel("Cumulative Variance")
axes[1].set_title("Cumulative Explained Variance")
axes[1].legend()

plt.tight_layout()
plt.show()

n_keep = np.argmax(cumvar >= 0.95) + 1
print(f"Components needed for 95% variance: {n_keep}")

## 4. Visualize PC space and loadings

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# PC1 vs PC2 scatter
scatter = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=labels_true,
                          cmap="Set1", edgecolors="k", s=60)
axes[0].set_xlabel("PC1")
axes[0].set_ylabel("PC2")
axes[0].set_title("Regions in PC space")

# Loadings biplot
loadings = pca.components_[:2].T  # (n_elements, 2)
for i, name in enumerate(element_names):
    axes[1].arrow(0, 0, loadings[i, 0], loadings[i, 1],
                  head_width=0.03, head_length=0.02, fc="steelblue", ec="steelblue")
    axes[1].text(loadings[i, 0] * 1.12, loadings[i, 1] * 1.12, name,
                 fontsize=9, ha="center")
axes[1].set_xlabel("PC1 loading")
axes[1].set_ylabel("PC2 loading")
axes[1].set_title("Element loadings")
axes[1].axhline(0, color="gray", lw=0.5)
axes[1].axvline(0, color="gray", lw=0.5)

plt.tight_layout()
plt.show()