# 03 - PCA Feature Extraction

After segmentation we extract per-region features and reduce dimensionality
with PCA. Steps:

1. Build a feature matrix (mean elemental concentration per region)
2. Standardize features
3. Run PCA
4. Visualize principal components and loadings

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from skimage.filters import threshold_otsu
from skimage.morphology import binary_opening, binary_closing, disk
from skimage.measure import label, regionprops

%matplotlib inline

## 1. Load Data and Segment (reuse pipeline from notebook 02)

In [None]:
DATA_PATH = "../data/sample_xrf.h5"

with h5py.File(DATA_PATH, "r") as f:
    names = [n.decode() for n in f["/MAPS/channel_names"][:]]
    all_maps = f["/MAPS/XRF_fits"][:]

ref_idx = names.index("Zn")
ref_map = all_maps[ref_idx]
binary = ref_map > threshold_otsu(ref_map)
selem = disk(3)
cleaned = binary_closing(binary_opening(binary, selem), selem)
labels = label(cleaned)
regions = [r for r in regionprops(labels) if 50 <= r.area <= 5000]

print(f"Regions: {len(regions)}, Elements: {len(names)}")

## 2. Build Feature Matrix

For each region, compute the mean concentration of every element.

In [None]:
n_regions = len(regions)
n_elements = len(names)
feature_matrix = np.zeros((n_regions, n_elements))

for i, r in enumerate(regions):
    mask = labels == r.label
    for j in range(n_elements):
        feature_matrix[i, j] = all_maps[j][mask].mean()

print(f"Feature matrix shape: {feature_matrix.shape}")

## 3. Standardize and Run PCA

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(feature_matrix)

pca = PCA(n_components=min(10, n_elements))
X_pca = pca.fit_transform(X_scaled)

print("Explained variance ratio:")
for i, v in enumerate(pca.explained_variance_ratio_):
    print(f"  PC{i+1}: {v:.4f} ({v*100:.1f}%)")

## 4. Scree Plot and 2-D Projection

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(13, 5))

# Scree plot
cumvar = np.cumsum(pca.explained_variance_ratio_)
axes[0].bar(range(1, len(cumvar) + 1), pca.explained_variance_ratio_, label="Individual")
axes[0].plot(range(1, len(cumvar) + 1), cumvar, "ro-", label="Cumulative")
axes[0].set_xlabel("Principal Component")
axes[0].set_ylabel("Explained Variance")
axes[0].legend()
axes[0].set_title("Scree Plot")

# 2-D scatter
axes[1].scatter(X_pca[:, 0], X_pca[:, 1], s=20, alpha=0.7)
axes[1].set_xlabel("PC1")
axes[1].set_ylabel("PC2")
axes[1].set_title("Regions in PC1-PC2 Space")

plt.tight_layout()
plt.show()

## 5. Loadings Plot

Shows which elements drive each principal component.

In [None]:
loadings = pca.components_[:2]  # first two PCs

fig, ax = plt.subplots(figsize=(8, 6))
for j, name in enumerate(names):
    ax.arrow(0, 0, loadings[0, j], loadings[1, j],
             head_width=0.02, head_length=0.01, fc="steelblue", ec="steelblue")
    ax.text(loadings[0, j] * 1.12, loadings[1, j] * 1.12, name, fontsize=9)

ax.set_xlabel("PC1 loading")
ax.set_ylabel("PC2 loading")
ax.set_title("Element Loadings on PC1 vs PC2")
ax.axhline(0, color="gray", lw=0.5)
ax.axvline(0, color="gray", lw=0.5)
ax.set_aspect("equal")
plt.tight_layout()
plt.show()