# 04 - Fuzzy K-Means Clustering and ROI Recommendation

This notebook applies fuzzy c-means clustering to PCA-reduced region
features and recommends the most representative ROIs per cluster.

1. Fuzzy c-means clustering
2. Membership visualization
3. ROI recommendation (highest membership per cluster)

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import skfuzzy as fuzz
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from skimage.filters import threshold_otsu
from skimage.morphology import binary_opening, binary_closing, disk
from skimage.measure import label, regionprops

%matplotlib inline

## 1. Rebuild Feature Matrix (from previous notebooks)

In [None]:
DATA_PATH = "../data/sample_xrf.h5"

with h5py.File(DATA_PATH, "r") as f:
    names = [n.decode() for n in f["/MAPS/channel_names"][:]]
    all_maps = f["/MAPS/XRF_fits"][:]

ref_map = all_maps[names.index("Zn")]
binary = ref_map > threshold_otsu(ref_map)
selem = disk(3)
cleaned = binary_closing(binary_opening(binary, selem), selem)
lab = label(cleaned)
regions = [r for r in regionprops(lab) if 50 <= r.area <= 5000]

feat = np.zeros((len(regions), len(names)))
for i, r in enumerate(regions):
    mask = lab == r.label
    for j in range(len(names)):
        feat[i, j] = all_maps[j][mask].mean()

X = StandardScaler().fit_transform(feat)
X_pca = PCA(n_components=5).fit_transform(X)
print(f"Regions: {X_pca.shape[0]}, PCA dims: {X_pca.shape[1]}")

## 2. Fuzzy C-Means Clustering

Fuzzy c-means assigns each region a membership degree to every cluster
rather than a hard label, which better captures mixed-phenotype cells.

In [None]:
N_CLUSTERS = 4
FUZZINESS = 2.0

cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
    X_pca.T,  # skfuzzy expects (features, samples)
    c=N_CLUSTERS,
    m=FUZZINESS,
    error=1e-5,
    maxiter=500,
    seed=42,
)

hard_labels = np.argmax(u, axis=0)
print(f"Fuzzy partition coefficient (FPC): {fpc:.4f}")
for c in range(N_CLUSTERS):
    print(f"  Cluster {c}: {(hard_labels == c).sum()} regions")

## 3. Visualize Clusters in PC Space

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Hard labels scatter
scatter = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=hard_labels,
                          cmap="tab10", s=25, alpha=0.8)
axes[0].set_xlabel("PC1")
axes[0].set_ylabel("PC2")
axes[0].set_title("Hard cluster assignments")
plt.colorbar(scatter, ax=axes[0], label="Cluster")

# Membership heatmap for cluster 0
sc = axes[1].scatter(X_pca[:, 0], X_pca[:, 1], c=u[0], cmap="YlOrRd",
                     s=25, alpha=0.8)
axes[1].set_xlabel("PC1")
axes[1].set_ylabel("PC2")
axes[1].set_title("Membership degree -- Cluster 0")
plt.colorbar(sc, ax=axes[1], label="Membership")

plt.tight_layout()
plt.show()

## 4. ROI Recommendation

For each cluster select the region with the highest membership as the
recommended ROI for further high-resolution analysis (e.g., nano-XRF,
XANES).

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.imshow(ref_map, cmap="gray", alpha=0.5)

colors = plt.cm.tab10.colors
recommended_rois = []

for c in range(N_CLUSTERS):
    best_idx = np.argmax(u[c])
    r = regions[best_idx]
    recommended_rois.append(r)
    minr, minc, maxr, maxc = r.bbox
    rect = plt.Rectangle((minc, minr), maxc - minc, maxr - minr,
                          edgecolor=colors[c], facecolor="none", lw=2)
    ax.add_patch(rect)
    ax.text(minc, minr - 4, f"ROI-{c}", color=colors[c], fontsize=10, weight="bold")
    print(f"Cluster {c}: region label={r.label}, centroid={r.centroid}, "
          f"area={r.area}, membership={u[c, best_idx]:.3f}")

ax.set_title("Recommended ROIs per Cluster")
ax.axis("off")
plt.tight_layout()
plt.show()