# XRF Data Exploratory Data Analysis

This notebook walks through a complete EDA workflow for XRF fluorescence microscopy
data stored in MAPS HDF5 format. We inspect elemental maps, compute per-channel
statistics, assess signal-to-noise ratio, build correlation matrices, detect dead
pixels, and check I0 normalization.

**Prerequisites**: `pip install h5py numpy matplotlib seaborn scipy pandas`

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy.ndimage import median_filter

plt.rcParams["figure.dpi"] = 120

# Update this path to your XRF HDF5 file
FILEPATH = "xrf_scan.h5"

# Load data
with h5py.File(FILEPATH, "r") as f:
    maps = f["MAPS/XRF_Analyzed/Fitted/Counts_Per_Sec"][:]
    names = [n.decode() for n in f["MAPS/XRF_Analyzed/Channel_Names"][:]]
    x_axis = f["MAPS/Scan/x_axis"][:]
    y_axis = f["MAPS/Scan/y_axis"][:]
    scaler_names = [n.decode() for n in f["MAPS/Scalers/Names"][:]]
    scalers = f["MAPS/Scalers/Values"][:]

nelem, nrow, ncol = maps.shape
print(f"Elements: {nelem} -- {names}")
print(f"Map size: {nrow} x {ncol} pixels")
print(f"Scan area: {x_axis[-1]-x_axis[0]:.1f} x {y_axis[-1]-y_axis[0]:.1f} um")

In [None]:
# Per-channel statistics and SNR estimation

bg_roi = (slice(0, max(1, nrow // 20)), slice(0, max(1, ncol // 20)))  # Top-left corner

rows = []
for i, name in enumerate(names):
    m = maps[i]
    bg = m[bg_roi]
    noise = np.std(bg) if bg.size > 1 else 1e-10
    snr = np.mean(m) / max(noise, 1e-10)
    rows.append({
        "Element": name,
        "Min": f"{m.min():.2f}",
        "Max": f"{m.max():.2f}",
        "Mean": f"{m.mean():.2f}",
        "Median": f"{np.median(m):.2f}",
        "Std": f"{m.std():.2f}",
        "SNR": f"{snr:.1f}",
        "% Zeros": f"{100 * (m == 0).mean():.1f}",
    })

df = pd.DataFrame(rows)
print("Per-Element Statistics:")
print(df.to_string(index=False))

In [None]:
# Channel histograms

n_show = min(nelem, 15)
ncols_plot = 5
nrows_plot = int(np.ceil(n_show / ncols_plot))

fig, axes = plt.subplots(nrows_plot, ncols_plot, figsize=(4 * ncols_plot, 3 * nrows_plot))
axes = axes.ravel()

for i in range(n_show):
    ax = axes[i]
    data = maps[i].ravel()
    pos_data = data[data > 0]
    if len(pos_data) > 0:
        ax.hist(pos_data, bins=80, log=True, color="steelblue", edgecolor="none")
        p99 = np.percentile(pos_data, 99)
        ax.axvline(p99, color="red", ls="--", lw=1, label=f"p99={p99:.1f}")
        ax.legend(fontsize=7)
    ax.set_title(names[i], fontsize=10)
    ax.set_xlabel("Counts/sec", fontsize=8)

for i in range(n_show, len(axes)):
    axes[i].set_visible(False)

plt.suptitle("XRF Channel Histograms (log scale)", fontsize=13)
plt.tight_layout()
plt.show()

In [None]:
# Element correlation matrix

flat_maps = maps.reshape(nelem, -1)
corr = np.corrcoef(flat_maps)

fig, ax = plt.subplots(figsize=(max(8, nelem * 0.6), max(6, nelem * 0.5)))
sns.heatmap(corr, xticklabels=names, yticklabels=names,
            cmap="RdBu_r", center=0, vmin=-1, vmax=1,
            annot=True, fmt=".2f", ax=ax, square=True,
            cbar_kws={"label": "Pearson r"})
ax.set_title("XRF Element Correlation Matrix")
plt.tight_layout()
plt.show()

# Report strongly correlated pairs
print("\nStrongly correlated element pairs (|r| > 0.7):")
for i in range(nelem):
    for j in range(i + 1, nelem):
        if abs(corr[i, j]) > 0.7:
            print(f"  {names[i]:4s} -- {names[j]:4s}:  r = {corr[i, j]:.3f}")

In [None]:
# Dead/hot pixel detection

def detect_bad_pixels(image, threshold=5.0):
    """Detect pixels deviating from local median by > threshold * MAD."""
    med = median_filter(image.astype(float), size=3)
    diff = np.abs(image.astype(float) - med)
    mad = np.median(diff[diff > 0]) if (diff > 0).any() else 1.0
    return diff > threshold * max(mad, 1e-10)

combined_mask = np.zeros((nrow, ncol), dtype=bool)
print("Dead/hot pixel detection per element:")
for i, name in enumerate(names):
    bad = detect_bad_pixels(maps[i])
    n_bad = bad.sum()
    combined_mask |= bad
    if n_bad > 0:
        print(f"  {name:4s}: {n_bad:5d} bad pixels ({100 * n_bad / (nrow * ncol):.2f}%)")

print(f"\nTotal unique bad pixel positions: {combined_mask.sum()}")

# Visualize bad pixel map
fig, ax = plt.subplots(figsize=(8, 6))
ax.imshow(combined_mask.astype(int), cmap="Reds", origin="lower",
          extent=[x_axis[0], x_axis[-1], y_axis[0], y_axis[-1]])
ax.set_title(f"Combined Bad Pixel Map ({combined_mask.sum()} pixels)")
ax.set_xlabel("X (um)")
ax.set_ylabel("Y (um)")
plt.tight_layout()
plt.show()

In [None]:
# I0 (incident flux) normalization check

if "I0" in scaler_names:
    i0_idx = scaler_names.index("I0")
    i0_map = scalers[i0_idx]
    
    fig, axes = plt.subplots(1, 3, figsize=(16, 4))
    
    # I0 spatial map
    im0 = axes[0].imshow(i0_map, cmap="viridis", origin="lower",
                         extent=[x_axis[0], x_axis[-1], y_axis[0], y_axis[-1]])
    axes[0].set_title("I0 Spatial Map")
    plt.colorbar(im0, ax=axes[0], label="Counts")
    
    # I0 histogram
    axes[1].hist(i0_map.ravel(), bins=100, color="steelblue")
    axes[1].set_title(f"I0 Distribution (CV={i0_map.std()/i0_map.mean():.3f})")
    axes[1].set_xlabel("I0 Counts")
    
    # Row-averaged I0 (shows beam stability over scan)
    i0_row_avg = np.mean(i0_map, axis=1)
    axes[2].plot(y_axis, i0_row_avg, "b-", lw=0.8)
    axes[2].fill_between(y_axis, i0_row_avg * 0.95, i0_row_avg * 1.05,
                         alpha=0.2, color="orange", label="+/- 5%")
    axes[2].set_xlabel("Y position (um)")
    axes[2].set_ylabel("Mean I0")
    axes[2].set_title("I0 Stability Over Scan")
    axes[2].legend()
    
    # Check for beam drops
    i0_mean = i0_map.mean()
    low_i0 = i0_map < 0.5 * i0_mean
    if low_i0.any():
        print(f"WARNING: {low_i0.sum()} pixels with I0 < 50% of mean (beam instability)")
    else:
        print("I0 check passed: no significant beam drops detected.")
    
    plt.tight_layout()
    plt.show()
else:
    print(f"I0 scaler not found. Available scalers: {scaler_names}")

## EDA Summary

After running this notebook, you should have assessed:

1. **Data dimensions** -- Verified map sizes and element list
2. **Per-element statistics** -- Min, max, mean, SNR for each channel
3. **Intensity distributions** -- Histogram shapes, dynamic range usage
4. **Element correlations** -- Co-localization patterns
5. **Dead/hot pixels** -- Identified and counted anomalous pixels
6. **I0 stability** -- Checked incident beam normalization

**Next steps**: Based on EDA findings, proceed to preprocessing (dead pixel
interpolation, background subtraction) and then quantitative analysis
(phase mapping, clustering, ROI identification).