# Spectroscopy Data Exploratory Data Analysis

This notebook demonstrates EDA techniques for X-ray Absorption Spectroscopy (XAS)
data, including XANES and EXAFS. We cover edge detection, normalization checks,
noise estimation, outlier identification, and Principal Component Analysis (PCA).

**Prerequisites**: `pip install numpy matplotlib scipy scikit-learn`

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.signal import savgol_filter
from sklearn.decomposition import PCA

plt.rcParams["figure.dpi"] = 120

# --- Generate synthetic demonstration data ---
# In practice, load your data from HDF5, Athena .prj, or ASCII files.
# This example simulates Fe K-edge XANES spectra.

np.random.seed(42)

# Energy axis: Fe K-edge at ~7112 eV
energy = np.linspace(7050, 7250, 400)  # 400 energy points
e0_nominal = 7112.0

def generate_xanes(energy, e0=7112, amplitude=1.0, white_line=0.3, noise=0.005):
    """Generate a synthetic XANES spectrum."""
    # Arctangent edge step
    mu = amplitude * (0.5 + 0.5 * np.tanh((energy - e0) / 3.0))
    # White line peak
    mu += white_line * np.exp(-0.5 * ((energy - e0 - 3) / 2.0) ** 2)
    # Post-edge oscillations (EXAFS-like)
    mu += 0.03 * np.sin(0.3 * (energy - e0)) * np.exp(-(energy - e0) / 80)
    # Add noise
    mu += noise * np.random.randn(len(energy))
    return mu

# Generate 30 spectra with varying parameters
n_spectra = 30
mu_spectra = np.zeros((n_spectra, len(energy)))
for i in range(n_spectra):
    e0_shift = np.random.normal(0, 0.5)  # small edge shifts
    wl = np.random.uniform(0.2, 0.4)     # varying white line
    noise_level = np.random.uniform(0.003, 0.01)
    mu_spectra[i] = generate_xanes(energy, e0=e0_nominal + e0_shift,
                                    white_line=wl, noise=noise_level)

# Add two outliers
mu_spectra[25] = generate_xanes(energy, e0=7115, white_line=0.05, noise=0.02)
mu_spectra[28] = generate_xanes(energy, e0=7110, white_line=0.6, noise=0.015)

print(f"Energy range: {energy[0]:.0f} to {energy[-1]:.0f} eV ({len(energy)} points)")
print(f"Number of spectra: {n_spectra}")

In [None]:
# Edge detection and alignment check

def find_edge(energy, mu):
    """Find edge energy as the maximum of the smoothed first derivative."""
    dmu = np.gradient(mu, energy)
    dmu_smooth = savgol_filter(dmu, window_length=11, polyorder=3)
    return energy[np.argmax(dmu_smooth)]

edge_energies = np.array([find_edge(energy, mu_spectra[i]) for i in range(n_spectra)])

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# All spectra overlaid
for i in range(n_spectra):
    color = "red" if abs(edge_energies[i] - np.median(edge_energies)) > 2 else "steelblue"
    alpha = 0.8 if color == "red" else 0.3
    axes[0].plot(energy, mu_spectra[i], color=color, lw=0.6, alpha=alpha)
axes[0].set_xlabel("Energy (eV)")
axes[0].set_ylabel("mu(E)")
axes[0].set_title("All Spectra (red = edge shift > 2 eV)")

# Edge energy distribution
axes[1].hist(edge_energies, bins=15, color="steelblue", edgecolor="white")
axes[1].axvline(np.median(edge_energies), color="red", ls="--",
                label=f"Median = {np.median(edge_energies):.1f} eV")
axes[1].set_xlabel("Edge Energy (eV)")
axes[1].set_ylabel("Count")
axes[1].set_title("Edge Energy Distribution")
axes[1].legend()

# Edge energy by spectrum index (drift detection)
axes[2].scatter(range(n_spectra), edge_energies, c="steelblue", s=30)
axes[2].axhline(np.median(edge_energies), color="red", ls="--")
axes[2].fill_between(range(n_spectra),
                     np.median(edge_energies) - 1, np.median(edge_energies) + 1,
                     alpha=0.15, color="orange", label="+/- 1 eV")
axes[2].set_xlabel("Spectrum Index")
axes[2].set_ylabel("Edge Energy (eV)")
axes[2].set_title("Edge Energy Stability")
axes[2].legend()

plt.tight_layout()
plt.show()

print(f"Edge energy: {np.mean(edge_energies):.2f} +/- {np.std(edge_energies):.2f} eV")

In [None]:
# Normalization and noise level assessment

def normalize_xas(energy, mu, e0=None, pre=(-100, -30), post=(50, 200)):
    """Normalize XAS spectrum: subtract pre-edge, divide by edge step."""
    if e0 is None:
        e0 = find_edge(energy, mu)
    e_rel = energy - e0
    
    # Pre-edge linear fit
    pre_mask = (e_rel >= pre[0]) & (e_rel <= pre[1])
    pre_coeffs = np.polyfit(energy[pre_mask], mu[pre_mask], 1)
    pre_line = np.polyval(pre_coeffs, energy)
    
    # Post-edge linear fit
    post_mask = (e_rel >= post[0]) & (e_rel <= post[1])
    post_coeffs = np.polyfit(energy[post_mask], mu[post_mask], 1)
    
    edge_step = np.polyval(post_coeffs, e0) - np.polyval(pre_coeffs, e0)
    mu_norm = (mu - pre_line) / max(edge_step, 1e-10)
    return mu_norm, e0, edge_step

# Normalize all spectra
normed = np.zeros_like(mu_spectra)
edge_steps = []
noise_levels = []

for i in range(n_spectra):
    normed[i], e0_i, step_i = normalize_xas(energy, mu_spectra[i])
    edge_steps.append(step_i)
    # Estimate noise in post-edge region
    post_mask = (energy - e0_i > 80) & (energy - e0_i < 150)
    residual = normed[i][post_mask] - savgol_filter(normed[i][post_mask], 15, 3)
    noise_levels.append(np.std(residual))

snr = np.array(edge_steps) / np.array(noise_levels)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Normalized spectra
for i in range(n_spectra):
    axes[0].plot(energy, normed[i], lw=0.5, alpha=0.5)
axes[0].axhline(0, color="gray", ls="--")
axes[0].axhline(1, color="gray", ls="--")
axes[0].set_xlabel("Energy (eV)")
axes[0].set_ylabel("Normalized mu(E)")
axes[0].set_title("Normalized Spectra")

# Noise level per spectrum
axes[1].bar(range(n_spectra), noise_levels, color="coral")
axes[1].set_xlabel("Spectrum Index")
axes[1].set_ylabel("Post-edge Noise (std)")
axes[1].set_title("Noise Level per Spectrum")

# SNR distribution
axes[2].bar(range(n_spectra), snr, color="forestgreen")
axes[2].axhline(np.median(snr), color="red", ls="--",
                label=f"Median SNR = {np.median(snr):.0f}")
axes[2].set_xlabel("Spectrum Index")
axes[2].set_ylabel("SNR")
axes[2].set_title("Signal-to-Noise Ratio")
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Outlier detection using L2 distance from mean spectrum

mean_spectrum = np.mean(normed, axis=0)
distances = np.sqrt(np.sum((normed - mean_spectrum) ** 2, axis=1))

threshold = np.mean(distances) + 3 * np.std(distances)
outliers = np.where(distances > threshold)[0]

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Distance bar chart
colors = ["crimson" if d > threshold else "steelblue" for d in distances]
axes[0].bar(range(n_spectra), distances, color=colors)
axes[0].axhline(threshold, color="red", ls="--", label=f"3-sigma = {threshold:.3f}")
axes[0].set_xlabel("Spectrum Index")
axes[0].set_ylabel("L2 Distance from Mean")
axes[0].set_title("Spectral Outlier Detection")
axes[0].legend()

# Overlay: mean vs. outliers
axes[1].plot(energy, mean_spectrum, "k-", lw=2, label="Mean Spectrum")
for idx in outliers:
    axes[1].plot(energy, normed[idx], "r-", lw=1, alpha=0.7,
                label=f"Outlier #{idx}" if idx == outliers[0] else "")
axes[1].set_xlabel("Energy (eV)")
axes[1].set_ylabel("Normalized mu(E)")
axes[1].set_title(f"{len(outliers)} Outlier(s) Detected")
axes[1].legend()

plt.tight_layout()
plt.show()

print(f"Outlier spectra indices: {outliers}")

In [None]:
# Principal Component Analysis (PCA)

n_components = min(10, n_spectra)
pca = PCA(n_components=n_components)
scores = pca.fit_transform(normed)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Scree plot
cumvar = np.cumsum(pca.explained_variance_ratio_) * 100
axes[0].bar(range(1, n_components + 1),
            pca.explained_variance_ratio_ * 100,
            color="steelblue", label="Individual")
axes[0].plot(range(1, n_components + 1), cumvar, "ro-", label="Cumulative")
axes[0].set_xlabel("Component")
axes[0].set_ylabel("Variance Explained (%)")
axes[0].set_title("PCA Scree Plot")
axes[0].legend()

# First 3 principal components (loadings)
for i in range(min(3, n_components)):
    axes[1].plot(energy, pca.components_[i],
                label=f"PC{i+1} ({pca.explained_variance_ratio_[i]*100:.1f}%)")
axes[1].set_xlabel("Energy (eV)")
axes[1].set_ylabel("Loading")
axes[1].set_title("Principal Component Loadings")
axes[1].legend(fontsize=8)

# Score scatter plot (PC1 vs PC2)
scatter = axes[2].scatter(scores[:, 0], scores[:, 1],
                          c=edge_energies, cmap="coolwarm", s=40)
if len(outliers) > 0:
    axes[2].scatter(scores[outliers, 0], scores[outliers, 1],
                    edgecolors="red", facecolors="none", s=100, linewidths=2,
                    label="Outliers")
axes[2].set_xlabel("PC1 Score")
axes[2].set_ylabel("PC2 Score")
axes[2].set_title("PCA Score Plot")
plt.colorbar(scatter, ax=axes[2], label="Edge Energy (eV)")
axes[2].legend()

plt.tight_layout()
plt.show()

print("Variance explained:")
for i in range(min(5, n_components)):
    print(f"  PC{i+1}: {pca.explained_variance_ratio_[i]*100:.2f}%  (cumulative: {cumvar[i]:.1f}%)")

## EDA Summary

After running this notebook, you should have assessed:

1. **Edge alignment** -- Verified that absorption edge energies are consistent
2. **Normalization quality** -- Pre-edge at 0, post-edge at 1, reasonable edge step
3. **Noise levels** -- Quantified per-spectrum noise and SNR
4. **Outliers** -- Identified spectra with anomalous shapes or shifts
5. **Component analysis** -- Determined the number of spectral end-members via PCA

**Next steps**: Remove or flag outliers, apply energy calibration corrections if needed,
then proceed to quantitative analysis (linear combination fitting, PCA target
transformation, EXAFS modeling).

**Tools for further analysis**:
- [Larch](https://xraypy.github.io/xraylarch/) -- Python XAS analysis
- [Athena/Artemis](https://bruceravel.github.io/demeter/) -- GUI-based XAS fitting