# Spectra Synthesis Explorer

Interactive notebook for testing, evaluating, and exploring NIRS spectra synthesis.

**Features:**
1. Load and display real datasets (nirs4all compatible)
2. Configure synthesis with all parameters inline
3. Optimize parameters to match real spectra
4. Analyze spectral properties (median, high-frequency, scatter, drift)
5. Compute similarity metrics and train discriminator

In [47]:
import sys
from pathlib import Path

# Add nirs4all to path
delete_root = Path.cwd().parent.parent.parent
if str(delete_root) not in sys.path:
    sys.path.insert(0, str(delete_root))

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# For comparison tools (from bench)
from comparator import SyntheticRealComparator, compute_spectral_properties
from scipy import signal, stats
from scipy.ndimage import gaussian_filter1d
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from visualizer import SyntheticSpectraVisualizer

import nirs4all
from nirs4all.data import DatasetConfigs, SpectroDataset
from nirs4all.data.synthetic import ComponentLibrary, NIRBand, SpectralComponent, SyntheticNIRSGenerator

%matplotlib inline
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['figure.dpi'] = 100

print(f"nirs4all loaded from: {nirs4all.__file__}")

nirs4all loaded from: None


---
## 1. Load Real Dataset

Load a real NIRS dataset for comparison. Supports:
- nirs4all DatasetConfigs folders (with spectra.csv)
- Direct CSV/parquet files

In [46]:
# ============================================================================
# CONFIGURE: Path to your real dataset
# ============================================================================
DATASET_PATH = delete_root / "NIRS DB" / "x_bank"  # Change this to your dataset
DATASET_NAME = "Beef_Marbling_RandomSplit"

# Load dataset using nirs4all
def load_real_dataset(path: Path, name: str):
    """Load real dataset from path using nirs4all DatasetConfigs."""
    path = Path(path)
    csv_path = path / (name + ".csv")

    config = {
        "x_train": str(csv_path),
        "delimiter": ",",
        "has_header": True,
        "header_unit": "nm",
    }

    # Use DatasetConfigs (now supports root-level params)
    configs = DatasetConfigs(config)
    dataset = configs.get_datasets()[0]
    X = dataset.x({}, layout='2d')
    print(dataset.header_unit(0))  # Example usage of new method
    wavelengths = dataset.wavelengths_nm(0)  # Method call with source index
    if wavelengths is None:
        wavelengths = np.arange(X.shape[1])
    return X, wavelengths, dataset.name

# Load the dataset
X_real, wavelengths_real, dataset_name = load_real_dataset(DATASET_PATH, DATASET_NAME)

print(f"Dataset: {dataset_name}")
print(f"Shape: {X_real.shape} (samples x wavelengths)")
print(f"Wavelength range: {wavelengths_real.min():.1f} - {wavelengths_real.max():.1f} nm")
print(f"Absorbance range: [{X_real.min():.4f}, {X_real.max():.4f}]")
print(f"Mean: {X_real.mean():.4f} +/- {X_real.std():.4f}")

nm
Dataset: x_bank_Beef_Marbling_RandomSplit
Shape: (833, 331) (samples x wavelengths)
Wavelength range: 740.0 - 1070.0 nm
Absorbance range: [0.1327, 1070.0000]
Mean: 1.5853 +/- 31.4950


In [None]:
# Visualize real dataset
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Plot 1: Sample spectra
n_display = min(100, X_real.shape[0])
sample_idx = np.random.choice(X_real.shape[0], n_display, replace=False)
for idx in sample_idx:
    axes[0].plot(wavelengths_real, X_real[idx], alpha=0.3, linewidth=0.5)
axes[0].set_xlabel('Wavelength (nm)')
axes[0].set_ylabel('Absorbance')
axes[0].set_title(f'Real Spectra (n={n_display})')
axes[0].grid(True, alpha=0.3)

# Plot 2: Mean and envelope
mean_spec = X_real.mean(axis=0)
std_spec = X_real.std(axis=0)
axes[1].fill_between(wavelengths_real, mean_spec - 2*std_spec, mean_spec + 2*std_spec, alpha=0.3, label='Mean +/- 2*std')
axes[1].plot(wavelengths_real, mean_spec, 'b-', linewidth=2, label='Mean')
axes[1].plot(wavelengths_real, np.median(X_real, axis=0), 'r--', linewidth=1.5, label='Median')
axes[1].set_xlabel('Wavelength (nm)')
axes[1].set_ylabel('Absorbance')
axes[1].set_title('Spectral Envelope')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Plot 3: Absorbance distribution
axes[2].hist(X_real.flatten(), bins=1000, density=True, alpha=0.7, edgecolor='black', linewidth=0.5)
axes[2].set_xlabel('Absorbance')
axes[2].set_ylabel('Density')
axes[2].set_title('Absorbance Distribution')
axes[2].grid(True, alpha=0.3)

plt.suptitle(f'Real Dataset: {dataset_name}', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## 2. Analyze Real Spectra Properties

Extract key spectral properties to guide synthesis parameter tuning.

In [None]:
def analyze_spectral_properties(X, wavelengths, name="dataset"):
    """Comprehensive analysis of spectral properties."""
    n_samples, n_wavelengths = X.shape
    wl_range = np.ptp(wavelengths)

    # Basic statistics
    mean_spectrum = X.mean(axis=0)
    median_spectrum = np.median(X, axis=0)
    std_spectrum = X.std(axis=0)

    # Global slope analysis (per 1000nm)
    x_norm = (wavelengths - wavelengths.min()) / wl_range
    slopes = []
    for i in range(n_samples):
        coeffs = np.polyfit(x_norm, X[i], 1)
        slopes.append(coeffs[0] * 5.0 / wl_range)
    slopes = np.array(slopes)

    # High-frequency noise analysis (first difference)
    first_diff = np.diff(X, axis=1)
    noise_estimate = first_diff.std() / np.sqrt(2)
    noise_per_wavelength = first_diff.std(axis=0) / np.sqrt(2)

    # Curvature analysis (second derivative)
    curvatures = []
    for i in range(min(100, n_samples)):
        window = min(21, n_wavelengths // 10 * 2 + 1)
        if window >= 3:
            smoothed = signal.savgol_filter(X[i], window, 2)
            d2 = np.gradient(np.gradient(smoothed))
            curvatures.append(np.mean(np.abs(d2)))
    mean_curvature = np.mean(curvatures) if curvatures else 0

    # PCA analysis
    pca = PCA(n_components=min(20, n_samples, n_wavelengths))
    pca.fit(X)
    cumvar = np.cumsum(pca.explained_variance_ratio_)
    n_components_95 = int(np.searchsorted(cumvar, 0.95) + 1)

    # Distribution statistics
    skewness = stats.skew(X.flatten())
    kurtosis = stats.kurtosis(X.flatten())

    # SNR estimate
    snr = std_spectrum.mean() / (noise_estimate + 1e-10)

    props = {
        'name': name,
        'n_samples': n_samples,
        'n_wavelengths': n_wavelengths,
        'wavelength_range': (wavelengths.min(), wavelengths.max()),
        'global_mean': X.mean(),
        'global_std': X.std(),
        'absorbance_range': (X.min(), X.max()),
        'mean_slope': slopes.mean(),
        'slope_std': slopes.std(),
        'noise_estimate': noise_estimate,
        'snr': snr,
        'mean_curvature': mean_curvature,
        'pca_n_components_95': n_components_95,
        'skewness': skewness,
        'kurtosis': kurtosis,
        'mean_spectrum': mean_spectrum,
        'median_spectrum': median_spectrum,
        'std_spectrum': std_spectrum,
        'noise_per_wavelength': noise_per_wavelength,
        'slopes': slopes,
        'pca_variance': pca.explained_variance_ratio_,
        'pca_loadings': pca.components_[:3],
    }

    return props

# Analyze real data
real_props = analyze_spectral_properties(X_real, wavelengths_real, dataset_name)

print("=" * 60)
print(f"SPECTRAL PROPERTIES: {real_props['name']}")
print("=" * 60)
print(f"Samples: {real_props['n_samples']}, Wavelengths: {real_props['n_wavelengths']}")
print(f"Wavelength range: {real_props['wavelength_range'][0]:.1f} - {real_props['wavelength_range'][1]:.1f} nm")
print("")
print(f"Global mean: {real_props['global_mean']:.4f}")
print(f"Global std: {real_props['global_std']:.4f}")
print(f"Absorbance range: [{real_props['absorbance_range'][0]:.4f}, {real_props['absorbance_range'][1]:.4f}]")
print("")
print(f"Mean slope: {real_props['mean_slope']:.4f} per 1000nm")
print(f"Slope std: {real_props['slope_std']:.4f}")
print("")
print(f"Noise estimate (1st diff): {real_props['noise_estimate']:.5f}")
print(f"SNR: {real_props['snr']:.1f}")
print(f"Mean curvature: {real_props['mean_curvature']:.6f}")
print("")
print(f"PCA components for 95%: {real_props['pca_n_components_95']}")
print(f"Skewness: {real_props['skewness']:.3f}")
print(f"Kurtosis: {real_props['kurtosis']:.3f}")
print("=" * 60)

---
## 3. Synthesis Configuration

Configure all synthesis parameters inline. Adjust these to match your real dataset.

In [None]:
# ============================================================================
# SYNTHESIS PARAMETERS - All configurable in one place
# ============================================================================

def generate_synthetic_spectra(
    # === Sample configuration ===
    n_samples: int = real_props['n_samples'],
    random_state: int = 42,

    # === Wavelength grid ===
    wavelength_start: float = real_props['wavelength_range'][0],  # Start wavelength in nm
    wavelength_end: float = real_props['wavelength_range'][1],    # End wavelength in nm
    wavelength_step: float = 2.0,      # Wavelength resolution in nm

    # === Component configuration ===
    components: list = None,  # Predefined components
    concentration_method: str = "dirichlet",  # 'dirichlet', 'uniform', 'lognormal', 'correlated'
    dirichlet_alpha: float = 2.0,  # Concentration distribution shape (higher = more uniform)

    # === Path length variation ===
    path_length_std: float = 0.05,  # Std of optical path length factor (multiplicative)

    # === Baseline effects ===
    baseline_amplitude: float = 0.02,  # Polynomial baseline drift amplitude

    # === Global slope (common in NIR) ===
    global_slope_mean: float = 0.05,   # Mean slope per 1000nm (typical upward trend)
    global_slope_std: float = 0.03,    # Variation in slope between samples

    # === Scatter effects (SNV/MSC-like before correction) ===
    scatter_alpha_std: float = 0.05,   # Multiplicative scatter variation
    scatter_beta_std: float = 0.01,    # Additive scatter offset
    tilt_std: float = 0.01,            # Wavelength-dependent tilt

    # === Wavelength calibration ===
    shift_std: float = 0.5,            # Wavelength shift in nm
    stretch_std: float = 0.001,        # Wavelength stretch factor

    # === Instrumental effects ===
    instrumental_fwhm: float = 8.0,    # Spectral resolution broadening (FWHM in nm)

    # === Noise model ===
    noise_base: float = 0.005,         # Base noise level (constant)
    noise_signal_dep: float = 0.01,    # Signal-dependent noise (heteroscedastic)

    # === Artifacts ===
    artifact_prob: float = 0.02,       # Probability of spike/dead band artifacts
):
    """
    Generate synthetic NIRS spectra with full parameter control.

    Returns:
        X: Spectra array (n_samples, n_wavelengths)
        wavelengths: Wavelength grid
        metadata: Generation parameters and additional info
    """
    # Build component library
    if components is None:
        components = ["water", "protein", "lipid", "starch", "cellulose"]
    library = ComponentLibrary.from_predefined(components, random_state=random_state)

    # Create generator with custom parameters
    generator = SyntheticNIRSGenerator(
        wavelength_start=wavelength_start,
        wavelength_end=wavelength_end,
        wavelength_step=wavelength_step,
        component_library=library,
        complexity="realistic",  # Base complexity (will override params)
        random_state=random_state,
    )

    # Override parameters with our settings
    generator.params.update({
        "path_length_std": path_length_std,
        "baseline_amplitude": baseline_amplitude,
        "scatter_alpha_std": scatter_alpha_std,
        "scatter_beta_std": scatter_beta_std,
        "tilt_std": tilt_std,
        "global_slope_mean": global_slope_mean,
        "global_slope_std": global_slope_std,
        "shift_std": shift_std,
        "stretch_std": stretch_std,
        "instrumental_fwhm": instrumental_fwhm,
        "noise_base": noise_base,
        "noise_signal_dep": noise_signal_dep,
        "artifact_prob": artifact_prob,
    })

    # Generate spectra
    alpha = np.ones(len(components)) * dirichlet_alpha
    X, Y, E, metadata = generator.generate(
        n_samples=n_samples,
        concentration_method=concentration_method,
        return_metadata=True,
    )

    # Store parameters in metadata
    metadata['params'] = generator.params.copy()
    metadata['concentrations'] = Y
    metadata['component_spectra'] = E

    return X, generator.wavelengths, metadata

print("Synthesis function defined. Ready to generate spectra.")

In [None]:
# ============================================================================
# GENERATE SYNTHETIC SPECTRA - Adjust parameters here to match real data
# ============================================================================

# Use real data properties to set initial parameters
wl_start, wl_end = real_props['wavelength_range']
wl_step = (wl_end - wl_start) / (real_props['n_wavelengths'] - 1)

X_synth, wavelengths_synth, synth_metadata = generate_synthetic_spectra(
    # Match real data grid
    n_samples=500,
    random_state=42,
    wavelength_start=wl_start,
    wavelength_end=wl_end,
    wavelength_step=wl_step,

    # Component settings
    components=["water", "protein", "lipid", "starch", "cellulose"],
    concentration_method="dirichlet",
    dirichlet_alpha=2.0,

    # --- TUNE THESE TO MATCH REAL DATA ---
    # Start from real data properties and adjust
    path_length_std=0.05,
    baseline_amplitude=0.02,

    # Global slope (from real data analysis)
    global_slope_mean=real_props['mean_slope'],  # Use measured slope
    global_slope_std=real_props['slope_std'],

    # Scatter effects
    scatter_alpha_std=0.05,
    scatter_beta_std=0.01,
    tilt_std=0.01,

    # Wavelength calibration
    shift_std=0.5,
    stretch_std=0.001,

    # Instrumental
    instrumental_fwhm=8.0,

    # Noise (from real data analysis)
    noise_base=real_props['noise_estimate'] * 0.5,  # Base noise
    noise_signal_dep=real_props['noise_estimate'] * 0.5,  # Signal-dependent

    artifact_prob=0.02,
)

print(f"Generated {X_synth.shape[0]} synthetic spectra")
print(f"Shape: {X_synth.shape}")
print(f"Wavelength range: {wavelengths_synth.min():.1f} - {wavelengths_synth.max():.1f} nm")

In [None]:
# Visualize synthetic spectra
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Plot 1: Sample spectra
n_display = min(100, X_synth.shape[0])
sample_idx = np.random.choice(X_synth.shape[0], n_display, replace=False)
for idx in sample_idx:
    axes[0].plot(wavelengths_synth, X_synth[idx], alpha=0.3, linewidth=0.5, color='orange')
axes[0].set_xlabel('Wavelength (nm)')
axes[0].set_ylabel('Absorbance')
axes[0].set_title(f'Synthetic Spectra (n={n_display})')
axes[0].grid(True, alpha=0.3)

# Plot 2: Mean and envelope
mean_synth = X_synth.mean(axis=0)
std_synth = X_synth.std(axis=0)
axes[1].fill_between(wavelengths_synth, mean_synth - 2*std_synth, mean_synth + 2*std_synth, alpha=0.3, color='orange')
axes[1].plot(wavelengths_synth, mean_synth, color='darkorange', linewidth=2, label='Mean')
axes[1].plot(wavelengths_synth, np.median(X_synth, axis=0), 'r--', linewidth=1.5, label='Median')
axes[1].set_xlabel('Wavelength (nm)')
axes[1].set_ylabel('Absorbance')
axes[1].set_title('Spectral Envelope')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Plot 3: Component library
E = synth_metadata['component_spectra']
for i, (name, spectrum) in enumerate(zip(synth_metadata['component_names'], E, strict=False)):
    norm_spec = (spectrum - spectrum.min()) / (spectrum.max() - spectrum.min() + 1e-10)
    axes[2].plot(wavelengths_synth, norm_spec + i * 1.2, linewidth=1.5, label=name)
axes[2].set_xlabel('Wavelength (nm)')
axes[2].set_ylabel('Normalized Absorbance (stacked)')
axes[2].set_title('Component Library')
axes[2].legend(loc='upper right')
axes[2].grid(True, alpha=0.3)

plt.suptitle('Synthetic Data', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## 4. Compare Synthetic vs Real

Side-by-side comparison of spectral properties.

In [None]:
# Analyze synthetic data
synth_props = analyze_spectral_properties(X_synth, wavelengths_synth, "synthetic")

# Print comparison table
print("=" * 70)
print("PROPERTY COMPARISON: Real vs Synthetic")
print("=" * 70)
print(f"{'Property':<25} {'Real':>15} {'Synthetic':>15} {'Diff %':>12}")
print("-" * 70)

def fmt_diff(real_val, synth_val):
    if abs(real_val) > 1e-10:
        diff_pct = (synth_val - real_val) / abs(real_val) * 100
        return f"{diff_pct:+.1f}%"
    return "N/A"

comparisons = [
    ('Global mean', real_props['global_mean'], synth_props['global_mean']),
    ('Global std', real_props['global_std'], synth_props['global_std']),
    ('Mean slope', real_props['mean_slope'], synth_props['mean_slope']),
    ('Slope std', real_props['slope_std'], synth_props['slope_std']),
    ('Noise estimate', real_props['noise_estimate'], synth_props['noise_estimate']),
    ('SNR', real_props['snr'], synth_props['snr']),
    ('Mean curvature', real_props['mean_curvature'], synth_props['mean_curvature']),
    ('PCA 95% components', real_props['pca_n_components_95'], synth_props['pca_n_components_95']),
    ('Skewness', real_props['skewness'], synth_props['skewness']),
    ('Kurtosis', real_props['kurtosis'], synth_props['kurtosis']),
]

for name, real_val, synth_val in comparisons:
    diff = fmt_diff(real_val, synth_val)
    print(f"{name:<25} {real_val:>15.5f} {synth_val:>15.5f} {diff:>12}")

print("=" * 70)

In [None]:
# Visual comparison
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

# 1. Mean spectra overlay
axes[0, 0].plot(wavelengths_real, real_props['mean_spectrum'], 'b-', linewidth=2, label='Real')
axes[0, 0].plot(wavelengths_synth, synth_props['mean_spectrum'], 'orange', linestyle='--', linewidth=2, label='Synthetic')
axes[0, 0].fill_between(wavelengths_real,
                         real_props['mean_spectrum'] - real_props['std_spectrum'],
                         real_props['mean_spectrum'] + real_props['std_spectrum'],
                         alpha=0.2, color='blue')
axes[0, 0].fill_between(wavelengths_synth,
                         synth_props['mean_spectrum'] - synth_props['std_spectrum'],
                         synth_props['mean_spectrum'] + synth_props['std_spectrum'],
                         alpha=0.2, color='orange')
axes[0, 0].set_xlabel('Wavelength (nm)')
axes[0, 0].set_ylabel('Absorbance')
axes[0, 0].set_title('Mean Spectra +/- Std')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Slope distributions
axes[0, 1].hist(real_props['slopes'], bins=30, alpha=0.6, color='blue', label='Real', density=True)
axes[0, 1].hist(synth_props['slopes'], bins=30, alpha=0.6, color='orange', label='Synthetic', density=True)
axes[0, 1].set_xlabel('Slope (per 1000nm)')
axes[0, 1].set_ylabel('Density')
axes[0, 1].set_title('Slope Distributions')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Noise comparison
# Interpolate if wavelengths differ
if len(wavelengths_real) != len(wavelengths_synth) or not np.allclose(wavelengths_real, wavelengths_synth):
    noise_synth_interp = np.interp(wavelengths_real[:-1], wavelengths_synth[:-1], synth_props['noise_per_wavelength'])
else:
    noise_synth_interp = synth_props['noise_per_wavelength']

axes[0, 2].plot(wavelengths_real[:-1], real_props['noise_per_wavelength'], 'b-', linewidth=1, label='Real', alpha=0.7)
axes[0, 2].plot(wavelengths_real[:-1], noise_synth_interp, 'orange', linewidth=1, label='Synthetic', alpha=0.7)
axes[0, 2].axhline(real_props['noise_estimate'], color='blue', linestyle='--', alpha=0.5)
axes[0, 2].axhline(synth_props['noise_estimate'], color='orange', linestyle='--', alpha=0.5)
axes[0, 2].set_xlabel('Wavelength (nm)')
axes[0, 2].set_ylabel('Noise (std)')
axes[0, 2].set_title('Wavelength-dependent Noise')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3)

# 4. PCA comparison
axes[1, 0].plot(range(1, len(real_props['pca_variance']) + 1),
                np.cumsum(real_props['pca_variance']) * 100, 'bo-', linewidth=2, label='Real')
axes[1, 0].plot(range(1, len(synth_props['pca_variance']) + 1),
                np.cumsum(synth_props['pca_variance']) * 100, 'o-', color='orange', linewidth=2, label='Synthetic')
axes[1, 0].axhline(95, color='gray', linestyle='--', alpha=0.5)
axes[1, 0].set_xlabel('Principal Component')
axes[1, 0].set_ylabel('Cumulative Variance (%)')
axes[1, 0].set_title('PCA Cumulative Variance')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# 5. Std per wavelength
std_synth_interp = np.interp(wavelengths_real, wavelengths_synth, synth_props['std_spectrum']) if len(wavelengths_real) != len(wavelengths_synth) else synth_props['std_spectrum']

axes[1, 1].plot(wavelengths_real, real_props['std_spectrum'], 'b-', linewidth=1.5, label='Real')
axes[1, 1].plot(wavelengths_real, std_synth_interp, color='orange', linewidth=1.5, label='Synthetic')
axes[1, 1].set_xlabel('Wavelength (nm)')
axes[1, 1].set_ylabel('Standard Deviation')
axes[1, 1].set_title('Sample Variation per Wavelength')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

# 6. Absorbance distributions
axes[1, 2].hist(X_real.flatten(), bins=100, density=True, alpha=0.6, color='blue', label='Real')
axes[1, 2].hist(X_synth.flatten(), bins=100, density=True, alpha=0.6, color='orange', label='Synthetic')
axes[1, 2].set_xlabel('Absorbance')
axes[1, 2].set_ylabel('Density')
axes[1, 2].set_title('Absorbance Distributions')
axes[1, 2].legend()
axes[1, 2].grid(True, alpha=0.3)

plt.suptitle('Real vs Synthetic Comparison', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## 5. Compute Similarity Metrics

Quantitative assessment of how well synthetic matches real.

In [None]:
def compute_similarity_metrics(X_real, X_synth, wavelengths_real, wavelengths_synth):
    """
    Compute comprehensive similarity metrics between real and synthetic spectra.
    """
    metrics = {}

    # Interpolate if wavelengths differ
    if len(wavelengths_real) != len(wavelengths_synth) or not np.allclose(wavelengths_real, wavelengths_synth):
        X_synth_interp = np.array([np.interp(wavelengths_real, wavelengths_synth, spec) for spec in X_synth])
    else:
        X_synth_interp = X_synth

    # 1. Mean spectrum correlation
    mean_real = X_real.mean(axis=0)
    mean_synth = X_synth_interp.mean(axis=0)
    metrics['mean_spectrum_correlation'] = np.corrcoef(mean_real, mean_synth)[0, 1]

    # 2. Mean spectrum RMSE
    metrics['mean_spectrum_rmse'] = np.sqrt(np.mean((mean_real - mean_synth) ** 2))

    # 3. Std spectrum correlation
    std_real = X_real.std(axis=0)
    std_synth = X_synth_interp.std(axis=0)
    metrics['std_spectrum_correlation'] = np.corrcoef(std_real, std_synth)[0, 1]

    # 4. Global statistics comparison
    metrics['mean_diff_pct'] = (X_synth.mean() - X_real.mean()) / (abs(X_real.mean()) + 1e-10) * 100
    metrics['std_diff_pct'] = (X_synth.std() - X_real.std()) / (X_real.std() + 1e-10) * 100

    # 5. Slope distribution similarity (KS test)
    slopes_real = []
    slopes_synth = []
    wl_range_real = np.ptp(wavelengths_real)
    x_norm_real = (wavelengths_real - wavelengths_real.min()) / wl_range_real

    for spec in X_real:
        coeffs = np.polyfit(x_norm_real, spec, 1)
        slopes_real.append(coeffs[0] * 1000.0 / wl_range_real)

    for spec in X_synth_interp:
        coeffs = np.polyfit(x_norm_real, spec, 1)
        slopes_synth.append(coeffs[0] * 1000.0 / wl_range_real)

    ks_stat, ks_pval = stats.ks_2samp(slopes_real, slopes_synth)
    metrics['slope_ks_statistic'] = ks_stat
    metrics['slope_ks_pvalue'] = ks_pval

    # 6. Noise comparison
    noise_real = np.diff(X_real, axis=1).std() / np.sqrt(2)
    noise_synth = np.diff(X_synth_interp, axis=1).std() / np.sqrt(2)
    metrics['noise_ratio'] = noise_synth / (noise_real + 1e-10)

    # 7. PCA structure comparison
    pca_real = PCA(n_components=min(10, X_real.shape[0], X_real.shape[1]))
    pca_synth = PCA(n_components=min(10, X_synth_interp.shape[0], X_synth_interp.shape[1]))
    pca_real.fit(X_real)
    pca_synth.fit(X_synth_interp)

    n_common = min(len(pca_real.explained_variance_ratio_), len(pca_synth.explained_variance_ratio_))
    metrics['pca_variance_correlation'] = np.corrcoef(
        pca_real.explained_variance_ratio_[:n_common],
        pca_synth.explained_variance_ratio_[:n_common]
    )[0, 1]

    # 8. Absorbance distribution similarity (Wasserstein distance)
    metrics['wasserstein_distance'] = stats.wasserstein_distance(
        X_real.flatten()[:10000],  # Sample to limit computation
        X_synth_interp.flatten()[:10000]
    )

    # 9. Overall similarity score (0-100)
    scores = [
        metrics['mean_spectrum_correlation'] * 100,
        metrics['std_spectrum_correlation'] * 100,
        max(0, 100 - abs(metrics['mean_diff_pct'])),
        max(0, 100 - abs(metrics['std_diff_pct'])),
        max(0, 100 - abs(1 - metrics['noise_ratio']) * 100),
        metrics['pca_variance_correlation'] * 100 if not np.isnan(metrics['pca_variance_correlation']) else 50,
    ]
    metrics['overall_similarity_score'] = np.mean(scores)

    return metrics

# Compute metrics
similarity_metrics = compute_similarity_metrics(X_real, X_synth, wavelengths_real, wavelengths_synth)

print("=" * 60)
print("SIMILARITY METRICS")
print("=" * 60)
print(f"Mean spectrum correlation:    {similarity_metrics['mean_spectrum_correlation']:.4f}")
print(f"Mean spectrum RMSE:           {similarity_metrics['mean_spectrum_rmse']:.5f}")
print(f"Std spectrum correlation:     {similarity_metrics['std_spectrum_correlation']:.4f}")
print("")
print(f"Global mean difference:       {similarity_metrics['mean_diff_pct']:+.2f}%")
print(f"Global std difference:        {similarity_metrics['std_diff_pct']:+.2f}%")
print("")
print(f"Slope KS statistic:           {similarity_metrics['slope_ks_statistic']:.4f}")
print(f"Slope KS p-value:             {similarity_metrics['slope_ks_pvalue']:.4f}")
print("")
print(f"Noise ratio (synth/real):     {similarity_metrics['noise_ratio']:.3f}")
print(f"PCA variance correlation:     {similarity_metrics['pca_variance_correlation']:.4f}")
print(f"Wasserstein distance:         {similarity_metrics['wasserstein_distance']:.5f}")
print("")
print(f"{'='*60}")
print(f"OVERALL SIMILARITY SCORE:     {similarity_metrics['overall_similarity_score']:.1f}/100")
print(f"{'='*60}")

---
## 6. Parameter Optimization

Automated optimization of synthesis parameters to match real data.

In [None]:
from scipy.optimize import differential_evolution, minimize


def objective_function(params, X_real, wavelengths_real, real_props):
    """
    Objective function for parameter optimization.
    Lower is better.
    """
    # Unpack parameters
    (
        path_length_std,
        baseline_amplitude,
        global_slope_mean,
        global_slope_std,
        scatter_alpha_std,
        noise_base,
        noise_signal_dep,
    ) = params

    wl_start, wl_end = real_props['wavelength_range']
    wl_step = (wl_end - wl_start) / (real_props['n_wavelengths'] - 1)

    try:
        # Generate synthetic data with current parameters
        X_synth, wavelengths_synth, _ = generate_synthetic_spectra(
            n_samples=400,  # Smaller for speed
            random_state=42,
            wavelength_start=wl_start,
            wavelength_end=wl_end,
            wavelength_step=wl_step,
            path_length_std=path_length_std,
            baseline_amplitude=baseline_amplitude,
            global_slope_mean=global_slope_mean,
            global_slope_std=global_slope_std,
            scatter_alpha_std=scatter_alpha_std,
            noise_base=noise_base,
            noise_signal_dep=noise_signal_dep,
        )

        # Compute similarity metrics
        metrics = compute_similarity_metrics(X_real, X_synth, wavelengths_real, wavelengths_synth)

        # Objective: maximize similarity (minimize negative score)
        return -metrics['overall_similarity_score']

    except Exception as e:
        return 1000  # Penalty for failed generations

# Initial parameters (from real data analysis)
initial_params = [
    0.05,                          # path_length_std
    0.02,                          # baseline_amplitude
    real_props['mean_slope'],      # global_slope_mean
    real_props['slope_std'],       # global_slope_std
    0.05,                          # scatter_alpha_std
    real_props['noise_estimate'] * 0.5,  # noise_base
    real_props['noise_estimate'] * 0.5,  # noise_signal_dep
]

# Parameter bounds
bounds = [
    (0.01, 0.15),    # path_length_std
    (0.001, 0.1),    # baseline_amplitude
    (-0.2, 0.3),     # global_slope_mean
    (0.001, 0.1),    # global_slope_std
    (0.01, 0.15),    # scatter_alpha_std
    (0.0001, 0.05),  # noise_base
    (0.0001, 0.05),  # noise_signal_dep
]

print("Starting parameter optimization...")
print(f"Initial score: {-objective_function(initial_params, X_real, wavelengths_real, real_props):.1f}/100")
print("")
print("This may take a few minutes...")

In [None]:
# Run optimization (differential evolution for global optimization)
result = differential_evolution(
    objective_function,
    bounds=bounds,
    args=(X_real, wavelengths_real, real_props),
    maxiter=30,
    seed=42,
    polish=True,
    disp=True,
    workers=1,
)

print("\n" + "=" * 60)
print("OPTIMIZATION RESULTS")
print("=" * 60)
print(f"Final score: {-result.fun:.1f}/100")
print("")
print("Optimized parameters:")
param_names = [
    'path_length_std',
    'baseline_amplitude',
    'global_slope_mean',
    'global_slope_std',
    'scatter_alpha_std',
    'noise_base',
    'noise_signal_dep',
]
for name, init_val, opt_val in zip(param_names, initial_params, result.x, strict=False):
    print(f"  {name:<20}: {init_val:.5f} -> {opt_val:.5f}")

In [None]:
# Generate with optimized parameters
wl_start, wl_end = real_props['wavelength_range']
wl_step = (wl_end - wl_start) / (real_props['n_wavelengths'] - 1)

X_opt, wavelengths_opt, opt_metadata = generate_synthetic_spectra(
    n_samples=500,
    random_state=42,
    wavelength_start=wl_start,
    wavelength_end=wl_end,
    wavelength_step=wl_step,
    path_length_std=result.x[0],
    baseline_amplitude=result.x[1],
    global_slope_mean=result.x[2],
    global_slope_std=result.x[3],
    scatter_alpha_std=result.x[4],
    noise_base=result.x[5],
    noise_signal_dep=result.x[6],
)

# Compute final metrics
opt_metrics = compute_similarity_metrics(X_real, X_opt, wavelengths_real, wavelengths_opt)

print(f"\nFinal similarity score: {opt_metrics['overall_similarity_score']:.1f}/100")

In [None]:
# Compare: Initial vs Optimized
fig, axes = plt.subplots(1, 3, figsize=(16, 5))

# Mean spectra comparison
axes[0].plot(wavelengths_real, X_real.mean(axis=0), 'b-', linewidth=2, label='Real')
axes[0].plot(wavelengths_synth, X_synth.mean(axis=0), '--', color='orange', linewidth=2, label='Initial')
axes[0].plot(wavelengths_opt, X_opt.mean(axis=0), '--', color='green', linewidth=2, label='Optimized')
axes[0].set_xlabel('Wavelength (nm)')
axes[0].set_ylabel('Absorbance')
axes[0].set_title('Mean Spectra Comparison')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Std comparison
axes[1].plot(wavelengths_real, X_real.std(axis=0), 'b-', linewidth=1.5, label='Real')
axes[1].plot(wavelengths_synth, X_synth.std(axis=0), '--', color='orange', linewidth=1.5, label='Initial')
axes[1].plot(wavelengths_opt, X_opt.std(axis=0), '--', color='green', linewidth=1.5, label='Optimized')
axes[1].set_xlabel('Wavelength (nm)')
axes[1].set_ylabel('Standard Deviation')
axes[1].set_title('Sample Variation Comparison')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Score comparison
scores = [
    similarity_metrics['overall_similarity_score'],
    opt_metrics['overall_similarity_score']
]
colors = ['orange', 'green']
axes[2].bar(['Initial', 'Optimized'], scores, color=colors, edgecolor='black')
axes[2].set_ylabel('Similarity Score')
axes[2].set_title('Optimization Improvement')
axes[2].set_ylim(0, 100)
for i, score in enumerate(scores):
    axes[2].text(i, score + 2, f'{score:.1f}', ha='center', fontsize=12, fontweight='bold')
axes[2].grid(True, alpha=0.3, axis='y')

plt.suptitle('Initial vs Optimized Synthesis', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

---
## 7. Discriminator Test

Train a classifier to distinguish real from synthetic. A good synthetic dataset should be hard to discriminate.

In [None]:
def train_discriminator(X_real, X_synth, wavelengths_real, wavelengths_synth, test_size=0.3):
    """
    Train a classifier to discriminate between real and synthetic spectra.

    Returns:
        accuracy: Classification accuracy (lower is better for synthesis quality)
        auc: ROC-AUC score (0.5 means indistinguishable)
        importance: Feature importance from classifier
    """
    # Interpolate if wavelengths differ
    if len(wavelengths_real) != len(wavelengths_synth) or not np.allclose(wavelengths_real, wavelengths_synth):
        X_synth_interp = np.array([np.interp(wavelengths_real, wavelengths_synth, spec) for spec in X_synth])
    else:
        X_synth_interp = X_synth

    # Balance classes
    n_samples = min(len(X_real), len(X_synth_interp))
    idx_real = np.random.choice(len(X_real), n_samples, replace=False)
    idx_synth = np.random.choice(len(X_synth_interp), n_samples, replace=False)

    # Create dataset
    X = np.vstack([X_real[idx_real], X_synth_interp[idx_synth]])
    y = np.array([0] * n_samples + [1] * n_samples)  # 0=real, 1=synthetic

    # Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, random_state=42, stratify=y
    )

    # Train classifiers
    results = {}

    # Random Forest
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    y_prob_rf = rf.predict_proba(X_test)[:, 1]
    results['rf'] = {
        'accuracy': accuracy_score(y_test, y_pred_rf),
        'auc': roc_auc_score(y_test, y_prob_rf),
        'importance': rf.feature_importances_,
        'model': rf,
    }

    # Gradient Boosting
    gb = GradientBoostingClassifier(n_estimators=100, max_depth=5, random_state=42)
    gb.fit(X_train, y_train)
    y_pred_gb = gb.predict(X_test)
    y_prob_gb = gb.predict_proba(X_test)[:, 1]
    results['gb'] = {
        'accuracy': accuracy_score(y_test, y_pred_gb),
        'auc': roc_auc_score(y_test, y_prob_gb),
        'importance': gb.feature_importances_,
        'model': gb,
    }

    return results, wavelengths_real

print("Training discriminators...")
print("")

In [None]:
# Test initial synthesis
disc_initial, wl = train_discriminator(X_real, X_synth, wavelengths_real, wavelengths_synth)

# Test optimized synthesis
disc_optimized, _ = train_discriminator(X_real, X_opt, wavelengths_real, wavelengths_opt)

print("=" * 70)
print("DISCRIMINATOR RESULTS")
print("=" * 70)
print("")
print("                              INITIAL          OPTIMIZED")
print("-" * 70)
print(f"Random Forest Accuracy:       {disc_initial['rf']['accuracy']:.3f}            {disc_optimized['rf']['accuracy']:.3f}")
print(f"Random Forest AUC:            {disc_initial['rf']['auc']:.3f}            {disc_optimized['rf']['auc']:.3f}")
print(f"Gradient Boosting Accuracy:   {disc_initial['gb']['accuracy']:.3f}            {disc_optimized['gb']['accuracy']:.3f}")
print(f"Gradient Boosting AUC:        {disc_initial['gb']['auc']:.3f}            {disc_optimized['gb']['auc']:.3f}")
print("")
print("Interpretation:")
print("  - Accuracy near 0.50 = indistinguishable (ideal)")
print("  - AUC near 0.50 = indistinguishable (ideal)")
print("  - Lower is better for synthesis quality")
print("=" * 70)

In [None]:
# Visualize feature importance (what wavelengths reveal synthetic vs real)
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Initial synthesis
axes[0].plot(wl, disc_initial['rf']['importance'], 'b-', linewidth=1, alpha=0.7, label='Random Forest')
axes[0].plot(wl, disc_initial['gb']['importance'], 'r-', linewidth=1, alpha=0.7, label='Gradient Boosting')
axes[0].set_xlabel('Wavelength (nm)')
axes[0].set_ylabel('Feature Importance')
axes[0].set_title(f'Initial Synthesis (Acc: {disc_initial["rf"]["accuracy"]:.2f})')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Optimized synthesis
axes[1].plot(wl, disc_optimized['rf']['importance'], 'b-', linewidth=1, alpha=0.7, label='Random Forest')
axes[1].plot(wl, disc_optimized['gb']['importance'], 'r-', linewidth=1, alpha=0.7, label='Gradient Boosting')
axes[1].set_xlabel('Wavelength (nm)')
axes[1].set_ylabel('Feature Importance')
axes[1].set_title(f'Optimized Synthesis (Acc: {disc_optimized["rf"]["accuracy"]:.2f})')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.suptitle('Discriminator Feature Importance (wavelengths that distinguish real vs synthetic)', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

print("\nHigh importance regions indicate where synthetic spectra differ most from real.")
print("Use this to identify which spectral features need tuning.")

---
## 8. Summary & Recommendations

In [None]:
print("=" * 70)
print("SYNTHESIS QUALITY SUMMARY")
print("=" * 70)
print("")
print(f"Dataset: {dataset_name}")
print(f"Real samples: {X_real.shape[0]}")
print(f"Synthetic samples: {X_opt.shape[0]}")
print("")
print("METRICS COMPARISON:")
print("-" * 70)
print(f"{'Metric':<30} {'Initial':>15} {'Optimized':>15}")
print("-" * 70)
print(f"{'Similarity Score':<30} {similarity_metrics['overall_similarity_score']:>15.1f} {opt_metrics['overall_similarity_score']:>15.1f}")
print(f"{'Mean Spectrum Correlation':<30} {similarity_metrics['mean_spectrum_correlation']:>15.4f} {opt_metrics['mean_spectrum_correlation']:>15.4f}")
print(f"{'Std Spectrum Correlation':<30} {similarity_metrics['std_spectrum_correlation']:>15.4f} {opt_metrics['std_spectrum_correlation']:>15.4f}")
print(f"{'Noise Ratio':<30} {similarity_metrics['noise_ratio']:>15.3f} {opt_metrics['noise_ratio']:>15.3f}")
print(f"{'Discriminator Accuracy':<30} {disc_initial['rf']['accuracy']:>15.3f} {disc_optimized['rf']['accuracy']:>15.3f}")
print("-" * 70)
print("")
print("OPTIMIZED PARAMETERS:")
print("-" * 70)
for name, val in zip(param_names, result.x, strict=False):
    print(f"  {name:<25}: {val:.6f}")
print("")
print("RECOMMENDATIONS:")
print("-" * 70)

# Generate recommendations based on metrics
if opt_metrics['noise_ratio'] < 0.8:
    print("  - Increase noise_base or noise_signal_dep (synthetic is too clean)")
elif opt_metrics['noise_ratio'] > 1.2:
    print("  - Decrease noise_base or noise_signal_dep (synthetic is too noisy)")

if disc_optimized['rf']['accuracy'] > 0.7:
    print("  - Synthetic is still distinguishable. Consider:")
    # Find highest importance wavelengths
    importance = disc_optimized['rf']['importance']
    top_idx = np.argsort(importance)[-3:]
    top_wl = wl[top_idx]
    print(f"    - Investigating wavelengths: {', '.join([f'{w:.0f}' for w in top_wl])} nm")
    print("    - Adjusting component spectra or adding more realistic effects")
elif disc_optimized['rf']['accuracy'] < 0.6:
    print("  - Excellent! Synthetic spectra are nearly indistinguishable from real.")
else:
    print("  - Good quality. Minor adjustments may improve further.")

print("=" * 70)

In [None]:
# Export optimized parameters for reuse
optimized_config = {
    'wavelength_start': wl_start,
    'wavelength_end': wl_end,
    'wavelength_step': wl_step,
    'path_length_std': result.x[0],
    'baseline_amplitude': result.x[1],
    'global_slope_mean': result.x[2],
    'global_slope_std': result.x[3],
    'scatter_alpha_std': result.x[4],
    'noise_base': result.x[5],
    'noise_signal_dep': result.x[6],
    'similarity_score': opt_metrics['overall_similarity_score'],
    'discriminator_accuracy': disc_optimized['rf']['accuracy'],
}

print("Optimized configuration (copy for reuse):")
print("")
print("optimized_config = {")
for k, v in optimized_config.items():
    if isinstance(v, float):
        print(f"    '{k}': {v:.6f},")
    else:
        print(f"    '{k}': {v},")
print("}")