# Tomography Data Exploratory Data Analysis

This notebook provides a systematic EDA workflow for synchrotron tomography data
stored in Data Exchange HDF5 format. We cover projection quality assessment,
sinogram visualization, ring artifact detection, rotation center estimation,
flat/dark field analysis, and histogram analysis.

**Prerequisites**: `pip install h5py numpy matplotlib scipy tomopy`

In [None]:
import h5py
import numpy as np
import matplotlib.pyplot as plt
from scipy.ndimage import median_filter

plt.rcParams["figure.dpi"] = 120

# Update to your tomography HDF5 file path
FILEPATH = "tomo_scan.h5"

# Load metadata and reference frames
with h5py.File(FILEPATH, "r") as f:
    nproj, nrow, ncol = f["/exchange/data"].shape
    theta = f["/exchange/theta"][:]
    flat = f["/exchange/data_white"][:].astype(np.float32)
    dark = f["/exchange/data_dark"][:].astype(np.float32)

flat_mean = np.mean(flat, axis=0)
dark_mean = np.mean(dark, axis=0)

print(f"Projections: {nproj}, Image: {nrow} x {ncol}")
print(f"Flat fields: {flat.shape[0]}, Dark fields: {dark.shape[0]}")
print(f"Theta: {np.degrees(theta[0]):.2f} to {np.degrees(theta[-1]):.2f} deg")
print(f"Angular step: {np.degrees(np.median(np.diff(theta))):.4f} deg")

In [None]:
# Projection quality: mean intensity vs. angle and outlier detection

means = np.zeros(nproj)
stds = np.zeros(nproj)

with h5py.File(FILEPATH, "r") as f:
    dset = f["/exchange/data"]
    for i in range(nproj):
        frame = dset[i].astype(np.float32)
        means[i] = np.mean(frame)
        stds[i] = np.std(frame)

# Detect outlier projections (> 3-sigma from median)
med = np.median(means)
mad = np.median(np.abs(means - med))
outlier_mask = np.abs(means - med) > 5 * max(mad, 1e-6)

fig, axes = plt.subplots(2, 1, figsize=(14, 7), sharex=True)

axes[0].plot(np.degrees(theta), means, "b-", lw=0.5, label="Mean")
if outlier_mask.any():
    axes[0].scatter(np.degrees(theta[outlier_mask]), means[outlier_mask],
                    c="red", s=20, zorder=5, label=f"Outliers ({outlier_mask.sum()})")
axes[0].set_ylabel("Mean Intensity")
axes[0].set_title("Projection Quality: Mean Intensity vs. Angle")
axes[0].legend()

axes[1].plot(np.degrees(theta), stds, "g-", lw=0.5)
axes[1].set_xlabel("Rotation Angle (degrees)")
axes[1].set_ylabel("Std Deviation")
axes[1].set_title("Projection Contrast (Std Dev)")

plt.tight_layout()
plt.show()

print(f"Intensity range: {means.min():.0f} to {means.max():.0f}")
print(f"Outlier projections: {np.where(outlier_mask)[0]}")

In [None]:
# Sinogram visualization and ring artifact detection

sino_row = nrow // 2

with h5py.File(FILEPATH, "r") as f:
    sino_raw = f["/exchange/data"][:, sino_row, :].astype(np.float32)

# Normalize sinogram
flat_row = flat_mean[sino_row, :]
dark_row = dark_mean[sino_row, :]
sino_norm = (sino_raw - dark_row) / np.clip(flat_row - dark_row, 1, None)

fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Raw sinogram
axes[0, 0].imshow(sino_raw, cmap="gray", aspect="auto", origin="lower")
axes[0, 0].set_title(f"Raw Sinogram (row={sino_row})")
axes[0, 0].set_ylabel("Projection Index")

# Normalized sinogram
axes[0, 1].imshow(sino_norm, cmap="gray", aspect="auto", origin="lower",
                  vmin=np.percentile(sino_norm, 1), vmax=np.percentile(sino_norm, 99))
axes[0, 1].set_title("Normalized Sinogram")

# Column-wise mean profile (ring artifact indicator)
col_means = np.mean(sino_norm, axis=0)
col_median = np.median(col_means)
col_mad = np.median(np.abs(col_means - col_median))
ring_threshold = 3.0
ring_cols = np.where(np.abs(col_means - col_median) > ring_threshold * col_mad)[0]

axes[1, 0].plot(col_means, "b-", lw=0.5)
if len(ring_cols) > 0:
    axes[1, 0].scatter(ring_cols, col_means[ring_cols], c="red", s=10, zorder=5)
axes[1, 0].set_xlabel("Column Index")
axes[1, 0].set_ylabel("Mean Across Angles")
axes[1, 0].set_title(f"Column Profile ({len(ring_cols)} potential ring columns)")

# Flat field quality
axes[1, 1].imshow(flat_mean, cmap="gray", origin="lower")
axes[1, 1].set_title(f"Mean Flat Field (CV={flat_mean.std()/flat_mean.mean():.3f})")

plt.tight_layout()
plt.show()

print(f"Ring artifact columns: {ring_cols[:20]}{'...' if len(ring_cols) > 20 else ''}")

In [None]:
# Rotation center estimation

try:
    import tomopy
    
    # Load a sinogram chunk for center finding
    with h5py.File(FILEPATH, "r") as f:
        mid = nrow // 2
        proj_chunk = f["/exchange/data"][:, mid-5:mid+5, :].astype(np.float32)
    
    proj_norm = tomopy.normalize(proj_chunk,
                                 flat_mean[mid-5:mid+5, :][np.newaxis, :, :],
                                 dark_mean[mid-5:mid+5, :][np.newaxis, :, :])
    proj_norm = tomopy.minus_log(proj_norm)
    
    # Find center using Vo's method
    center = tomopy.find_center_vo(proj_norm)
    print(f"Estimated rotation center (Vo): {center:.2f}")
    print(f"Image center: {ncol / 2:.1f}")
    print(f"Offset: {center - ncol / 2:.2f} pixels")
    
    # Visual check: reconstruct at several centers
    test_range = np.arange(center - 4, center + 4.5, 1)
    fig, axes = plt.subplots(1, len(test_range), figsize=(3 * len(test_range), 3))
    
    sino_1 = proj_norm[:, 5:6, :]
    for ax, c in zip(axes, test_range):
        rec = tomopy.recon(sino_1, theta, center=c, algorithm="gridrec")
        ax.imshow(rec[0], cmap="gray",
                  vmin=np.percentile(rec, 2), vmax=np.percentile(rec, 98))
        ax.set_title(f"c={c:.0f}", fontsize=9)
        ax.axis("off")
    
    plt.suptitle("Rotation Center Scan", fontsize=12)
    plt.tight_layout()
    plt.show()

except ImportError:
    print("tomopy not installed -- skipping rotation center analysis.")
    print("Install with: pip install tomopy")

In [None]:
# Histogram analysis: raw, normalized, and absorption

# Sample random projections for efficiency
n_sample = min(50, nproj)
sample_idx = np.linspace(0, nproj - 1, n_sample, dtype=int)

with h5py.File(FILEPATH, "r") as f:
    sample_proj = np.stack([f["/exchange/data"][i].astype(np.float32)
                            for i in sample_idx])

# Normalize
norm_proj = (sample_proj - dark_mean) / np.clip(flat_mean - dark_mean, 1, None)
absorp = -np.log(np.clip(norm_proj, 1e-6, None))

fig, axes = plt.subplots(1, 3, figsize=(17, 4))

axes[0].hist(sample_proj.ravel(), bins=200, log=True, color="steelblue", edgecolor="none")
axes[0].set_title("Raw Projection Histogram")
axes[0].set_xlabel("Detector Counts")

axes[1].hist(norm_proj.ravel(), bins=200, log=True, color="darkorange", edgecolor="none")
axes[1].set_title("Normalized (I/I0) Histogram")
axes[1].set_xlabel("Transmission")
axes[1].axvline(1.0, color="red", ls="--", label="I/I0 = 1")
axes[1].legend()

valid_abs = absorp[np.isfinite(absorp)]
axes[2].hist(valid_abs.ravel(), bins=200, log=True, color="forestgreen", edgecolor="none")
axes[2].set_title("Absorption (-log) Histogram")
axes[2].set_xlabel("Absorption (a.u.)")

plt.tight_layout()
plt.show()

# Summary
print(f"Raw: min={sample_proj.min():.0f}, max={sample_proj.max():.0f}")
print(f"Normalized: min={norm_proj.min():.4f}, max={norm_proj.max():.4f}")
print(f"Absorption: min={valid_abs.min():.4f}, max={valid_abs.max():.4f}")
print(f"Pixels with transmission > 1.0: {(norm_proj > 1.0).sum()} ({100*(norm_proj > 1.0).mean():.2f}%)")

In [None]:
# Zinger (cosmic ray) detection

def detect_zingers(projection, threshold=10.0):
    filtered = median_filter(projection.astype(float), size=3)
    diff = np.abs(projection.astype(float) - filtered)
    mad = np.median(diff[diff > 0]) if (diff > 0).any() else 1.0
    return diff > threshold * max(mad, 1.0)

total_zingers = 0
zinger_counts = []

check_idx = np.linspace(0, nproj - 1, min(100, nproj), dtype=int)
with h5py.File(FILEPATH, "r") as f:
    for i in check_idx:
        proj_i = f["/exchange/data"][i].astype(np.float32)
        z = detect_zingers(proj_i)
        count = z.sum()
        total_zingers += count
        zinger_counts.append(count)

fig, ax = plt.subplots(figsize=(12, 3))
ax.bar(check_idx, zinger_counts, width=max(1, nproj / 200), color="crimson")
ax.set_xlabel("Projection Index")
ax.set_ylabel("Zinger Count")
ax.set_title(f"Zinger Detection (total: {total_zingers} in {len(check_idx)} sampled projections)")
plt.tight_layout()
plt.show()

print(f"Mean zingers per projection: {np.mean(zinger_counts):.1f}")
print(f"Max zingers in one projection: {np.max(zinger_counts)} (proj #{check_idx[np.argmax(zinger_counts)]})")

## EDA Summary

After running this notebook, you should have assessed:

1. **Data dimensions** -- Confirmed projection count, image size, angular range
2. **Projection stability** -- Mean intensity vs. angle, identified outlier frames
3. **Sinogram quality** -- Checked for ring artifacts via column profiles
4. **Rotation center** -- Estimated and visually verified
5. **Histogram distributions** -- Raw, normalized, and absorption distributions
6. **Zingers** -- Counted cosmic ray artifacts per projection

**Next steps**: Apply preprocessing corrections (stripe removal, zinger correction,
phase retrieval) based on findings, then proceed to reconstruction with TomoPy or
TomocuPy.