# HDF5 File Exploration with h5py

This notebook demonstrates how to open, navigate, and inspect HDF5 files commonly
produced at APS BER beamlines. We cover the essential h5py operations:
opening files, listing groups, reading datasets, examining attributes, and
understanding chunking and compression.

**Prerequisites**: `pip install h5py numpy`

In [None]:
import h5py
import numpy as np
from pathlib import Path

# Path to an HDF5 file (update this to your local file)
# Example files can be obtained from TomoBank: https://tomobank.readthedocs.io/
FILEPATH = "sample_data.h5"

print(f"h5py version: {h5py.__version__}")
print(f"HDF5 version: {h5py.version.hdf5_version}")

In [None]:
# Opening an HDF5 file and printing the full tree structure

def print_hdf5_tree(filepath):
    """Print the complete tree structure of an HDF5 file."""
    with h5py.File(filepath, "r") as f:
        print(f"File: {filepath}")
        print(f"File size: {Path(filepath).stat().st_size / 1e6:.1f} MB")
        print(f"Root groups: {list(f.keys())}")
        print("\nFull tree:")
        print("â”€" * 60)
        
        def visitor(name, obj):
            indent = "  " * name.count("/")
            if isinstance(obj, h5py.Group):
                n_attrs = len(obj.attrs)
                attr_str = f"  ({n_attrs} attrs)" if n_attrs > 0 else ""
                print(f"{indent}[G] {name.split('/')[-1]}/{attr_str}")
            elif isinstance(obj, h5py.Dataset):
                shape_str = str(obj.shape)
                dtype_str = str(obj.dtype)
                size_mb = obj.nbytes / 1e6
                print(f"{indent}[D] {name.split('/')[-1]}  {shape_str}  {dtype_str}  ({size_mb:.1f} MB)")
        
        f.visititems(visitor)

print_hdf5_tree(FILEPATH)

In [None]:
# Navigating groups and accessing datasets

with h5py.File(FILEPATH, "r") as f:
    # Navigate using dictionary-like syntax
    root_keys = list(f.keys())
    print(f"Root-level entries: {root_keys}")
    
    # Access a group (adjust path based on your file)
    # Common paths: /exchange, /MAPS, /entry
    for key in root_keys:
        obj = f[key]
        if isinstance(obj, h5py.Group):
            print(f"\nGroup '/{key}/' contains: {list(obj.keys())}")
        elif isinstance(obj, h5py.Dataset):
            print(f"\nDataset '/{key}': shape={obj.shape}, dtype={obj.dtype}")
    
    # Read a dataset (lazy -- no data loaded until sliced)
    # Example: reading the first dataset found
    first_dataset_path = None
    def find_first_dataset(name, obj):
        global first_dataset_path
        if isinstance(obj, h5py.Dataset) and first_dataset_path is None:
            first_dataset_path = name
    f.visititems(find_first_dataset)
    
    if first_dataset_path:
        dset = f[first_dataset_path]
        print(f"\nFirst dataset: /{first_dataset_path}")
        print(f"  Shape: {dset.shape}")
        print(f"  Dtype: {dset.dtype}")
        print(f"  Chunks: {dset.chunks}")
        print(f"  Compression: {dset.compression}")
        print(f"  Compression opts: {dset.compression_opts}")

In [None]:
# Reading and inspecting attributes

with h5py.File(FILEPATH, "r") as f:
    
    def print_all_attributes(name, obj):
        """Print attributes attached to every group and dataset."""
        if len(obj.attrs) > 0:
            print(f"\n/{name}")
            for attr_name, attr_value in obj.attrs.items():
                # Handle byte strings
                if isinstance(attr_value, bytes):
                    attr_value = attr_value.decode("utf-8", errors="replace")
                elif isinstance(attr_value, np.ndarray) and attr_value.dtype.kind == 'S':
                    attr_value = [v.decode() for v in attr_value]
                print(f"  @{attr_name} = {attr_value}")
    
    # Root-level attributes
    if len(f.attrs) > 0:
        print("/  (root attributes)")
        for k, v in f.attrs.items():
            print(f"  @{k} = {v}")
    
    f.visititems(print_all_attributes)

In [None]:
# Reading data: full load vs. partial (slice) access

with h5py.File(FILEPATH, "r") as f:
    # Find a multi-dimensional dataset
    target = None
    def find_nd_dataset(name, obj):
        nonlocal target
        if isinstance(obj, h5py.Dataset) and len(obj.shape) >= 2 and target is None:
            target = name
    f.visititems(find_nd_dataset)
    
    if target:
        dset = f[target]
        print(f"Dataset: /{target}, shape={dset.shape}, size={dset.nbytes/1e6:.1f} MB")
        
        # Method 1: Load entire dataset into memory
        if dset.nbytes < 100e6:  # Only if < 100 MB
            full_data = dset[:]
            print(f"  Full load: {full_data.shape}, {full_data.nbytes/1e6:.1f} MB")
        
        # Method 2: Load a single slice (much more memory efficient)
        if len(dset.shape) >= 2:
            single_frame = dset[0]  # First frame along axis 0
            print(f"  Single frame: {single_frame.shape}")
        
        # Method 3: Load a region of interest
        if len(dset.shape) >= 2:
            h, w = dset.shape[-2], dset.shape[-1]
            roi = dset[..., h//4:3*h//4, w//4:3*w//4]  # Central 50%
            print(f"  Central ROI: {roi.shape}")
        
        # Compute statistics without loading everything
        print(f"\n  Statistics (first frame):")
        frame = dset[0] if len(dset.shape) >= 2 else dset[:]
        print(f"    Min:    {np.nanmin(frame)}")
        print(f"    Max:    {np.nanmax(frame)}")
        print(f"    Mean:   {np.nanmean(frame):.4f}")
        print(f"    Std:    {np.nanstd(frame):.4f}")
        print(f"    NaNs:   {np.isnan(frame).sum() if np.issubdtype(frame.dtype, np.floating) else 'N/A'}")
    else:
        print("No multi-dimensional dataset found.")

In [None]:
# Understanding chunking and compression

with h5py.File(FILEPATH, "r") as f:
    print("Dataset storage analysis:")
    print(f"{'Path':<50s} {'Shape':<20s} {'Chunks':<20s} {'Compression':<12s} {'Ratio':<8s}")
    print("=" * 110)
    
    def analyze_storage(name, obj):
        if isinstance(obj, h5py.Dataset) and obj.nbytes > 0:
            raw_size = obj.nbytes
            # Estimate stored size from id.get_storage_size()
            stored_size = obj.id.get_storage_size()
            ratio = raw_size / max(stored_size, 1)
            
            comp = obj.compression if obj.compression else "none"
            chunks = str(obj.chunks) if obj.chunks else "contiguous"
            
            print(f"  /{name:<48s} {str(obj.shape):<20s} {chunks:<20s} {comp:<12s} {ratio:<.1f}x")
    
    f.visititems(analyze_storage)

print("\nNotes:")
print("  - Ratio > 1 means data is compressed (higher = more compression)")
print("  - Chunked datasets allow efficient partial reads")
print("  - Chunk shape should align with typical access patterns")

## Summary

Key h5py operations covered in this notebook:

| Operation | Code |
|-----------|------|
| Open file | `h5py.File(path, "r")` |
| List groups | `list(f.keys())` or `list(f["group"].keys())` |
| Walk tree | `f.visititems(callback)` |
| Read dataset | `data = f["/path/to/dataset"][:]` |
| Slice dataset | `frame = f["/path/to/dataset"][0, :, :]` |
| Read attribute | `val = f["/path"].attrs["name"]` |
| Check chunks | `f["/path/to/dataset"].chunks` |
| Check compression | `f["/path/to/dataset"].compression` |

**Next**: See [02_data_visualization.ipynb](02_data_visualization.ipynb) for visualizing
synchrotron data arrays.