# HDF5 I/O with h5io

This notebook demonstrates how to save and load arrays and DataFrames using `neural_analysis.utils.h5io` and how to filter pairs when loading via `load_hdf5`.

In [1]:
# Imports and helpers
from pathlib import Path
import tempfile
import numpy as np
import pandas as pd
from neural_analysis.utils import h5io
from neural_analysis.utils.io import load_hdf5

tmpdir = tempfile.TemporaryDirectory()
base = Path(tmpdir.name)
print("Using temp dir:", base)

Using temp dir: /tmp/tmp2q3ekzy9


In [2]:
# Example 1: Array roundtrip with labels and attrs
path = base / 'array_demo.h5'
data = np.random.randn(100, 10).astype(np.float32)
labels = np.array([f'sample_{i}' for i in range(data.shape[0])])
attrs = {
    'description': 'random normal features',
    'version': 1,
    'metadata': {'source': 'synthetic', 'dims': list(data.shape)}
}

# Save
h5io(path, task='save', data=data, labels=labels, attrs=attrs)
print('Saved to', path)

# Load
loaded_data, loaded_labels = h5io(path, task='load')

# Validate roundtrip
assert isinstance(loaded_data, np.ndarray)
np.testing.assert_allclose(loaded_data, data)
assert list(loaded_labels) == list(labels)
print('Array roundtrip OK:', loaded_data.shape)

Saved to /tmp/tmp2q3ekzy9/array_demo.h5
Array roundtrip OK: (100, 10)


In [3]:
# Example 2: DataFrame roundtrip
path_df = base / 'df_demo.h5'
df = pd.DataFrame({
    'neuron_id': [f'n{i}' for i in range(5)],
    'firing_rate': np.random.rand(5),
    'condition': ['A', 'B', 'A', 'B', 'A'],
})
labels_df = ['trial_1', 'trial_2', 'trial_3', 'trial_4', 'trial_5']

h5io(path_df, task='save', data=df, labels=labels_df)
loaded_df, loaded_labels_df = h5io(path_df, task='load')

# Validate
assert isinstance(loaded_df, pd.DataFrame)
pd.testing.assert_frame_equal(loaded_df.reset_index(drop=True), df.reset_index(drop=True))
assert list(loaded_labels_df) == labels_df
print('DataFrame roundtrip OK:', loaded_df.shape)

DataFrame roundtrip OK: (5, 3)


In [4]:
# Example 3: Filtering pairs on load (via load_hdf5)
path_pairs = base / 'pairs_demo.h5'
pairs_df = pd.DataFrame({
    'item_i': ['A', 'A', 'B', 'C'],
    'item_j': ['B', 'C', 'C', 'D'],
    'score': [0.1, 0.8, 0.5, 0.9],
})
h5io(path_pairs, task='save', data=pairs_df, labels=None)

wanted = [('A','C'), ('B','C')]
(loaded_filtered, _), attrs = load_hdf5(path_pairs, filter_pairs=wanted, return_attrs=True)
print('Filtered rows:')
display(loaded_filtered)

# Validate only desired pairs
assert set(zip(loaded_filtered['item_i'], loaded_filtered['item_j'])) == set(wanted)
print('Filter pairs OK')

Filtered rows:


Unnamed: 0,item_i,item_j,score
0,A,C,0.8
1,B,C,0.5


Filter pairs OK


## Advanced HDF5 Operations

The following examples demonstrate the hierarchical HDF5 structure and advanced functions for saving/loading result datasets with mixed scalar and array data.

In [5]:
# Example 4: save_result_to_hdf5_dataset with mixed data types
from neural_analysis.utils.io import (
    save_result_to_hdf5_dataset,
    load_results_from_hdf5_dataset,
    get_hdf5_dataset_names,
    get_hdf5_result_summary
)

# Create a hierarchical HDF5 file with multiple result datasets
hdf5_path = base / 'hierarchical_results.h5'

# Save multiple results under different comparison groups
for session in ['session_001', 'session_002']:
    for condition_pair in [('condA', 'condB'), ('condA', 'condC'), ('condB', 'condC')]:
        # Generate sample results
        result_key = f"{condition_pair[0]}_vs_{condition_pair[1]}_wasserstein"
        value = np.random.rand() * 100  # Scalar metric
        
        # Generate pairwise comparison data (could be large arrays)
        n_pairs = 50
        pair_indices = [(i, j) for i in range(10) for j in range(10) if i < j][:n_pairs]
        pair_values = np.random.rand(n_pairs)
        
        # Create structured array that HDF5 can handle
        # Store as separate arrays: indices and values
        pairs_indices = np.array(pair_indices, dtype=np.int32)
        pairs_vals = np.array(pair_values, dtype=np.float64)
        
        # Save with scalar attributes and array datasets
        save_result_to_hdf5_dataset(
            save_path=hdf5_path,
            dataset_name=session,
            result_key=result_key,
            scalar_data={
                'dataset_i': condition_pair[0],
                'dataset_j': condition_pair[1],
                'metric': 'wasserstein',
                'value': float(value),
                'n_samples_i': 100,
                'n_samples_j': 100,
                'timestamp': '2024-01-15'
            },
            array_data={
                'pairs_indices': pairs_indices,
                'pairs_values': pairs_vals
            }
        )

print(f"✓ Saved hierarchical results to {hdf5_path.name}")
print(f"  Structure: dataset_name / result_key / {{scalar_data, array_data}}")

✓ Saved hierarchical results to hierarchical_results.h5
  Structure: dataset_name / result_key / {scalar_data, array_data}


### Viewing Dataset Names

Use `get_hdf5_dataset_names()` to list all comparison groups and result keys in the HDF5 file.

In [6]:
# List all datasets in the hierarchical HDF5 file
dataset_names = get_hdf5_dataset_names(hdf5_path)

print("Dataset names in HDF5 file:")
for name in dataset_names:
    print(f"  {name}")

Dataset names in HDF5 file:
  session_001
  session_002


### Loading with Filtering

Use `load_results_from_hdf5_dataset()` to load results with optional filtering by comparison name, dataset names, or metric.

In [7]:
# Example 6a: Load all results from session_001
results_session1 = load_results_from_hdf5_dataset(
    save_path=hdf5_path,
    dataset_name='session_001'
)

print(f"Loaded {len(results_session1['session_001'])} results from session_001")
print("\nSample result keys:")
for key in list(results_session1['session_001'].keys())[:3]:
    print(f"  {key}")

Loaded 3 results from session_001

Sample result keys:
  condA_vs_condB_wasserstein
  condA_vs_condC_wasserstein
  condB_vs_condC_wasserstein


### Summary DataFrame

Use `get_hdf5_result_summary()` to generate a pandas DataFrame with all results and their metadata for easy analysis.

In [8]:
# Example 7: Generate summary DataFrame
summary_df = get_hdf5_result_summary(
    save_path=hdf5_path,
    dataset_name='session_001'
)

print("Summary DataFrame:")
print(summary_df[['dataset_i', 'dataset_j', 'metric', 'value', 'n_samples_i', 'n_samples_j']].head(10))

Summary DataFrame:
  dataset_i dataset_j       metric      value  n_samples_i  n_samples_j
0     condA     condB  wasserstein  10.734410          100          100
1     condA     condC  wasserstein  59.203765          100          100
2     condB     condC  wasserstein  22.089291          100          100


### Accessing Array Data

Results can contain both scalar attributes and large array datasets (like pairwise comparison matrices).

In [9]:
# Example 6b: Access specific result details
result_key = list(results_session1['session_001'].keys())[0]
result = results_session1['session_001'][result_key]

print(f"Result: {result_key}")
print(f"  Attributes: {result['attributes']}")
print(f"  Arrays available: {list(result['arrays'].keys())}")

# Access the pairs arrays (now split into indices and values)
pairs_indices = result['arrays']['pairs_indices']
pairs_values = result['arrays']['pairs_values']
print(f"  Pairs shape: {pairs_indices.shape}, values: {pairs_values.shape}")

Result: condA_vs_condB_wasserstein
  Attributes: {'dataset_i': 'condA', 'dataset_j': 'condB', 'metric': 'wasserstein', 'n_samples_i': np.int64(100), 'n_samples_j': np.int64(100), 'timestamp': '2024-01-15', 'value': np.float64(10.734410362419588)}
  Arrays available: ['pairs_indices', 'pairs_values']
  Pairs shape: (45, 2), values: (50,)


## Phase 4B: Auto-Save/Load with compare_datasets()

The `compare_datasets()` function now supports automatic result caching via HDF5:
- **save_path**: Path to HDF5 file for caching results
- **regenerate**: Force recomputation even if cached result exists
- **dataset_names**: Required for mode="between" to identify datasets in cache

Benefits:
- **Instant loading**: Skip expensive computations for repeated analyses
- **Reproducibility**: Cached results with metadata
- **Easy comparison**: Test different metrics without recomputing

In [10]:
# Import metrics functions
from neural_analysis.metrics.pairwise_metrics import compare_datasets
import numpy as np
import time

# Generate test datasets
np.random.seed(42)
control_data = np.random.randn(100, 10)
treatment_data = np.random.randn(100, 10) + 0.5  # Shifted distribution

# Path for caching
cache_file = base / "comparison_cache.h5"

print("="*60)
print("AUTO-SAVE/LOAD DEMONSTRATION")
print("="*60)

AUTO-SAVE/LOAD DEMONSTRATION


In [11]:
# First call: Compute and save
print("\n1. First call: Computing and saving...")
start = time.time()
result1 = compare_datasets(
    control_data,
    treatment_data,
    mode="between",
    metric="wasserstein",
    save_path=cache_file,
    dataset_names=("control", "treatment"),
)
elapsed1 = time.time() - start

# Result is a dict with 'value' key for between mode
result1_value = result1['value'] if isinstance(result1, dict) else result1
print(f"   Result: {result1_value:.6f}")
print(f"   Time: {elapsed1:.4f}s")
print(f"   Cache file created: {cache_file.exists()}")


1. First call: Computing and saving...
   Result: 5.586092
   Time: 1.9650s
   Cache file created: True


In [12]:
# Second call: Load from cache (instant!)
print("\n2. Second call: Loading from cache...")
start = time.time()
result2 = compare_datasets(
    control_data,  # These datasets are ignored - loads from cache
    treatment_data,
    mode="between",
    metric="wasserstein",
    save_path=cache_file,
    dataset_names=("control", "treatment"),
    regenerate=False,  # Default: use cached result
)
elapsed2 = time.time() - start

# When loading from cache, result is a float directly
result2_value = result2 if isinstance(result2, (int, float)) else result2['value']
print(f"   Result: {result2_value:.6f}")
print(f"   Time: {elapsed2:.4f}s")
print(f"   Speedup: {elapsed1/elapsed2:.1f}x faster!")
print(f"   Results match: {result2_value == result1_value}")


2. Second call: Loading from cache...
   Result: 5.586092
   Time: 0.0018s
   Speedup: 1110.7x faster!
   Results match: True


In [13]:
# Force regeneration with modified data
print("\n3. Force regeneration (regenerate=True)...")
treatment_modified = treatment_data + 0.5  # Further shift

start = time.time()
result3 = compare_datasets(
    control_data,
    treatment_modified,  # Different data
    mode="between",
    metric="wasserstein",
    save_path=cache_file,
    dataset_names=("control", "treatment"),
    regenerate=True,  # Force recomputation
)
elapsed3 = time.time() - start

# Result is a dict with 'value' key for between mode
result3_value = result3['value'] if isinstance(result3, dict) else result3
print(f"   Result: {result3_value:.6f}")
print(f"   Time: {elapsed3:.4f}s")
print(f"   Changed: {abs(result3_value - result1_value) > 0.001}")


3. Force regeneration (regenerate=True)...
   Result: 10.520632
   Time: 0.0012s
   Changed: True


### All-Pairs Mode with Caching

All-pairs mode also supports caching for efficiency when comparing multiple datasets.

In [14]:
# Create multiple datasets
datasets = {
    "control": np.random.randn(50, 8),
    "treatment_A": np.random.randn(50, 8) + 0.3,
    "treatment_B": np.random.randn(50, 8) + 0.7,
    "treatment_C": np.random.randn(50, 8) + 1.0,
}

cache_all_pairs = base / "all_pairs_cache.h5"

print("\n" + "="*60)
print("ALL-PAIRS CACHING")
print("="*60)

# First call: Compute all pairs
print("\n1. Computing all-pairs (6 comparisons)...")
start = time.time()
all_pairs_results = compare_datasets(
    datasets,
    mode="all-pairs",
    metric="wasserstein",
    save_path=cache_all_pairs,
    show_progress=False,
)
elapsed_compute = time.time() - start

print(f"   Time: {elapsed_compute:.4f}s")
print(f"   Comparisons: {sum(len(v) for v in all_pairs_results.values())} pairs")

# Display sample results
print("\n   Sample results:")
for i, (key_i, inner) in enumerate(all_pairs_results.items()):
    if i < 2:  # Show first 2 datasets
        for key_j, dist in inner.items():
            print(f"      {key_i} → {key_j}: {dist:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
Failed to save result: cannot access local variable 'save_val_all_pairs' where it is not associated with a value



ALL-PAIRS CACHING

1. Computing all-pairs (6 comparisons)...
   Time: 0.0497s
   Comparisons: 16 pairs

   Sample results:
      control → control: 0.0000
      control → treatment_A: 3.4233
      control → treatment_B: 5.3701
      control → treatment_C: 8.8483
      treatment_A → control: 3.4233
      treatment_A → treatment_A: 0.0000
      treatment_A → treatment_B: 2.6983
      treatment_A → treatment_C: 5.7887


In [15]:
# Second call: Load from cache
print("\n2. Loading from cache...")
start = time.time()
loaded_results = compare_datasets(
    datasets,
    mode="all-pairs",
    metric="wasserstein",
    save_path=cache_all_pairs,
    regenerate=False,
)
elapsed_load = time.time() - start

print(f"   Time: {elapsed_load:.4f}s")
print(f"   Speedup: {elapsed_compute/elapsed_load:.1f}x faster!")

# Verify results match
matches = all(
    loaded_results[k1][k2] == all_pairs_results[k1][k2]
    for k1 in all_pairs_results
    for k2 in all_pairs_results[k1]
)
print(f"   Results match: {matches}")


2. Loading from cache...


Computing wasserstein distances: 100%|██████████| 16/16 [00:00<00:00, 7110.50it/s]
Failed to save result: cannot access local variable 'save_val_all_pairs' where it is not associated with a value


   Time: 0.0046s
   Speedup: 10.9x faster!
   Results match: True


### Inspection with comparison_store

Use the comparison_store API to inspect cached comparisons:

In [16]:
# Query cached comparisons
from neural_analysis.utils.comparison_store import query_comparisons, load_comparison

print("\n" + "="*60)
print("INSPECTING CACHED COMPARISONS")
print("="*60)

# Query all comparisons in the between-mode cache
df = query_comparisons(cache_file)
print("\nCached comparisons (between-mode):")
print(df[["metric", "mode", "dataset_i", "dataset_j"]])

# Load specific comparison directly
direct_load = load_comparison(
    cache_file,
    metric="wasserstein",
    dataset_i="control",
    dataset_j="treatment",
)
print(f"\nDirect load result: {direct_load:.6f}")
print(f"Matches cached: {direct_load == result2_value}")


INSPECTING CACHED COMPARISONS

Cached comparisons (between-mode):
        metric mode dataset_i dataset_j
0  wasserstein                         

Direct load result: 10.520632
Matches cached: False


### Best Practices

**When to use auto-save:**
- ✅ Expensive computations (high-D data, many samples)
- ✅ Repeated analyses with same data
- ✅ Batch processing pipelines
- ✅ Exploratory data analysis workflows

**When regenerate=True:**
- ✅ Data has been updated
- ✅ Metric parameters changed
- ✅ Force cache refresh
- ✅ Debugging/validation

**Caching tips:**
- Use descriptive dataset_names for easy identification
- Organize cache files by experiment/session
- Query comparisons to avoid redundant computation
- Clean up old cache files periodically

In [17]:
# Cleanup
tmpdir.cleanup()
print("✓ Cleaned up temporary directory")

✓ Cleaned up temporary directory
