# HDF5 I/O with h5io

This notebook demonstrates how to save and load arrays and DataFrames using `neural_analysis.utils.h5io` and how to filter pairs when loading via `load_hdf5`.

In [7]:
# Imports and helpers
from pathlib import Path
import tempfile
import numpy as np
import pandas as pd
from neural_analysis.utils import h5io
from neural_analysis.utils.io import load_hdf5

tmpdir = tempfile.TemporaryDirectory()
base = Path(tmpdir.name)
print("Using temp dir:", base)

Using temp dir: /tmp/tmpm_hiu835


In [8]:
# Example 1: Array roundtrip with labels and attrs
path = base / 'array_demo.h5'
data = np.random.randn(100, 10).astype(np.float32)
labels = np.array([f'sample_{i}' for i in range(data.shape[0])])
attrs = {
    'description': 'random normal features',
    'version': 1,
    'metadata': {'source': 'synthetic', 'dims': list(data.shape)}
}

# Save
h5io(path, task='save', data=data, labels=labels, attrs=attrs)
print('Saved to', path)

# Load
loaded_data, loaded_labels = h5io(path, task='load')

# Validate roundtrip
assert isinstance(loaded_data, np.ndarray)
np.testing.assert_allclose(loaded_data, data)
assert list(loaded_labels) == list(labels)
print('Array roundtrip OK:', loaded_data.shape)

Saved to /tmp/tmpm_hiu835/array_demo.h5
Array roundtrip OK: (100, 10)


In [9]:
# Example 2: DataFrame roundtrip
path_df = base / 'df_demo.h5'
df = pd.DataFrame({
    'neuron_id': [f'n{i}' for i in range(5)],
    'firing_rate': np.random.rand(5),
    'condition': ['A', 'B', 'A', 'B', 'A'],
})
labels_df = ['trial_1', 'trial_2', 'trial_3', 'trial_4', 'trial_5']

h5io(path_df, task='save', data=df, labels=labels_df)
loaded_df, loaded_labels_df = h5io(path_df, task='load')

# Validate
assert isinstance(loaded_df, pd.DataFrame)
pd.testing.assert_frame_equal(loaded_df.reset_index(drop=True), df.reset_index(drop=True))
assert list(loaded_labels_df) == labels_df
print('DataFrame roundtrip OK:', loaded_df.shape)

DataFrame roundtrip OK: (5, 3)


In [10]:
# Example 3: Filtering pairs on load (via load_hdf5)
path_pairs = base / 'pairs_demo.h5'
pairs_df = pd.DataFrame({
    'item_i': ['A', 'A', 'B', 'C'],
    'item_j': ['B', 'C', 'C', 'D'],
    'score': [0.1, 0.8, 0.5, 0.9],
})
h5io(path_pairs, task='save', data=pairs_df, labels=None)

wanted = [('A','C'), ('B','C')]
(loaded_filtered, _), attrs = load_hdf5(path_pairs, filter_pairs=wanted, return_attrs=True)
print('Filtered rows:')
display(loaded_filtered)

# Validate only desired pairs
assert set(zip(loaded_filtered['item_i'], loaded_filtered['item_j'])) == set(wanted)
print('Filter pairs OK')

Filtered rows:


Unnamed: 0,item_i,item_j,score
0,A,C,0.8
1,B,C,0.5


Filter pairs OK


## Advanced HDF5 Operations

The following examples demonstrate the hierarchical HDF5 structure and advanced functions for saving/loading result datasets with mixed scalar and array data.

In [13]:
# Example 4: save_result_to_hdf5_dataset with mixed data types
from neural_analysis.utils.io import (
    save_result_to_hdf5_dataset,
    load_results_from_hdf5_dataset,
    get_hdf5_dataset_names,
    get_hdf5_result_summary
)

# Create a hierarchical HDF5 file with multiple result datasets
hdf5_path = base / 'hierarchical_results.h5'

# Save multiple results under different comparison groups
for session in ['session_001', 'session_002']:
    for condition_pair in [('condA', 'condB'), ('condA', 'condC'), ('condB', 'condC')]:
        # Generate sample results
        result_key = f"{condition_pair[0]}_vs_{condition_pair[1]}_wasserstein"
        value = np.random.rand() * 100  # Scalar metric
        
        # Generate pairwise comparison data (could be large arrays)
        n_pairs = 50
        pair_indices = [(i, j) for i in range(10) for j in range(10) if i < j][:n_pairs]
        pair_values = np.random.rand(n_pairs)
        
        # Create structured array that HDF5 can handle
        # Store as separate arrays: indices and values
        pairs_indices = np.array(pair_indices, dtype=np.int32)
        pairs_vals = np.array(pair_values, dtype=np.float64)
        
        # Save with scalar attributes and array datasets
        save_result_to_hdf5_dataset(
            save_path=hdf5_path,
            dataset_name=session,
            result_key=result_key,
            scalar_data={
                'dataset_i': condition_pair[0],
                'dataset_j': condition_pair[1],
                'metric': 'wasserstein',
                'value': float(value),
                'n_samples_i': 100,
                'n_samples_j': 100,
                'timestamp': '2024-01-15'
            },
            array_data={
                'pairs_indices': pairs_indices,
                'pairs_values': pairs_vals
            }
        )

print(f"✓ Saved hierarchical results to {hdf5_path.name}")
print(f"  Structure: dataset_name / result_key / {{scalar_data, array_data}}")

✓ Saved hierarchical results to hierarchical_results.h5
  Structure: dataset_name / result_key / {scalar_data, array_data}


### Viewing Dataset Names

Use `get_hdf5_dataset_names()` to list all comparison groups and result keys in the HDF5 file.

In [15]:
# List all datasets in the hierarchical HDF5 file
dataset_names = get_hdf5_dataset_names(hdf5_path)

print("Dataset names in HDF5 file:")
for name in dataset_names:
    print(f"  {name}")

Dataset names in HDF5 file:
  session_001
  session_002


### Loading with Filtering

Use `load_results_from_hdf5_dataset()` to load results with optional filtering by comparison name, dataset names, or metric.

In [17]:
# Example 6a: Load all results from session_001
results_session1 = load_results_from_hdf5_dataset(
    save_path=hdf5_path,
    dataset_name='session_001'
)

print(f"Loaded {len(results_session1['session_001'])} results from session_001")
print("\nSample result keys:")
for key in list(results_session1['session_001'].keys())[:3]:
    print(f"  {key}")

Loaded 3 results from session_001

Sample result keys:
  condA_vs_condB_wasserstein
  condA_vs_condC_wasserstein
  condB_vs_condC_wasserstein


### Summary DataFrame

Use `get_hdf5_result_summary()` to generate a pandas DataFrame with all results and their metadata for easy analysis.

In [19]:
# Example 7: Generate summary DataFrame
summary_df = get_hdf5_result_summary(
    save_path=hdf5_path,
    dataset_name='session_001'
)

print("Summary DataFrame:")
print(summary_df[['dataset_i', 'dataset_j', 'metric', 'value', 'n_samples_i', 'n_samples_j']].head(10))

Summary DataFrame:
  dataset_i dataset_j       metric      value  n_samples_i  n_samples_j
0     condA     condB  wasserstein  62.012778          100          100
1     condA     condC  wasserstein  46.645856          100          100
2     condB     condC  wasserstein  24.088261          100          100


### Accessing Array Data

Results can contain both scalar attributes and large array datasets (like pairwise comparison matrices).

In [22]:
# Example 6b: Access specific result details
result_key = list(results_session1['session_001'].keys())[0]
result = results_session1['session_001'][result_key]

print(f"Result: {result_key}")
print(f"  Attributes: {result['attributes']}")
print(f"  Arrays available: {list(result['arrays'].keys())}")

# Access the pairs arrays (now split into indices and values)
pairs_indices = result['arrays']['pairs_indices']
pairs_values = result['arrays']['pairs_values']
print(f"  Pairs shape: {pairs_indices.shape}, values: {pairs_values.shape}")

Result: condA_vs_condB_wasserstein
  Attributes: {'dataset_i': 'condA', 'dataset_j': 'condB', 'metric': 'wasserstein', 'n_samples_i': np.int64(100), 'n_samples_j': np.int64(100), 'timestamp': '2024-01-15', 'value': np.float64(62.01277798611857)}
  Arrays available: ['pairs_indices', 'pairs_values']
  Pairs shape: (45, 2), values: (50,)


In [21]:
# Cleanup
tmpdir.cleanup()
print("✓ Cleaned up temporary directory")

✓ Cleaned up temporary directory
