# HDF5 I/O with h5io

This notebook demonstrates how to save and load arrays and DataFrames using `neural_analysis.utils.h5io` and how to filter pairs when loading via `load_hdf5`.

In [None]:
# Imports and helpers
from pathlib import Path
import tempfile
import numpy as np
import pandas as pd
from neural_analysis.utils import h5io
from neural_analysis.utils.io import load_hdf5

tmpdir = tempfile.TemporaryDirectory()
base = Path(tmpdir.name)
print("Using temp dir:", base)

Using temp dir: /tmp/tmpl6wbuqn_


In [2]:
# Example 1: Array roundtrip with labels and attrs
path = base / 'array_demo.h5'
data = np.random.randn(100, 10).astype(np.float32)
labels = np.array([f'sample_{i}' for i in range(data.shape[0])])
attrs = {
    'description': 'random normal features',
    'version': 1,
    'metadata': {'source': 'synthetic', 'dims': list(data.shape)}
}

# Save
h5io(path, task='save', data=data, labels=labels, attrs=attrs)
print('Saved to', path)

# Load
loaded_data, loaded_labels = h5io(path, task='load')

# Validate roundtrip
assert isinstance(loaded_data, np.ndarray)
np.testing.assert_allclose(loaded_data, data)
assert list(loaded_labels) == list(labels)
print('Array roundtrip OK:', loaded_data.shape)

Saved to /tmp/tmpl6wbuqn_/array_demo.h5
Array roundtrip OK: (100, 10)


In [3]:
# Example 2: DataFrame roundtrip
path_df = base / 'df_demo.h5'
df = pd.DataFrame({
    'neuron_id': [f'n{i}' for i in range(5)],
    'firing_rate': np.random.rand(5),
    'condition': ['A', 'B', 'A', 'B', 'A'],
})
labels_df = ['trial_1', 'trial_2', 'trial_3', 'trial_4', 'trial_5']

h5io(path_df, task='save', data=df, labels=labels_df)
loaded_df, loaded_labels_df = h5io(path_df, task='load')

# Validate
assert isinstance(loaded_df, pd.DataFrame)
pd.testing.assert_frame_equal(loaded_df.reset_index(drop=True), df.reset_index(drop=True))
assert list(loaded_labels_df) == labels_df
print('DataFrame roundtrip OK:', loaded_df.shape)

DataFrame roundtrip OK: (5, 3)


In [4]:
# Example 3: Filtering pairs on load (via load_hdf5)
path_pairs = base / 'pairs_demo.h5'
pairs_df = pd.DataFrame({
    'item_i': ['A', 'A', 'B', 'C'],
    'item_j': ['B', 'C', 'C', 'D'],
    'score': [0.1, 0.8, 0.5, 0.9],
})
h5io(path_pairs, task='save', data=pairs_df, labels=None)

wanted = [('A','C'), ('B','C')]
(loaded_filtered, _), attrs = load_hdf5(path_pairs, filter_pairs=wanted, return_attrs=True)
print('Filtered rows:')
display(loaded_filtered)

# Validate only desired pairs
assert set(zip(loaded_filtered['item_i'], loaded_filtered['item_j'])) == set(wanted)
print('Filter pairs OK')

Filtered rows:


Unnamed: 0,item_i,item_j,score
0,A,C,0.8
1,B,C,0.5


Filter pairs OK


## Advanced HDF5 Operations

The following examples demonstrate the hierarchical HDF5 structure and advanced functions for saving/loading result datasets with mixed scalar and array data.

In [None]:
# Example 4: save_result_to_hdf5_dataset with mixed data types
from neural_analysis.utils.io import (
    save_result_to_hdf5_dataset,
    load_results_from_hdf5_dataset,
    get_hdf5_dataset_names,
    get_hdf5_result_summary
)

# Create a hierarchical HDF5 file with multiple result datasets
hdf5_path = base / 'hierarchical_results.h5'

# Save multiple results under different comparison groups
for session in ['session_001', 'session_002']:
    for condition_pair in [('condA', 'condB'), ('condA', 'condC'), ('condB', 'condC')]:
        # Generate sample results
        result_key = f"{condition_pair[0]}_vs_{condition_pair[1]}_wasserstein"
        value = np.random.rand() * 100  # Scalar metric
        
        # Generate pairwise comparison data (could be large arrays)
        n_pairs = 50
        pair_indices = [(i, j) for i in range(10) for j in range(10) if i < j][:n_pairs]
        pair_values = np.random.rand(n_pairs)
        pairs_dict = {pair: val for pair, val in zip(pair_indices, pair_values)}
        
        # Save with scalar attributes and array datasets
        save_result_to_hdf5_dataset(
            save_path=hdf5_path,
            comparison_name=session,
            result_key=result_key,
            attrs={
                'dataset_i': condition_pair[0],
                'dataset_j': condition_pair[1],
                'metric': 'wasserstein',
                'value': value,
                'n_samples_i': 100,
                'n_samples_j': 100,
                'timestamp': '2024-01-15'
            },
            arrays={'pairs': pairs_dict}
        )

print(f"✓ Saved hierarchical results to {hdf5_path.name}")
print(f"  Structure: comparison_name / result_key / {{attributes, arrays}}")

### Viewing Dataset Names

Use `get_hdf5_dataset_names()` to list all comparison groups and result keys in the HDF5 file.

In [None]:
# Example 5: List all datasets in the HDF5 file
dataset_names = get_hdf5_dataset_names(hdf5_path)

print("Comparison groups and result keys:")
for comparison_name, result_keys in dataset_names.items():
    print(f"\n{comparison_name}/")
    for key in result_keys:
        print(f"  ├── {key}")

### Loading with Filtering

Use `load_results_from_hdf5_dataset()` to load results with optional filtering by comparison name, dataset names, or metric.

In [None]:
# Example 6a: Load all results from session_001
results_session1 = load_results_from_hdf5_dataset(
    save_path=hdf5_path,
    comparison_name='session_001'
)

print(f"Loaded {len(results_session1['session_001'])} results from session_001")
print("\nSample result keys:")
for i, key in enumerate(list(results_session1['session_001'].keys())[:3]):
    result = results_session1['session_001'][key]
    print(f"  {key}: value={result['attrs']['value']:.3f}")

# Example 6b: Load only wasserstein metrics across all sessions
results_wasserstein = load_results_from_hdf5_dataset(
    save_path=hdf5_path,
    metric='wasserstein'
)

print(f"\n✓ Loaded wasserstein metrics from {len(results_wasserstein)} sessions")

# Example 6c: Load results for specific dataset comparison
results_condA_vs_condB = load_results_from_hdf5_dataset(
    save_path=hdf5_path,
    dataset_i='condA',
    dataset_j='condB'
)

print(f"✓ Loaded {sum(len(v) for v in results_condA_vs_condB.values())} condA vs condB comparisons")

### Summary DataFrame

Use `get_hdf5_result_summary()` to generate a pandas DataFrame with all results and their metadata for easy analysis.

In [None]:
# Example 7: Generate summary DataFrame
summary_df = get_hdf5_result_summary(
    save_path=hdf5_path,
    comparison_name='session_001'
)

print("Summary DataFrame:")
print(summary_df[['dataset_i', 'dataset_j', 'metric', 'value', 'n_samples_i', 'n_samples_j']].head(10))

# Can also load all sessions
summary_all = get_hdf5_result_summary(save_path=hdf5_path)
print(f"\n✓ Total {len(summary_all)} results across all sessions")
print(f"  Columns: {list(summary_all.columns)}")

### Accessing Array Data

Results can contain both scalar attributes and large array datasets (like pairwise comparison matrices).

In [None]:
# Example 8: Access array data from a specific result
result_key = list(results_session1['session_001'].keys())[0]
result = results_session1['session_001'][result_key]

print(f"Result: {result_key}")
print(f"  Attributes: {result['attrs']}")
print(f"  Arrays available: {list(result['arrays'].keys())}")

# Access the pairs array
pairs_array = result['arrays']['pairs']
print(f"\nPairs array shape: {pairs_array.shape}")
print(f"First 5 pairs:")
for i in range(min(5, len(pairs_array))):
    indices, value = pairs_array[i]
    print(f"  ({indices[0]}, {indices[1]}): {value:.4f}")

print(f"\n✓ Successfully demonstrated hierarchical HDF5 with mixed data types")

In [None]:
# Cleanup
tmpdir.cleanup()
print("✓ Cleaned up temporary directory")