In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import numpy as np
import h5py

# Save chunked HDF5 data file

In [None]:
h5py_file = h5py.File(f"/home/ssd2tb/dturcu/electric_fish_processed_data/data-2024_06_13-characterization_dataset-test_chunking/responses.hdf5",'r')["responses"]
print(h5py_file.shape, h5py_file.dtype)

In [None]:
## manually chunk the dataset
with h5py.File(f"hdf5-chunking/responses-sqrt_chunk.hdf5", 'w') as f:
    f.create_dataset("responses", shape=h5py_file.shape, dtype=h5py_file.dtype, chunks=tuple([int(np.sqrt(h5py_file.shape[0]))]+list(h5py_file.shape[1:])))

In [None]:
with h5py.File(f"hdf5-chunking/responses-sqrt_chunk.hdf5", "r+") as f:
    write_size = 100_000
    for i in range(0, h5py_file.shape[0], write_size):
        print(i // write_size, end=", ")
        f["responses"][i : i + write_size] = h5py_file[i : i + write_size]
        if (i / write_size + 1) % 50 == 0:
            print()

# Test accuracy of chunked HDF5 data file with respect to original file

In [3]:
h5py_file = h5py.File(f"/home/ssd2tb/dturcu/electric_fish_processed_data/data-2024_06_13-characterization_dataset-test_chunking/responses.hdf5",'r')["responses"]
h5py_file_chunked = h5py.File(f"hdf5-chunking/responses.hdf5", 'r')["responses"]
h5py_file_chunked_sqrt = h5py.File(f"hdf5-chunking/responses-sqrt_chunk.hdf5", 'r')["responses"]
ids = np.random.permutation(h5py_file_chunked.shape[0])[:10]
ids = np.sort(ids)
(ids, (h5py_file[ids] == h5py_file_chunked[ids]).all(), (h5py_file[ids] == h5py_file_chunked_sqrt[ids]).all())

(array([ 4341987,  4423492, 15065574, 15535978, 19984428, 33439972,
        33969291, 35776807, 42689181, 44635674]),
 True,
 True)

# Test loading speed of chunked HDF5 data file with respect to original file

In [4]:
ids = np.random.permutation(h5py_file.shape[0])[:27000]
# ids = np.sort(ids)
t0 = time.time()
for i in ids:
    _ = h5py_file[i]
t1 = time.time()
for i in ids:
    _ = h5py_file_chunked[i]
t2 = time.time()
for i in ids:
    _ = h5py_file_chunked_sqrt[i]
t3 = time.time()
print(f"Time for non-chunked: {t1-t0:.2f} s")
print(f"Time for man chunked: {t2-t1:.2f} s")
print(f"Time for sqt chunked: {t3-t2:.2f} s")

Time for non-chunked: 7.07 s
Time for man chunked: 7.70 s
Time for sqt chunked: 3.71 s


# Miscellaneous tests

e.g. manual individual row vs. automatic chunking

In [None]:
# h5py_file = h5py.File(f"/home/ssd2tb/dturcu/electric_fish_processed_data/data-2024_06_13-test_chunking/responses.hdf5",'r')["responses"]
h5py_file_chunked_manual = h5py.File(f"/home/ssd2tb/dturcu/electric_fish_processed_data/data-2024_06_13-characterization_dataset-test_chunking/responses-chunked.hdf5",'r')["test-chunking"]
# h5py_file_chunked_manual_main = h5py.File(f"hdf5-chunking/responses-chunked.hdf5",'r')["test-chunking"]


In [None]:
### manually chunk the dataset
# with h5py.File(f"hdf5-chunking/responses-chunked.hdf5", 'w') as f:
    # f.create_dataset("test-chunking", shape=h5py_file.shape, dtype=h5py_file.dtype, chunks=tuple([1]+list(h5py_file.shape[1:])))
### auto-chunk the dataset
with h5py.File(f"hdf5-chunking/responses-chunked-auto.hdf5", 'w') as f:
    f.create_dataset("test-chunking-auto", shape=h5py_file.shape, dtype=h5py_file.dtype, chunks=True)

In [None]:
with h5py.File(f"hdf5-chunking/responses-chunked-auto.hdf5", "r+") as f:
    write_size = 100_000
    for i in range(0, h5py_file.shape[0], write_size):
        print(i//write_size, end=", ")
        f["test-chunking-auto"][i : i + write_size] = h5py_file[i : i + write_size]
        if (i/write_size+1) % 50 == 0:
            print()

In [None]:
h5py_file_chunked_manual_main = h5py.File(f"hdf5-chunking/responses-chunked.hdf5",'r')["test-chunking"]
h5py_file_chunked_auto = h5py.File(f"hdf5-chunking/responses-chunked-auto.hdf5",'r')["test-chunking-auto"]

In [None]:
ids = np.random.permutation(h5py_file_chunked_manual.shape[0])[:3]
ids = np.sort(ids)
(ids, #h5py_file[ids], h5py_file_chunked_manual[ids], h5py_file_chunked_auto[ids], 
(h5py_file[ids] == h5py_file_chunked_manual[ids]).all(), 
# (h5py_file[ids] == h5py_file_chunked_auto[ids]).all()
)

In [None]:
ids = np.random.permutation(h5py_file_chunked_manual.shape[0])[:5]
# ids = np.sort(ids)
t0 = time.time()
for i in ids:
    _ = h5py_file_chunked_manual_main[i]
t1 = time.time()
for i in ids:
    _ = h5py_file_chunked_manual[i]
t2 = time.time()
for i in ids:
    _ = h5py_file_chunked_auto[i]
t3 = time.time()
print(f"Time for non-chunked: {t1-t0:.2f} s")
print(f"Time for man chunked: {t2-t1:.2f} s")
print(f"Time for aut chunked: {t3-t2:.2f} s")