# Convert a HDF5 compressed dataset to sparse matrices

This notebook presents different ways to convert a stack of images stored as a HDF5 dataset to a stack of sparse matrices.

Notebook license: [CC-0](https://creativecommons.org/public-domain/cc0/)

In [1]:
# Enable multithreading
import os

os.environ["OMP_NUM_THREADS"] = "4"
os.environ["BLOSC_NTHREADS"] = "4"

In [2]:
import b2h5py
import blosc2
import bslz4_to_sparse
import h5py
from hdf5plugin import Bitshuffle, Blosc2
import numpy as np

## Prepare the datasets

Download data (credits ID11@ESRF) and create datasets with different compressions.

In [3]:
if not os.path.exists("sparse_image_stack.h5"):
    !wget http://www.silx.org/pub/leaps-innov/sparse_image_stack.h5

In [4]:
if True or not os.path.exists("sparse_images_shuffle_dc_test.h5"):
    with h5py.File("sparse_image_stack.h5", "r") as h5f:
        data = h5f["entry_0000/measurement/data"][:10]

    chunk_shape = (1,) + data.shape[1:]

    with h5py.File("sparse_images_shuffle_dc_test.h5", "w") as h5f:
        h5f.create_dataset(
            "bslz4",
            data=data,
            chunks=chunk_shape,
            compression=Bitshuffle(),
        )
        h5f.create_dataset(
            "bszstd",
            data=data,
            chunks=chunk_shape,
            compression=Bitshuffle(cname="zstd"),
        )
        blosc2_blosclz_dataset = h5f.create_dataset(
            "blosc2_blosclz",
            # data=data,
            shape=data.shape,
            dtype=data.dtype,
            chunks=chunk_shape,
            compression=Blosc2(cname="blosclz", filters=Blosc2.SHUFFLE),
        )
        blosc2_lz4_dataset = h5f.create_dataset(
            "blosc2_lz4",
            # data=data,
            shape=data.shape,
            dtype=data.dtype,
            chunks=chunk_shape,
            compression=Blosc2(cname="lz4", filters=Blosc2.SHUFFLE),
        )
        blosc2_zstd_dataset = h5f.create_dataset(
            "blosc2_zstd",
            #data=data,
            shape=data.shape,
            dtype=data.dtype,
            chunks=chunk_shape,
            compression=Blosc2(cname="zstd", filters=Blosc2.SHUFFLE),
        )
        cpblosclz = dict(clevel=9, filters=[blosc2.Filter.SHUFFLE], codec=blosc2.Codec.BLOSCLZ)
        cplz4 = dict(clevel=9, filters=[blosc2.Filter.SHUFFLE], codec=blosc2.Codec.LZ4)
        cpzstd = dict(clevel=8, filters=[blosc2.Filter.SHUFFLE], codec=blosc2.Codec.ZSTD)
        # chunks = (1,) + (2162 // 2, 2068 // 4)
        # blocks = (1,) + (256, 256 // 2)
        chunks = (1,) + (2162 // 2, 2068 // 4)
        blocks = (1,) + (256, 256 // 2)
        for i in range(data.shape[0]):
            im = data[i:i+1, ...]
            # Transform the numpy array to a blosc2 array. This is where compression happens.
            b2im_blosclz = blosc2.asarray(im, chunks=chunks, blocks=blocks, cparams=cpblosclz)
            b2im_lz4 = blosc2.asarray(im, chunks=chunks, blocks=blocks, cparams=cplz4)
            b2im_zstd = blosc2.asarray(im, chunks=chunks, blocks=blocks, cparams=cpzstd)
            blosc2_blosclz_dataset.id.write_direct_chunk((i, 0, 0), b2im_blosclz.schunk.to_cframe())
            blosc2_lz4_dataset.id.write_direct_chunk((i, 0, 0), b2im_lz4.schunk.to_cframe())
            blosc2_zstd_dataset.id.write_direct_chunk((i, 0, 0), b2im_zstd.schunk.to_cframe())
        
    del data

## Benchmark reading data

Compare different ways to read data from a HDf5 compressed dataset.

The test is done on the first frame of a 3D dataset

In [5]:
h5f = h5py.File("sparse_images_shuffle_dc_test.h5", "r")
bslz4_dataset = h5f["bslz4"]
bszstd_dataset = h5f["bszstd"]
blosc2_blosclz_dataset = h5f["blosc2_blosclz"]
blosc2_lz4_dataset = h5f["blosc2_lz4"]
blosc2_zstd_dataset = h5f["blosc2_zstd"]

In [6]:
h5_chunk_info = blosc2_zstd_dataset.id.get_chunk_info(0)
b2_schunk = blosc2.schunk.open(
    blosc2_zstd_dataset.file.filename,
    mode='r',
    offset=h5_chunk_info.byte_offset,
)
print(b2_schunk.info)

type    : NDArray
shape   : (1, 2162, 2068)
chunks  : (1, 1081, 517)
blocks  : (1, 256, 128)
dtype   : uint32
cratio  : 1071.51
cparams : {'blocksize': 131072,
 'clevel': 8,
 'codec': <Codec.ZSTD: 5>,
 'codec_meta': 0,
 'filters': [<Filter.SHUFFLE: 1>,
             <Filter.NOFILTER: 0>,
             <Filter.NOFILTER: 0>,
             <Filter.NOFILTER: 0>,
             <Filter.NOFILTER: 0>,
             <Filter.NOFILTER: 0>],
 'filters_meta': [0, 0, 0, 0, 0, 0],
 'nthreads': 16,
 'splitmode': <SplitMode.ALWAYS_SPLIT: 1>,
 'typesize': 4,
 'use_dict': 0}
dparams : {'nthreads': 16}



Compressed size of first frame

In [7]:
bslz4_nbytes = len(bslz4_dataset.id.read_direct_chunk((0, 0, 0))[1])
bszstd_nbytes = len(bszstd_dataset.id.read_direct_chunk((0, 0, 0))[1])
blosc2_blosclz_nbytes = len(blosc2_blosclz_dataset.id.read_direct_chunk((0, 0, 0))[1])
blosc2_lz4_nbytes = len(blosc2_lz4_dataset.id.read_direct_chunk((0, 0, 0))[1])
blosc2_zstd_nbytes = len(blosc2_zstd_dataset.id.read_direct_chunk((0, 0, 0))[1])
print(f"{bslz4_nbytes=}\n{bszstd_nbytes=}\n{blosc2_blosclz_nbytes=}\n{blosc2_lz4_nbytes=}\n{blosc2_zstd_nbytes=}")

bslz4_nbytes=183882
bszstd_nbytes=111675
blosc2_blosclz_nbytes=66378
blosc2_lz4_nbytes=70064
blosc2_zstd_nbytes=24780


* **With hdf5plugin**: Decompression is performed by the HDF5 filters

In [8]:
%timeit blosc2_blosclz_dataset[0]

3.05 ms ± 509 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%timeit blosc2_lz4_dataset[0]

2.78 ms ± 611 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [10]:
%timeit blosc2_zstd_dataset[0]

2.43 ms ± 4.01 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
def array_to_sparse(array: np.ndarray) -> tuple[np.ndarray, np.ndarray]:
    """Convert an array to sparse representation"""
    array = np.ravel(array)
    indices = np.nonzero(array)[0]
    values = array[indices]
    return values, indices

In [12]:
def read_blosc2_h5_chunk(dataset: h5py.Dataset, chunk_index: int):
    h5_chunk_info = dataset.id.get_chunk_info(chunk_index)
    b2_schunk = blosc2.schunk.open(
        dataset.file.filename,
        mode='r',
        offset=h5_chunk_info.byte_offset,
    )
    return b2_schunk[:].view(dtype=dataset.dtype)

In [13]:
%timeit array_to_sparse(read_blosc2_h5_chunk(blosc2_blosclz_dataset, 0))

5.94 ms ± 7.58 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%timeit array_to_sparse(read_blosc2_h5_chunk(blosc2_lz4_dataset, 0))

6.01 ms ± 32.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
%timeit array_to_sparse(read_blosc2_h5_chunk(blosc2_zstd_dataset, 0))

6.05 ms ± 8.47 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


* **With bslz4_to_sparse** for bitshuffle+LZ4 compressed dataset

In [16]:
%timeit bslz4_to_sparse.bslz4_to_sparse(bslz4_dataset, 0, 0)

4.16 ms ± 3.01 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [17]:
h5f.close()