From 94d849c37f81834bbf913c4899f0a2d07d44eb5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20M=C3=BCller?= Date: Thu, 16 May 2024 19:50:13 +0200 Subject: [PATCH] enh: introduce RTDCBase.features_local --- CHANGELOG | 4 + dclab/rtdc_dataset/core.py | 70 ++++++++++++----- dclab/rtdc_dataset/fmt_hierarchy/base.py | 4 + tests/test_rtdc_core_feat.py | 96 ++++++++++++++++++++++++ 4 files changed, 155 insertions(+), 19 deletions(-) create mode 100644 tests/test_rtdc_core_feat.py diff --git a/CHANGELOG b/CHANGELOG index 7ca5aacd..6e6574ce 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,8 @@ 0.58.7 + - enh: introduce `RTDCBase.features_local` for accessing features + that are exclusively local and not in remote locations + - enh: prevent `RTDCBase._get_ancillary_feature_data` from + unnecessarily accessing feature data for hashing - enh: introduce new feature "bg_off", a float-valued feature for event-wise background offset correction - enh: warn user about missing endpoint URLs in S3 format and raise an diff --git a/dclab/rtdc_dataset/core.py b/dclab/rtdc_dataset/core.py index be115393..a2bbb854 100644 --- a/dclab/rtdc_dataset/core.py +++ b/dclab/rtdc_dataset/core.py @@ -225,18 +225,24 @@ def _get_ancillary_feature_data(self, be found or was not computed. """ data = None + anhash = None # Try to find the feature in the ancillary features # (see feat_anc_core submodule for more information). # These features are cached in `self._ancillaries`. ancol = AncillaryFeature.available_features(self) if feat in ancol: - # The feature is available. - anhash = ancol[feat].hash(self) - if (feat in self._ancillaries and - self._ancillaries[feat][0] == anhash): - # Use cached value - data = self._ancillaries[feat][1] - elif not no_compute: + # The feature is generally available. + if feat in self._ancillaries: + # We have already computed the feature. Make sure that we + # have the updated one by checking the hash. + anhash = ancol[feat].hash(self) + if self._ancillaries[feat][0] == anhash: + # Use cached value + data = self._ancillaries[feat][1] + # We either already have the ancillary feature or have to + # compute it. We only compute it if we are asked to. + if data is None and not no_compute: + anhash = anhash or ancol[feat].hash(self) # Compute new value data_dict = ancol[feat].compute(self) for okey in data_dict: @@ -462,18 +468,44 @@ def features_loaded(self): always included. They are defined in :const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`. """ - features_innate = self.features_innate - features_loaded = [] - for feat in self.features: - if (feat in features_innate - or feat in FEATURES_RAPID - or feat in self._usertemp - or feat in self._ancillaries): - # Note that there is no hash checking here for - # ancillary features. This might be interesting - # only in rare cases. - features_loaded.append(feat) - return features_loaded + features_loaded = self.features_local + self.features_innate + features_loaded += [f for f in self.features if f in FEATURES_RAPID] + return sorted(set(features_loaded)) + + @property + def features_local(self): + """All features that are, with certainty, really fast to access + + Local features is a slimmed down version of `features_loaded`. + Nothing needs to be computed, not even rapid features + (:const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`). + And features from remote sources that have not been downloaded + already are excluded. Ancillary and temporary features that are + available are included. + """ + features_local = [] + # Note that the hierarchy format just calls its hparent's + # `features_local`. + if hasattr(self._events, "_cached_events"): + features_local += list(self._events._cached_events.keys()) + + if self.format == "hdf5": + features_local += list(self._events.keys()) + + # Get into the basins. + for bn in self.basins: + if (bn.basin_format == "hdf5" + and bn.basin_type == "file" + and bn.is_available()): + features_local += bn.ds.features_local + elif bn._ds is not None: + features_local += bn.ds.features_local + + # If they are here, then we use them: + features_local += list(self._ancillaries.keys()) + features_local += list(self._usertemp.keys()) + + return sorted(set(features_local)) @property def features_scalar(self): diff --git a/dclab/rtdc_dataset/fmt_hierarchy/base.py b/dclab/rtdc_dataset/fmt_hierarchy/base.py index d797e4e9..d6166b3f 100644 --- a/dclab/rtdc_dataset/fmt_hierarchy/base.py +++ b/dclab/rtdc_dataset/fmt_hierarchy/base.py @@ -178,6 +178,10 @@ def features_innate(self): def features_loaded(self): return self.hparent.features_loaded + @property + def features_local(self): + return self.hparent.features_local + @property def features_scalar(self): return self.hparent.features_scalar diff --git a/tests/test_rtdc_core_feat.py b/tests/test_rtdc_core_feat.py new file mode 100644 index 00000000..1761796a --- /dev/null +++ b/tests/test_rtdc_core_feat.py @@ -0,0 +1,96 @@ +import pytest + +import dclab +from dclab import RTDCWriter +from dclab.rtdc_dataset import rtdc_copy +from dclab.rtdc_dataset.fmt_http import RTDC_HTTP +import h5py + +from helper_methods import DCOR_AVAILABLE, retrieve_data + + +http_url = ("https://objectstore.hpccloud.mpcdf.mpg.de/" + "circle-5a7a053d-55fb-4f99-960c-f478d0bd418f/" + "resource/fb7/19f/b2-bd9f-817a-7d70-f4002af916f0") + + +def test_features_local_basic(): + h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip") + with dclab.new_dataset(h5path) as ds: + # access time and index so that they are in the local features + assert ds["index"][0] == 1 + assert len(ds["time"]) == 10 + assert ds.features_local == ds.features_loaded + + +def test_features_local_basin(): + h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip") + h5path_small = h5path.with_name("small.rtdc") + + with h5py.File(h5path, "a") as h5: + del h5["events/deform"] + + with h5py.File(h5path) as src, RTDCWriter(h5path_small) as hw: + # first, copy all the scalar features to the new file + rtdc_copy(src_h5file=src, + dst_h5file=hw.h5file, + features=["area_um"]) + hw.store_basin(basin_name="example basin", + basin_type="file", + basin_format="hdf5", + basin_locs=[h5path], + basin_descr="an example test basin", + ) + + with dclab.new_dataset(h5path_small) as ds: + # access time and index so that they are in the local features + assert "area_um" in ds.features_local + assert "area_msd" in ds.features_local + # not accessed, thus not locally available + assert "time" not in ds.features_local + assert "time" in ds.features_loaded + # area_um was removed above, but can be computed (ancillary) + assert "deform" in ds.features_loaded + assert "deform" not in ds.features_local + assert "circ" in ds.features_local + assert ds["deform"] is not None # just access the feature + assert "deform" in ds.features_local + + +@pytest.mark.skipif(not DCOR_AVAILABLE, reason="DCOR not accessible") +def test_features_local_remote(): + """Open a remote dataset and see whether local features are empty""" + with RTDC_HTTP(http_url) as ds: + assert not ds.features_local + assert ds.features_loaded + assert ds["deform"] is not None # access a feature + assert ds.features_local == ["deform"] + + +@pytest.mark.skipif(not DCOR_AVAILABLE, reason="DCOR not accessible") +def test_features_local_remote_basin(tmp_path): + tmp_path = tmp_path.resolve() + h5path = tmp_path / "test_basin_http.rtdc" + + with h5py.File(h5path, "a") as dst, RTDC_HTTP(http_url) as src: + # Store non-existent basin information + with RTDCWriter(dst, mode="append") as hw: + meta = src.config.as_dict(pop_filtering=True) + hw.store_metadata(meta) + hw.store_basin(basin_name="example basin", + basin_type="remote", + basin_format="http", + basin_locs=[http_url], + basin_descr="an example http test basin", + ) + + with dclab.new_dataset(h5path) as ds: + assert not ds.features_local + assert ds.features_loaded + assert ds["deform"] is not None # access a feature + # The "circ" feature should not be downloaded for accessing the + # "deform" feature. + assert ds.features_local == ["deform"] + assert "deform" not in ds.features_innate + assert "deform" in ds.features_basin + assert ds.basins[0].ds.features_local == ["deform"]