Skip to content

Commit

Permalink
enh: introduce RTDCBase.features_local
Browse files Browse the repository at this point in the history
  • Loading branch information
paulmueller committed May 16, 2024
1 parent e970c8a commit 94d849c
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 19 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
0.58.7
- enh: introduce `RTDCBase.features_local` for accessing features
that are exclusively local and not in remote locations
- enh: prevent `RTDCBase._get_ancillary_feature_data` from
unnecessarily accessing feature data for hashing
- enh: introduce new feature "bg_off", a float-valued feature for
event-wise background offset correction
- enh: warn user about missing endpoint URLs in S3 format and raise an
Expand Down
70 changes: 51 additions & 19 deletions dclab/rtdc_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,18 +225,24 @@ def _get_ancillary_feature_data(self,
be found or was not computed.
"""
data = None
anhash = None
# Try to find the feature in the ancillary features
# (see feat_anc_core submodule for more information).
# These features are cached in `self._ancillaries`.
ancol = AncillaryFeature.available_features(self)
if feat in ancol:
# The feature is available.
anhash = ancol[feat].hash(self)
if (feat in self._ancillaries and
self._ancillaries[feat][0] == anhash):
# Use cached value
data = self._ancillaries[feat][1]
elif not no_compute:
# The feature is generally available.
if feat in self._ancillaries:
# We have already computed the feature. Make sure that we
# have the updated one by checking the hash.
anhash = ancol[feat].hash(self)
if self._ancillaries[feat][0] == anhash:
# Use cached value
data = self._ancillaries[feat][1]
# We either already have the ancillary feature or have to
# compute it. We only compute it if we are asked to.
if data is None and not no_compute:
anhash = anhash or ancol[feat].hash(self)
# Compute new value
data_dict = ancol[feat].compute(self)
for okey in data_dict:
Expand Down Expand Up @@ -462,18 +468,44 @@ def features_loaded(self):
always included. They are defined in
:const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`.
"""
features_innate = self.features_innate
features_loaded = []
for feat in self.features:
if (feat in features_innate
or feat in FEATURES_RAPID
or feat in self._usertemp
or feat in self._ancillaries):
# Note that there is no hash checking here for
# ancillary features. This might be interesting
# only in rare cases.
features_loaded.append(feat)
return features_loaded
features_loaded = self.features_local + self.features_innate
features_loaded += [f for f in self.features if f in FEATURES_RAPID]
return sorted(set(features_loaded))

@property
def features_local(self):
"""All features that are, with certainty, really fast to access
Local features is a slimmed down version of `features_loaded`.
Nothing needs to be computed, not even rapid features
(:const:`dclab.rtdc_dataset.feat_anc_core.FEATURES_RAPID`).
And features from remote sources that have not been downloaded
already are excluded. Ancillary and temporary features that are
available are included.
"""
features_local = []
# Note that the hierarchy format just calls its hparent's
# `features_local`.
if hasattr(self._events, "_cached_events"):
features_local += list(self._events._cached_events.keys())

if self.format == "hdf5":
features_local += list(self._events.keys())

# Get into the basins.
for bn in self.basins:
if (bn.basin_format == "hdf5"
and bn.basin_type == "file"
and bn.is_available()):
features_local += bn.ds.features_local
elif bn._ds is not None:
features_local += bn.ds.features_local

# If they are here, then we use them:
features_local += list(self._ancillaries.keys())
features_local += list(self._usertemp.keys())

return sorted(set(features_local))

@property
def features_scalar(self):
Expand Down
4 changes: 4 additions & 0 deletions dclab/rtdc_dataset/fmt_hierarchy/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,10 @@ def features_innate(self):
def features_loaded(self):
return self.hparent.features_loaded

@property
def features_local(self):
return self.hparent.features_local

@property
def features_scalar(self):
return self.hparent.features_scalar
Expand Down
96 changes: 96 additions & 0 deletions tests/test_rtdc_core_feat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import pytest

import dclab
from dclab import RTDCWriter
from dclab.rtdc_dataset import rtdc_copy
from dclab.rtdc_dataset.fmt_http import RTDC_HTTP
import h5py

from helper_methods import DCOR_AVAILABLE, retrieve_data


http_url = ("https://objectstore.hpccloud.mpcdf.mpg.de/"
"circle-5a7a053d-55fb-4f99-960c-f478d0bd418f/"
"resource/fb7/19f/b2-bd9f-817a-7d70-f4002af916f0")


def test_features_local_basic():
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")
with dclab.new_dataset(h5path) as ds:
# access time and index so that they are in the local features
assert ds["index"][0] == 1
assert len(ds["time"]) == 10
assert ds.features_local == ds.features_loaded


def test_features_local_basin():
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")
h5path_small = h5path.with_name("small.rtdc")

with h5py.File(h5path, "a") as h5:
del h5["events/deform"]

with h5py.File(h5path) as src, RTDCWriter(h5path_small) as hw:
# first, copy all the scalar features to the new file
rtdc_copy(src_h5file=src,
dst_h5file=hw.h5file,
features=["area_um"])
hw.store_basin(basin_name="example basin",
basin_type="file",
basin_format="hdf5",
basin_locs=[h5path],
basin_descr="an example test basin",
)

with dclab.new_dataset(h5path_small) as ds:
# access time and index so that they are in the local features
assert "area_um" in ds.features_local
assert "area_msd" in ds.features_local
# not accessed, thus not locally available
assert "time" not in ds.features_local
assert "time" in ds.features_loaded
# area_um was removed above, but can be computed (ancillary)
assert "deform" in ds.features_loaded
assert "deform" not in ds.features_local
assert "circ" in ds.features_local
assert ds["deform"] is not None # just access the feature
assert "deform" in ds.features_local


@pytest.mark.skipif(not DCOR_AVAILABLE, reason="DCOR not accessible")
def test_features_local_remote():
"""Open a remote dataset and see whether local features are empty"""
with RTDC_HTTP(http_url) as ds:
assert not ds.features_local
assert ds.features_loaded
assert ds["deform"] is not None # access a feature
assert ds.features_local == ["deform"]


@pytest.mark.skipif(not DCOR_AVAILABLE, reason="DCOR not accessible")
def test_features_local_remote_basin(tmp_path):
tmp_path = tmp_path.resolve()
h5path = tmp_path / "test_basin_http.rtdc"

with h5py.File(h5path, "a") as dst, RTDC_HTTP(http_url) as src:
# Store non-existent basin information
with RTDCWriter(dst, mode="append") as hw:
meta = src.config.as_dict(pop_filtering=True)
hw.store_metadata(meta)
hw.store_basin(basin_name="example basin",
basin_type="remote",
basin_format="http",
basin_locs=[http_url],
basin_descr="an example http test basin",
)

with dclab.new_dataset(h5path) as ds:
assert not ds.features_local
assert ds.features_loaded
assert ds["deform"] is not None # access a feature
# The "circ" feature should not be downloaded for accessing the
# "deform" feature.
assert ds.features_local == ["deform"]
assert "deform" not in ds.features_innate
assert "deform" in ds.features_basin
assert ds.basins[0].ds.features_local == ["deform"]

0 comments on commit 94d849c

Please sign in to comment.