Skip to content

Commit

Permalink
Merge branch 'master' into 251_export_log
Browse files Browse the repository at this point in the history
  • Loading branch information
NadiaSbaa committed Jun 11, 2024
2 parents 474291e + 5e1e5b1 commit 30c219d
Show file tree
Hide file tree
Showing 10 changed files with 231 additions and 14 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
0.59.1
0.59.2
- enh: add log entry to the output file when exporting data
0.59.1
- fix: protection against cyclic basin dependencies
- fix: ValueError when exporting data from basin-mapped dataset
- fix: Check for basin availability when checking run identifier
0.59.0
- feat: support basins with blown indices
- enh: increase verbosity when failing to resolve basins
Expand Down
33 changes: 31 additions & 2 deletions dclab/rtdc_dataset/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,9 @@ def __init__(self, identifier=None, enable_basins=True):
self._usertemp = {}
# List of :class:`.Basin` for external features
self._basins = None
# List of basin identifiers that should be ignored, used to
# avoid circular basin dependencies
self._basins_ignored = []
#: Configuration of the measurement
self.config = None
#: Export functionalities; instance of
Expand Down Expand Up @@ -275,7 +278,7 @@ def _get_basin_feature_data(
"""
data = None
if self.basins:
for bn in self.basins:
for bn in list(self.basins):
if basin_type is not None and basin_type != bn.basin_type:
# User asked for specific basin type
continue
Expand All @@ -292,9 +295,19 @@ def _get_basin_feature_data(
data = bn.get_feature_data(feat)
# The data are available, we may abort the search.
break
except (KeyError, OSError, PermissionError, RecursionError):
except (KeyError, OSError, PermissionError):
# Basin data not available
pass
except feat_basin.BasinNotAvailableError:
# remove the basin from the list
# TODO:
# Check whether this has an actual effect. It could be
# that due to some iterative process `self`
# gets re-initialized and we have to go through this
# again.
self._basins.remove(bn)
warnings.warn(
f"Removed unavailable basin {bn} from {self}")
except BaseException:
warnings.warn(f"Could not access {feat} in {self}:\n"
f"{traceback.format_exc()}")
Expand Down Expand Up @@ -523,6 +536,13 @@ def features_scalar(self):
def hash(self):
"""Reproducible dataset hash (defined by derived classes)"""

def ignore_basins(self, basin_identifiers):
"""Ignore these basin identifiers when looking for features
This is used to avoid circular basin dependencies.
"""
self._basins_ignored += basin_identifiers

def apply_filter(self, force=None):
"""Compute the filters for the dataset"""
if force is None:
Expand Down Expand Up @@ -797,11 +817,18 @@ def basins_retrieve(self):
# Sort basins according to priority
bdicts_srt = sorted(self.basins_get_dicts(),
key=feat_basin.basin_priority_sorted_key)
bd_keys = [bd["key"] for bd in bdicts_srt if "key" in bd]
bd_keys += self._basins_ignored
for bdict in bdicts_srt:
if bdict["format"] not in bc:
warnings.warn(f"Encountered unsupported basin "
f"format '{bdict['format']}'!")
continue
if "key" in bdict and bdict["key"] in self._basins_ignored:
warnings.warn(
f"Encountered cyclic basin dependency '{bdict['key']}'",
feat_basin.CyclicBasinDependencyFoundWarning)
continue

# Basin initialization keyword arguments
kwargs = {
Expand All @@ -816,6 +843,8 @@ def basins_retrieve(self):
"mapping_referrer": self,
# Make sure the measurement identifier is checked.
"measurement_identifier": self.get_measurement_identifier(),
# allow to ignore basins
"ignored_basins": bd_keys,
}

# Check whether this basin is supported and exists
Expand Down
32 changes: 26 additions & 6 deletions dclab/rtdc_dataset/feat_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,26 @@
import numbers
import threading
from typing import Dict, List, Literal
import warnings
import weakref

import numpy as np


class CyclicBasinDependencyFoundWarning(UserWarning):
"""Used when a basin is defined in one of its sub-basins"""


class BasinmapFeatureMissingError(KeyError):
"""Used when one of the `basinmap` features is not defined"""
pass


class BasinNotAvailableError(BaseException):
"""Used to identify situations where the basin data is not available"""
pass


class BasinAvailabilityChecker(threading.Thread):
"""Helper thread for checking basin availability in the background"""
def __init__(self, basin, *args, **kwargs):
Expand Down Expand Up @@ -57,6 +67,7 @@ def __init__(self,
"basinmap9",
] = "same",
mapping_referrer: Dict = None,
ignored_basins: List[str] = None,
**kwargs):
"""
Expand Down Expand Up @@ -90,6 +101,8 @@ def __init__(self,
in situations where `mapping != "same"`. This can be a simple
dictionary of numpy arrays or e.g. an instance of
:class:`.RTDCBase`.
ignored_basins: list of str
List of basins to ignore in subsequent basin instantiations
kwargs:
Additional keyword arguments passed to the `load_dataset`
method of the `Basin` subclass.
Expand All @@ -110,6 +123,8 @@ def __init__(self,
#: measurement identifier of the referencing dataset
self.measurement_identifier = measurement_identifier
self._measurement_identifier_verified = False
#: ignored basins
self.ignored_basins = ignored_basins or []
#: additional keyword arguments passed to the basin
self.kwargs = kwargs
#: Event mapping strategy. If this is "same", it means that the
Expand Down Expand Up @@ -147,10 +162,8 @@ def __repr__(self):

def _assert_measurement_identifier(self):
"""Make sure the basin matches the measurement identifier
This method caches its result, i.e. only the first call is slow.
"""
if not self.verify_basin(run_identifier=True, availability=False):
if not self.verify_basin(run_identifier=True):
raise KeyError(f"Measurement identifier of basin {self.ds} "
f"({self.get_measurement_identifier()}) does "
f"not match {self.measurement_identifier}!")
Expand Down Expand Up @@ -195,8 +208,9 @@ def ds(self):
"""The :class:`.RTDCBase` instance represented by the basin"""
if self._ds is None:
if not self.is_available():
raise ValueError(f"Basin {self} is not available!")
raise BasinNotAvailableError(f"Basin {self} is not available!")
self._ds = self.load_dataset(self.location, **self.kwargs)
self._ds.ignore_basins(self.ignored_basins)
return self._ds

@property
Expand Down Expand Up @@ -278,7 +292,12 @@ def load_dataset(self, location, **kwargs):
ds_bn = ds
return ds_bn

def verify_basin(self, availability=True, run_identifier=True):
def verify_basin(self, run_identifier=True, availability=True):
if not availability:
warnings.warn("The keyword argument 'availability' is "
"deprecated, because it can lead to long waiting "
"times with many unavailable basins.",
DeprecationWarning)
if availability:
check_avail = self.is_available()
else:
Expand Down Expand Up @@ -360,6 +379,7 @@ def __getattr__(self, item):
"features_local",
"features_scalar",
"get_measurement_identifier",
"ignore_basins",
]:
return getattr(self.ds, item)
else:
Expand Down Expand Up @@ -414,7 +434,7 @@ def __getitem__(self, index):
return self.feat_obj[self.basinmap[index]]
elif not self.is_scalar:
# image, mask, etc
if index == slice(None):
if isinstance(index, slice) and index == slice(None):
indices = self.basinmap
else:
indices = self.basinmap[index]
Expand Down
1 change: 1 addition & 0 deletions dclab/rtdc_dataset/fmt_hdf5/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,5 +182,6 @@ def basins_get_dicts(self):
if isinstance(bdat[0], bytes):
bdat = [bi.decode("utf") for bi in bdat]
bdict = json.loads(" ".join(bdat))
bdict["key"] = bk
basins.append(bdict)
return basins
118 changes: 118 additions & 0 deletions tests/test_rtdc_feat_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,124 @@
from helper_methods import DCOR_AVAILABLE, retrieve_data


def test_basin_cyclic_dependency_found():
"""A basin can be defined in one of its sub-basins
This is something dclab identifies and then
raises a CyclicBasinDependencyFoundWarning
"""
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")
paths_list = [
h5path.with_name("level1.rtdc"),
h5path.with_name("level2.rtdc"),
h5path.with_name("level3.rtdc"),
]

paths_list_rolled = np.roll(paths_list, 1)
basins_list = []
for pp in paths_list_rolled:
basins_list.append({
"basin_name": "user-defined basin",
"basin_type": "file",
"basin_format": "hdf5",
"basin_feats": ["userdef1"],
"basin_locs": [str(pp)],
"verify": False,
})

for bdict, pp in zip(basins_list, paths_list):
with h5py.File(h5path) as src, RTDCWriter(pp) as hw:
# copy all the scalar features to the new file
rtdc_dataset.rtdc_copy(src_h5file=src,
dst_h5file=hw.h5file,
features="scalar")
hw.store_basin(**bdict)

# Open the dataset
with dclab.new_dataset(paths_list[0]) as ds:
assert np.allclose(ds["deform"][0], 0.02494624), "sanity check"
with pytest.warns(feat_basin.CyclicBasinDependencyFoundWarning,
match="Encountered cyclic basin dependency"):
with pytest.raises(KeyError):
_ = ds["userdef1"]


def test_basin_cyclic_dependency_found_2():
"""A basin can be defined in one of its sub-basins
This is something dclab identifies and then
raises a CyclicBasinDependencyFoundWarning
"""
h5path = retrieve_data("fmt-hdf5_fl_wide-channel_2023.zip")
# Those are the files with a cyclic dependency
p1 = h5path.with_name("level1.rtdc")
p2 = h5path.with_name("level2.rtdc")
p3 = h5path.with_name("level3.rtdc")
# This is the file that will contain the userdef1 feature
pz = h5path.with_name("final.rtdc")

# Initialize datasets
for pp in [p1, p2, p3, pz]:
with h5py.File(h5path) as src, RTDCWriter(pp) as hw:
# copy all the scalar features to the new file
rtdc_dataset.rtdc_copy(src_h5file=src,
dst_h5file=hw.h5file,
features="scalar")

with RTDCWriter(pz) as hw:
hw.store_feature("userdef1", hw.h5file["events/deform"][:])

bn_kwargs = {
"basin_type": "file",
"basin_format": "hdf5",
"basin_feats": ["userdef1"],
"verify": False,
}

with RTDCWriter(p1) as hw:
hw.store_basin(
basin_name="link to path 2",
basin_locs=[str(p2)],
**bn_kwargs
)

# store a basin with a key that is sorted after the basin above
bdat = {
"name": "user-defined data",
"type": "file",
"format": "hdf5",
"features": ["userdef1"],
"paths": [str(pz)]
}
blines = json.dumps(bdat, indent=2).split("\n")
basins = hw.h5file.require_group("basins")
hw.write_text(basins, "zzzz99999zzzzz99999", blines)

with RTDCWriter(p2) as hw:
hw.store_basin(
basin_name="link to path 3",
basin_locs=[str(p3)],
**bn_kwargs
)

with RTDCWriter(p3) as hw:
hw.store_basin(
basin_name="link to path 1, completing the circle",
basin_locs=[str(p1)],
**bn_kwargs
)

# Open the dataset
with dclab.new_dataset(p1) as ds:
assert np.allclose(ds["deform"][0], 0.02494624), "sanity check"
assert "userdef1" in ds
assert ds.basins[0].name == "link to path 2", "order matters"
assert ds.basins[1].name == "user-defined data", "order matters"
with pytest.warns(feat_basin.CyclicBasinDependencyFoundWarning,
match="Encountered cyclic basin dependency"):
_ = ds["userdef1"]


@pytest.mark.filterwarnings(
"ignore::dclab.rtdc_dataset.config.WrongConfigurationTypeWarning")
@pytest.mark.skipif(not DCOR_AVAILABLE, reason="DCOR is not available")
Expand Down
41 changes: 41 additions & 0 deletions tests/test_rtdc_feat_basin_mapped.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,47 @@ def test_basin_mapped(basinmap):
assert np.all(ds1["mask"] == ds0["mask"][:][basinmap])


@pytest.mark.parametrize("basinmap", [
[1, 3, 4], # very simple case
[1, 1, 1, 2], # not trivial, not realizable with hierarchy children
])
def test_basin_mapped_export(basinmap):
path = retrieve_data("fmt-hdf5_image-mask-blood_2021.zip")
with h5py.File(path, "a") as h5:
# delete circularity to avoid ancillary feature computation in this
# test.
del h5["events"]["circ"]

path_out = path.with_name("level1.rtdc")
basinmap = np.array(basinmap, dtype=np.uint64)

# create basin
with dclab.new_dataset(path) as ds0, dclab.RTDCWriter(path_out) as hw1:
hw1.store_metadata(ds0.config.as_dict(pop_filtering=True))
hw1.store_basin(basin_name="level1",
basin_type="file",
basin_format="hdf5",
basin_locs=[path],
basin_map=basinmap
)

path_export = path.with_name("export.rtdc")
with dclab.new_dataset(path_out) as ds1:
ds1.filter.manual[:] = False
ds1.filter.manual[:2] = True
ds1.apply_filter()
ds1.export.hdf5(
path_export,
features=["deform", "image", "mask"]
)

# Export data
with dclab.new_dataset(path) as ds0, dclab.new_dataset(path_export) as ds1:
assert np.all(ds1["deform"] == ds0["deform"][basinmap[:2]])
assert np.all(ds1["image"] == ds0["image"][:][basinmap[:2]])
assert np.all(ds1["mask"] == ds0["mask"][:][basinmap[:2]])


def test_error_when_basinmap_not_given():
"""This is a test for when the basinmap feature for mapping is missing
Expand Down
4 changes: 2 additions & 2 deletions tests/test_rtdc_fmt_dcor_basin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from dclab import new_dataset, RTDCWriter
from dclab.rtdc_dataset.fmt_dcor import DCORBasin, RTDC_DCOR

from dclab.rtdc_dataset.feat_basin import BasinNotAvailableError

from helper_methods import DCOR_AVAILABLE, retrieve_data

Expand Down Expand Up @@ -101,7 +101,7 @@ def test_basin_not_available(url):
bn = DCORBasin("https://dcor.mpl.mpg.de/api/3/action/dcserv?id="
"00000000-0000-0000-0000-000000000000")
assert not bn.is_available()
with pytest.raises(ValueError, match="is not available"):
with pytest.raises(BasinNotAvailableError, match="is not available"):
_ = bn.ds


Expand Down
Loading

0 comments on commit 30c219d

Please sign in to comment.