From f2a867a1f079047bf10b9e170de993966e4e2f51 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 8 May 2026 13:04:25 +0200 Subject: [PATCH 01/10] Add inline CTable support to TreeStore TreeStore can now store CTable objects as first-class inline leaves alongside NDArrays: ts["/arr"] = blosc2.arange(10) ts["/table"] = ctable # CTable stored inline table = ts["/table"] # returns CTable transparently Physical layout: CTable internals (_meta, _valid_rows, _cols/*, _indexes/*) are stored as ordinary Blosc2 leaves inside the outer store's working directory, avoiding nested ZIPs and keeping everything directly addressable by offset in .b2z bundles. Implementation: - Add TreeStoreTableStorage backend in ctable_storage.py that maps CTable logical keys onto an outer TreeStore's map_tree/working_dir - Refactor CTable.save() and CTable.open() around shared _save_to_storage() and _open_from_storage() helpers; add private _save_to_treestore() and _open_from_treestore() used by TreeStore - Add persistent object registry in TreeStore (embed-store vlmeta) to track object roots; probe _//_meta as fallback for old stores - Update TreeStore.__setitem__ to dispatch CTable and block writes to object internals - Update TreeStore.__getitem__ to return CTable for registered object roots - Update TreeStore.__delitem__ to remove all physical leaves of an object root and unregister it; block direct deletion of internals - Update keys(), __contains__, walk(), get_children(), get_descendants() to treat object roots as opaque leaves and hide their internals - get_subtree() raises ValueError on object root paths - TreeStore.close()/discard() flush inline CTable handles before repacking - Add 30 new tests covering b2d/b2z, append mode, traversal, guards, deletion, roundtrips, and string columns - Add plans/tree_store_ctable_ndarray.md with design rationale --- plans/tree_store_ctable_ndarray.md | 670 +++++++++++++++++++++++++++++ src/blosc2/ctable.py | 211 +++++---- src/blosc2/ctable_storage.py | 378 ++++++++++++++++ src/blosc2/tree_store.py | 335 +++++++++++---- tests/test_tree_store.py | 322 ++++++++++++++ 5 files changed, 1755 insertions(+), 161 deletions(-) create mode 100644 plans/tree_store_ctable_ndarray.md diff --git a/plans/tree_store_ctable_ndarray.md b/plans/tree_store_ctable_ndarray.md new file mode 100644 index 00000000..3a180916 --- /dev/null +++ b/plans/tree_store_ctable_ndarray.md @@ -0,0 +1,670 @@ +# TreeStore containing NDArrays and CTables + +## Goal + +Allow a persistent `TreeStore` to contain both ordinary Blosc2 leaves, such as +`NDArray`, and higher-level objects, specifically `CTable`, while keeping the +public API simple: + +```python +with blosc2.TreeStore("bundle.b2z", mode="w") as ts: + ts["/x"] = blosc2.arange(10) + ts["/table"] = table + +with blosc2.open("bundle.b2z", mode="r") as ts: + x = ts["/x"] # NDArray + table = ts["/table"] # CTable +``` + +The preferred design is **inline CTable subtree storage**, not nested `.b2z` +files. This avoids the ZIP-inside-ZIP problem for read-only `.b2z` bundles and +keeps all CTable components directly addressable as normal Blosc2 frame leaves +inside the outer store. + +## Non-goals for the first iteration + +- Full recursive `DictStore` / `TreeStore` values. +- Arbitrary Python object storage. +- Public APIs like `CTable.save_to_store()` or `CTable.open_from_store()`. +- Zero-copy linking to an external CTable path. Assignment should copy/materialize + the CTable into the destination TreeStore subtree. + +These may be considered later after the CTable use case is stable. + +## Current situation + +`CTable` is already persisted internally as a `TreeStore` with a layout like: + +```text +table.b2d or table.b2z + /_meta + /_valid_rows + /_cols/ + /_indexes/... +``` + +The `/_meta` SChunk has metadata such as: + +```python +meta.vlmeta["kind"] = "ctable" +meta.vlmeta["version"] = 1 +meta.vlmeta["schema"] = ... +``` + +`blosc2.open("table.b2z")` already detects this root manifest and returns a +`CTable` instead of a raw `TreeStore`. + +However, `TreeStore.__setitem__()` currently only supports array-like values via +`DictStore.__setitem__()`. It does not support assigning a `CTable` as a leaf, +and if CTable internals were placed below `/table`, `TreeStore.__getitem__()` +would currently return a subtree view for `/table`, not a `CTable`. + +## Proposed physical layout + +Store CTable internals inline below the assigned key: + +```text +bundle.b2z + embed.b2e + x.b2nd + table/_meta.b2f + table/_valid_rows.b2nd + table/_cols/name.b2nd + table/_cols/age.b2nd + table/_indexes/... +``` + +For a directory-backed outer store: + +```text +bundle.b2d/ + embed.b2e + x.b2nd + table/ + _meta.b2f + _valid_rows.b2nd + _cols/ + name.b2nd + age.b2nd + _indexes/ + ... +``` + +From the outer `TreeStore` point of view, `/table` is an object root. Its +internal paths are implementation details. + +## Proposed public semantics + +### Write + +```python +ts["/table"] = ctable +``` + +Materializes `ctable` into the subtree rooted at `/table` and registers `/table` +as a CTable object root. + +### Read + +```python +table = ts["/table"] +``` + +Returns a `CTable` when `/table` is registered as an object root or when +`/table/_meta` declares `kind == "ctable"`. + +### Traversal + +Normal high-level traversal should treat `/table` as one object: + +```python +sorted(ts.keys()) +# ['/table', '/x'] +``` + +not: + +```python +["/table/_meta", "/table/_valid_rows", "/table/_cols/name", ...] +``` + +A raw/internal view can be considered later if needed. + +## Metadata design + +Use two metadata layers: + +1. **CTable internal manifest** at `/table/_meta`. + - This remains authoritative for opening the CTable. + - It already contains `kind == "ctable"`, version and schema. + +2. **TreeStore object registry** in TreeStore-level metadata. + - Used for efficient object-boundary detection and for hiding internals in + `keys()`, `items()`, `walk()`, deletion and conflict checks. + +Suggested TreeStore-level registry: + +```python +tstore.vlmeta["objects"] = { + "/table": { + "kind": "ctable", + "version": 1, + "layout": "inline-tree-subtree", + } +} +``` + +The registry is a convenience index. If missing, TreeStore should be able to +fall back to probing `/table/_meta` for backward compatibility and robustness. + +## Internal protocol + +Do not expose public `CTable.save_to_store()` / `CTable.open_from_store()` APIs +initially. Instead, implement private/internal hooks. + +Possible hooks: + +```text +CTable._save_to_treestore(store: TreeStore, key: str) -> None +CTable._open_from_treestore(store: TreeStore, key: str) -> CTable +``` + +or a more generic private object protocol later: + +```python +obj.__blosc2_store_into__(store, key) +``` + +For the first implementation, CTable-specific private methods are likely simpler. + +## Required implementation pieces + +### 1. `TreeStoreTableStorage` + +Add a new `TableStorage` backend in `src/blosc2/ctable_storage.py`: + +```python +class TreeStoreTableStorage(TableStorage): + def __init__( + self, + store: blosc2.TreeStore, + root_key: str, + mode: str, + owns_store: bool = False, + ): ... +``` + +It maps CTable logical storage keys: + +```text +/_meta +/_valid_rows +/_cols/ +/_indexes/... +``` + +onto outer TreeStore keys/paths: + +```text +/_meta +/_valid_rows +/_cols/ +/_indexes/... +``` + +For example: + +```text +_table_key("/_meta") -> "/table/_meta" +_table_key("/_valid_rows") -> "/table/_valid_rows" +_table_key("/_cols/x") -> "/table/_cols/x" +``` + +It should implement the same `TableStorage` interface currently implemented by +`FileTableStorage`: + +- `create_column()` +- `open_column()` +- `create_list_column()` +- `open_list_column()` +- `create_varlen_scalar_column()` +- `open_varlen_scalar_column()` +- `create_valid_rows()` +- `open_valid_rows()` +- `save_schema()` +- `load_schema()` +- `check_kind()` +- `table_exists()` +- `is_read_only()` +- `open_mode()` +- `delete_column()` +- `rename_column()` +- `close()` +- `discard()` +- index catalog / epoch helpers +- `index_anchor_path()` + +Important lifecycle rule: + +- If the backend is created from an existing outer `TreeStore`, it should not + close/discard the outer store unless it explicitly owns it. +- Use `owns_store=False` for tables returned by `ts["/table"]`. + +### 2. Refactor CTable open/save around `TableStorage` + +Currently `CTable.__init__()`, `CTable.open()` and `CTable.save()` are strongly +oriented around either `FileTableStorage(urlpath, mode)` or `InMemoryTableStorage()`. + +Add private helper paths so any `TableStorage` implementation can be used: + +```text +CTable._open_from_storage(storage: TableStorage) -> CTable +CTable._save_to_storage(storage: TableStorage) -> None +``` + +Then: + +- `CTable.open(urlpath, mode="r")` uses `FileTableStorage` and calls + `_open_from_storage()`. +- `CTable.save(urlpath, overwrite=False)` uses `FileTableStorage` and calls + `_save_to_storage()`. +- `CTable._open_from_treestore(store, key)` uses `TreeStoreTableStorage` and + calls `_open_from_storage()`. +- `CTable._save_to_treestore(store, key)` uses `TreeStoreTableStorage` and + calls `_save_to_storage()`. + +This should reduce duplication and keep the public API unchanged. + +### 3. TreeStore object registry helpers + +Add private helpers in `src/blosc2/tree_store.py`: + +```python +def _normalize_object_key(self, key: str) -> str: ... +def _objects_metadata(self) -> dict: ... +def _register_object( + self, key: str, *, kind: str, version: int, layout: str +) -> None: ... +def _unregister_object(self, key: str) -> None: ... +def _object_info(self, key: str) -> dict | None: ... +def _object_roots(self) -> set[str]: ... +``` + +Fallback probing helper: + +```text +def _probe_object_info(self, key: str) -> dict | None: + # Look for key + "/_meta" and inspect vlmeta["kind"]. +``` + +The registry should probably live in the TreeStore root vlmeta. Subtree views +need to translate object keys correctly between full and subtree-relative paths. + +### 4. TreeStore assignment integration + +In `TreeStore.__setitem__()` before falling back to `DictStore.__setitem__()`: + +```python +if isinstance(value, blosc2.CTable): + self._set_ctable_object(key, value) + return +``` + +`_set_ctable_object()` should: + +1. Validate key and structural conflicts. +2. Reject assigning inside an existing object subtree unless this is an internal + write performed by CTable storage. +3. Delete/overwrite an existing object root if overwrite semantics are allowed, + or raise if the key exists. This must be consistent with existing TreeStore + assignment behavior. +4. Materialize the CTable into `key` via `CTable._save_to_treestore()`. +5. Register object metadata: + + ```python + self._register_object(key, kind="ctable", version=1, layout="inline-tree-subtree") + ``` + +Need an internal bypass flag/mechanism so `TreeStoreTableStorage` can write +`/table/_meta`, `/table/_cols/x`, etc. without being blocked by object-boundary +protection. + +Possible approaches: + +- `DictStore.__setitem__()` direct calls from `TreeStoreTableStorage` after full + key translation. +- A private `TreeStore._set_internal(key, value)` method. +- A context manager `with store._raw_object_write(): ...`. + +Prefer a small private method so the bypass is explicit and limited. + +### 5. TreeStore retrieval integration + +In `TreeStore.__getitem__()` after key validation and before returning subtree +views: + +```python +info = self._object_info(key) or self._probe_object_info(key) +if info is not None: + if info["kind"] == "ctable": + return blosc2.CTable._open_from_treestore(self, key) +``` + +This ensures: + +```python +ts["/table"] +``` + +returns `CTable` instead of a raw subtree. + +### 6. Object-boundary protection + +Prevent accidental user mutation of CTable internals through the outer TreeStore: + +```python +ts["/table/_cols/x"] = arr # should raise by default +del ts["/table/_meta"] # should raise by default +``` + +Allowed operations: + +```python +ts["/table"] = new_ctable # replace whole object, if overwrite semantics permit +del ts["/table"] # delete whole object subtree and registry entry +table = ts["/table"] # object access +``` + +Private CTable storage code must be able to bypass this protection. + +### 7. Collapse object internals in traversal + +Update high-level `TreeStore` methods so object roots are treated as leaves: + +- `keys()` +- `items()` +- `values()` if present/added +- `walk()` +- `get_children()` +- `get_descendants()` +- `get_subtree()` behavior around object roots + +Suggested behavior: + +- Include object root key, e.g. `/table`. +- Exclude descendants under registered object roots from normal high-level + traversal. +- If a user asks for `get_subtree("/table")`, either: + - return the CTable via `__getitem__()` and document that object roots are not + normal subtrees, or + - add an explicit raw/internal method later. + +Avoid adding public raw APIs in the first iteration unless tests or development +needs require it. + +### 8. Deletion semantics + +`del ts["/table"]` should: + +1. Detect `/table` as object root. +2. Delete all physical keys/files under `/table/...`. +3. Remove object registry entry. +4. Mark store modified. + +Deleting normal subtrees should also remove object registry entries for any +objects inside the deleted subtree. + +### 9. `.b2z` read-only behavior + +This design avoids nested ZIPs. For fixed-width columns and metadata, read-only +outer `.b2z` access can continue to use zip offsets for cframe leaves. + +For list/varlen columns and index sidecars, mirror the existing +`FileTableStorage` logic: + +- `.b2b` leaves can be opened by offset from the outer `.b2z` because they are + Blosc2 cframes. +- Index sidecars that need filesystem paths may need extraction into the outer + store working directory, as current `FileTableStorage` already does for + `.b2z` tables. + +### 10. Index catalog handling + +`TreeStoreTableStorage` should store index sidecar paths consistently relative +to the outer store working directory, e.g.: + +```text +table/_indexes//... +``` + +Then `DictStore.to_b2z()` naturally packs them into the outer `.b2z`. + +Carefully port/adapt from `FileTableStorage`: + +- `_walk_descriptor_paths()` +- `_relativize_descriptor()` +- `_absolutize_descriptor()` +- `_ensure_index_files_extracted()` +- `load_index_catalog()` +- `save_index_catalog()` +- `index_anchor_path()` + +## Limitations of the design + +- This addresses CTable-as-object, not arbitrary recursive stores. +- Object internals are physically present in the TreeStore and must be protected. +- High-level traversal becomes semantic rather than purely physical. +- Multiple mutable handles to the same inline CTable may conflict unless handled + with caching or documented as unsupported. +- Assigning an in-memory CTable copies/materializes all columns. +- Assigning a persistent CTable should copy contents, not link to its source. +- Registry metadata and `/table/_meta` can get out of sync if manually edited; + `/table/_meta` should remain authoritative for opening. +- Mutation of inline CTables inside append-mode outer `.b2z` requires careful + flush/close ordering. + +## Suggested implementation phases + +### Phase 1: Storage refactor only + +- Add `CTable._open_from_storage()`. +- Add `CTable._save_to_storage()`. +- Update existing `CTable.open()` / `save()` to use the helpers. +- Ensure all current CTable tests pass unchanged. + +### Phase 2: Add `TreeStoreTableStorage` + +- Implement the backend. +- Add private `CTable._save_to_treestore()` and `_open_from_treestore()`. +- Add focused tests using private methods initially if necessary. + +### Phase 3: TreeStore object registry and dispatch + +- Add object registry metadata helpers. +- Add `TreeStore.__setitem__()` support for `CTable`. +- Add `TreeStore.__getitem__()` dispatch to return `CTable` for object roots. + +### Phase 4: Object-boundary traversal and deletion + +- Hide object internals from `keys()`, `items()`, `walk()`, etc. +- Protect internals from direct mutation. +- Implement whole-object deletion. + +### Phase 5: Full CTable feature coverage + +Add/verify tests for: + +- fixed-width columns +- list columns +- varlen scalar columns +- computed/materialized column metadata +- index catalogs and sidecars +- read-only `.b2z` bundles +- append-mode `.b2d` bundles +- append-mode `.b2z` bundles + +## Test plan + +### Basic TreeStore with NDArray and CTable + +Parametrize over outer format `b2d` / `b2z`: + +```python +with blosc2.TreeStore(path, mode="w") as ts: + ts["/x"] = blosc2.arange(10) + ts["/table"] = ctable + +with blosc2.open(path, mode="r") as ts: + assert isinstance(ts["/x"], blosc2.NDArray) + assert isinstance(ts["/table"], blosc2.CTable) +``` + +### Traversal hides internals + +```python +assert "/table" in ts.keys() +assert "/table/_meta" not in ts.keys() +assert not any(k.startswith("/table/_cols") for k in ts.keys()) +``` + +### Raw physical persistence + +For debugging-level checks, inspect the filesystem/zip entries and confirm +physical internals exist: + +```text +table/_meta.b2f +table/_valid_rows.b2nd +table/_cols/... +``` + +### Structural conflict tests + +```python +ts["/table"] = ctable +with pytest.raises(ValueError): + ts["/table/_cols/x"] = arr +``` + +and reverse conflict: + +```python +ts["/table/foo"] = arr +with pytest.raises(ValueError): + ts["/table"] = ctable +``` + +### Deletion + +```python +del ts["/table"] +assert "/table" not in ts +# assert no physical table/* entries remain after reopen +``` + +### CTable feature tests + +- simple schema with numeric/string columns +- list columns +- nullable/varlen scalar columns +- indexes if applicable +- append/read after reopen + +## Decisions on initially open questions + +### Replacing an existing object root + +Do **not** allow implicit replacement. If `/table` already exists as an object +root, then: + +```python +ts["/table"] = new_table +``` + +should raise. Users must delete explicitly first: + +```python +del ts["/table"] +ts["/table"] = new_table +``` + +Rationale: replacing a CTable subtree is destructive and can involve many +physical leaves. Requiring an explicit delete avoids accidental data loss and +simplifies consistency handling. + +### `get_subtree()` on object roots + +`ts.get_subtree("/table")` should raise by default: + +```python +ValueError("'/table' is a CTable object root, not a TreeStore subtree") +``` + +Use: + +```python +ts["/table"] +``` + +to retrieve the `CTable` object. Returning a raw subtree would expose internals; +returning a `CTable` from `get_subtree()` would make the method misleading. + +### Public raw/internal inspection API + +Do **not** add a public raw/internal API initially. Keep object internals +private for the first implementation. If a real debugging or advanced-use need +appears later, consider an explicit API such as: + +```python +ts.get_subtree("/table", raw=True) +``` + +or: + +```python +ts.get_object_storage("/table") +``` + +Avoid exposing this too early so the inline object layout can still evolve. + +### Caching object handles + +Do **not** cache returned object handles initially. Multiple read-only handles +are fine. Multiple mutable handles to the same inline object should be +documented as unsupported initially. + +A weakref cache or writable-handle guard can be added later if practical issues +show up. + +### Write ordering and close semantics + +Returned inline CTable handles should be non-owning with respect to the outer +`TreeStore`, but the outer store should track inline handles it created so close +ordering is safe. + +Recommended behavior: + +- `TreeStore.__getitem__("/table")` returns a `CTable` backed by the outer store. +- The outer `TreeStore` keeps a private weak set/list of inline object handles it + opened. +- `TreeStore.close()` closes any still-open inline object handles before packing + an append/write-mode `.b2z` outer store. +- Then the outer store repacks as usual. + +This makes the following safe: + +```python +with blosc2.TreeStore("bundle.b2z", mode="a") as ts: + table = ts["/table"] + table.append(...) +# TreeStore.close() closes table first, then repacks bundle.b2z +``` + +Explicitly closing the table remains fine: + +```python +with blosc2.TreeStore("bundle.b2d", mode="a") as ts: + table = ts["/table"] + table.append(...) + table.close() +``` diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 0ff8c336..ffda0795 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -31,7 +31,7 @@ import blosc2 from blosc2 import compute_chunks_blocks -from blosc2.ctable_storage import FileTableStorage, InMemoryTableStorage, TableStorage +from blosc2.ctable_storage import FileTableStorage, InMemoryTableStorage, TableStorage, TreeStoreTableStorage from blosc2.info import InfoReporter, format_nbytes_info from blosc2.list_array import ListArray, coerce_list_cell from blosc2.scalar_array import _ScalarVarLenArray @@ -2050,44 +2050,7 @@ def open(cls, urlpath: str, *, mode: str = "r") -> CTable: storage = FileTableStorage(urlpath, mode) if not storage.table_exists(): raise FileNotFoundError(f"No CTable found at {urlpath!r}") - storage.check_kind() - schema_dict = storage.load_schema() - schema = schema_from_dict(schema_dict) - col_names = [c["name"] for c in schema_dict["columns"]] - - obj = cls.__new__(cls) - obj._row_type = None - obj._validate = True - obj._table_cparams = None - obj._table_dparams = None - obj._storage = storage - obj._read_only = storage.is_read_only() - obj._schema = schema - obj._cols = {} - obj._col_widths = {} - obj.col_names = col_names - obj.auto_compact = False - obj.base = None - - obj._valid_rows = storage.open_valid_rows() - for name in col_names: - cc = schema.columns_by_name[name] - if obj._is_list_column(cc): - obj._cols[name] = storage.open_list_column(name) - elif obj._is_varlen_scalar_column(cc): - obj._cols[name] = storage.open_varlen_scalar_column(name, cc.spec) - else: - obj._cols[name] = storage.open_column(name) - obj._col_widths[name] = max(len(name), cc.display_width) - - obj._n_rows = int(blosc2.count_nonzero(obj._valid_rows)) - obj._last_pos = None # resolve lazily on first write - obj._computed_cols = {} - obj._materialized_cols = {} - obj._expr_index_arrays = {} - obj._load_computed_cols_from_schema(schema_dict) - obj._load_materialized_cols_from_schema(schema_dict) - return obj + return cls._open_from_storage(storage) def to_b2z(self, urlpath: str, *, overwrite: bool = False, compact: bool = False) -> str: """Write this table to a compact ``.b2z`` container. @@ -2202,47 +2165,12 @@ def to_b2d(self, urlpath: str, *, overwrite: bool = False, compact: bool = False self.save(urlpath, overwrite=overwrite) return os.path.abspath(urlpath) - def save(self, urlpath: str, *, overwrite: bool = False) -> None: - """Persist this table to disk at *urlpath*. - - This writes a standalone copy and returns ``None``; use :meth:`copy` - directly when the copied :class:`CTable` object is needed. - - Only live rows are written — the on-disk table is always compacted. - A ``.b2z`` suffix selects the compact zip-backed format; any other - suffix creates a directory-backed store. Use a ``.b2d`` suffix for - directory-backed stores when possible so the format is clear. - - Parameters - ---------- - urlpath: - Destination path. Use a ``.b2z`` suffix for a compact zip-backed - store; any other suffix creates a directory-backed store. A - ``.b2d`` suffix is recommended for directory-backed stores. - overwrite: - If ``False`` (default), raise :exc:`ValueError` when *urlpath* - already exists. Set to ``True`` to replace an existing table. + def _save_to_storage(self, storage: TableStorage) -> None: + """Write all live rows and columns into *storage*. - Raises - ------ - ValueError - If *urlpath* already exists and ``overwrite=False``. + The caller is responsible for calling ``storage.close()`` when done. + This method does **not** close *storage*. """ - if self.base is not None: - materialized = self.copy(compact=True) - materialized.save(urlpath, overwrite=overwrite) - return - - file_storage = FileTableStorage(urlpath, "w") - target_path = file_storage._root - if os.path.exists(target_path): - if not overwrite: - raise ValueError(f"Path {target_path!r} already exists. Use overwrite=True to replace.") - if os.path.isdir(target_path): - shutil.rmtree(target_path) - else: - os.remove(target_path) - self._flush_varlen_columns() # Collect live physical positions @@ -2254,7 +2182,7 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None: default_chunks, default_blocks = compute_chunks_blocks((capacity,)) # --- valid_rows (all True, compacted) --- - disk_valid = file_storage.create_valid_rows( + disk_valid = storage.create_valid_rows( shape=(capacity,), chunks=default_chunks, blocks=default_blocks, @@ -2266,7 +2194,7 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None: for col in self._schema.columns: name = col.name if self._is_list_column(col): - disk_col = file_storage.create_list_column( + disk_col = storage.create_list_column( name, spec=col.spec, cparams=col.config.cparams if col.config.cparams is not None else self._table_cparams, @@ -2277,7 +2205,7 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None: disk_col.flush() continue if self._is_varlen_scalar_column(col): - disk_col = file_storage.create_varlen_scalar_column( + disk_col = storage.create_varlen_scalar_column( name, spec=col.spec, cparams=col.config.cparams if col.config.cparams is not None else self._table_cparams, @@ -2289,7 +2217,7 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None: continue dtype_chunks, dtype_blocks = compute_chunks_blocks((capacity,), dtype=col.dtype) col_storage = self._resolve_column_storage(col, dtype_chunks, dtype_blocks) - disk_col = file_storage.create_column( + disk_col = storage.create_column( name, dtype=col.dtype, shape=(capacity,), @@ -2301,9 +2229,126 @@ def save(self, urlpath: str, *, overwrite: bool = False) -> None: if n_live > 0: disk_col[:n_live] = self._cols[name][live_pos] - file_storage.save_schema(self._schema_dict_with_computed()) + storage.save_schema(self._schema_dict_with_computed()) + + def save(self, urlpath: str, *, overwrite: bool = False) -> None: + """Persist this table to disk at *urlpath*. + + This writes a standalone copy and returns ``None``; use :meth:`copy` + directly when the copied :class:`CTable` object is needed. + + Only live rows are written — the on-disk table is always compacted. + A ``.b2z`` suffix selects the compact zip-backed format; any other + suffix creates a directory-backed store. Use a ``.b2d`` suffix for + directory-backed stores when possible so the format is clear. + + Parameters + ---------- + urlpath: + Destination path. Use a ``.b2z`` suffix for a compact zip-backed + store; any other suffix creates a directory-backed store. A + ``.b2d`` suffix is recommended for directory-backed stores. + overwrite: + If ``False`` (default), raise :exc:`ValueError` when *urlpath* + already exists. Set to ``True`` to replace an existing table. + + Raises + ------ + ValueError + If *urlpath* already exists and ``overwrite=False``. + """ + if self.base is not None: + materialized = self.copy(compact=True) + materialized.save(urlpath, overwrite=overwrite) + return + + file_storage = FileTableStorage(urlpath, "w") + target_path = file_storage._root + if os.path.exists(target_path): + if not overwrite: + raise ValueError(f"Path {target_path!r} already exists. Use overwrite=True to replace.") + if os.path.isdir(target_path): + shutil.rmtree(target_path) + else: + os.remove(target_path) + + self._save_to_storage(file_storage) file_storage.close() + @classmethod + def _open_from_storage(cls, storage: TableStorage) -> CTable: + """Construct a :class:`CTable` from an already-configured *storage* backend. + + The caller must have already verified that the storage target exists. + This is the common open path shared by :meth:`open` and + :meth:`_open_from_treestore`. + """ + storage.check_kind() + schema_dict = storage.load_schema() + schema = schema_from_dict(schema_dict) + col_names = [c["name"] for c in schema_dict["columns"]] + + obj = cls.__new__(cls) + obj._row_type = None + obj._validate = True + obj._table_cparams = None + obj._table_dparams = None + obj._storage = storage + obj._read_only = storage.is_read_only() + obj._schema = schema + obj._cols = {} + obj._col_widths = {} + obj.col_names = col_names + obj.auto_compact = False + obj.base = None + + obj._valid_rows = storage.open_valid_rows() + for name in col_names: + cc = schema.columns_by_name[name] + if obj._is_list_column(cc): + obj._cols[name] = storage.open_list_column(name) + elif obj._is_varlen_scalar_column(cc): + obj._cols[name] = storage.open_varlen_scalar_column(name, cc.spec) + else: + obj._cols[name] = storage.open_column(name) + obj._col_widths[name] = max(len(name), cc.display_width) + + obj._n_rows = int(blosc2.count_nonzero(obj._valid_rows)) + obj._last_pos = None + obj._computed_cols = {} + obj._materialized_cols = {} + obj._expr_index_arrays = {} + obj._load_computed_cols_from_schema(schema_dict) + obj._load_materialized_cols_from_schema(schema_dict) + return obj + + def _save_to_treestore(self, store: blosc2.TreeStore, full_key: str) -> None: + """Save this CTable inline into *store* under *full_key*. + + *full_key* must be the absolute (fully-translated) key within the + backing DictStore (not a subtree-relative key). + Internal use only — called by :class:`blosc2.TreeStore`. + """ + if self.base is not None: + materialized = self.copy(compact=True) + materialized._save_to_treestore(store, full_key) + return + storage = TreeStoreTableStorage(store, full_key, mode="a", owns_store=False) + self._save_to_storage(storage) + # storage is non-owning; outer store handles persistence + + @classmethod + def _open_from_treestore(cls, store: blosc2.TreeStore, full_key: str) -> CTable: + """Open an inline CTable from *store* at *full_key*. + + *full_key* must be the absolute key within the backing DictStore. + Internal use only — called by :class:`blosc2.TreeStore`. + """ + storage = TreeStoreTableStorage(store, full_key, mode=store.mode, owns_store=False) + if not storage.table_exists(): + raise FileNotFoundError(f"No inline CTable found at key {full_key!r} in {store.localpath!r}") + return cls._open_from_storage(storage) + @classmethod def load(cls, urlpath: str) -> CTable: """Load a persistent table from *urlpath* into RAM. diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py index 6ece618b..ec6fc337 100644 --- a/src/blosc2/ctable_storage.py +++ b/src/blosc2/ctable_storage.py @@ -603,3 +603,381 @@ def bump_visibility_epoch(self) -> int: def index_anchor_path(self, col_name: str) -> str | None: return os.path.join(self._open_store().working_dir, _INDEXES_DIR, col_name, "_anchor") + + +# --------------------------------------------------------------------------- +# TreeStore-backed backend (inline subtree layout) +# --------------------------------------------------------------------------- + + +class TreeStoreTableStorage(TableStorage): + """TableStorage backend that stores a CTable inline inside an outer TreeStore. + + All CTable components are written as normal external leaves under *root_key* + inside *store*'s working directory. This avoids nested ZIP files and allows + the entire bundle to be packed as a flat ``.b2z`` archive. + + Parameters + ---------- + store: + The outer :class:`blosc2.TreeStore` (or its subtree view) that will + hold the CTable internals. + root_key: + Full absolute key where the CTable lives, e.g. ``"/table"``. + mode: + Open mode (``'r'``, ``'a'``, or ``'w'``). Should match ``store.mode``. + owns_store: + If ``True``, ``close()`` / ``discard()`` will also close / discard + *store*. Use ``False`` (default) when *store* is owned by the caller. + """ + + def __init__( + self, + store: blosc2.TreeStore, + root_key: str, + mode: str, + owns_store: bool = False, + ) -> None: + self._store = store + self._root_key = root_key.rstrip("/") + self._mode = mode + self._owns_store = owns_store + self._meta: blosc2.SChunk | None = None + + # ------------------------------------------------------------------ + # Key / path helpers + # ------------------------------------------------------------------ + + def _table_key(self, logical_key: str) -> str: + """Translate a CTable-internal logical key to an outer-store absolute key. + + For example, if *root_key* is ``"/table"`` and *logical_key* is + ``"/_meta"``, the result is ``"/table/_meta"``. + """ + return self._root_key + logical_key + + def _working_dir(self) -> str: + return self._store.working_dir + + def _dest_path(self, logical_key: str, ext: str) -> str: + """Absolute filesystem path for the external leaf file.""" + rel = self._table_key(logical_key).lstrip("/") + return os.path.join(self._working_dir(), rel + ext) + + def _write_leaf(self, logical_key: str, value: Any, ext: str) -> None: + """Write *value* as a raw cframe file and register it in the outer + store's map_tree so DictStore can find it again on open.""" + dest_path = self._dest_path(logical_key, ext) + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + if isinstance(value, blosc2.SChunk) or hasattr(value, "to_cframe"): + with open(dest_path, "wb") as f: + f.write(value.to_cframe()) + else: + value.save(urlpath=dest_path, mode="w") + rel_path = os.path.relpath(dest_path, self._working_dir()).replace(os.sep, "/") + full_key = self._table_key(logical_key) + self._store.map_tree[full_key] = rel_path + self._store._modified = True + + def _open_leaf(self, logical_key: str) -> Any: + """Open a leaf via the outer store's map_tree / estore logic.""" + from blosc2.dict_store import DictStore + + full_key = self._table_key(logical_key) + return DictStore.__getitem__(self._store, full_key) + + def _list_col_path(self, name: str) -> str: + """Filesystem path for a list-style column (``.b2b``).""" + return self._dest_path(f"/_cols/{name}", ".b2b") + + # ------------------------------------------------------------------ + # TableStorage interface — lifecycle + # ------------------------------------------------------------------ + + def table_exists(self) -> bool: + full_key = self._table_key("/_meta") + return full_key in self._store.map_tree or full_key in self._store._estore + + def is_read_only(self) -> bool: + return self._mode == "r" + + def open_mode(self) -> str | None: + return self._mode + + def close(self) -> None: + if self._owns_store and self._store is not None: + self._store.close() + self._store = None + self._meta = None + + def discard(self) -> None: + if self._owns_store and self._store is not None: + self._store.discard() + self._store = None + self._meta = None + + # ------------------------------------------------------------------ + # TableStorage interface — columns and valid_rows + # ------------------------------------------------------------------ + + def create_column( + self, + name: str, + *, + dtype: np.dtype, + shape: tuple[int, ...], + chunks: tuple[int, ...], + blocks: tuple[int, ...], + cparams: dict[str, Any] | None, + dparams: dict[str, Any] | None, + ) -> blosc2.NDArray: + kwargs: dict[str, Any] = {"chunks": chunks, "blocks": blocks} + if cparams is not None: + kwargs["cparams"] = cparams + if dparams is not None: + kwargs["dparams"] = dparams + dest_path = self._dest_path(f"/_cols/{name}", ".b2nd") + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + col = blosc2.zeros(shape, dtype=dtype, urlpath=dest_path, mode="w", **kwargs) + rel_path = os.path.relpath(dest_path, self._working_dir()).replace(os.sep, "/") + self._store.map_tree[self._table_key(f"/_cols/{name}")] = rel_path + self._store._modified = True + return col + + def open_column(self, name: str) -> blosc2.NDArray: + return self._open_leaf(f"/_cols/{name}") + + def create_list_column( + self, + name: str, + *, + spec: ListSpec, + cparams: dict[str, Any] | None, + dparams: dict[str, Any] | None, + ) -> ListArray: + kwargs: dict[str, Any] = { + "urlpath": self._list_col_path(name), + "mode": "w", + "contiguous": True, + } + if cparams is not None: + kwargs["cparams"] = cparams + if dparams is not None: + kwargs["dparams"] = dparams + os.makedirs(os.path.dirname(self._list_col_path(name)), exist_ok=True) + return ListArray(spec=spec, **kwargs) + + def open_list_column(self, name: str) -> ListArray: + if self._store.is_zip_store and self._mode == "r": + rel = self._table_key(f"/_cols/{name}").lstrip("/") + ".b2b" + if rel not in self._store.offsets: + raise KeyError(f"List column {name!r} not found in {self._store.localpath!r}") + opened = blosc2.blosc2_ext.open( + self._store.b2z_path, + mode="r", + offset=self._store.offsets[rel]["offset"], + ) + return process_opened_object(opened) + return blosc2.open(self._list_col_path(name), mode=self._mode) + + def create_varlen_scalar_column( + self, + name: str, + *, + spec, + cparams=None, + dparams=None, + ) -> _ScalarVarLenArray: + urlpath = self._list_col_path(name) + os.makedirs(os.path.dirname(urlpath), exist_ok=True) + return _make_persistent_backend(spec, urlpath, "w", cparams=cparams, dparams=dparams) + + def open_varlen_scalar_column(self, name: str, spec) -> _ScalarVarLenArray: + if self._store.is_zip_store and self._mode == "r": + rel = self._table_key(f"/_cols/{name}").lstrip("/") + ".b2b" + if rel not in self._store.offsets: + raise KeyError(f"Varlen scalar column {name!r} not found in {self._store.localpath!r}") + backend = BatchArray( + _from_schunk=blosc2.blosc2_ext.open( + self._store.b2z_path, + mode="r", + offset=self._store.offsets[rel]["offset"], + ) + ) + else: + backend = _open_persistent_backend(self._list_col_path(name), self._mode, spec=spec) + _validate_role_metadata(backend, spec) + return _ScalarVarLenArray(spec, backend) + + def create_valid_rows( + self, + *, + shape: tuple[int, ...], + chunks: tuple[int, ...], + blocks: tuple[int, ...], + ) -> blosc2.NDArray: + dest_path = self._dest_path("/_valid_rows", ".b2nd") + os.makedirs(os.path.dirname(dest_path), exist_ok=True) + valid_rows = blosc2.zeros( + shape, + dtype=np.bool_, + chunks=chunks, + blocks=blocks, + urlpath=dest_path, + mode="w", + ) + rel_path = os.path.relpath(dest_path, self._working_dir()).replace(os.sep, "/") + self._store.map_tree[self._table_key("/_valid_rows")] = rel_path + self._store._modified = True + return valid_rows + + def open_valid_rows(self) -> blosc2.NDArray: + return self._open_leaf("/_valid_rows") + + # ------------------------------------------------------------------ + # TableStorage interface — schema and manifest + # ------------------------------------------------------------------ + + def save_schema(self, schema_dict: dict[str, Any]) -> None: + meta = blosc2.SChunk() + meta.vlmeta["kind"] = "ctable" + meta.vlmeta["version"] = 1 + meta.vlmeta["schema"] = json.dumps(schema_dict) + self._write_leaf("/_meta", meta, ".b2f") + opened = self._open_leaf("/_meta") + if not isinstance(opened, blosc2.SChunk): + raise ValueError("CTable manifest '/_meta' must materialise as an SChunk.") + self._meta = opened + + def _open_meta(self) -> blosc2.SChunk: + if self._meta is None: + try: + opened = self._open_leaf("/_meta") + except KeyError as exc: + raise FileNotFoundError(f"No CTable manifest found at {self._root_key!r}") from exc + if not isinstance(opened, blosc2.SChunk): + raise ValueError(f"CTable manifest at {self._root_key!r} must be an SChunk.") + self._meta = opened + return self._meta + + def load_schema(self) -> dict[str, Any]: + raw = self._open_meta().vlmeta["schema"] + if isinstance(raw, bytes): + raw = raw.decode() + return json.loads(raw) + + def check_kind(self) -> None: + kind = self._open_meta().vlmeta["kind"] + if isinstance(kind, bytes): + kind = kind.decode() + if kind != "ctable": + raise ValueError(f"Object at {self._root_key!r} is not a CTable (kind={kind!r})") + + def column_names_from_schema(self) -> list[str]: + return [c["name"] for c in self.load_schema()["columns"]] + + def delete_column(self, name: str) -> None: + full_key = self._table_key(f"/_cols/{name}") + if full_key in self._store.map_tree: + filepath = self._store.map_tree.pop(full_key) + full_path = os.path.join(self._working_dir(), filepath) + if os.path.exists(full_path): + os.remove(full_path) + return + list_path = self._list_col_path(name) + if os.path.exists(list_path): + blosc2.remove_urlpath(list_path) + return + raise KeyError(name) + + def rename_column(self, old: str, new: str) -> blosc2.NDArray: + old_key = self._table_key(f"/_cols/{old}") + new_key = self._table_key(f"/_cols/{new}") + if old_key in self._store.map_tree: + new_dest = self._dest_path(f"/_cols/{new}", ".b2nd") + old_dest = os.path.join(self._working_dir(), self._store.map_tree[old_key]) + os.makedirs(os.path.dirname(new_dest), exist_ok=True) + os.replace(old_dest, new_dest) + del self._store.map_tree[old_key] + self._store.map_tree[new_key] = os.path.relpath(new_dest, self._working_dir()).replace( + os.sep, "/" + ) + self._store._modified = True + return blosc2.open(new_dest, mode=self._mode) + old_path = self._list_col_path(old) + new_path = self._list_col_path(new) + if os.path.exists(old_path): + os.makedirs(os.path.dirname(new_path), exist_ok=True) + os.replace(old_path, new_path) + return blosc2.open(new_path, mode=self._mode) + raise KeyError(old) + + # ------------------------------------------------------------------ + # TableStorage interface — index catalog and epoch counters + # ------------------------------------------------------------------ + + def load_index_catalog(self) -> dict: + meta = self._open_meta() + raw = meta.vlmeta.get("index_catalog") + if not isinstance(raw, dict): + return {} + catalog = copy.deepcopy(raw) + working_dir = self._working_dir() + store = self._store + rel_paths_needed = [] + for col_name, descriptor in catalog.items(): + catalog[col_name] = FileTableStorage._absolutize_descriptor(descriptor, working_dir) + if store.is_zip_store and self._mode == "r": + for obj, key in FileTableStorage._walk_descriptor_paths(catalog[col_name]): + v = obj[key] + if not os.path.exists(v): + rel_paths_needed.append(os.path.relpath(v, working_dir).replace(os.sep, "/")) + if rel_paths_needed: + self._ensure_index_files_extracted(rel_paths_needed) + return catalog + + def _ensure_index_files_extracted(self, rel_paths: list[str]) -> None: + import zipfile + + store = self._store + for rel in rel_paths: + dest = os.path.join(self._working_dir(), rel) + if os.path.exists(dest): + continue + info = store.offsets.get(rel) + if info is None: + continue + os.makedirs(os.path.dirname(dest), exist_ok=True) + with zipfile.ZipFile(store.b2z_path, "r") as zf: + with zf.open(rel) as src, open(dest, "wb") as dst: + dst.write(src.read()) + + def save_index_catalog(self, catalog: dict) -> None: + meta = self._open_meta() + working_dir = self._working_dir() + relativized = { + col: FileTableStorage._relativize_descriptor(desc, working_dir) for col, desc in catalog.items() + } + meta.vlmeta["index_catalog"] = relativized + + def get_epoch_counters(self) -> tuple[int, int]: + meta = self._open_meta() + ve = int(meta.vlmeta.get("value_epoch", 0) or 0) + vis_e = int(meta.vlmeta.get("visibility_epoch", 0) or 0) + return ve, vis_e + + def bump_value_epoch(self) -> int: + meta = self._open_meta() + ve = int(meta.vlmeta.get("value_epoch", 0) or 0) + 1 + meta.vlmeta["value_epoch"] = ve + return ve + + def bump_visibility_epoch(self) -> int: + meta = self._open_meta() + vis_e = int(meta.vlmeta.get("visibility_epoch", 0) or 0) + 1 + meta.vlmeta["visibility_epoch"] = vis_e + return vis_e + + def index_anchor_path(self, col_name: str) -> str | None: + table_rel = self._root_key.lstrip("/") + return os.path.join(self._working_dir(), table_rel, _INDEXES_DIR, col_name, "_anchor") diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index f1ac6f05..f250206c 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -167,6 +167,84 @@ def __init__(self, *args, _from_parent_store=None, **kwargs): super().__init__(*args, **kwargs, _storage_meta={"b2tree": {"version": 1}}) self.subtree_path = "" # Empty string means full tree + self._inline_handles: list = [] # inline object handles opened from this store + + # ------------------------------------------------------------------ + # Object registry helpers + # ------------------------------------------------------------------ + + def _objects_registry(self) -> dict: + """Return the object registry dict stored in the embed-store SChunk vlmeta.""" + try: + reg = self._estore._store.vlmeta.get("_object_registry") + return dict(reg) if reg else {} + except Exception: + return {} + + def _register_object(self, full_key: str, *, kind: str, version: int, layout: str) -> None: + """Register *full_key* as an object root in the persistent registry.""" + try: + reg = self._objects_registry() + reg[full_key] = {"kind": kind, "version": version, "layout": layout} + self._estore._store.vlmeta["_object_registry"] = reg + except Exception: + pass # best-effort + + def _unregister_object(self, full_key: str) -> None: + """Remove *full_key* from the object registry.""" + try: + reg = self._objects_registry() + reg.pop(full_key, None) + self._estore._store.vlmeta["_object_registry"] = reg + except Exception: + pass + + def _object_info(self, full_key: str) -> dict | None: + """Return registry metadata for *full_key*, or ``None`` if not registered.""" + return self._objects_registry().get(full_key) + + def _object_roots(self) -> set: + """Return all registered object-root full keys.""" + return set(self._objects_registry().keys()) + + def _effective_object_roots(self) -> set: + """Object root keys relative to the current view (subtree or root).""" + all_roots = self._object_roots() + if not self.subtree_path: + return all_roots + result = set() + for full_key in all_roots: + relative = self._translate_key_from_full(full_key) + if relative is not None: + result.add(relative) + return result + + def _is_object_internal_key(self, key: str) -> bool: + """Return ``True`` when *key* (subtree-relative) is inside an object root.""" + return any(key != root and key.startswith(root + "/") for root in self._effective_object_roots()) + + def _probe_object_info(self, full_key: str) -> dict | None: + """Probe the physical store for a CTable manifest at *full_key*/_meta. + + Used as a fallback for stores written before the registry was introduced. + """ + meta_full_key = full_key + "/_meta" + if meta_full_key not in self.map_tree and meta_full_key not in self._estore: + return None + try: + from blosc2.dict_store import DictStore + + meta_obj = DictStore.__getitem__(self, meta_full_key) + if not isinstance(meta_obj, blosc2.SChunk): + return None + kind = meta_obj.vlmeta.get("kind") + if isinstance(kind, bytes): + kind = kind.decode() + if kind == "ctable": + return {"kind": "ctable", "version": 1, "layout": "inline-tree-subtree"} + except Exception: + pass + return None def _is_vlmeta_key(self, key: str) -> bool: """Check if a key is a vlmeta key that should be hidden from regular access.""" @@ -241,7 +319,7 @@ def __setitem__( ---------- key : str Hierarchical node key. - value : np.ndarray or blosc2.NDArray or blosc2.C2Array or blosc2.SChunk + value : np.ndarray or blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.CTable to store. Raises @@ -253,6 +331,26 @@ def __setitem__( """ key = self._validate_key(key) + # --- CTable: store as inline subtree object --- + if isinstance(value, blosc2.CTable): + self._set_ctable_object(key, value) + return + + # Block writes to object internals + if self._is_object_internal_key(key): + raise ValueError( + f"Cannot write to '{key}': it is an internal component of an object root. " + f"Use the object's own API to modify it." + ) + + # Block overwriting an existing object root with a plain value + full_key = self._translate_key_to_full(key) + if self._object_info(full_key) is not None: + raise ValueError( + f"'{key}' is an object root (e.g. CTable). " + f"Delete it first with `del ts['{key}']` before assigning a new value." + ) + # Check if this key already has children (is a structural subtree) children = self.get_children(key) if children: @@ -261,7 +359,6 @@ def __setitem__( ) # Check if we're trying to add a child to a path that already has data - # Extract parent path from the key if key != "/": parent_path = "/".join(key.split("/")[:-1]) if not parent_path: # Handle case where parent is root @@ -273,35 +370,48 @@ def __setitem__( f"Cannot add child '{key}' to path '{parent_path}' that already contains data" ) - full_key = self._translate_key_to_full(key) super().__setitem__(full_key, value) - def __getitem__( - self, key: str - ) -> NDArray | C2Array | SChunk | blosc2.ObjectArray | blosc2.BatchArray | TreeStore: - """Retrieve a node or subtree view. + def _set_ctable_object(self, key: str, value: blosc2.CTable) -> None: + """Materialise a CTable inline into this store at *key*.""" + if self.mode == "r": + raise ValueError("TreeStore is in read-only mode") - If the key points to a subtree (intermediate path with children), - returns a TreeStore view of that subtree. If the key points to - a final node (leaf), returns the stored array or schunk. + full_key = self._translate_key_to_full(key) - Parameters - ---------- - key : str - Hierarchical node key. + # Raise if already exists as object root (no silent replace) + if self._object_info(full_key) is not None: + raise ValueError( + f"'{key}' already exists as an object root. Delete it first with `del ts['{key}']`." + ) - Returns - ------- - out : blosc2.NDArray or blosc2.C2Array or blosc2.SChunk or blosc2.ObjectArray or blosc2.BatchArray or TreeStore - The stored array/chunk if key is a leaf node, or a TreeStore subtree view - if key is an intermediate path with children. + # Raise if already exists as data leaf + if super().__contains__(full_key): + raise ValueError(f"'{key}' already exists as a data leaf. Delete it first.") - Raises - ------ - KeyError - If key is not found. - ValueError - If key doesn't follow hierarchical structure rules. + # Raise if key is inside an existing object root + if self._is_object_internal_key(key): + raise ValueError(f"Cannot assign to '{key}': it is inside an existing object root.") + + # Raise if key already has structural children + children = self.get_children(key) + if children: + raise ValueError( + f"Cannot assign CTable to '{key}': structural children already exist: {children}." + ) + + value._save_to_treestore(self, full_key) + self._register_object(full_key, kind="ctable", version=1, layout="inline-tree-subtree") + self._modified = True + + def __getitem__( + self, key: str + ) -> NDArray | C2Array | SChunk | blosc2.ObjectArray | blosc2.BatchArray | blosc2.CTable | TreeStore: + """Retrieve a node, object, or subtree view. + + If the key is a registered object root (e.g. CTable) returns that object. + If the key is a structural intermediate path returns a subtree view. + If the key is a leaf returns the stored array/schunk. """ key = self._validate_key(key) if self._is_vlmeta_key(key): @@ -309,96 +419,117 @@ def __getitem__( full_key = self._translate_key_to_full(key) - # Check if this key has children (is a subtree) - children = self.get_children(key) + # --- Object root dispatch (registry first, then probe fallback) --- + info = self._object_info(full_key) + if info is None: + info = self._probe_object_info(full_key) + if info is not None and info["kind"] == "ctable": + ctable = blosc2.CTable._open_from_treestore(self, full_key) + self._inline_handles.append(ctable) + return ctable # Check if the key exists as an actual data node key_exists_as_data = super().__contains__(full_key) + # Check if this key has children (is a structural subtree) + children = self.get_children(key) + if children: - # If it has children, return a subtree view return self.get_subtree(key) elif key_exists_as_data: - # If no children but exists as data, it's a leaf node - get the actual data return super().__getitem__(full_key) else: - # Key doesn't exist at all raise KeyError(f"Key '{key}' not found") def __delitem__(self, key: str) -> None: - """Remove a node or subtree. + """Remove a node, object root, or subtree. - If the key points to a subtree (intermediate path with children), - removes all nodes in that subtree recursively. If the key points to a final - node (leaf), removes only that node. - - Parameters - ---------- - key : str - Hierarchical node key. - - Raises - ------ - KeyError - If key is not found. - ValueError - If key doesn't follow hierarchical structure rules. + If *key* is a registered object root, all its physical leaves and the + registry entry are removed. If *key* has children, all descendants are + removed recursively. Object internals cannot be deleted directly. """ key = self._validate_key(key) if self._is_vlmeta_key(key): raise KeyError(f"Key '{key}' not found; vlmeta keys are not directly accessible.") - # Check if the key exists (either as data or as a structural node with descendants) full_key = self._translate_key_to_full(key) + + # --- Object root deletion --- + if self._object_info(full_key) is not None: + self._delete_object_subtree(full_key) + return + + # Block direct deletion of object internals + if self._is_object_internal_key(key): + raise ValueError( + f"Cannot delete '{key}': it is an internal component of an object root. " + f"Delete the object root itself." + ) + + # Regular node / subtree deletion key_exists_as_data = super().__contains__(full_key) descendants = self.get_descendants(key) if not key_exists_as_data and not descendants: raise KeyError(f"Key '{key}' not found") - # Collect all keys to delete (leaf nodes only, since structural nodes don't exist as data) keys_to_delete = [] - - # If the key itself has data, include it if key_exists_as_data: keys_to_delete.append(key) - - # Add all descendant leaf nodes (only those that actually exist as data) for descendant in descendants: - full_descendant_key = self._translate_key_to_full(descendant) - if super().__contains__(full_descendant_key): + full_desc = self._translate_key_to_full(descendant) + if super().__contains__(full_desc): keys_to_delete.append(descendant) - # Delete all data keys in the subtree for k in keys_to_delete: - full_key_to_delete = self._translate_key_to_full(k) - super().__delitem__(full_key_to_delete) + super().__delitem__(self._translate_key_to_full(k)) + + def _delete_object_subtree(self, full_key: str) -> None: + """Delete all physical leaves under *full_key* and unregister it.""" + prefix = full_key + "/" + # Remove from map_tree + for k in list(self.map_tree.keys()): + if k == full_key or k.startswith(prefix): + filepath = self.map_tree.pop(k) + full_path = os.path.join(self.working_dir, filepath) + if os.path.exists(full_path) and not os.path.isdir(full_path): + os.remove(full_path) + # Remove any embedded entries + for k in list(self._estore.keys()): + if k == full_key or k.startswith(prefix): + import contextlib - def __contains__(self, key: str) -> bool: - """Check if a key exists. + with contextlib.suppress(KeyError): + del self._estore[k] + # Remove leftover directory (e.g. _indexes) + table_dir = os.path.join(self.working_dir, full_key.lstrip("/")) + if os.path.isdir(table_dir): + import shutil - Parameters - ---------- - key : str - Hierarchical node key. + shutil.rmtree(table_dir, ignore_errors=True) + self._unregister_object(full_key) + self._modified = True - Returns - ------- - exists : bool - True if key exists, False otherwise. - """ + def __contains__(self, key: str) -> bool: + """Check if a key exists (includes object roots).""" try: key = self._validate_key(key) if self._is_vlmeta_key(key): return False + if self._is_object_internal_key(key): + return False full_key = self._translate_key_to_full(key) - return super().__contains__(full_key) + return super().__contains__(full_key) or self._object_info(full_key) is not None except ValueError: return False def keys(self): - """Return all keys in the current subtree view.""" + """Return all keys in the current subtree view. + + Object root keys (e.g. CTable) are included as single entries. + Object-internal keys are hidden from normal traversal. + """ if not self.subtree_path: all_keys = set(super().keys()) else: @@ -411,18 +542,24 @@ def keys(self): # Filter out vlmeta keys all_keys = {key for key in all_keys if not self._is_vlmeta_key(key)} - # Also include structural paths (intermediate nodes that have children but no data) + # Filter out object-internal keys + all_keys = {key for key in all_keys if not self._is_object_internal_key(key)} + + # Add object roots (they are not stored as DictStore keys themselves) + object_roots = self._effective_object_roots() + + # Build structural paths from both data leaves and object root keys + all_with_roots = all_keys | object_roots structural_keys = set() - for key in all_keys: - # For each leaf key, add all its parent paths + for key in all_with_roots: parts = key.split("/")[1:] # Remove empty first element from split current_path = "" for part in parts[:-1]: # Exclude the leaf itself current_path = current_path + "/" + part if current_path else "/" + part - if current_path and current_path != "/" and current_path not in all_keys: + if current_path and current_path != "/" and current_path not in all_with_roots: structural_keys.add(current_path) - return all_keys | structural_keys + return all_keys | structural_keys | object_roots def __iter__(self) -> Iterator[str]: """Iterate over keys, excluding vlmeta keys.""" @@ -562,14 +699,14 @@ def walk(self, path: str = "/", topdown: bool = True) -> Iterator[tuple[str, lis name for name in leaf_nodes if isinstance(name, str) and "/" not in name and name != "" ] - # 2) Ensure leaf nodes correspond to actual data nodes in the underlying store + # 2) Ensure leaf nodes correspond to actual data nodes or object roots valid_leaf_nodes: list[str] = [] for name in leaf_nodes: # Compose subtree-relative child path child_rel_path = path + "/" + name if path != "/" else "/" + name - # Translate to full key in the backing store and verify it's a data node + # Translate to full key in the backing store and verify it's a data node or object root full_key = self._translate_key_to_full(child_rel_path) - if super().__contains__(full_key): + if super().__contains__(full_key) or self._object_info(full_key) is not None: valid_leaf_nodes.append(name) leaf_nodes = valid_leaf_nodes @@ -618,6 +755,13 @@ def get_subtree(self, path: str) -> TreeStore: path = self._validate_key(path) full_path = self._translate_key_to_full(path) + # Object roots cannot be navigated as subtrees + if self._object_info(full_path) is not None: + raise ValueError( + f"'{path}' is an object root (e.g. CTable), not a TreeStore subtree. " + f"Use ts['{path}'] to access the object." + ) + # Create a new TreeStore instance that shares the same underlying storage # but with a different subtree_path subtree = TreeStore(_from_parent_store=self) @@ -688,6 +832,41 @@ def _persist_vlmeta(self) -> None: del self._estore[vlmeta_key] self._estore[vlmeta_key] = self._vlmeta + # ------------------------------------------------------------------ + # Lifecycle overrides (inline handle management) + # ------------------------------------------------------------------ + + def close(self) -> None: + """Flush inline object handles then delegate to DictStore.close().""" + if self._closed: + return + # Close any inline object handles (CTable etc.) before packing. + for handle in list(getattr(self, "_inline_handles", [])): + try: + storage = getattr(handle, "_storage", None) + if storage is not None: + handle.close() + except Exception: + pass + if hasattr(self, "_inline_handles"): + self._inline_handles.clear() + super().close() + + def discard(self) -> None: + """Discard without repacking; also discard inline handle storage.""" + if self._closed: + return + for handle in list(getattr(self, "_inline_handles", [])): + try: + storage = getattr(handle, "_storage", None) + if storage is not None and hasattr(storage, "discard"): + storage.discard() + except Exception: + pass + if hasattr(self, "_inline_handles"): + self._inline_handles.clear() + super().discard() + if __name__ == "__main__": # Example usage diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 64249357..bec55b6b 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -5,6 +5,7 @@ # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +import dataclasses import os import shutil import zipfile @@ -1110,3 +1111,324 @@ def test_mmap_mode_validation(tmp_path): with pytest.raises(ValueError, match="mmap_mode='r' requires mode='r'"): TreeStore(str(path), mode="a", mmap_mode="r") + + +# =========================================================================== +# CTable inline object support tests +# =========================================================================== + + +@dataclasses.dataclass +class _Row: + x: int = 0 + y: float = 0.0 + + +@dataclasses.dataclass +class _RowStr: + name: str = "" + score: float = 0.0 + + +def _make_ctable(n=5): + t = blosc2.CTable(_Row) + for i in range(n): + t.append(_Row(x=i, y=i * 1.5)) + return t + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_basic_write_read(tmp_path, storage_type): + """Basic write/read of NDArray + CTable in one TreeStore.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/arr"] = np.arange(10) + ts["/table"] = t + + with blosc2.open(path, mode="r") as ts: + assert isinstance(ts["/arr"], blosc2.NDArray) + assert isinstance(ts["/table"], blosc2.CTable) + table2 = ts["/table"] + assert len(table2) == 5 + np.testing.assert_array_equal(list(table2["x"][:]), list(range(5))) + np.testing.assert_array_almost_equal(list(table2["y"][:]), [i * 1.5 for i in range(5)]) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_traversal_hides_internals(tmp_path, storage_type): + """Object internals are hidden from keys(), walk(), __contains__.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/arr"] = np.arange(3) + ts["/table"] = t + + with blosc2.open(path, mode="r") as ts: + k = sorted(ts.keys()) + assert "/arr" in k + assert "/table" in k + # Internals must NOT appear + assert not any(x.startswith("/table/_") for x in k) + # __contains__ + assert "/table" in ts + assert "/table/_meta" not in ts + assert "/table/_valid_rows" not in ts + assert "/table/_cols" not in ts + # walk + walked = list(ts.walk("/")) + all_nodes = [n for _, _, nodes in walked for n in nodes] + assert "table" in all_nodes + assert not any(n.startswith("_") for root, _, _ in walked for n in _) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_get_subtree_raises(tmp_path, storage_type): + """get_subtree() on an object root must raise ValueError.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + with pytest.raises(ValueError, match="object root"): + ts.get_subtree("/table") + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_write_to_internal_raises(tmp_path, storage_type): + """Writing directly to object internals must raise ValueError.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + with pytest.raises(ValueError, match="internal component"): + ts["/table/_cols/x"] = np.ones(3) + with pytest.raises(ValueError, match="internal component"): + ts["/table/_meta"] = np.ones(2) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_replace_raises(tmp_path, storage_type): + """Replacing an existing object root without deleting first must raise.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + with pytest.raises(ValueError, match="already exists as an object root"): + ts["/table"] = t + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_structural_conflict(tmp_path, storage_type): + """Cannot assign CTable where structural children exist, and vice-versa.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + # Structural children → CTable assignment blocked + ts["/grp/leaf"] = np.ones(2) + with pytest.raises(ValueError): + ts["/grp"] = t + + path2 = str(tmp_path / f"bundle2.{storage_type}") + with blosc2.TreeStore(path2, mode="w") as ts: + ts["/table"] = t + # CTable root exists → adding child blocked + with pytest.raises(ValueError, match="internal component"): + ts["/table/foo"] = np.ones(2) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_deletion(tmp_path, storage_type): + """Deleting an object root removes all its physical leaves.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/arr"] = np.arange(5) + ts["/table"] = t + del ts["/table"] + assert "/table" not in ts + assert "/arr" in ts + + # Stays gone after reopen + with blosc2.open(path, mode="r") as ts: + assert "/table" not in ts + assert "/arr" in ts + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_delete_and_reassign(tmp_path, storage_type): + """After deletion a CTable can be re-assigned at the same key.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + del ts["/table"] + ts["/table"] = _make_ctable(n=3) + + with blosc2.open(path, mode="r") as ts: + assert "/table" in ts + t2 = ts["/table"] + assert len(t2) == 3 + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_delete_internal_raises(tmp_path, storage_type): + """Direct deletion of object internals must raise.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + with pytest.raises(ValueError, match="internal component"): + del ts["/table/_meta"] + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_append_mode(tmp_path, storage_type): + """Inline CTable can be opened and extended in append mode.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable(n=3) + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + + with blosc2.TreeStore(path, mode="a") as ts: + table = ts["/table"] + table.append(_Row(x=99, y=-1.0)) + table.close() + + with blosc2.open(path, mode="r") as ts: + t2 = ts["/table"] + assert len(t2) == 4 + assert list(t2["x"][:])[-1] == 99 + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_coexists_with_nested_ndarray(tmp_path, storage_type): + """CTables and nested NDArrays can coexist in the same bundle.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable() + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/group/arr"] = np.arange(6) + ts["/group/sub/val"] = np.ones(2) + ts["/table"] = t + ts["/scalar"] = np.array([42]) + + with blosc2.open(path, mode="r") as ts: + k = sorted(ts.keys()) + assert "/group" in k + assert "/group/arr" in k + assert "/group/sub" in k + assert "/group/sub/val" in k + assert "/table" in k + assert "/scalar" in k + assert isinstance(ts["/table"], blosc2.CTable) + + +def test_ctable_b2d_to_b2z_roundtrip(tmp_path): + """b2d bundle with CTable can be packed to b2z and read back.""" + b2d = str(tmp_path / "bundle.b2d") + b2z = str(tmp_path / "bundle.b2z") + t = _make_ctable() + + with blosc2.TreeStore(b2d, mode="w") as ts: + ts["/arr"] = np.arange(4) + ts["/table"] = t + + with blosc2.TreeStore(b2d, mode="r") as ts: + ts.to_b2z(filename=b2z) + + with blosc2.open(b2z, mode="r") as ts: + assert "/table" in ts + t2 = ts["/table"] + assert len(t2) == 5 + np.testing.assert_array_equal(list(t2["x"][:]), list(range(5))) + + +def test_ctable_b2z_to_b2d_roundtrip(tmp_path): + """b2z bundle with CTable can be unpacked to b2d and read back.""" + b2z = str(tmp_path / "bundle.b2z") + b2d = str(tmp_path / "bundle_out.b2d") + t = _make_ctable() + + with blosc2.TreeStore(b2z, mode="w") as ts: + ts["/arr"] = np.arange(4) + ts["/table"] = t + + with blosc2.TreeStore(b2z, mode="r") as ts: + ts.to_b2d(b2d) + + with blosc2.open(b2d, mode="r") as ts: + assert "/table" in ts + t2 = ts["/table"] + assert len(t2) == 5 + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_with_string_column(tmp_path, storage_type): + """CTable with a string column round-trips correctly through TreeStore.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = blosc2.CTable(_RowStr) + t.append(_RowStr(name="alice", score=9.5)) + t.append(_RowStr(name="bob", score=8.0)) + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/tbl"] = t + + with blosc2.open(path, mode="r") as ts: + t2 = ts["/tbl"] + assert len(t2) == 2 + names = [str(n) for n in t2["name"][:]] + assert "alice" in names + assert "bob" in names + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_persistent_source_roundtrip(tmp_path, storage_type): + """A persistent CTable can also be stored into a TreeStore bundle.""" + src_path = str(tmp_path / "standalone.b2d") + bundle_path = str(tmp_path / f"bundle.{storage_type}") + + t = blosc2.CTable(_Row, urlpath=src_path, mode="w") + for i in range(4): + t.append(_Row(x=i * 10, y=float(i))) + t.close() + + t_disk = blosc2.CTable.open(src_path, mode="r") + with blosc2.TreeStore(bundle_path, mode="w") as ts: + ts["/arr"] = np.zeros(3) + ts["/table"] = t_disk + t_disk.close() + + with blosc2.open(bundle_path, mode="r") as ts: + t2 = ts["/table"] + assert len(t2) == 4 + np.testing.assert_array_equal(list(t2["x"][:]), [0, 10, 20, 30]) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_context_manager_auto_close(tmp_path, storage_type): + """Outer TreeStore auto-closes inline CTable handles on __exit__.""" + path = str(tmp_path / f"bundle.{storage_type}") + t = _make_ctable(n=2) + + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = t + + with blosc2.TreeStore(path, mode="a") as ts: + table = ts["/table"] + table.append(_Row(x=100, y=0.0)) + # Don't close table explicitly; outer __exit__ should handle it + + with blosc2.open(path, mode="r") as ts: + t2 = ts["/table"] + assert len(t2) == 3 + assert list(t2["x"][:])[-1] == 100 From c9deed364c80829ea22a229aa87d9a6258ca6632 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 8 May 2026 13:12:55 +0200 Subject: [PATCH 02/10] Add examples of CTable integration with TreeStore in docstrings --- src/blosc2/tree_store.py | 115 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 112 insertions(+), 3 deletions(-) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index f250206c..cdc8215d 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -128,9 +128,10 @@ class TreeStore(DictStore): Examples -------- + Store plain arrays in a hierarchy: + >>> tstore = TreeStore(localpath="my_tstore.b2z", mode="w") - >>> # Create a hierarchy. Data is stored in leaf nodes. - >>> # Structural nodes like /child0 and /child0/child1 are created automatically. + >>> # Data lives in leaf nodes; structural nodes are created automatically. >>> tstore["/child0/leaf1"] = np.array([1, 2, 3]) >>> tstore["/child0/child1/leaf2"] = np.array([4, 5, 6]) >>> tstore["/child0/child2"] = np.array([7, 8, 9]) @@ -146,6 +147,27 @@ class TreeStore(DictStore): >>> sorted(list(subtree.keys())) ['/child1/leaf2', '/child2', '/leaf1'] + Mix NDArrays and CTables in the same bundle: + + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + ... y: float = 0.0 + >>> table = blosc2.CTable(Row) + >>> _ = table.append(Row(x=1, y=1.5)) + >>> _ = table.append(Row(x=2, y=3.0)) + >>> with blosc2.TreeStore("bundle.b2z", mode="w") as ts: + ... ts["/data/array"] = blosc2.arange(5) + ... ts["/data/table"] = table + >>> with blosc2.open("bundle.b2z", mode="r") as ts: + ... print(sorted(ts.keys())) + ... arr = ts["/data/array"] + ... tbl = ts["/data/table"] + ... print(type(tbl).__name__, len(tbl)) + ['/data', '/data/array', '/data/table'] + CTable 2 + """ # For some reason, we had to revert the explicit parametrisation of the @@ -328,6 +350,25 @@ def __setitem__( If key doesn't follow hierarchical structure rules, if trying to assign to a structural path that already has children, or if trying to add a child to a path that already contains data. + + Examples + -------- + Store an NDArray and a CTable together: + + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + >>> t = blosc2.CTable(Row) + >>> _ = t.append(Row(x=10)) + >>> with blosc2.TreeStore("store.b2z", mode="w") as ts: + ... ts["/arr"] = blosc2.zeros(5, dtype="i4") + ... ts["/table"] = t # CTable stored inline + + Replacing an existing object root requires an explicit delete first:: + + del ts["/table"] + ts["/table"] = new_table """ key = self._validate_key(key) @@ -412,6 +453,25 @@ def __getitem__( If the key is a registered object root (e.g. CTable) returns that object. If the key is a structural intermediate path returns a subtree view. If the key is a leaf returns the stored array/schunk. + + Examples + -------- + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + >>> t = blosc2.CTable(Row) + >>> _ = t.append(Row(x=42)) + >>> with blosc2.TreeStore("store.b2z", mode="w") as ts: + ... ts["/arr"] = blosc2.zeros(3, dtype="i4") + ... ts["/group/val"] = blosc2.ones(2, dtype="f4") + ... ts["/table"] = t + >>> with blosc2.open("store.b2z", mode="r") as ts: + ... arr = ts["/arr"] # NDArray leaf + ... sub = ts["/group"] # TreeStore subtree view + ... tbl = ts["/table"] # CTable object + ... print(type(arr).__name__, type(sub).__name__, type(tbl).__name__) + NDArray TreeStore CTable """ key = self._validate_key(key) if self._is_vlmeta_key(key): @@ -447,6 +507,21 @@ def __delitem__(self, key: str) -> None: If *key* is a registered object root, all its physical leaves and the registry entry are removed. If *key* has children, all descendants are removed recursively. Object internals cannot be deleted directly. + + Examples + -------- + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + >>> t = blosc2.CTable(Row) + >>> _ = t.append(Row(x=1)) + >>> with blosc2.TreeStore("store.b2z", mode="w") as ts: + ... ts["/arr"] = blosc2.zeros(3, dtype="i4") + ... ts["/table"] = t + ... del ts["/table"] # removes all CTable leaves + registry entry + ... print("/table" in ts) + False """ key = self._validate_key(key) @@ -512,7 +587,26 @@ def _delete_object_subtree(self, full_key: str) -> None: self._modified = True def __contains__(self, key: str) -> bool: - """Check if a key exists (includes object roots).""" + """Check if a key exists (includes object roots, excludes object internals). + + Examples + -------- + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + >>> t = blosc2.CTable(Row) + >>> _ = t.append(Row(x=7)) + >>> with blosc2.TreeStore("store.b2z", mode="w") as ts: + ... ts["/arr"] = blosc2.zeros(2, dtype="i4") + ... ts["/table"] = t + ... print("/table" in ts) # object root: True + ... print("/table/_meta" in ts) # internal key: False + ... print("/arr" in ts) # normal leaf: True + True + False + True + """ try: key = self._validate_key(key) if self._is_vlmeta_key(key): @@ -529,6 +623,21 @@ def keys(self): Object root keys (e.g. CTable) are included as single entries. Object-internal keys are hidden from normal traversal. + + Examples + -------- + >>> import dataclasses + >>> @dataclasses.dataclass + ... class Row: + ... x: int = 0 + >>> t = blosc2.CTable(Row) + >>> _ = t.append(Row(x=1)) + >>> with blosc2.TreeStore("store.b2z", mode="w") as ts: + ... ts["/arr"] = blosc2.zeros(3, dtype="i4") + ... ts["/group/val"] = blosc2.ones(2, dtype="f4") + ... ts["/table"] = t + ... print(sorted(ts.keys())) + ['/arr', '/group', '/group/val', '/table'] """ if not self.subtree_path: all_keys = set(super().keys()) From bbd9823c87c300647c9d01acb66c235e176ec344 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 8 May 2026 14:22:54 +0200 Subject: [PATCH 03/10] Add sections in tutorials and examples --- .../tutorials/13.containers.ipynb | 116 +++++++++++------- examples/tree-store.py | 47 ++++++- 2 files changed, 116 insertions(+), 47 deletions(-) diff --git a/doc/getting_started/tutorials/13.containers.ipynb b/doc/getting_started/tutorials/13.containers.ipynb index 417d1e97..3c4dec22 100644 --- a/doc/getting_started/tutorials/13.containers.ipynb +++ b/doc/getting_started/tutorials/13.containers.ipynb @@ -5,22 +5,7 @@ "id": "cell-01", "metadata": {}, "source": [ - "# Working with Containers\n", - "\n", - "This notebook is a guided tour of the main data containers in `python-blosc2`.\n", - "\n", - "The goal is to build a practical mental model first: what each container is, how the containers relate, and when each one is the right tool.\n", - "\n", - "We will cover these containers in this order:\n", - "\n", - "1. `SChunk`\n", - "2. `NDArray`\n", - "3. `ObjectArray`\n", - "4. `BatchArray`\n", - "5. `EmbedStore`\n", - "6. `DictStore`\n", - "7. `TreeStore`\n", - "8. `C2Array`" + "# Working with Containers\n\nThis notebook is a guided tour of the main data containers in `python-blosc2`.\n\nThe goal is to build a practical mental model first: what each container is, how the containers relate, and when each one is the right tool.\n\nWe will cover these containers in this order:\n\n1. `SChunk`\n2. `NDArray`\n3. `ObjectArray`\n4. `BatchArray`\n5. `EmbedStore`\n6. `DictStore`\n7. `TreeStore` (including inline `CTable` support)\n8. `C2Array`" ] }, { @@ -444,6 +429,73 @@ " show(\"/exp/run2/data\", tstore[\"/exp/run2/data\"][:])" ] }, + { + "cell_type": "markdown", + "id": "cell-17b", + "metadata": {}, + "source": [ + "### Storing CTables inside a TreeStore\n", + "\n", + "A `TreeStore` can hold **both NDArrays and CTables** in the same bundle. A `CTable` is stored inline as a named subtree — all its columns, metadata, and index sidecars live as ordinary Blosc2 leaves inside the outer store. From the outside it appears as a single key, exactly like any other leaf:\n", + "\n", + "* `ts[\"/table\"] = ctable` — stores the CTable inline (same syntax as NDArray).\n", + "* `ts[\"/table\"]` — returns a `CTable` object transparently.\n", + "* `\"/table/_meta\" not in ts` — internal keys are hidden from normal traversal.\n", + "* `del ts[\"/table\"]` — removes the whole object and all its leaves at once.\n", + "\n", + "The inline layout means there are **no nested ZIP files**: all leaves are flat members of the outer `.b2z` archive and can be opened by offset without extraction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-17c", + "metadata": {}, + "outputs": [], + "source": [ + "from dataclasses import dataclass\n", + "\n", + "\n", + "@dataclass\n", + "class Reading:\n", + " sensor_id: int = 0\n", + " value: float = 0.0\n", + "\n", + "\n", + "bundle_path = reset(\"bundle.b2z\")\n", + "\n", + "# --- Write: mix NDArrays and CTables in one bundle ----------------------\n", + "t = blosc2.CTable(Reading)\n", + "for i in range(6):\n", + " t.append(Reading(sensor_id=i, value=round(i * 1.1, 2)))\n", + "\n", + "with blosc2.TreeStore(bundle_path, mode=\"w\") as ts:\n", + " ts[\"/raw/signal\"] = np.arange(8, dtype=np.float32)\n", + " ts[\"/tables/readings\"] = t # CTable stored inline\n", + " show(\"keys after write\", sorted(ts.keys()))\n", + " show(\"/tables/readings/_meta in ts (hidden)\", \"/tables/readings/_meta\" in ts)\n", + "\n", + "# --- Read back from the .b2z archive ------------------------------------\n", + "with blosc2.open(bundle_path, mode=\"r\") as ts:\n", + " readings = ts[\"/tables/readings\"] # returns CTable transparently\n", + " show(\"type\", type(readings).__name__)\n", + " show(\"rows\", len(readings))\n", + " show(\"sensor_id\", list(readings[\"sensor_id\"][:]))\n", + " show(\"value\", list(readings[\"value\"][:]))\n", + "\n", + "# --- Append a row in-place (append mode) --------------------------------\n", + "with blosc2.TreeStore(bundle_path, mode=\"a\") as ts:\n", + " r = ts[\"/tables/readings\"]\n", + " r.append(Reading(sensor_id=99, value=-1.0))\n", + " r.close() # optional; outer store also closes it on __exit__\n", + " show(\"rows after append\", len(ts[\"/tables/readings\"]))\n", + "\n", + "# --- Delete the CTable (all internal leaves removed) -------------------\n", + "with blosc2.TreeStore(bundle_path, mode=\"a\") as ts:\n", + " del ts[\"/tables/readings\"]\n", + " show(\"keys after delete\", sorted(ts.keys()))" + ] + }, { "cell_type": "markdown", "id": "cell-18", @@ -494,25 +546,7 @@ "id": "cell-20", "metadata": {}, "source": [ - "## Choosing The Right Container\n", - "\n", - "| Container | Backing idea | Best for |\n", - "| --- | --- | --- |\n", - "| `SChunk` | raw compressed chunks | direct chunk-level storage control |\n", - "| `NDArray` | `SChunk` plus array metadata | dense numeric arrays |\n", - "| `ObjectArray` | one variable-length entry per chunk | ragged or heterogeneous Python values |\n", - "| `BatchArray` | one batch per chunk | batch-oriented ingestion and access |\n", - "| `EmbedStore` | one bundled object store | packaging a few Blosc2 objects together |\n", - "| `DictStore` | keyed collection of leaves | portable multi-object datasets |\n", - "| `TreeStore` | hierarchical keyed collection | tree-structured datasets |\n", - "| `C2Array` | remote array handle | arrays hosted by a remote Caterva2 service |\n", - "\n", - "A simple rule of thumb is:\n", - "\n", - "- start with `NDArray` for dense numeric data\n", - "- drop down to `SChunk` if you need chunk-level control\n", - "- use `ObjectArray` or `BatchArray` for variable-length Python objects\n", - "- use `EmbedStore`, `DictStore`, or `TreeStore` when your dataset contains multiple objects" + "## Choosing The Right Container\n\n| Container | Backing idea | Best for |\n| --- | --- | --- |\n| `SChunk` | raw compressed chunks | direct chunk-level storage control |\n| `NDArray` | `SChunk` plus array metadata | dense numeric arrays |\n| `ObjectArray` | one variable-length entry per chunk | ragged or heterogeneous Python values |\n| `BatchArray` | one batch per chunk | batch-oriented ingestion and access |\n| `EmbedStore` | one bundled object store | packaging a few Blosc2 objects together |\n| `DictStore` | keyed collection of leaves | portable multi-object datasets |\n| `TreeStore` | hierarchical keyed collection | tree-structured datasets with NDArrays and/or CTables |\n| `C2Array` | remote array handle | arrays hosted by a remote Caterva2 service |\n\nA simple rule of thumb is:\n\n- start with `NDArray` for dense numeric data\n- drop down to `SChunk` if you need chunk-level control\n- use `ObjectArray` or `BatchArray` for variable-length Python objects\n- use `EmbedStore`, `DictStore`, or `TreeStore` when your dataset contains multiple objects" ] }, { @@ -520,17 +554,7 @@ "id": "cell-21", "metadata": {}, "source": [ - "## Final Notes\n", - "\n", - "This notebook is intentionally organized from low-level storage to higher-level organization:\n", - "\n", - "- understand `SChunk` first\n", - "- use `NDArray` for most dense numeric workloads\n", - "- move to `ObjectArray` or `BatchArray` when entries stop being fixed-size arrays\n", - "- use `EmbedStore`, `DictStore`, or `TreeStore` when you need to package multiple objects together\n", - "- use `C2Array` when the data lives on a remote service\n", - "\n", - "For deeper details on a specific class, continue with the reference docs and the dedicated tutorials for `ObjectArray`, `BatchArray`, and indexing." + "## Final Notes\n\nThis notebook is intentionally organized from low-level storage to higher-level organization:\n\n- understand `SChunk` first\n- use `NDArray` for most dense numeric workloads\n- move to `ObjectArray` or `BatchArray` when entries stop being fixed-size arrays\n- use `EmbedStore`, `DictStore`, or `TreeStore` when you need to package multiple objects together\n- use `TreeStore` + `CTable` together when your bundle mixes dense arrays with structured tables\n- use `C2Array` when the data lives on a remote service\n\nFor deeper details on a specific class, continue with the reference docs and the dedicated tutorials for `ObjectArray`, `BatchArray`, and indexing." ] }, { diff --git a/examples/tree-store.py b/examples/tree-store.py index 74eac272..e65aac3d 100644 --- a/examples/tree-store.py +++ b/examples/tree-store.py @@ -5,7 +5,9 @@ # SPDX-License-Identifier: BSD-3-Clause ####################################################################### -# Example usage of TreeStore with hierarchical navigation and vlmeta +# Example usage of TreeStore with hierarchical navigation, vlmeta, and CTables + +from dataclasses import dataclass import numpy as np @@ -66,3 +68,46 @@ rsub = tstore2["/child0"] print("/child0/new_leaf via subtree:", rsub["/new_leaf"][:]) print(f"TreeStore file at: {tstore2.localpath}") + +# --------------------------------------------------------------------------- +# Mixing NDArrays and CTables in the same TreeStore +# --------------------------------------------------------------------------- + + +@dataclass +class Reading: + sensor_id: int = 0 + value: float = 0.0 + + +with blosc2.TreeStore("example_tree.b2z", mode="a") as ts: + # Create a small CTable in memory and store it inline + t = blosc2.CTable(Reading) + for i in range(5): + t.append(Reading(sensor_id=i, value=float(i) * 1.1)) + + # Assignment syntax is identical to NDArray + ts["/readings"] = t + print("Keys after adding CTable:", sorted(ts.keys())) + + # Object internals are hidden from normal traversal + print("/readings/_meta in ts:", "/readings/_meta" in ts) # False + +with blosc2.open("example_tree.b2z", mode="r") as ts: + # CTable is returned transparently; no special open call needed + readings = ts["/readings"] + print(f"CTable type: {type(readings).__name__}, rows: {len(readings)}") + print("sensor_id column:", list(readings["sensor_id"][:])) + print("value column :", list(readings["value"][:])) + +# Append a row to an inline CTable via append mode +with blosc2.TreeStore("example_tree.b2z", mode="a") as ts: + readings = ts["/readings"] + readings.append(Reading(sensor_id=99, value=-1.0)) + readings.close() # explicit close before outer store repacks + print("After append, rows:", len(ts["/readings"])) + +# Delete the CTable object root (removes all internal leaves) +with blosc2.TreeStore("example_tree.b2z", mode="a") as ts: + del ts["/readings"] + print("After deleting /readings:", sorted(ts.keys())) From 141bf07ba958b07f976166a06951360f59fd7f9c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 8 May 2026 18:03:52 +0200 Subject: [PATCH 04/10] Fix inline CTable TreeStore edge cases: - TreeStore.values() now collapses object roots. - Parent subtree deletion removes nested inline CTable objects. - Missing object registry fallback now hides/protects inline CTable internals. - Inline CTable indexes work in .b2d and .b2z. --- src/blosc2/ctable_storage.py | 15 ++++---- src/blosc2/indexing.py | 7 +++- src/blosc2/tree_store.py | 73 +++++++++++++++++++++++++++++++----- tests/test_tree_store.py | 62 ++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 19 deletions(-) diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py index ec6fc337..fab3ac11 100644 --- a/src/blosc2/ctable_storage.py +++ b/src/blosc2/ctable_storage.py @@ -504,13 +504,12 @@ def discard(self) -> None: @staticmethod def _walk_descriptor_paths(descriptor: dict): """Yield (obj, key) for every string value that looks like a file path.""" - _PATH_KEYS = {"path", "values_path", "positions_path", "l1_path", "l2_path"} stack = [descriptor] while stack: obj = stack.pop() if isinstance(obj, dict): for k, v in obj.items(): - if k in _PATH_KEYS and isinstance(v, str): + if (k == "path" or k.endswith("_path")) and isinstance(v, str): yield obj, k elif isinstance(v, (dict, list)): stack.append(v) @@ -521,22 +520,22 @@ def _walk_descriptor_paths(descriptor: dict): @staticmethod def _relativize_descriptor(descriptor: dict, working_dir: str) -> dict: - """Replace absolute paths inside *working_dir* with ``_indexes/…`` relative paths.""" - prefix = working_dir.rstrip("/") + "/" + """Replace absolute paths inside *working_dir* with working-dir relative paths.""" + prefix = working_dir.rstrip(os.sep) + os.sep d = copy.deepcopy(descriptor) for obj, key in FileTableStorage._walk_descriptor_paths(d): v = obj[key] - if v.startswith(prefix): - obj[key] = v[len(prefix) :] + if os.path.isabs(v) and v.startswith(prefix): + obj[key] = v[len(prefix) :].replace(os.sep, "/") return d @staticmethod def _absolutize_descriptor(descriptor: dict, working_dir: str) -> dict: - """Expand ``_indexes/…`` relative paths back to absolute using *working_dir*.""" + """Expand working-dir relative paths back to absolute paths.""" d = copy.deepcopy(descriptor) for obj, key in FileTableStorage._walk_descriptor_paths(d): v = obj[key] - if v.startswith(_INDEXES_DIR + "/") or v.startswith(_INDEXES_DIR + os.sep): + if not os.path.isabs(v): obj[key] = os.path.join(working_dir, v) return d diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 14872055..09e54b2e 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -5889,7 +5889,12 @@ def _bucket_batch_result_dtype(where_x) -> np.dtype: def _bucket_worker_source(where_x): if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: - return blosc2.open(str(where_x.urlpath), mode="r", mmap_mode=_INDEX_MMAP_MODE) + urlpath = str(where_x.urlpath) + # Arrays opened from a b2z TreeStore/CTable are offset-backed leaves whose + # urlpath points at the outer bundle, not at a standalone .b2nd file. + # Reopening that path would materialize the whole TreeStore/CTable. + if not urlpath.endswith(".b2z"): + return blosc2.open(urlpath, mode="r", mmap_mode=_INDEX_MMAP_MODE) return where_x diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index cdc8215d..5e642c79 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -229,9 +229,29 @@ def _object_roots(self) -> set: """Return all registered object-root full keys.""" return set(self._objects_registry().keys()) + def _probed_object_roots(self) -> set: + """Return object roots discovered from physical CTable manifests.""" + roots = set() + candidates = set(self.map_tree.keys()) | set(self._estore.keys()) + for key in candidates: + if not key.endswith("/_meta"): + continue + root = key[: -len("/_meta")] + if not root: + # A manifest at /_meta marks the TreeStore itself as a CTable + # backing store; it is not an inline object root to collapse. + continue + if self._probe_object_info(root) is not None: + roots.add(root) + return roots + + def _known_object_roots(self) -> set: + """Return registered plus physically probed object-root full keys.""" + return self._object_roots() | self._probed_object_roots() + def _effective_object_roots(self) -> set: """Object root keys relative to the current view (subtree or root).""" - all_roots = self._object_roots() + all_roots = self._known_object_roots() if not self.subtree_path: return all_roots result = set() @@ -386,7 +406,7 @@ def __setitem__( # Block overwriting an existing object root with a plain value full_key = self._translate_key_to_full(key) - if self._object_info(full_key) is not None: + if (self._object_info(full_key) or self._probe_object_info(full_key)) is not None: raise ValueError( f"'{key}' is an object root (e.g. CTable). " f"Delete it first with `del ts['{key}']` before assigning a new value." @@ -421,7 +441,7 @@ def _set_ctable_object(self, key: str, value: blosc2.CTable) -> None: full_key = self._translate_key_to_full(key) # Raise if already exists as object root (no silent replace) - if self._object_info(full_key) is not None: + if (self._object_info(full_key) or self._probe_object_info(full_key)) is not None: raise ValueError( f"'{key}' already exists as an object root. Delete it first with `del ts['{key}']`." ) @@ -531,7 +551,7 @@ def __delitem__(self, key: str) -> None: full_key = self._translate_key_to_full(key) # --- Object root deletion --- - if self._object_info(full_key) is not None: + if (self._object_info(full_key) or self._probe_object_info(full_key)) is not None: self._delete_object_subtree(full_key) return @@ -545,8 +565,14 @@ def __delitem__(self, key: str) -> None: # Regular node / subtree deletion key_exists_as_data = super().__contains__(full_key) descendants = self.get_descendants(key) - - if not key_exists_as_data and not descendants: + prefix = full_key + "/" if full_key != "/" else "/" + object_roots_to_delete = sorted( + [root for root in self._known_object_roots() if root.startswith(prefix)], + key=len, + reverse=True, + ) + + if not key_exists_as_data and not descendants and not object_roots_to_delete: raise KeyError(f"Key '{key}' not found") keys_to_delete = [] @@ -557,8 +583,18 @@ def __delitem__(self, key: str) -> None: if super().__contains__(full_desc): keys_to_delete.append(descendant) + for object_root in object_roots_to_delete: + self._delete_object_subtree(object_root) + for k in keys_to_delete: - super().__delitem__(self._translate_key_to_full(k)) + full_desc = self._translate_key_to_full(k) + if super().__contains__(full_desc): + super().__delitem__(full_desc) + + # Remove stale registry entries for any nested objects that were deleted as plain descendants. + for root in list(self._object_roots()): + if root.startswith(prefix): + self._unregister_object(root) def _delete_object_subtree(self, full_key: str) -> None: """Delete all physical leaves under *full_key* and unregister it.""" @@ -614,7 +650,11 @@ def __contains__(self, key: str) -> bool: if self._is_object_internal_key(key): return False full_key = self._translate_key_to_full(key) - return super().__contains__(full_key) or self._object_info(full_key) is not None + return ( + super().__contains__(full_key) + or self._object_info(full_key) is not None + or self._probe_object_info(full_key) is not None + ) except ValueError: return False @@ -679,6 +719,15 @@ def items(self) -> Iterator[tuple[str, NDArray | C2Array | SChunk | TreeStore]]: for key in self.keys(): yield key, self[key] + def values( + self, + ) -> Iterator[ + NDArray | C2Array | SChunk | blosc2.ObjectArray | blosc2.BatchArray | blosc2.CTable | TreeStore + ]: + """Return values in the current subtree view, with object roots collapsed.""" + for key in self.keys(): + yield self[key] + def get_children(self, path: str) -> list[str]: """Get direct children of a given path. @@ -815,7 +864,11 @@ def walk(self, path: str = "/", topdown: bool = True) -> Iterator[tuple[str, lis child_rel_path = path + "/" + name if path != "/" else "/" + name # Translate to full key in the backing store and verify it's a data node or object root full_key = self._translate_key_to_full(child_rel_path) - if super().__contains__(full_key) or self._object_info(full_key) is not None: + if ( + super().__contains__(full_key) + or self._object_info(full_key) is not None + or self._probe_object_info(full_key) is not None + ): valid_leaf_nodes.append(name) leaf_nodes = valid_leaf_nodes @@ -865,7 +918,7 @@ def get_subtree(self, path: str) -> TreeStore: full_path = self._translate_key_to_full(path) # Object roots cannot be navigated as subtrees - if self._object_info(full_path) is not None: + if (self._object_info(full_path) or self._probe_object_info(full_path)) is not None: raise ValueError( f"'{path}' is an object root (e.g. CTable), not a TreeStore subtree. " f"Use ts['{path}'] to access the object." diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index bec55b6b..815129ab 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -1432,3 +1432,65 @@ def test_ctable_context_manager_auto_close(tmp_path, storage_type): t2 = ts["/table"] assert len(t2) == 3 assert list(t2["x"][:])[-1] == 100 + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_values_collapses_object_roots(tmp_path, storage_type): + """values() yields the CTable object, not its internal leaves.""" + path = str(tmp_path / f"bundle.{storage_type}") + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = _make_ctable(n=2) + values = list(ts.values()) + + assert len(values) == 1 + assert isinstance(values[0], blosc2.CTable) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_delete_parent_subtree_removes_nested_object(tmp_path, storage_type): + """Deleting a normal subtree also deletes nested object roots and physical leaves.""" + path = str(tmp_path / f"bundle.{storage_type}") + with blosc2.TreeStore(path, mode="w") as ts: + ts["/grp/table"] = _make_ctable(n=2) + del ts["/grp"] + assert sorted(ts.keys()) == [] + + with blosc2.open(path, mode="r") as ts: + assert sorted(ts.keys()) == [] + assert "/grp/table" not in ts + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_inline_index_roundtrip(tmp_path, storage_type): + """Index catalogs and sidecars work for inline CTable objects.""" + path = str(tmp_path / f"bundle.{storage_type}") + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = _make_ctable(n=100) + + with blosc2.TreeStore(path, mode="a") as ts: + table = ts["/table"] + table.create_index("x") + np.testing.assert_array_equal(list(table.where(table["x"] > 95)["x"][:]), [96, 97, 98, 99]) + + with blosc2.open(path, mode="r") as ts: + table = ts["/table"] + assert len(table.indexes) == 1 + np.testing.assert_array_equal(list(table.where(table["x"] > 95)["x"][:]), [96, 97, 98, 99]) + + +@pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) +def test_ctable_registry_missing_fallback_hides_and_protects_internals(tmp_path, storage_type): + """Physical CTable manifests are enough to detect object roots if registry is missing.""" + path = str(tmp_path / f"bundle.{storage_type}") + with blosc2.TreeStore(path, mode="w") as ts: + ts["/table"] = _make_ctable(n=2) + + with blosc2.TreeStore(path, mode="a") as ts: + del ts._estore._store.vlmeta["_object_registry"] + + with blosc2.open(path, mode="r") as ts: + assert sorted(ts.keys()) == ["/table"] + assert isinstance(ts["/table"], blosc2.CTable) + assert "/table/_meta" not in ts + with pytest.raises(ValueError, match="object root"): + ts.get_subtree("/table") From 14b0a72ee6b02300f67aee264222f033c64efafc Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 8 May 2026 18:29:30 +0200 Subject: [PATCH 05/10] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- tests/test_tree_store.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_tree_store.py b/tests/test_tree_store.py index 815129ab..8d1c8dc4 100644 --- a/tests/test_tree_store.py +++ b/tests/test_tree_store.py @@ -1180,8 +1180,10 @@ def test_ctable_traversal_hides_internals(tmp_path, storage_type): # walk walked = list(ts.walk("/")) all_nodes = [n for _, _, nodes in walked for n in nodes] + all_dirs = [d for _, dirs, _ in walked for d in dirs] assert "table" in all_nodes - assert not any(n.startswith("_") for root, _, _ in walked for n in _) + assert not any(name.startswith("_") for name in all_dirs) + assert not any(name.startswith("_") for name in all_nodes) @pytest.mark.parametrize("storage_type", ["b2d", "b2z"]) From 881f37b568da74f81a04ad599e2d4a479aa31f90 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 16:32:20 +0000 Subject: [PATCH 06/10] Cache effective object roots in TreeStore lookups Agent-Logs-Url: https://github.com/Blosc/python-blosc2/sessions/973a79eb-d0c3-43ac-8008-54a9bd392be0 Co-authored-by: FrancescAlted <314521+FrancescAlted@users.noreply.github.com> --- src/blosc2/tree_store.py | 44 ++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 5e642c79..c8296d76 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -190,6 +190,8 @@ def __init__(self, *args, _from_parent_store=None, **kwargs): self.subtree_path = "" # Empty string means full tree self._inline_handles: list = [] # inline object handles opened from this store + self._known_object_roots_cache: set[str] | None = None + self._effective_object_roots_cache: tuple[str, set[str]] | None = None # ------------------------------------------------------------------ # Object registry helpers @@ -203,12 +205,18 @@ def _objects_registry(self) -> dict: except Exception: return {} + def _invalidate_object_roots_cache(self) -> None: + """Invalidate cached object-root views.""" + self._known_object_roots_cache = None + self._effective_object_roots_cache = None + def _register_object(self, full_key: str, *, kind: str, version: int, layout: str) -> None: """Register *full_key* as an object root in the persistent registry.""" try: reg = self._objects_registry() reg[full_key] = {"kind": kind, "version": version, "layout": layout} self._estore._store.vlmeta["_object_registry"] = reg + self._invalidate_object_roots_cache() except Exception: pass # best-effort @@ -218,6 +226,7 @@ def _unregister_object(self, full_key: str) -> None: reg = self._objects_registry() reg.pop(full_key, None) self._estore._store.vlmeta["_object_registry"] = reg + self._invalidate_object_roots_cache() except Exception: pass @@ -247,23 +256,40 @@ def _probed_object_roots(self) -> set: def _known_object_roots(self) -> set: """Return registered plus physically probed object-root full keys.""" - return self._object_roots() | self._probed_object_roots() + if self._known_object_roots_cache is not None: + return set(self._known_object_roots_cache) + + registered = self._object_roots() + # Fast path: when registry exists, avoid costly full-store probing. + roots = registered if registered else self._probed_object_roots() + self._known_object_roots_cache = roots + return set(roots) def _effective_object_roots(self) -> set: """Object root keys relative to the current view (subtree or root).""" + current_subtree_path = self.subtree_path or "" + if ( + self._effective_object_roots_cache is not None + and self._effective_object_roots_cache[0] == current_subtree_path + ): + return set(self._effective_object_roots_cache[1]) + all_roots = self._known_object_roots() if not self.subtree_path: - return all_roots + self._effective_object_roots_cache = (current_subtree_path, all_roots) + return set(all_roots) result = set() for full_key in all_roots: relative = self._translate_key_from_full(full_key) if relative is not None: result.add(relative) - return result + self._effective_object_roots_cache = (current_subtree_path, result) + return set(result) - def _is_object_internal_key(self, key: str) -> bool: + def _is_object_internal_key(self, key: str, object_roots: set[str] | None = None) -> bool: """Return ``True`` when *key* (subtree-relative) is inside an object root.""" - return any(key != root and key.startswith(root + "/") for root in self._effective_object_roots()) + roots = self._effective_object_roots() if object_roots is None else object_roots + return any(key != root and key.startswith(root + "/") for root in roots) def _probe_object_info(self, full_key: str) -> dict | None: """Probe the physical store for a CTable manifest at *full_key*/_meta. @@ -647,7 +673,8 @@ def __contains__(self, key: str) -> bool: key = self._validate_key(key) if self._is_vlmeta_key(key): return False - if self._is_object_internal_key(key): + object_roots = self._effective_object_roots() + if self._is_object_internal_key(key, object_roots): return False full_key = self._translate_key_to_full(key) return ( @@ -692,11 +719,10 @@ def keys(self): all_keys = {key for key in all_keys if not self._is_vlmeta_key(key)} # Filter out object-internal keys - all_keys = {key for key in all_keys if not self._is_object_internal_key(key)} - - # Add object roots (they are not stored as DictStore keys themselves) object_roots = self._effective_object_roots() + all_keys = {key for key in all_keys if not self._is_object_internal_key(key, object_roots)} + # Add object roots (they are not stored as DictStore keys themselves) # Build structural paths from both data leaves and object root keys all_with_roots = all_keys | object_roots structural_keys = set() From 4b7c0d6371646f2bbd51d11b46b5ca4feaa110a6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 16:33:41 +0000 Subject: [PATCH 07/10] Harden TreeStore cache assignments with defensive copies Agent-Logs-Url: https://github.com/Blosc/python-blosc2/sessions/973a79eb-d0c3-43ac-8008-54a9bd392be0 Co-authored-by: FrancescAlted <314521+FrancescAlted@users.noreply.github.com> --- src/blosc2/tree_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index c8296d76..880333e6 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -262,7 +262,7 @@ def _known_object_roots(self) -> set: registered = self._object_roots() # Fast path: when registry exists, avoid costly full-store probing. roots = registered if registered else self._probed_object_roots() - self._known_object_roots_cache = roots + self._known_object_roots_cache = set(roots) return set(roots) def _effective_object_roots(self) -> set: @@ -276,7 +276,7 @@ def _effective_object_roots(self) -> set: all_roots = self._known_object_roots() if not self.subtree_path: - self._effective_object_roots_cache = (current_subtree_path, all_roots) + self._effective_object_roots_cache = (current_subtree_path, set(all_roots)) return set(all_roots) result = set() for full_key in all_roots: From 9db1c80714f298c9c5176abe2c5be87c8de0041e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 16:34:54 +0000 Subject: [PATCH 08/10] Clarify TreeStore object-root probe fast-path comment Agent-Logs-Url: https://github.com/Blosc/python-blosc2/sessions/973a79eb-d0c3-43ac-8008-54a9bd392be0 Co-authored-by: FrancescAlted <314521+FrancescAlted@users.noreply.github.com> --- src/blosc2/tree_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 880333e6..8cb054ef 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -260,7 +260,7 @@ def _known_object_roots(self) -> set: return set(self._known_object_roots_cache) registered = self._object_roots() - # Fast path: when registry exists, avoid costly full-store probing. + # Fast path: when registry is non-empty, avoid costly full-store probing. roots = registered if registered else self._probed_object_roots() self._known_object_roots_cache = set(roots) return set(roots) From 1d51b246bc69751965cee5a20ce6e2dbeb066d62 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 16:36:11 +0000 Subject: [PATCH 09/10] Use explicit fallback branch for object-root probing Agent-Logs-Url: https://github.com/Blosc/python-blosc2/sessions/973a79eb-d0c3-43ac-8008-54a9bd392be0 Co-authored-by: FrancescAlted <314521+FrancescAlted@users.noreply.github.com> --- src/blosc2/tree_store.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index 8cb054ef..e868f9bd 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -261,7 +261,10 @@ def _known_object_roots(self) -> set: registered = self._object_roots() # Fast path: when registry is non-empty, avoid costly full-store probing. - roots = registered if registered else self._probed_object_roots() + if registered: + roots = registered + else: + roots = self._probed_object_roots() self._known_object_roots_cache = set(roots) return set(roots) From 018cc7cd8eef8b89fef4593af57b07d3414c5f76 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 8 May 2026 16:38:09 +0000 Subject: [PATCH 10/10] Add blosc2.CTable to __setitem__ type annotation Agent-Logs-Url: https://github.com/Blosc/python-blosc2/sessions/83225dd5-9d5c-4ca4-ba33-6ab4021b552f Co-authored-by: FrancescAlted <314521+FrancescAlted@users.noreply.github.com> --- src/blosc2/tree_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blosc2/tree_store.py b/src/blosc2/tree_store.py index e868f9bd..53a32588 100644 --- a/src/blosc2/tree_store.py +++ b/src/blosc2/tree_store.py @@ -382,7 +382,7 @@ def _validate_key(self, key: str) -> str: return key def __setitem__( - self, key: str, value: blosc2.Array | SChunk | blosc2.ObjectArray | blosc2.BatchArray + self, key: str, value: blosc2.Array | SChunk | blosc2.ObjectArray | blosc2.BatchArray | blosc2.CTable ) -> None: """Add a node with hierarchical key validation.