diff --git a/doc/getting_started/tutorials.rst b/doc/getting_started/tutorials.rst index d2786d12..c446589b 100644 --- a/doc/getting_started/tutorials.rst +++ b/doc/getting_started/tutorials.rst @@ -20,3 +20,4 @@ Tutorials tutorials/12.batcharray tutorials/13.containers tutorials/14.indexing-arrays + tutorials/15.indexing-ctables diff --git a/doc/getting_started/tutorials/15.indexing-ctables.ipynb b/doc/getting_started/tutorials/15.indexing-ctables.ipynb new file mode 100644 index 00000000..6c8ea4da --- /dev/null +++ b/doc/getting_started/tutorials/15.indexing-ctables.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "44fdf4b9", + "metadata": {}, + "source": [ + "# Indexing CTables\n", + "\n", + "CTable supports **persistent, table-owned indexes** that speed up `where()` queries on numeric columns. \n", + "An index maps sorted-value ranges to the chunk positions that contain matching rows, allowing Blosc2 to skip large parts of the table without reading every row.\n", + "\n", + "This tutorial covers:\n", + "\n", + "1. Creating an index on a CTable column\n", + "2. Querying with an index (automatic)\n", + "3. Stale detection and automatic scan fallback\n", + "4. Rebuilding and dropping indexes\n", + "5. Persistent tables: indexes survive close/reopen\n", + "6. Views and indexes\n" + ] + }, + { + "cell_type": "markdown", + "id": "da26cc61", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "We will use a simple measurement table with three numeric columns.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b23746ca", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:02.229246Z", + "start_time": "2026-04-15T12:29:00.966071Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Table: 500 rows\n" + ] + } + ], + "source": [ + "import dataclasses\n", + "\n", + "import numpy as np\n", + "\n", + "import blosc2\n", + "\n", + "\n", + "@dataclasses.dataclass\n", + "class Measurement:\n", + " sensor_id: int = blosc2.field(blosc2.int32())\n", + " temperature: float = blosc2.field(blosc2.float64())\n", + " region: int = blosc2.field(blosc2.int32())\n", + "\n", + "\n", + "N = 500\n", + "t = blosc2.CTable(Measurement)\n", + "rng = np.random.default_rng(42)\n", + "for i in range(N):\n", + " t.append([i, 15.0 + rng.random() * 25, int(rng.integers(0, 4))])\n", + "\n", + "print(f\"Table: {N} rows\")" + ] + }, + { + "cell_type": "markdown", + "id": "2be47ee8", + "metadata": {}, + "source": [ + "## Creating an index\n", + "\n", + "Call `create_index(col_name)` to build a bucket index on a column. \n", + "The returned `CTableIndex` handle shows the column name, kind, and whether the index is stale.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2ac1f281", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:12.081628Z", + "start_time": "2026-04-15T12:29:12.033154Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "stale? False\n", + "all indexes: []\n" + ] + } + ], + "source": [ + "idx = t.create_index(\"sensor_id\")\n", + "print(idx)\n", + "print(\"stale?\", idx.stale)\n", + "print(\"all indexes:\", t.indexes)" + ] + }, + { + "cell_type": "markdown", + "id": "792416cc", + "metadata": {}, + "source": [ + "## Querying with an index\n", + "\n", + "`where()` automatically uses an available (non-stale) index when the filter expression matches the indexed column. \n", + "The result is identical to a full scan.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "dcc2dc87", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:18.333378Z", + "start_time": "2026-04-15T12:29:18.283229Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Rows sensor_id > 450: 49\n", + "sensor_ids: [451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499]\n" + ] + } + ], + "source": [ + "result = t.where(t[\"sensor_id\"] > 450)\n", + "print(\"Rows sensor_id > 450:\", len(result))\n", + "print(\"sensor_ids:\", sorted(int(v) for v in result[\"sensor_id\"].to_numpy()))" + ] + }, + { + "cell_type": "markdown", + "id": "8b3b9725", + "metadata": {}, + "source": [ + "## Stale detection\n", + "\n", + "Any mutation — `append`, `extend`, `Column.__setitem__`, `Column.assign`, `sort_by`, `compact` —\n", + "marks all indexes **stale**. \n", + "When an index is stale, `where()` falls back to a full scan automatically so results are always correct.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b0132381", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:24.936335Z", + "start_time": "2026-04-15T12:29:24.884590Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stale after append? True\n", + "Found row: 1\n" + ] + } + ], + "source": [ + "t.append([9999, 30.0, 1]) # any mutation marks indexes stale\n", + "\n", + "idx = t.index(\"sensor_id\")\n", + "print(\"stale after append?\", idx.stale)\n", + "\n", + "# Query still works — scan fallback\n", + "result_stale = t.where(t[\"sensor_id\"] == 9999)\n", + "print(\"Found row:\", len(result_stale))" + ] + }, + { + "cell_type": "markdown", + "id": "110f792f", + "metadata": {}, + "source": [ + "Note: `delete()` only bumps the *visibility epoch* (it does not change column values) so it does **not** mark indexes stale.\n", + "\n", + "## Rebuilding an index\n", + "\n", + "`rebuild_index(col_name)` drops the old index and builds a fresh one from the current table state.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dc4d2897", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:31.023914Z", + "start_time": "2026-04-15T12:29:30.970979Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "stale after rebuild? False\n", + "Found row via rebuilt index: 1\n" + ] + } + ], + "source": [ + "idx = t.rebuild_index(\"sensor_id\")\n", + "print(\"stale after rebuild?\", idx.stale)\n", + "\n", + "result_rebuilt = t.where(t[\"sensor_id\"] == 9999)\n", + "print(\"Found row via rebuilt index:\", len(result_rebuilt))" + ] + }, + { + "cell_type": "markdown", + "id": "38363aa3", + "metadata": {}, + "source": [ + "## Dropping an index\n", + "\n", + "`drop_index(col_name)` removes the index from the catalog and deletes any sidecar files (for persistent tables).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e1583b4f", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:34.666155Z", + "start_time": "2026-04-15T12:29:34.628535Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexes after drop: []\n" + ] + } + ], + "source": [ + "t.drop_index(\"sensor_id\")\n", + "print(\"Indexes after drop:\", t.indexes)" + ] + }, + { + "cell_type": "markdown", + "id": "aab1e6ec", + "metadata": {}, + "source": [ + "## Persistent tables\n", + "\n", + "Indexes on persistent tables (tables with a `urlpath`) survive close and reopen because the catalog is stored inside the table's own `/_meta` sidecar and the index data lives under `/_indexes//`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "85d42133", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:41.237153Z", + "start_time": "2026-04-15T12:29:39.916230Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Created: \n", + "Sidecar files: 7\n", + "Rows > 280 (before close): 19\n" + ] + } + ], + "source": [ + "import shutil\n", + "import tempfile\n", + "from pathlib import Path\n", + "\n", + "tmpdir = Path(tempfile.mkdtemp())\n", + "path = str(tmpdir / \"sensors.b2d\")\n", + "\n", + "# Create a persistent table and build an index\n", + "pt = blosc2.CTable(Measurement, urlpath=path, mode=\"w\")\n", + "rng2 = np.random.default_rng(0)\n", + "for i in range(300):\n", + " pt.append([i, 15.0 + rng2.random() * 25, int(rng2.integers(0, 4))])\n", + "\n", + "pidx = pt.create_index(\"sensor_id\")\n", + "print(\"Created:\", pidx)\n", + "\n", + "# Sidecar files\n", + "index_dir = Path(path) / \"_indexes\" / \"sensor_id\"\n", + "print(\"Sidecar files:\", len(list(index_dir.glob(\"**/*.b2nd\"))))\n", + "\n", + "# Query before close\n", + "r1 = pt.where(pt[\"sensor_id\"] > 280)\n", + "print(\"Rows > 280 (before close):\", len(r1))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "149ddba5", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:45.139325Z", + "start_time": "2026-04-15T12:29:45.095849Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Indexes after reopen: []\n", + "Rows > 280 (after reopen): 19\n", + "Results match ✓\n" + ] + } + ], + "source": [ + "# Close and reopen — catalog is preserved\n", + "del pt\n", + "pt2 = blosc2.open(path)\n", + "\n", + "print(\"Indexes after reopen:\", pt2.indexes)\n", + "\n", + "r2 = pt2.where(pt2[\"sensor_id\"] > 280)\n", + "print(\"Rows > 280 (after reopen):\", len(r2))\n", + "\n", + "ids1 = sorted(int(v) for v in r1[\"sensor_id\"].to_numpy())\n", + "ids2 = sorted(int(v) for v in r2[\"sensor_id\"].to_numpy())\n", + "assert ids1 == ids2, \"Results differ!\"\n", + "print(\"Results match ✓\")\n", + "\n", + "shutil.rmtree(tmpdir, ignore_errors=True)" + ] + }, + { + "cell_type": "markdown", + "id": "2743e784", + "metadata": {}, + "source": [ + "## Views and indexes\n", + "\n", + "A *view* (the result of `where()`) is a filtered window into the underlying table. \n", + "Index management methods (`create_index`, `drop_index`, `rebuild_index`, `compact_index`) are **not** available on views — they raise `ValueError`.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "83db418b", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-15T12:29:51.038611Z", + "start_time": "2026-04-15T12:29:50.906410Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "View type: CTable\n", + "create_index on view: Cannot create an index on a view.\n", + "drop_index on view: Cannot drop an index from a view.\n" + ] + } + ], + "source": [ + "t2 = blosc2.CTable(Measurement)\n", + "for i in range(50):\n", + " t2.append([i, 20.0, i % 3])\n", + "t2.create_index(\"sensor_id\")\n", + "\n", + "view = t2.where(t2[\"sensor_id\"] > 10)\n", + "print(\"View type:\", type(view).__name__)\n", + "\n", + "try:\n", + " view.create_index(\"sensor_id\")\n", + "except ValueError as e:\n", + " print(\"create_index on view:\", e)\n", + "\n", + "try:\n", + " view.drop_index(\"sensor_id\")\n", + "except ValueError as e:\n", + " print(\"drop_index on view:\", e)" + ] + }, + { + "cell_type": "markdown", + "id": "f5e87579", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "| Operation | Method |\n", + "|---|---|\n", + "| Build index | `t.create_index(col)` |\n", + "| Query (auto) | `t.where(expr)` — uses index when fresh |\n", + "| Check if stale | `t.index(col).stale` |\n", + "| Rebuild | `t.rebuild_index(col)` |\n", + "| Drop | `t.drop_index(col)` |\n", + "| Compact (full indexes) | `t.compact_index(col)` |\n", + "| List all | `t.indexes` |\n", + "\n", + "Key behaviours:\n", + "\n", + "- **Mutations** (`append`, `extend`, `setitem`, `assign`, `sort_by`, `compact`) mark indexes stale.\n", + "- **Stale indexes** trigger automatic scan fallback — no user intervention needed.\n", + "- **Persistent indexes** survive table close and reopen.\n", + "- **Views** cannot own indexes; only root tables can.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "363827fec805190a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/ctable/indexing.py b/examples/ctable/indexing.py new file mode 100644 index 00000000..dd06436d --- /dev/null +++ b/examples/ctable/indexing.py @@ -0,0 +1,97 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +# CTable indexing with mixed dtypes, persistent sidecars, and a packed .b2z bundle. + +import shutil +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import blosc2 + + +@dataclass +class Measurement: + sensor_id: int = blosc2.field(blosc2.int32()) + temperature: float = blosc2.field(blosc2.float64()) + region: str = blosc2.field(blosc2.string(max_length=12), default="") + active: bool = blosc2.field(blosc2.bool(), default=True) + status: str = blosc2.field(blosc2.string(max_length=12), default="") + + +def load_rows(table: blosc2.CTable, nrows: int = 240) -> None: + regions = ["north", "south", "east", "west"] + for i in range(nrows): + region = regions[i % len(regions)] + active = (i % 7) not in (0, 6) + status = "alert" if i % 23 == 0 else ("warm" if i % 11 == 0 else "ok") + table.append([i, 12.5 + (i % 40) * 0.35, region, active, status]) + + +bundle_path = Path("indexed_measurements.b2z").resolve() +workspace = Path(tempfile.mkdtemp()) +table_path = workspace / "indexed_measurements.b2d" + +pt = None +packed = None + +try: + print("Creating a CTable with mixed dtypes...") + pt = blosc2.CTable(Measurement, urlpath=str(table_path), mode="w") + load_rows(pt) + + # Create a couple of indexes on columns with different dtypes. + print("\nCreating indexes...") + idx_sensor = pt.create_index("sensor_id", kind=blosc2.IndexKind.FULL) + idx_active = pt.create_index("active") + print("Indexes created:", pt.indexes) + print("sensor_id stale?", idx_sensor.stale) + print("active stale?", idx_active.stale) + + # Queries can combine indexed and non-indexed predicates. + recent_active = pt.where((pt["sensor_id"] >= 180) & pt["active"] & (pt["region"] == "north")) + print("\nLive rows with sensor_id >= 180, active=True, region='north':", len(recent_active)) + print("sensor_ids:", recent_active["sensor_id"]) + print("statuses:", recent_active["status"].to_numpy()) + + # Close the table, pack the TreeStore into a single .b2z file, and reopen it. + del pt + pt = None + + if bundle_path.exists(): + bundle_path.unlink() + + store = blosc2.TreeStore(str(table_path), mode="r") + try: + packed_path = store.to_b2z(filename=str(bundle_path), overwrite=True) + finally: + store.close() + + print(f"\nPacked bundle created at: {packed_path}") + + packed = blosc2.open(str(bundle_path), mode="r") + print("Reopened object type:", type(packed).__name__) + print("Indexes after reopen from .b2z:", packed.indexes) + + # Query directly against the .b2z bundle; no unpack step is needed. + warm_active = packed.where(packed["active"] & (packed["status"] == "warm") & (packed["sensor_id"] > 100)) + print("\nRows from .b2z with active=True, status='warm', sensor_id > 100:", len(warm_active)) + print("sensor_ids:", warm_active["sensor_id"]) + print("regions:", warm_active["region"].to_numpy()) + + print("\nThe packed file is kept on disk.") + print(f"Inspect it later with: f = blosc2.open({bundle_path.name!r}, mode='r')") + print("Then call: f.info()") + print("For a quick check of the available info entry point, print: f.info") + +finally: + if packed is not None: + del packed + if pt is not None: + del pt + shutil.rmtree(workspace, ignore_errors=True) diff --git a/examples/mmap-rw.py b/examples/mmap-rw.py index 2fee6d7c..f27e6c19 100644 --- a/examples/mmap-rw.py +++ b/examples/mmap-rw.py @@ -24,7 +24,7 @@ blosc2.asarray(a, urlpath=urlpath, mmap_mode="w+", initial_mapping_size=initial_mapping_size) # Read the ndarray back via the general open function -a_read = blosc2.open(urlpath, mmap_mode="r") +a_read = blosc2.open(urlpath, mode="r", mmap_mode="r") assert np.all(a == a_read) blosc2.remove_urlpath(urlpath) diff --git a/examples/ndarray/blosc2_3_10_demo.py b/examples/ndarray/blosc2_3_10_demo.py index 17a1c8b9..00725cd0 100644 --- a/examples/ndarray/blosc2_3_10_demo.py +++ b/examples/ndarray/blosc2_3_10_demo.py @@ -33,7 +33,7 @@ # Reopen persistent expression, compute, and write to disk with blosc2 t0 = time.time() -lexpr = blosc2.open(urlpath=url_path) +lexpr = blosc2.open(urlpath=url_path, mode="r") dt = time.time() - t0 print(f"In {round(dt * 1000, 3)} ms opened lazy expression: shape = {lexpr.shape}, dtype = {lexpr.dtype}") t1 = time.time() diff --git a/examples/ndarray/compute_expr.py b/examples/ndarray/compute_expr.py index 60ecf606..b6ed8ce3 100644 --- a/examples/ndarray/compute_expr.py +++ b/examples/ndarray/compute_expr.py @@ -50,7 +50,7 @@ # Get a LazyExpr instance (da**2 + db**2 + 2 * da * db + 1).save(urlpath="c.b2nd") -dc = blosc2.open("c.b2nd") +dc = blosc2.open("c.b2nd", mode="r") # Evaluate: output is a NDArray dc2 = dc.compute() diff --git a/examples/ndarray/dsl_save.py b/examples/ndarray/dsl_save.py index 24870afb..55e975e9 100644 --- a/examples/ndarray/dsl_save.py +++ b/examples/ndarray/dsl_save.py @@ -46,7 +46,7 @@ def heat_step(u, v): print("LazyUDF saved to heat_step.b2nd") # ── reload in a 'fresh' context (no reference to heat_step) ───────── -reloaded = blosc2.open("heat_step.b2nd") +reloaded = blosc2.open("heat_step.b2nd", mode="r") assert isinstance(reloaded, blosc2.LazyUDF), "Expected a LazyUDF after open()" assert isinstance(reloaded.func, DSLKernel), "func must be a DSLKernel after reload" assert reloaded.func.dsl_source is not None, "dsl_source must survive the round-trip" @@ -64,7 +64,7 @@ def heat_step(u, v): lazy2 = blosc2.lazyudf(heat_step, (u2, v), dtype=np.float64) lazy2.save(urlpath="heat_step2.b2nd") -reloaded2 = blosc2.open("heat_step2.b2nd") +reloaded2 = blosc2.open("heat_step2.b2nd", mode="r") result2 = reloaded2.compute() expected2 = u2[()] + 0.1 * (v[()] - u2[()]) assert np.allclose(result2[()], expected2) diff --git a/examples/ndarray/meta.py b/examples/ndarray/meta.py index c9d4a498..5fa13d12 100644 --- a/examples/ndarray/meta.py +++ b/examples/ndarray/meta.py @@ -27,7 +27,7 @@ print(a.info) # Read a b2nd array from disk -b = blosc2.open(urlpath) +b = blosc2.open(urlpath, mode="r") # Deal with meta m1 = b.schunk.meta.get("m5", b"0000") diff --git a/examples/ndarray/persistency.py b/examples/ndarray/persistency.py index 014519a5..4541da47 100644 --- a/examples/ndarray/persistency.py +++ b/examples/ndarray/persistency.py @@ -20,6 +20,6 @@ a = blosc2.asarray(nparray, urlpath=urlpath, mode="w") # Read the array from disk -b = blosc2.open(urlpath) +b = blosc2.open(urlpath, mode="r") # And see its contents print(b[...]) diff --git a/examples/ndarray/reduce_and_enlarge.py b/examples/ndarray/reduce_and_enlarge.py index e6841441..928a4fe8 100644 --- a/examples/ndarray/reduce_and_enlarge.py +++ b/examples/ndarray/reduce_and_enlarge.py @@ -49,7 +49,7 @@ url_path = "my_expr.b2nd" # Open the saved file -lazy_expr = blosc2.open(urlpath=url_path) +lazy_expr = blosc2.open(urlpath=url_path, mode="r") print(lazy_expr) print(f"expr (after open) shape: {lazy_expr.shape}; dtype: {lazy_expr.dtype}") # Evaluate and print the result of the lazy expression (should be a 2x4 arr) diff --git a/examples/ndarray/reduce_expr_save.py b/examples/ndarray/reduce_expr_save.py index 99f5c58a..1fdb257e 100644 --- a/examples/ndarray/reduce_expr_save.py +++ b/examples/ndarray/reduce_expr_save.py @@ -24,13 +24,13 @@ # Get a LazyExpr instance c = a**2 + b**2 + 2 * a * b + 1 c.save(urlpath="c.b2nd") -c = blosc2.open("c.b2nd") +c = blosc2.open("c.b2nd", mode="r") # Evaluate: output is a NDArray d = blosc2.lazyexpr("a + c.sum() + a.std()", operands={"a": a, "c": c}) d.save(urlpath="lazy-d.b2nd") # Load the expression from disk -d = blosc2.open("lazy-d.b2nd") +d = blosc2.open("lazy-d.b2nd", mode="r") print(f"Expression: {d}") assert isinstance(d, blosc2.LazyExpr) e = d.compute() diff --git a/plans/changing-default-open-mode.md b/plans/changing-default-open-mode.md new file mode 100644 index 00000000..9b97cc84 --- /dev/null +++ b/plans/changing-default-open-mode.md @@ -0,0 +1,341 @@ +# Plan For Changing `blosc2.open()` Default Mode To Read-Only + +## Goal + +Change the default mode for `blosc2.open(...)` from `"a"` to `"r"` so that +opening an existing object is non-mutating and unsurprising by default. + +The change should: + +- reduce accidental write access +- avoid implicit unpack / rewrite work for store-backed containers +- align with user expectations for a generic `open(...)` API +- preserve a smooth migration path for existing code that relied on writable + opens without an explicit `mode=` + +This plan is for later consideration and rollout design. It does not assume +that the change should land immediately. + +## Motivation + +Today, `blosc2.open(...)` defaults to `"a"` in +[src/blosc2/schunk.py](/Users/faltet/blosc/python-blosc2/src/blosc2/schunk.py). + +That means: + +- opening a `.b2z` store without `mode=` may create a writable working copy +- append-mode store opens may unpack zip-backed stores into a temporary working + directory immediately +- code that only intends to inspect metadata or query data can still enter a + mutation-capable path by accident + +This is especially surprising for: + +- `TreeStore` +- `DictStore` +- `CTable` +- other container-like objects opened through the generic dispatcher + +By contrast, users generally expect a bare `open(path)` call to be safe for +inspection unless they explicitly request write access. + +## Current Situation + +### Default values today + +The following default to `"a"` today: + +- `blosc2.open(...)` +- `DictStore(...)` +- `TreeStore(...)` +- `CTable(...)` constructor when opening/creating through `urlpath` + +At the same time: + +- `CTable.open(...)` already defaults to `"r"` + +This creates an inconsistency where: + +- `blosc2.open("table.b2z")` is writable by default +- `blosc2.CTable.open("table.b2z")` is read-only by default + +### Concrete user surprise + +For a `.b2z` store, append mode currently does extra work: + +1. create a working directory (usually temporary) +2. extract the archive into that working directory +3. serve reads/writes from the extracted layout +4. repack on close + +This is implemented in +[src/blosc2/dict_store.py](/Users/faltet/blosc/python-blosc2/src/blosc2/dict_store.py). + +That behavior is reasonable when the caller explicitly asked for `"a"`, but +surprising when it is triggered only because `mode` was omitted. + +## Desired End State + +The target behavior is: + +```python +blosc2.open(path) +``` + +should behave as if the user had written: + +```python +blosc2.open(path, mode="r") +``` + +unless the object category does not support read-only opening for technical +reasons. In such cases, the exception should be explicit and documented. + +The user should need to opt into mutation with: + +- `mode="a"` +- `mode="w"` + +## Design Principles + +The migration should follow these rules: + +- do not silently change semantics without a warning phase +- make the warning text concrete and actionable +- update all docs and examples before flipping the default +- keep the opt-in writable paths unchanged +- avoid introducing ambiguity about whether a store may be mutated +- prefer explicit `mode=` in library docs even after the default changes + +## Recommended Rollout + +### Phase 0: prepare the codebase + +Before warning users: + +1. audit internal calls to `blosc2.open(...)` +2. make all internal call sites spell out `mode=` +3. update examples, docs, and tests to use explicit modes +4. document the difference between: + - `mode="r"`: inspect/query only + - `mode="a"`: may unpack and repack stores + - `mode="w"`: overwrite/create + +This phase reduces ambiguity and makes later warning noise much more useful. + +### Phase 1: deprecation warning + +Keep the runtime default as `"a"`, but emit a `FutureWarning` when: + +- `blosc2.open(...)` is called without an explicit `mode=` + +The warning should fire only when `mode` was omitted, not when the caller +explicitly requested `"a"`. + +Recommended warning text: + +```python +FutureWarning( + "blosc2.open() currently defaults to mode='a', but this will change " + "to mode='r' in a future release. Pass mode='a' explicitly to keep " + "writable behavior, or mode='r' for read-only access." +) +``` + +Notes: + +- the wording should mention both the current and future defaults +- the wording should explain how to preserve current behavior +- the wording should not be container-specific + +### Phase 2: flip the default + +In the next planned breaking-compatible release window: + +- change the default mode in `blosc2.open(...)` from `"a"` to `"r"` + +At that point: + +- calls with omitted `mode` become read-only +- code that needs writable behavior must use `mode="a"` explicitly + +### Phase 3: remove warning-specific scaffolding + +After the default flip has been out for one full release cycle: + +- remove temporary warning helpers and migration notes that are no longer + useful +- keep release notes and changelog entries for historical context + +## Implementation Notes + +### Tracking whether `mode` was omitted + +To emit a warning only when appropriate, `blosc2.open(...)` needs to +distinguish: + +- caller omitted `mode` +- caller passed `mode="a"` explicitly + +A practical implementation is: + +1. change the function signature internally to use a sentinel +2. resolve the effective mode inside the function +3. warn only when the sentinel path is used + +For example: + +```python +_MODE_SENTINEL = object() + + +def open(urlpath, mode=_MODE_SENTINEL, **kwargs): + mode_was_omitted = mode is _MODE_SENTINEL + if mode_was_omitted: + mode = "a" # Phase 1 + warnings.warn(...) +``` + +Later, in Phase 2: + +```python +if mode_was_omitted: + mode = "r" +``` + +This is better than relying on `mode="a"` in the signature because that +signature cannot tell whether the user explicitly passed `"a"`. + +### Scope of change + +This plan is specifically about `blosc2.open(...)`. + +It does **not** require changing the defaults of: + +- `DictStore(...)` +- `TreeStore(...)` +- `CTable(...)` + +at the same time. + +However, the docs should explain that: + +- constructor-style APIs may still default to `"a"` +- generic `blosc2.open(...)` becomes read-only by default + +This narrower scope reduces breakage and focuses on the highest-surprise entry +point first. + +## Compatibility Risks + +The main breakage risk is downstream code that relies on: + +```python +obj = blosc2.open(path) +obj[...] = ... +``` + +without ever spelling out `mode="a"`. + +After the default flip, that code may: + +- fail with a read-only error +- stop persisting modifications +- expose behavior differences only at runtime + +This is why the warning phase is important. + +### Secondary risk: tests that mutate after open + +Internal and downstream tests may open objects generically and then mutate +them. These need to be found and updated during Phase 0. + +### Secondary risk: docs and notebooks + +Tutorials that currently omit `mode=` may accidentally teach users the old +behavior. These should be updated before the warning phase begins. + +## Documentation Changes + +### API docs + +Update the docstring for `blosc2.open(...)` to: + +- describe the migration +- clearly document the meaning of each mode +- mention that read-only is the recommended mode for inspection/querying + +### Examples + +Update examples to use explicit modes consistently: + +- inspection/querying: `mode="r"` +- mutation of existing stores: `mode="a"` +- create/overwrite: `mode="w"` + +### User-facing migration note + +Add a short migration note to release notes: + +- “`blosc2.open()` now defaults to read-only; pass `mode='a'` explicitly if + you need writable behavior.” + +## Testing Plan + +### Phase 1 tests + +Add tests that verify: + +- omitted `mode` emits `FutureWarning` +- explicit `mode="a"` does not warn +- explicit `mode="r"` does not warn +- effective behavior remains writable during the warning phase + +### Phase 2 tests + +After the flip, add/update tests that verify: + +- omitted `mode` is read-only +- writes after omitted-mode open fail clearly +- explicit `mode="a"` still allows mutation +- `.b2z` omitted-mode open does not enter append-style write setup + +### Documentation tests + +Where practical, examples should use explicit `mode=` so doctests remain clear +and stable across the transition. + +## Optional Compatibility Escape Hatch + +If downstream breakage risk is considered high, one temporary option is an +environment-variable override for one transition cycle, for example: + +- `BLOSC2_OPEN_DEFAULT_MODE=a` + +This should only be used if needed. It adds complexity and should not become a +permanent configuration surface unless there is a strong operational reason. + +## Related Follow-Up Worth Considering + +Even if the default changes to `"r"`, append mode for `.b2z` may still be more +eager than desirable. + +A separate improvement could make `.b2z` append behavior lazier: + +- open in `"a"` without extracting immediately +- extract only on first mutation +- keep read-only-style fast paths for pure reads + +That is orthogonal to the default-mode change and can be planned separately. + +## Summary + +The recommended path is: + +1. make internal/docs/example usage explicit +2. add a `FutureWarning` when `blosc2.open(...)` is called without `mode=` +3. flip the default from `"a"` to `"r"` in the next suitable release window +4. keep writable behavior available via explicit `mode="a"` + +This delivers a safer and less surprising user experience while still giving +existing code a clear migration path. diff --git a/plans/ctable-indexing.md b/plans/ctable-indexing.md new file mode 100644 index 00000000..077db1cc --- /dev/null +++ b/plans/ctable-indexing.md @@ -0,0 +1,713 @@ +# CTable Indexing Integration Plan + +## Goal + +Add persistent, table-owned indexing to `CTable` so that: + +- indexes can be created on `CTable` columns +- persistent indexes live inside the `TreeStore` that backs the table +- `CTable.where(...)` can reuse the existing index machinery as directly as possible +- index management feels aligned with the current `NDArray` indexing API + +This plan is for design and implementation guidance only. It does not assume +that all pieces must land in one patch. + +## Current Situation + +### What already exists + +`CTable` already supports persistent storage on top of `TreeStore`: + +- `/_meta` +- `/_valid_rows` +- `/_cols/` + +This is implemented in [src/blosc2/ctable_storage.py](/Users/faltet/blosc/python-blosc2/src/blosc2/ctable_storage.py) +and used by [src/blosc2/ctable.py](/Users/faltet/blosc/python-blosc2/src/blosc2/ctable.py). + +The generic indexing engine already exists for 1-D `NDArray` targets: + +- summary / bucket / partial / full indexes +- persistent descriptors in `array.schunk.vlmeta` +- sidecar arrays stored next to the indexed array +- query planning via `plan_query(...)` +- ordered reuse via `plan_ordered_query(...)` + +This lives in [src/blosc2/indexing.py](/Users/faltet/blosc/python-blosc2/src/blosc2/indexing.py) +and is exposed through `NDArray.create_index()`, `drop_index()`, `rebuild_index()`, +`compact_index()`, `index()`, and `indexes`. + +### What is missing + +`CTable` cannot currently reuse that machinery cleanly because: + +1. `CTable.where(...)` eagerly computes a boolean filter and never gives the + planner a table-aware lazy query shape. +2. the current index engine assumes that one index belongs to one `NDArray` + and stores its descriptor in that array's `vlmeta`. +3. persistent sidecar path derivation is based on `array.urlpath`, which places + index files next to the array file rather than inside a table-owned subtree. +4. `CTable` has row visibility semantics through `_valid_rows`, which means + "row still exists" and "row currently matches" are distinct concerns. + +## Design Principles + +The implementation should follow these rules: + +- indexes are table-managed, not column-autonomous +- column indexes are still built from and logically targeted at individual column arrays +- persistent index artifacts must be part of the table store layout +- the public API should mirror existing `NDArray` indexing names where possible +- delete visibility should not force index rebuilds when it can be handled by + post-filtering with `_valid_rows` +- planner and evaluator logic should be reused, not reimplemented from scratch +- unsupported queries must keep a correct scan fallback + +## Proposed Storage Layout + +Extend the persistent `CTable` layout with a reserved index subtree: + +- `/_meta` +- `/_valid_rows` +- `/_cols/` +- `/_indexes//...` + +Recommended concrete shape: + +- `/_indexes//_meta` +- `/_indexes//summary.chunk` +- `/_indexes//summary.block` +- `/_indexes//bucket.values` +- `/_indexes//bucket.bucket_positions` +- `/_indexes//bucket.offsets` +- `/_indexes//bucket_nav.l1` +- `/_indexes//bucket_nav.l2` +- `/_indexes//partial.values` +- `/_indexes//partial.positions` +- `/_indexes//partial.offsets` +- `/_indexes//partial_nav.l1` +- `/_indexes//partial_nav.l2` +- `/_indexes//full.values` +- `/_indexes//full.positions` +- `/_indexes//full_nav.l1` +- `/_indexes//full_nav.l2` +- `/_indexes//full_run..values` +- `/_indexes//full_run..positions` + +Notes: + +- `token` should match the current indexing token model: + - field token for column indexes + - normalized expression token for expression indexes +- all index payloads should stay under `/_indexes//` +- query-cache payloads, if reused for `CTable`, should also be table-owned and + not emitted as sibling files outside the table root + +## Metadata Placement + +The top-level table manifest in `/_meta.vlmeta` should gain index catalog +entries and epoch counters. + +Recommended fields: + +``` +{ + "kind": "ctable", + "version": 1, + "schema": {...}, + "index_catalog_version": 1, + "value_epoch": 0, + "visibility_epoch": 0, + "indexes": { + "id": { + "name": "id", + "token": "id", + "target": {"source": "column", "column": "id"}, + "kind": "full", + "version": 1, + "persistent": True, + "stale": False, + "built_value_epoch": 3, + ... + } + } +} +``` + +Notes: + +- do not keep a historical list of epochs +- overwrite descriptor metadata on rebuild +- descriptors remain small; large payloads stay in `/_indexes/...` +- index catalog ownership remains at the table level, not per-column + +## Public API + +The `CTable` surface should mirror `NDArray` as closely as possible: + +```python +table.create_index("id", kind=blosc2.IndexKind.FULL) +table.drop_index("id") +table.rebuild_index("id") +table.compact_index("id") +table.index("id") +table.indexes +``` + +### Initial target support + +Phase 1 should support column indexes only: + +- `table.create_index("id", kind=...)` +- `table.create_index(field="id", kind=...)` + +Phase 2 can add expression indexes: + +- `table.create_index(expression="abs(score - baseline)", operands=...)` + +but only when all operands resolve to columns from the same `CTable`. + +### Descriptor identity + +Use one active index per target, matching current `NDArray` behavior: + +- one index per column token +- one index per normalized expression token +- optional `name=` remains a label, not identity + +## Query Integration Model + +### Current `CTable` behavior + +Today, `CTable` column comparisons produce `NDArray` or `LazyExpr` results over +physical rows, and `CTable.where(...)` then: + +1. computes the filter +2. pads or trims it +3. intersects it with `_valid_rows` +4. returns a view + +This is correct but fully scan-based. + +### Proposed behavior + +Teach `CTable.where(...)` to detect when the incoming predicate is a `LazyExpr` +that can be interpreted as a table query over table-owned columns. + +For such predicates: + +1. normalize the expression into a table-query descriptor +2. ask a new `CTable` planner for candidate physical row positions +3. intersect candidates with `_valid_rows` +4. evaluate any residual predicate only on surviving candidates +5. produce the final boolean mask or direct row-position set +6. return the usual `CTable` view + +If any step is unsupported, fall back to the current eager full-filter path. + +## Planner Strategy + +Do not build a second independent indexing engine for `CTable`. + +Instead, refactor the current engine into: + +- reusable target normalization +- reusable index build logic +- reusable query plan primitives +- storage backends: + - `NDArrayIndexStorage` + - `CTableIndexStorage` + +### Reusable concepts from current `indexing.py` + +The following should be kept conceptually unchanged: + +- index kinds: summary / bucket / partial / full +- descriptor structure where practical +- target token resolution +- exact and segment planning +- ordered full-index reuse +- full-index compaction model + +### New `CTable` planner layer + +Add a thin planner layer that: + +- maps expression operands back to `CTable` columns +- resolves which indexed columns can participate +- requests index plans from the underlying column index implementation +- intersects or combines candidate physical positions +- reports a reason when indexed planning is not possible + +For v1: + +- single-column predicates should be first-class +- multi-column conjunctions should be supported when each term can be planned independently +- disjunctions can initially fall back to scan if they complicate correctness + +## Row Visibility Semantics + +`CTable` indexes should be defined over physical row positions, not over the +current live-row numbering. + +That means: + +- index payloads refer to physical positions in the backing arrays +- `_valid_rows` remains the source of truth for row visibility +- deleted rows are filtered at query execution time + +This is important because deletes in `CTable` do not rewrite columns; they only +flip visibility bits. + +## Epoch Model + +The epoch model is intentionally small. + +### Table-level counters + +Store only: + +- `value_epoch` +- `visibility_epoch` + +Both are monotonically increasing integers in top-level table metadata. + +### Per-index metadata + +Each descriptor stores: + +- `built_value_epoch` + +Optionally later: + +- `built_visibility_epoch` + +but this is not required in the first implementation. + +### Why this is enough + +- if indexed values or row order change, the index may be invalid: + bump `value_epoch` +- if only `_valid_rows` changes, the index still points to correct physical + rows; execution can intersect with current visibility: + bump `visibility_epoch` + +No epoch history is retained. There is no cleanup problem because only current +scalar values are stored. + +## Mutation Rules + +### Mutations that should bump `value_epoch` + +- `append(...)` +- `extend(...)` +- column writes through `Column.__setitem__` +- `Column.assign(...)` +- `compact()` +- `sort_by(inplace=True)` +- any future row rewrite / reorder operation +- add / drop / rename column for affected targets + +### Mutations that should bump `visibility_epoch` only + +- `delete(...)` + +### Initial stale policy + +For a first implementation, keep rebuild behavior conservative: + +- if a mutation changes indexed values or row positions: + - set affected indexes stale + - bump `value_epoch` +- if only visibility changes: + - do not set indexes stale + - bump `visibility_epoch` + +This is simpler than trying to preserve append-compatible incremental +maintenance on day one. + +## Incremental Maintenance Policy + +The current `NDArray` engine supports limited append maintenance for some index +types. `CTable` does not need to replicate all of that immediately. + +Recommended rollout: + +### Phase 1 + +- create / drop / rebuild / compact indexes +- mark value-changing mutations stale +- keep deletes valid via `_valid_rows` + +### Phase 2 + +- optimize append / extend maintenance for column indexes +- reuse full-index append-run logic where practical +- decide whether summary / bucket / partial can be refreshed incrementally for + appended ranges without rebuilding everything + +The plan should prefer correctness and clear ownership before maintenance +optimizations. + +## Ordered Queries + +The smoothest integration with current `CTable` querying is: + +- filtering remains `table.where(predicate)` +- ordered access is added later in a table-appropriate way + +Possible later APIs: + +- `table.where(expr).sort_by("id")` with index reuse +- `table.where(expr).argsort(order="id")` on a row-index result abstraction +- dedicated row-position helpers for internal use + +For the first version, the main target should be indexed filtering, not full +ordered traversal. + +However, the storage format should not block future ordered reuse, so `full` +indexes should still store enough information to support: + +- ordered filtered row positions +- stable tie handling +- secondary refinement + +## Refactoring Needed in `indexing.py` + +The current implementation mixes three concerns: + +1. planner / evaluator logic +2. metadata ownership +3. sidecar path naming and opening + +To support `CTable`, split these concerns. + +### Step A: storage abstraction + +Introduce an internal storage protocol with responsibilities like: + +- load/save index catalog +- derive payload location for a component +- open/store/remove sidecar arrays +- load/save query-cache catalog and payloads + +Concrete implementations: + +- `NDArrayIndexStorage` +- `CTableIndexStorage` + +### Step B: generic target abstraction + +Introduce an internal target wrapper that represents: + +- base length +- dtype +- chunks / blocks +- slice access for the indexed value stream +- optional block-read helpers +- identity for query cache keys + +For `CTable`, the target for a column index is the column `NDArray`, but +descriptor ownership and sidecar storage are table-owned. + +### Step C: planner entry points + +Keep the existing `NDArray` public entry points intact, but allow internal +planner functions to accept the new abstractions rather than hard-coded raw +`NDArray` ownership assumptions. + +## `CTable` Internal Changes + +### New helpers on `CTable` + +Add private helpers for: + +- resolving the root table from a view +- checking whether a `LazyExpr` is table-plannable +- mapping operands back to column names +- building a physical-position result into a boolean mask +- reading and writing index metadata via storage + +### New helpers on `FileTableStorage` + +Add persistent helpers for: + +- `index_root(token)` +- `index_component_key(token, component_name)` +- create/open/delete index sidecars under `/_indexes/...` +- load/save index catalog in `/_meta` +- load/save table epoch counters + +### View behavior + +Views should not own indexes. + +Rules: + +- creating or dropping indexes on a view should raise +- querying a view may reuse root-table indexes +- planner must always combine indexed matches with the view's current mask + +## Expression Index Scope + +Expression indexes are valuable but should not be part of the first patch +unless the column-index path is already stable. + +Recommended sequence: + +1. column indexes only +2. exact-match multi-column filtering using multiple column indexes +3. expression indexes over same-table columns +4. ordered reuse + +When expression indexes are added, require: + +- all operands belong to the same base `CTable` +- expression normalization produces a stable token +- dependencies are stored by column name, not transient operand aliases + +## Query Cache Scope + +The existing query cache in `indexing.py` is array-owned. + +For `CTable`, if reused, it should be table-owned as well: + +- cache identity should include the table root plus query descriptor +- cache invalidation should happen on `value_epoch` changes +- visibility-only changes can either: + - invalidate conservatively in v1, or + - be ignored if cached results are always post-filtered through current `_valid_rows` + +To keep the first version smaller, query-cache reuse can be deferred entirely. + +## Validation and Reserved Names + +Extend reserved internal names for persistent `CTable` layout: + +- `_meta` +- `_valid_rows` +- `_cols` +- `_indexes` + +If the schema compiler already blocks these, document it. If not, extend the +reserved-name validation explicitly. + +## Error Handling + +Recommended behavior: + +- creating an index on a view: `ValueError` +- creating an index on a missing column: `KeyError` +- creating an unsupported index target: `TypeError` or `ValueError` +- querying with a non-plannable expression: silent scan fallback +- querying with malformed index metadata: clear error on open/use +- compacting a non-`full` index: same semantics as current engine + +## Testing Plan + +### Storage and metadata + +Add tests for: + +- create persistent `CTable` column index +- reopen table and see the index catalog +- verify index payloads are stored under `/_indexes/...` +- verify no sidecar siblings are emitted outside the table root layout +- drop index removes `/_indexes//...` + +### Query correctness + +Add tests for: + +- equality and range predicates on indexed columns +- same queries on reopened persistent tables +- results match scan-based filtering +- deleted rows are excluded without rebuilding the index +- appending after index creation follows the chosen stale policy + +### View semantics + +Add tests for: + +- view queries can reuse parent indexes +- creating indexes on views is rejected +- view mask and `_valid_rows` are both respected + +### Mutation semantics + +Add tests for: + +- delete bumps visibility only and keeps index query correctness +- overwrite of indexed column marks index stale +- compact marks index stale +- inplace sort marks index stale +- rebuild refreshes `built_value_epoch` + +### Multi-column planning + +Add tests for: + +- one indexed term + one unindexed residual term +- two indexed conjunctive terms +- unsupported disjunction falls back correctly + +## Documentation Plan + +The feature should not land with code and tests only. It needs user-facing +documentation from the start. + +### Examples + +Add runnable examples under `examples/ctable` covering at least: + +- creating a `CTable` index on one column +- querying a `CTable` with an indexed predicate +- reopening a persistent table and reusing the index +- basic index management such as `indexes`, `index(...)`, `drop_index(...)`, + and `rebuild_index(...)` + +### Tutorial + +Add a dedicated tutorial notebook at: + +- `doc/getting_started/tutorials/15.indexing-ctables.ipynb` + +The tutorial should explain: + +- what a `CTable` index is +- how indexes relate to columns and to the table as a whole +- how persistence works for indexed tables +- what kinds of queries benefit from indexes +- what happens after deletes and other mutations +- how to inspect and maintain indexes + +### API docstrings and Sphinx integration + +Do not treat docstrings as optional follow-up work. + +For every new public `CTable` indexing API entry point, add fully descriptive +docstrings with small examples, following the style already used elsewhere in +the codebase. + +This includes, as applicable: + +- `CTable.create_index(...)` +- `CTable.drop_index(...)` +- `CTable.rebuild_index(...)` +- `CTable.compact_index(...)` +- `CTable.index(...)` +- `CTable.indexes` + +The docstrings should cover: + +- parameters +- return values +- persistence behavior +- mutation / stale behavior where relevant +- short examples that show the intended usage + +These APIs should also be integrated into the Sphinx docs so they are reachable +from the generated documentation, not only from source docstrings. + +## Recommended Implementation Order + +### Phase 1: storage foundations + +1. add `/_indexes` reserved subtree conventions +2. extend `FileTableStorage` with index catalog and sidecar helpers +3. add table-level epoch metadata + +### Phase 2: API skeleton + +4. add `CTable.create_index`, `drop_index`, `rebuild_index`, `compact_index`, + `index`, and `indexes` +5. implement build/drop/rebuild against column targets only +6. keep query path unchanged initially + +### Phase 3: planner integration + +7. refactor `indexing.py` storage ownership assumptions +8. add `CTable` query planner shim +9. teach `CTable.where(...)` to use indexed planning when possible +10. keep scan fallback for everything else + +### Phase 4: mutation policy + +11. wire `value_epoch` / `visibility_epoch` +12. mark affected indexes stale on value-changing mutations +13. keep delete visibility index-safe without rebuild + +### Phase 5: follow-up optimizations + +14. consider append-aware maintenance +15. consider expression indexes +16. consider ordered reuse for table queries +17. consider query-cache reuse + +### Phase 6: documentation + +18. add `examples/ctable` indexing examples +19. add `doc/getting_started/tutorials/15.indexing-ctables.ipynb` +20. add full public docstrings with examples for the `CTable` indexing API +21. integrate the new API and tutorial into Sphinx documentation + +## Non-Goals for the First Implementation + +Do not include these in the first patch unless they come almost for free: + +- full expression-index support +- ordered query reuse for `CTable` +- disjunction planning across multiple indexes +- aggressive incremental maintenance for all index kinds +- index-aware query caching +- cross-table expression operands + +## Future Work + +One possible future storage evolution would be to make each persisted column a +subtree root instead of a single leaf object. + +That would allow a layout more like: + +- `/_cols/id/data` +- `/_cols/id/indexes/...` +- `/_cols/id/missing/...` +- `/_cols/id/sidecars/...` +- `/_cols/score/data` +- `/_cols/score/indexes/...` + +Potential benefits: + +- stronger locality between a column and its derived artifacts +- easier `rename_column()` and `drop_column()` handling +- a natural home for future per-column sidecars beyond indexes +- room for explicit missing-value bitmaps, nullability metadata, sketches, or + other derived column structures + +Potential costs: + +- this would be a real `CTable` storage-schema change, not just an indexing feature +- current persisted tables would need migration or dual-layout support +- `FileTableStorage` and open/materialization logic would become more complex +- the benefit is broader than indexing, so it is better considered as part of a + larger storage-layout revision + +For that reason, this plan does not assume that redesign. It keeps the current +column-leaf layout and places indexes in a table-owned `/_indexes` subtree. + +## Summary + +The right model is: + +- indexes are table-managed, not column-autonomous +- column indexes are still built from and logically targeted at individual + column arrays +- persistent index artifacts live under `/_indexes` +- existing `indexing.py` logic is reused through refactoring, not duplicated +- deletes remain cheap by treating indexes as physical-row structures and + applying `_valid_rows` at execution time +- epoch tracking stays minimal: a small number of table-level counters, not a + growing history + +This keeps the user model coherent with current `CTable` persistence and as +close as possible to the existing `NDArray` indexing API. diff --git a/tree_store_extensions.md b/plans/tree_store_extensions.md similarity index 100% rename from tree_store_extensions.md rename to plans/tree_store_extensions.md diff --git a/src/blosc2/core.py b/src/blosc2/core.py index 809c209a..ceb78acd 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -834,7 +834,7 @@ def load_tensor(urlpath: str, dparams: dict | None = None) -> tensorflow.Tensor :func:`~blosc2.save_tensor` :func:`~blosc2.pack_tensor` """ - schunk = blosc2.open(urlpath, dparams=dparams) + schunk = blosc2.open(urlpath, mode="r", dparams=dparams) return _unpack_tensor(schunk) diff --git a/src/blosc2/ctable.py b/src/blosc2/ctable.py index 3b1f3863..13fb5080 100644 --- a/src/blosc2/ctable.py +++ b/src/blosc2/ctable.py @@ -12,10 +12,13 @@ import contextlib import dataclasses +import itertools import os +import pprint import shutil from collections.abc import Iterable from dataclasses import MISSING +from textwrap import TextWrapper from typing import Any, Generic, TypeVar import numpy as np @@ -37,6 +40,7 @@ def wrapper(*args, **kwargs): import blosc2 +from blosc2.info import InfoReporter, format_nbytes_info from blosc2.schema import SchemaSpec from blosc2.schema_compiler import ( ColumnConfig, @@ -47,6 +51,223 @@ def wrapper(*args, **kwargs): compute_display_width, ) +# --------------------------------------------------------------------------- +# Index proxy and CTableIndex +# --------------------------------------------------------------------------- + + +class _FakeVlMeta: + """Minimal vlmeta stand-in that accepts writes without touching a real SChunk.""" + + def __init__(self): + self._data: dict = {} + + def __getitem__(self, key): + return self._data[key] + + def __setitem__(self, key, value): + self._data[key] = value + + def get(self, key, default=None): + return self._data.get(key, default) + + +class _FakeSchunk: + """Minimal SChunk stand-in whose vlmeta stores in memory.""" + + def __init__(self): + self.vlmeta = _FakeVlMeta() + + +class _CTableIndexProxy: + """Minimal shim that lets the ``indexing`` module build sidecars for a + CTable column without touching the column's own ``schunk.vlmeta``. + + Attributes mirror those required by the internal build functions: + ``urlpath``, ``schunk``, ``shape``, ``ndim``, ``dtype``, ``chunks``, + ``blocks``, and item access via ``__getitem__``. + """ + + def __init__(self, col_array: blosc2.NDArray, anchor_urlpath: str | None) -> None: + self._col_array = col_array + self.urlpath = anchor_urlpath # controls sidecar placement + self.schunk = _FakeSchunk() + self.shape = col_array.shape + self.ndim = col_array.ndim + self.dtype = col_array.dtype + self.chunks = col_array.chunks + self.blocks = col_array.blocks + + def __getitem__(self, key): + return self._col_array[key] + + +class CTableIndex: + """A handle on an index attached to a :class:`CTable` column. + + Returned by :meth:`CTable.index` and items of :attr:`CTable.indexes`. + Provides :meth:`drop`, :meth:`rebuild`, and :meth:`compact` convenience + methods that delegate back to the owning table. + """ + + def __init__(self, table: CTable, col_name: str, descriptor: dict) -> None: + self._table = table + self._col_name = col_name + self._descriptor = descriptor + + @property + def col_name(self) -> str: + """Column name this index targets.""" + return self._col_name + + @property + def kind(self) -> str: + """Index kind string (``'bucket'``, ``'partial'``, or ``'full'``).""" + return self._descriptor.get("kind", "") + + @property + def stale(self) -> bool: + """True if the index is stale and needs rebuilding.""" + return bool(self._descriptor.get("stale", False)) + + @property + def name(self) -> str | None: + """Optional human-readable name assigned at creation time.""" + return self._descriptor.get("name") or None + + @property + def nbytes(self) -> int: + """Total uncompressed size in bytes for this index payload.""" + from blosc2.indexing import _component_nbytes, iter_index_components + + root = self._table._root_table + col_arr = root._cols[self._col_name] + descriptor = self._descriptor + return sum( + _component_nbytes(col_arr, descriptor, component) + for component in iter_index_components(col_arr, descriptor) + ) + + @property + def cbytes(self) -> int: + """Total compressed size in bytes for this index payload.""" + from blosc2.indexing import _component_cbytes, iter_index_components + + root = self._table._root_table + col_arr = root._cols[self._col_name] + descriptor = self._descriptor + return sum( + _component_cbytes(col_arr, descriptor, component) + for component in iter_index_components(col_arr, descriptor) + ) + + @property + def cratio(self) -> float: + """Compression ratio for this index payload.""" + cbytes = self.cbytes + if cbytes == 0: + return float("inf") + return self.nbytes / cbytes + + def storage_stats(self) -> tuple[int, int, float] | None: + """Return ``(nbytes, cbytes, cratio)`` when sidecars are directly measurable.""" + try: + nbytes = self.nbytes + cbytes = self.cbytes + except (FileNotFoundError, OSError, RuntimeError, KeyError, ValueError): + root = self._table._root_table + if not isinstance(root._storage, FileTableStorage): + return None + + from blosc2.indexing import iter_index_components + + descriptor = self._descriptor + col_arr = root._cols[self._col_name] + store = root._storage._open_store() + nbytes = 0 + cbytes = 0 + try: + for component in iter_index_components(col_arr, descriptor): + if component.path is None: + return None + key = self._component_store_key(component.path) + obj = store[key] + nbytes += int(obj.nbytes) + cbytes += int(obj.cbytes) + except (FileNotFoundError, OSError, RuntimeError, KeyError, ValueError): + return None + cratio = float("inf") if cbytes == 0 else nbytes / cbytes + return nbytes, cbytes, cratio + + @staticmethod + def _component_store_key(path: str) -> str: + """Return the logical TreeStore key for an index component path.""" + normalized = path.replace("\\", "/") + marker = "_indexes/" + idx = normalized.find(marker) + if idx < 0: + raise KeyError(f"Cannot resolve index component path {path!r} inside table store.") + relpath = normalized[idx:] + for suffix in (".b2nd", ".b2f"): + if relpath.endswith(suffix): + relpath = relpath[: -len(suffix)] + break + return "/" + relpath.lstrip("/") + + def drop(self) -> None: + """Drop this index from the owning table.""" + self._table.drop_index(self._col_name) + + def rebuild(self) -> CTableIndex: + """Rebuild this index and return the updated handle.""" + return self._table.rebuild_index(self._col_name) + + def compact(self) -> CTableIndex: + """Compact this index (merge incremental runs) and return the updated handle.""" + return self._table.compact_index(self._col_name) + + def __repr__(self) -> str: + stale_str = " (stale)" if self.stale else "" + name_str = f" name={self.name!r}" if self.name else "" + return f"" + + +class _CTableInfoReporter(InfoReporter): + """Info reporter that also preserves the historic ``t.info()`` call style.""" + + def __repr__(self) -> str: + items = self.obj.info_items + max_key_len = max(len(k) for k, _ in items) + parts = [] + for key, value in items: + if isinstance(value, dict): + parts.append(f"{key.ljust(max_key_len)} :") + pretty = pprint.pformat(value, sort_dicts=False) + parts.extend(f" {line}" for line in pretty.splitlines()) + continue + + wrapper = TextWrapper( + width=96, + initial_indent=key.ljust(max_key_len) + " : ", + subsequent_indent=" " * max_key_len + " : ", + ) + parts.append(wrapper.fill(str(value))) + return "\n".join(parts) + "\n" + + def __call__(self) -> None: + print(repr(self), end="") + + +class _InfoLiteral: + """Pretty-printer helper for unquoted literal values inside info dicts.""" + + def __init__(self, text: str) -> None: + self.text = text + + def __repr__(self) -> str: + return self.text + + # RowT is intentionally left unbound so CTable works with both dataclasses # and legacy Pydantic models during the transition period. RowT = TypeVar("RowT") @@ -61,7 +282,9 @@ def _arange(start, stop=None, step=1) -> blosc2.NDArray | np.ndarray: if stop is None: start, stop = 0, start n = len(range(start, stop, step)) - return blosc2.arange(start, stop, step) if n >= _BLOSC2_ARANGE_THRESHOLD else np.arange(start, stop, step) + return ( + blosc2.arange(start, stop, step) if n >= _BLOSC2_ARANGE_THRESHOLD else np.arange(start, stop, step) + ) # --------------------------------------------------------------------------- @@ -253,6 +476,8 @@ def __getitem__(self, col_name: str): class Column: + _REPR_PREVIEW_ITEMS = 8 + def __init__(self, table: CTable, col_name: str, mask=None): self._table = table self._col_name = col_name @@ -351,6 +576,7 @@ def __setitem__(self, key: int | slice | list | np.ndarray, value): else: raise TypeError(f"Invalid index type: {type(key)}") + self._table._root_table._mark_all_indexes_stale() def __iter__(self): arr = self._valid_rows @@ -374,6 +600,21 @@ def __iter__(self): data_chunk = self._raw_col[chunk_start : chunk_start + actual_size] yield from data_chunk[mask_chunk] + def __repr__(self) -> str: + preview_items = [] + for value in itertools.islice(self, self._REPR_PREVIEW_ITEMS + 1): + if isinstance(value, np.generic): + value = value.item() + preview_items.append(repr(value)) + + truncated = len(preview_items) > self._REPR_PREVIEW_ITEMS + if truncated: + preview_items = preview_items[: self._REPR_PREVIEW_ITEMS] + preview_items.append("...") + + preview = ", ".join(preview_items) + return f"Column({self._col_name!r}, dtype={self.dtype}, len={len(self)}, values=[{preview}])" + def __len__(self): return blosc2.count_nonzero(self._valid_rows) @@ -500,6 +741,7 @@ def assign(self, data) -> None: raise TypeError(f"Cannot coerce data to column dtype {self.dtype!r}: {exc}") from exc live_pos = np.where(self._valid_rows[:])[0] self._raw_col[live_pos] = arr + self._table._root_table._mark_all_indexes_stale() def unique(self) -> np.ndarray: """Return sorted array of unique live values. @@ -1916,6 +2158,13 @@ def drop_column(self, name: str) -> None: if len(self.col_names) == 1: raise ValueError("Cannot drop the last column.") + catalog = self._storage.load_index_catalog() + if name in catalog: + descriptor = catalog.pop(name) + self._validate_index_descriptor(name, descriptor) + self._drop_index_descriptor(name, descriptor) + self._storage.save_index_catalog(catalog) + if isinstance(self._storage, FileTableStorage): self._storage.delete_column(name) @@ -1954,6 +2203,15 @@ def rename_column(self, old: str, new: str) -> None: raise ValueError(f"Column {new!r} already exists.") _validate_column_name(new) + catalog = self._storage.load_index_catalog() + rebuild_kwargs = None + if old in catalog: + descriptor = catalog.pop(old) + self._validate_index_descriptor(old, descriptor) + rebuild_kwargs = self._index_create_kwargs_from_descriptor(descriptor) + self._drop_index_descriptor(old, descriptor) + self._storage.save_index_catalog(catalog) + if isinstance(self._storage, FileTableStorage): self._cols[new] = self._storage.rename_column(old, new) else: @@ -1982,6 +2240,8 @@ def rename_column(self, old: str, new: str) -> None: ) if isinstance(self._storage, FileTableStorage): self._storage.save_schema(schema_to_dict(self._schema)) + if rebuild_kwargs is not None: + self.create_index(new, **rebuild_kwargs) # ------------------------------------------------------------------ # Column access @@ -2019,6 +2279,7 @@ def compact(self): self._valid_rows[: self._n_rows] = True self._valid_rows[self._n_rows :] = False self._last_pos = self._n_rows # next write goes right after live rows + self._mark_all_indexes_stale() def _normalise_sort_keys( self, @@ -2130,6 +2391,7 @@ def sort_by( self._valid_rows[n:] = False self._n_rows = n self._last_pos = n + self._mark_all_indexes_stale() return self else: # Build a new in-memory table with the sorted rows @@ -2209,6 +2471,13 @@ def nbytes(self) -> int: """Total uncompressed size in bytes (all columns + valid_rows mask).""" return sum(col.nbytes for col in self._cols.values()) + self._valid_rows.nbytes + @property + def cratio(self) -> float: + """Compression ratio for the whole table payload.""" + if self.cbytes == 0: + return float("inf") + return self.nbytes / self.cbytes + @property def schema(self) -> CompiledSchema: """The compiled schema that drives this table's columns and validation.""" @@ -2231,33 +2500,652 @@ def schema_dict(self) -> dict[str, Any]: """Return a JSON-compatible dict describing this table's schema.""" return schema_to_dict(self._schema) - def info(self) -> None: - """Print a concise summary of the CTable.""" - ratio = (self.nbytes / self.cbytes) if self.cbytes > 0 else 0.0 + # ------------------------------------------------------------------ + # Index management + # ------------------------------------------------------------------ - lines = [] - lines.append("") - lines.append(f"nºColumns: {self.ncols}") - lines.append(f"nºRows: {self.nrows}") - lines.append("") + @property + def _root_table(self) -> CTable: + """Return the root (non-view) table; *self* if not a view.""" + t = self + while t.base is not None: + t = t.base + return t + + def _mark_all_indexes_stale(self) -> None: + """Bump value_epoch and mark every catalog entry stale on the root table.""" + root = self._root_table + root._storage.bump_value_epoch() + catalog = root._storage.load_index_catalog() + if not catalog: + return + changed = False + for desc in catalog.values(): + if not desc.get("stale", False): + desc["stale"] = True + changed = True + if changed: + root._storage.save_index_catalog(catalog) + + @staticmethod + def _validate_index_descriptor(col_name: str, descriptor: dict) -> None: + """Raise ValueError when an index catalog entry is malformed.""" + if not isinstance(descriptor, dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: descriptor must be a dict.") + token = descriptor.get("token") + if not isinstance(token, str) or not token: + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing token.") + kind = descriptor.get("kind") + if kind not in {"summary", "bucket", "partial", "full"}: + raise ValueError(f"Malformed index metadata for column {col_name!r}: invalid kind {kind!r}.") + if kind == "bucket" and not isinstance(descriptor.get("bucket"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing bucket payload.") + if kind == "partial" and not isinstance(descriptor.get("partial"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing partial payload.") + if kind == "full" and not isinstance(descriptor.get("full"), dict): + raise ValueError(f"Malformed index metadata for column {col_name!r}: missing full payload.") + + def _drop_index_descriptor(self, col_name: str, descriptor: dict) -> None: + """Delete sidecars/cache for a catalog descriptor without touching the column mapping.""" + from pathlib import Path + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _clear_cached_data, + _drop_descriptor_sidecars, + _is_persistent_array, + ) - header = f" {'#':>3} {'Column':<15} {'Itemsize':<12} {'Dtype':<15}" - lines.append(header) - lines.append(f" {'---':>3} {'------':<15} {'--------':<12} {'-----':<15}") + col_arr = self._cols.get(col_name) + token = descriptor["token"] + + if col_arr is not None: + _clear_cached_data(col_arr, token) + + if col_arr is not None and _is_persistent_array(col_arr): + arr_key = _array_key(col_arr) + store = _PERSISTENT_INDEXES.get(arr_key) + if store is not None: + store["indexes"].pop(token, None) + elif col_arr is not None: + store = _IN_MEMORY_INDEXES.get(id(col_arr)) + if store is not None: + store["indexes"].pop(token, None) + + _drop_descriptor_sidecars(descriptor) + + anchor = self._storage.index_anchor_path(col_name) + if anchor is not None: + proxy_key = ("persistent", str(Path(anchor).resolve())) + _PERSISTENT_INDEXES.pop(proxy_key, None) + with contextlib.suppress(OSError): + os.rmdir(os.path.dirname(anchor)) + + def _index_create_kwargs_from_descriptor(self, descriptor: dict) -> dict[str, Any]: + """Return create_index kwargs that rebuild an existing descriptor.""" + build = "ooc" if bool(descriptor.get("ooc", False)) else "memory" + return { + "kind": descriptor["kind"], + "optlevel": int(descriptor.get("optlevel", 5)), + "name": descriptor.get("name") or None, + "build": build, + "cparams": descriptor.get("cparams"), + } - for i, name in enumerate(self.col_names): - col_array = self._cols[name] - dtype_str = str(col_array.dtype) - itemsize = f"{col_array.dtype.itemsize} B" - lines.append(f" {i:>3} {name:<15} {itemsize:<12} {dtype_str:<15}") + def _build_index_persistent( + self, + col_name: str, + col_arr: blosc2.NDArray, + *, + kind: str, + optlevel: int, + name_hint: str | None, + build: str, + tmpdir: str | None, + cparams_obj, + ) -> dict: + """Build index sidecar files for a persistent-table column; return the descriptor.""" + import tempfile + from pathlib import Path + + from blosc2.indexing import ( + _PERSISTENT_INDEXES, + _array_key, + _build_bucket_descriptor, + _build_bucket_descriptor_ooc, + _build_descriptor, + _build_full_descriptor, + _build_full_descriptor_ooc, + _build_levels_descriptor, + _build_levels_descriptor_ooc, + _build_partial_descriptor, + _build_partial_descriptor_ooc, + _copy_descriptor, + _field_target_descriptor, + _resolve_full_index_tmpdir, + _resolve_ooc_mode, + _target_token, + _values_for_target, + ) - lines.append("") - lines.append(f"memory usage: {_fmt_bytes(self.cbytes)}") - lines.append(f"uncompressed size: {_fmt_bytes(self.nbytes)}") - lines.append(f"compression ratio: {ratio:.2f}x") - lines.append("") + anchor = self._storage.index_anchor_path(col_name) + os.makedirs(os.path.dirname(anchor), exist_ok=True) + proxy = _CTableIndexProxy(col_arr, anchor) + proxy_key = _array_key(proxy) + _PERSISTENT_INDEXES.pop(proxy_key, None) # clear any stale cache entry + + target = _field_target_descriptor(None) + token = _target_token(target) + persistent = True + dtype = col_arr.dtype + use_ooc = _resolve_ooc_mode(kind, build) + + if use_ooc: + resolved_tmpdir = _resolve_full_index_tmpdir(proxy, tmpdir) + levels = _build_levels_descriptor_ooc(proxy, target, token, kind, dtype, persistent, cparams_obj) + bucket = ( + _build_bucket_descriptor_ooc( + proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj + ) + if kind == "bucket" + else None + ) + partial = ( + _build_partial_descriptor_ooc( + proxy, target, token, kind, dtype, optlevel, persistent, cparams_obj + ) + if kind == "partial" + else None + ) + full = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-", dir=resolved_tmpdir) as td: + full = _build_full_descriptor_ooc( + proxy, target, token, kind, dtype, persistent, Path(td), cparams_obj + ) + descriptor = _build_descriptor( + proxy, + target, + token, + kind, + optlevel, + persistent, + True, + name_hint, + dtype, + levels, + bucket, + partial, + full, + cparams_obj, + ) + else: + values = _values_for_target(proxy, target) + levels = _build_levels_descriptor( + proxy, target, token, kind, dtype, values, persistent, cparams_obj + ) + bucket = ( + _build_bucket_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) + if kind == "bucket" + else None + ) + partial = ( + _build_partial_descriptor(proxy, token, kind, values, optlevel, persistent, cparams_obj) + if kind == "partial" + else None + ) + full = ( + _build_full_descriptor(proxy, token, kind, values, persistent, cparams_obj) + if kind == "full" + else None + ) + descriptor = _build_descriptor( + proxy, + target, + token, + kind, + optlevel, + persistent, + False, + name_hint, + dtype, + levels, + bucket, + partial, + full, + cparams_obj, + ) - print("\n".join(lines)) + result = _copy_descriptor(descriptor) + _PERSISTENT_INDEXES.pop(proxy_key, None) # evict proxy to avoid memory leak + return result + + def create_index( + self, + col_name: str, + *, + kind: blosc2.IndexKind = blosc2.IndexKind.BUCKET, + optlevel: int = 5, + name: str | None = None, + build: str = "auto", + tmpdir: str | None = None, + **kwargs, + ) -> CTableIndex: + """Build and register an index for a column. + + Parameters + ---------- + col_name: + Name of the column to index. + kind: + Index kind. One of :attr:`blosc2.IndexKind.BUCKET` (default), + :attr:`blosc2.IndexKind.PARTIAL`, or :attr:`blosc2.IndexKind.FULL`. + optlevel: + Optimisation level (1–9). Higher values give more precise pruning + at the cost of larger index files. Default is 5. + name: + Optional human-readable label for the index. + build: + Build strategy: ``'auto'``, ``'memory'``, or ``'ooc'`` (out-of-core). + tmpdir: + Temporary directory for out-of-core builds. ``None`` means use the + column's own directory (persistent tables) or the system temporary + directory (in-memory tables). + **kwargs: + Pass ``cparams=`` to customise index compression. + + Returns + ------- + CTableIndex + A handle on the newly created index. + + Raises + ------ + ValueError + If called on a view. + KeyError + If *col_name* is not a column of this table. + """ + if self.base is not None: + raise ValueError("Cannot create an index on a view.") + if col_name not in self._cols: + raise KeyError(f"No column named {col_name!r}. Available: {self.col_names}") + catalog = self._storage.load_index_catalog() + if col_name in catalog: + raise ValueError( + f"Index already exists for column {col_name!r}. " + "Call rebuild_index() to replace it or drop_index() first." + ) + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _copy_descriptor, + _normalize_build_mode, + _normalize_index_cparams, + _normalize_index_kind, + ) + from blosc2.indexing import ( + create_index as _ix_create_index, + ) + + cparams_obj = _normalize_index_cparams(kwargs.pop("cparams", None)) + if kwargs: + raise TypeError(f"unexpected keyword argument(s): {', '.join(sorted(kwargs))}") + + kind_str = _normalize_index_kind(kind) + build_str = _normalize_build_mode(build) + col_arr = self._cols[col_name] + is_persistent = self._storage.index_anchor_path(col_name) is not None + + if is_persistent: + descriptor = self._build_index_persistent( + col_name, + col_arr, + kind=kind_str, + optlevel=optlevel, + name_hint=name, + build=build_str, + tmpdir=tmpdir, + cparams_obj=cparams_obj, + ) + else: + _ix_create_index( + col_arr, + field=None, + kind=blosc2.IndexKind(kind_str), + optlevel=optlevel, + name=name, + build=build, + tmpdir=tmpdir, + cparams=cparams_obj, + ) + store = _IN_MEMORY_INDEXES[id(col_arr)] + descriptor = _copy_descriptor(store["indexes"]["__self__"]) + + value_epoch, _ = self._storage.get_epoch_counters() + descriptor["built_value_epoch"] = value_epoch + + catalog = self._storage.load_index_catalog() + catalog[col_name] = descriptor + self._storage.save_index_catalog(catalog) + return CTableIndex(self, col_name, descriptor) + + def drop_index(self, col_name: str) -> None: + """Remove the index for *col_name* and delete any sidecar files. + + Parameters + ---------- + col_name: + Column whose index should be dropped. + + Raises + ------ + ValueError + If called on a view. + KeyError + If no index exists for *col_name*. + """ + if self.base is not None: + raise ValueError("Cannot drop an index from a view.") + + catalog = self._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + + descriptor = catalog.pop(col_name) + self._validate_index_descriptor(col_name, descriptor) + self._drop_index_descriptor(col_name, descriptor) + self._storage.save_index_catalog(catalog) + + def rebuild_index(self, col_name: str) -> CTableIndex: + """Drop and recreate the index for *col_name* with the same parameters. + + Parameters + ---------- + col_name: + Column whose index should be rebuilt. + + Returns + ------- + CTableIndex + A handle on the newly built index. + + Raises + ------ + ValueError + If called on a view. + KeyError + If no index exists for *col_name*. + """ + if self.base is not None: + raise ValueError("Cannot rebuild an index on a view.") + + catalog = self._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + + old_desc = catalog[col_name] + self._validate_index_descriptor(col_name, old_desc) + create_kwargs = self._index_create_kwargs_from_descriptor(old_desc) + + self.drop_index(col_name) + return self.create_index(col_name, **create_kwargs) + + def compact_index(self, col_name: str) -> CTableIndex: + """Compact the index for *col_name*, merging any incremental append runs. + + Only meaningful for ``kind='full'`` indexes. For other kinds the call + is a no-op and returns the current handle. + + Parameters + ---------- + col_name: + Column whose index should be compacted. + + Returns + ------- + CTableIndex + A handle reflecting the (possibly updated) index descriptor. + + Raises + ------ + ValueError + If called on a view. + KeyError + If no index exists for *col_name*. + """ + if self.base is not None: + raise ValueError("Cannot compact an index on a view.") + + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _copy_descriptor, + _default_index_store, + _is_persistent_array, + ) + from blosc2.indexing import ( + compact_index as _ix_compact_index, + ) + + catalog = self._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + + col_arr = self._cols[col_name] + descriptor = catalog[col_name] + + if _is_persistent_array(col_arr): + anchor = self._storage.index_anchor_path(col_name) + proxy = _CTableIndexProxy(col_arr, anchor) + proxy_key = _array_key(proxy) + store = _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _PERSISTENT_INDEXES[proxy_key] = store + try: + _ix_compact_index(proxy) + updated_store = _PERSISTENT_INDEXES.get(proxy_key) or store + updated_desc = _copy_descriptor(updated_store["indexes"][descriptor["token"]]) + finally: + _PERSISTENT_INDEXES.pop(proxy_key, None) + updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) + catalog[col_name] = updated_desc + self._storage.save_index_catalog(catalog) + return CTableIndex(self, col_name, updated_desc) + else: + _ix_compact_index(col_arr) + store = _IN_MEMORY_INDEXES.get(id(col_arr)) + if store: + token = descriptor["token"] + updated_desc = _copy_descriptor(store["indexes"].get(token, descriptor)) + updated_desc["built_value_epoch"] = descriptor.get("built_value_epoch", 0) + catalog[col_name] = updated_desc + self._storage.save_index_catalog(catalog) + return CTableIndex(self, col_name, updated_desc) + return CTableIndex(self, col_name, descriptor) + + def index(self, col_name: str) -> CTableIndex: + """Return the index handle for *col_name*. + + Parameters + ---------- + col_name: + Column name to look up. + + Returns + ------- + CTableIndex + + Raises + ------ + KeyError + If no index exists for *col_name*. + """ + catalog = self._root_table._storage.load_index_catalog() + if col_name not in catalog: + raise KeyError(f"No index found for column {col_name!r}.") + return CTableIndex(self, col_name, catalog[col_name]) + + @property + def indexes(self) -> list[CTableIndex]: + """Return a list of :class:`CTableIndex` handles for all active indexes.""" + catalog = self._root_table._storage.load_index_catalog() + return [CTableIndex(self, col_name, desc) for col_name, desc in catalog.items()] + + @staticmethod + def _find_indexed_columns(root_cols, catalog, operands): + """Return live indexed columns referenced by *operands* in expression order.""" + indexed = [] + seen = set() + for operand in operands.values(): + if not isinstance(operand, blosc2.NDArray): + continue + for col_name, col_arr in root_cols.items(): + if col_arr is not operand or col_name in seen or col_name not in catalog: + continue + descriptor = catalog[col_name] + CTable._validate_index_descriptor(col_name, descriptor) + if descriptor.get("stale", False): + continue + indexed.append((col_name, col_arr, descriptor)) + seen.add(col_name) + return indexed + + def _try_index_where(self, expr_result: blosc2.LazyExpr) -> np.ndarray | None: + """Attempt to resolve *expr_result* via a column index. + + Returns a 1-D int64 array of physical row positions that satisfy the + predicate, or ``None`` if no usable index was found (caller falls back + to a full scan). + """ + from blosc2.indexing import ( + _IN_MEMORY_INDEXES, + _PERSISTENT_INDEXES, + _array_key, + _default_index_store, + _is_persistent_array, + evaluate_bucket_query, + evaluate_segment_query, + plan_query, + ) + + root = self._root_table + catalog = root._storage.load_index_catalog() + if not catalog: + return None + + expression = expr_result.expression + operands = dict(expr_result.operands) + + indexed_columns = self._find_indexed_columns(root._cols, catalog, operands) + if not indexed_columns: + return None + + primary_col_name, primary_col_arr, _ = indexed_columns[0] + + # Inject every usable table-owned descriptor so plan_query can combine them. + for _col_name, col_arr, descriptor in indexed_columns: + arr_key = _array_key(col_arr) + if _is_persistent_array(col_arr): + store = _PERSISTENT_INDEXES.get(arr_key) or _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _PERSISTENT_INDEXES[arr_key] = store + else: + store = _IN_MEMORY_INDEXES.get(id(col_arr)) or _default_index_store() + store["indexes"][descriptor["token"]] = descriptor + _IN_MEMORY_INDEXES[id(col_arr)] = store + + where_dict = {"_where_x": primary_col_arr} + merged_operands = {**operands, "_where_x": primary_col_arr} + + plan = plan_query(expression, merged_operands, where_dict) + if not plan.usable: + return None + + if plan.exact_positions is not None: + return np.asarray(plan.exact_positions, dtype=np.int64) + + if plan.bucket_masks is not None: + _, positions = evaluate_bucket_query( + expression, merged_operands, {}, where_dict, plan, return_positions=True + ) + return np.asarray(positions, dtype=np.int64) + + if plan.candidate_units is not None and plan.segment_len is not None: + _, positions = evaluate_segment_query( + expression, merged_operands, {}, where_dict, plan, return_positions=True + ) + return np.asarray(positions, dtype=np.int64) + + return None + + @property + def info_items(self) -> list[tuple[str, object]]: + """Structured summary items used by :meth:`info`.""" + storage_type = "persistent" if isinstance(self._storage, FileTableStorage) else "in-memory" + urlpath = self._storage._root if isinstance(self._storage, FileTableStorage) else None + schema_summary = { + name: _InfoLiteral(self._dtype_info_label(self._cols[name].dtype)) for name in self.col_names + } + + index_summary = {} + for idx in self.indexes: + stale = " stale" if idx.stale else "" + label = f" name={idx.name!r}" if idx.name and idx.name != "__self__" else "" + stats = idx.storage_stats() + if stats is None: + suffix = "size=n/a (sidecars not directly addressable)" + else: + _, cbytes, _ = stats + suffix = f"cbytes={format_nbytes_info(cbytes)}" + index_summary[idx.col_name] = f"[{idx.kind}{stale}{label}] {suffix}" + + items = [ + ("type", self.__class__.__name__), + ("storage", storage_type), + ("rows", self.nrows), + ("columns", self.ncols), + ("view", self.base is not None), + ("nbytes", format_nbytes_info(self.nbytes)), + ("cbytes", format_nbytes_info(self.cbytes)), + ("cratio", f"{self.cratio:.1f}x"), + ("schema", schema_summary), + ( + "valid_rows_mask", + f"cbytes={format_nbytes_info(self._valid_rows.cbytes)}", + ), + ("indexes", index_summary if index_summary else "none"), + ] + if urlpath is not None: + items.insert(2, ("urlpath", urlpath)) + open_mode = self._storage.open_mode() + if open_mode is not None: + items.insert(3, ("open_mode", open_mode)) + return items + + @staticmethod + def _dtype_info_label(dtype: np.dtype) -> str: + """Return a compact dtype label for info reports.""" + if dtype.kind == "U": + nchars = dtype.itemsize // 4 + return f"U{nchars} (Unicode, max {nchars} chars)" + if dtype.kind == "S": + return f"S{dtype.itemsize}" + return str(dtype) + + @property + def info(self) -> _CTableInfoReporter: + """Get information about this table. + + Examples + -------- + >>> print(t.info) + >>> t.info() + """ + return _CTableInfoReporter(self) # ------------------------------------------------------------------ # Mutation: append / extend / delete @@ -2306,6 +3194,7 @@ def append(self, data: list | np.void | np.ndarray) -> None: self._valid_rows[pos] = True self._last_pos = pos + 1 self._n_rows += 1 + self._mark_all_indexes_stale() def delete(self, ind: int | slice | str | Iterable) -> None: if self._read_only: @@ -2327,6 +3216,7 @@ def delete(self, ind: int | slice | str | Iterable) -> None: self._valid_rows[:] = valid_rows_np # write back in-place; no new array created self._n_rows -= n_deleted self._last_pos = None # recalculate on next write + self._storage.bump_visibility_epoch() def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> None: if self._read_only: @@ -2388,6 +3278,7 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> self._valid_rows[start_pos:end_pos] = True self._last_pos = end_pos self._n_rows += new_nrows + self._mark_all_indexes_stale() # ------------------------------------------------------------------ # Filtering @@ -2395,12 +3286,26 @@ def extend(self, data: list | CTable | Any, *, validate: bool | None = None) -> @profile def where(self, expr_result) -> CTable: + if isinstance(expr_result, Column): + expr_result = expr_result._raw_col + if not ( isinstance(expr_result, (blosc2.NDArray, blosc2.LazyExpr)) and (getattr(expr_result, "dtype", None) == np.bool_) ): raise TypeError(f"Expected boolean blosc2.NDArray or LazyExpr, got {type(expr_result).__name__}") + # Attempt index-accelerated filtering before falling back to a full scan. + if isinstance(expr_result, blosc2.LazyExpr): + positions = self._try_index_where(expr_result) + if positions is not None: + total = len(self._valid_rows) + mask = np.zeros(total, dtype=bool) + valid_pos = positions[(positions >= 0) & (positions < total)] + mask[valid_pos] = True + mask &= self._valid_rows[:] + return self.view(blosc2.asarray(mask)) + filter = expr_result.compute() if isinstance(expr_result, blosc2.LazyExpr) else expr_result target_len = len(self._valid_rows) diff --git a/src/blosc2/ctable_storage.py b/src/blosc2/ctable_storage.py index 226fc29b..b071ad02 100644 --- a/src/blosc2/ctable_storage.py +++ b/src/blosc2/ctable_storage.py @@ -19,6 +19,7 @@ from __future__ import annotations +import copy import json import os from typing import Any @@ -27,6 +28,10 @@ import blosc2 +# Directory inside the table root that holds per-column index sidecar files. +_INDEXES_DIR = "_indexes" + + # --------------------------------------------------------------------------- # Abstract base # --------------------------------------------------------------------------- @@ -75,6 +80,9 @@ def table_exists(self) -> bool: def is_read_only(self) -> bool: raise NotImplementedError + def open_mode(self) -> str | None: + raise NotImplementedError + def delete_column(self, name: str) -> None: raise NotImplementedError @@ -84,6 +92,36 @@ def rename_column(self, old: str, new: str) -> blosc2.NDArray: def close(self) -> None: raise NotImplementedError + # -- Index catalog and epoch helpers ------------------------------------- + + def load_index_catalog(self) -> dict: + """Return the current index catalog (column_name → descriptor dict).""" + raise NotImplementedError + + def save_index_catalog(self, catalog: dict) -> None: + """Persist *catalog* (column_name → descriptor dict).""" + raise NotImplementedError + + def get_epoch_counters(self) -> tuple[int, int]: + """Return ``(value_epoch, visibility_epoch)``.""" + raise NotImplementedError + + def bump_value_epoch(self) -> int: + """Increment and return the value epoch (data values changed).""" + raise NotImplementedError + + def bump_visibility_epoch(self) -> int: + """Increment and return the visibility epoch (row set changed by delete).""" + raise NotImplementedError + + def index_anchor_path(self, col_name: str) -> str | None: + """Return the urlpath used as the anchor for index sidecar naming. + + Returns *None* for in-memory storage. For file-backed storage returns + a path of the form ``/_indexes//_anchor``. + """ + raise NotImplementedError + # --------------------------------------------------------------------------- # In-memory backend @@ -93,6 +131,11 @@ def close(self) -> None: class InMemoryTableStorage(TableStorage): """All arrays are plain in-memory blosc2.NDArray objects.""" + def __init__(self) -> None: + self._index_catalog: dict = {} + self._value_epoch: int = 0 + self._visibility_epoch: int = 0 + def create_column(self, name, *, dtype, shape, chunks, blocks, cparams, dparams): kwargs: dict[str, Any] = {"chunks": chunks, "blocks": blocks} if cparams is not None: @@ -122,6 +165,9 @@ def table_exists(self): def is_read_only(self): return False + def open_mode(self) -> str | None: + return None + def delete_column(self, name): raise RuntimeError("In-memory tables have no on-disk representation to mutate.") @@ -131,6 +177,28 @@ def rename_column(self, old: str, new: str): def close(self): pass + # -- Index catalog and epoch helpers ------------------------------------- + + def load_index_catalog(self) -> dict: + return copy.deepcopy(self._index_catalog) + + def save_index_catalog(self, catalog: dict) -> None: + self._index_catalog = copy.deepcopy(catalog) + + def get_epoch_counters(self) -> tuple[int, int]: + return self._value_epoch, self._visibility_epoch + + def bump_value_epoch(self) -> int: + self._value_epoch += 1 + return self._value_epoch + + def bump_visibility_epoch(self) -> int: + self._visibility_epoch += 1 + return self._visibility_epoch + + def index_anchor_path(self, col_name: str) -> str | None: + return None + # --------------------------------------------------------------------------- # File-backed backend @@ -207,6 +275,9 @@ def table_exists(self) -> bool: def is_read_only(self) -> bool: return self._mode == "r" + def open_mode(self) -> str | None: + return self._mode + def create_column(self, name, *, dtype, shape, chunks, blocks, cparams, dparams): kwargs: dict[str, Any] = { "chunks": chunks, @@ -298,3 +369,37 @@ def close(self) -> None: self._store.close() self._store = None self._meta = None + + # -- Index catalog and epoch helpers ------------------------------------- + + def load_index_catalog(self) -> dict: + meta = self._open_meta() + raw = meta.vlmeta.get("index_catalog") + if isinstance(raw, dict): + return copy.deepcopy(raw) + return {} + + def save_index_catalog(self, catalog: dict) -> None: + meta = self._open_meta() + meta.vlmeta["index_catalog"] = copy.deepcopy(catalog) + + def get_epoch_counters(self) -> tuple[int, int]: + meta = self._open_meta() + ve = int(meta.vlmeta.get("value_epoch", 0) or 0) + vis_e = int(meta.vlmeta.get("visibility_epoch", 0) or 0) + return ve, vis_e + + def bump_value_epoch(self) -> int: + meta = self._open_meta() + ve = int(meta.vlmeta.get("value_epoch", 0) or 0) + 1 + meta.vlmeta["value_epoch"] = ve + return ve + + def bump_visibility_epoch(self) -> int: + meta = self._open_meta() + vis_e = int(meta.vlmeta.get("visibility_epoch", 0) or 0) + 1 + meta.vlmeta["visibility_epoch"] = vis_e + return vis_e + + def index_anchor_path(self, col_name: str) -> str | None: + return os.path.join(self._root, _INDEXES_DIR, col_name, "_anchor") diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index a77a2da0..821c5a9c 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -935,7 +935,7 @@ def _open_sidecar_handle(array: blosc2.NDArray, token: str, category: str, name: raise RuntimeError("sidecar handle path is not available") handle = legacy if isinstance(legacy, blosc2.NDArray) else blosc2.asarray(np.asarray(legacy)) else: - handle = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + handle = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) _SIDECAR_HANDLE_CACHE[cache_key] = handle return handle @@ -1054,7 +1054,7 @@ def _compute_sorted_boundaries_from_sidecar( ) -> np.ndarray: nsegments = math.ceil(length / segment_len) boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) - sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + sidecar = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) start_value = np.empty(1, dtype=dtype) end_value = np.empty(1, dtype=dtype) for idx in range(nsegments): @@ -1301,7 +1301,7 @@ def _sidecar_storage_geometry( ) -> tuple[int, int]: if path is None: return fallback_chunk_len, fallback_block_len - sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + sidecar = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) return int(sidecar.chunks[0]), int(sidecar.blocks[0]) @@ -1385,7 +1385,7 @@ def _stream_copy_sidecar_array( blocks: tuple[int, ...], cparams: dict | None = None, ) -> None: - source = blosc2.open(str(source_path), mmap_mode=_INDEX_MMAP_MODE) + source = blosc2.open(str(source_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) blosc2.remove_urlpath(str(dest_path)) kwargs = {"chunks": chunks, "blocks": blocks, "urlpath": str(dest_path), "mode": "w"} if cparams is not None: @@ -2754,7 +2754,7 @@ def _copy_sidecar_to_temp_run( cparams: dict | None = None, ) -> Path: out_path = workdir / f"{prefix}.b2nd" - sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + sidecar = blosc2.open(path, mode="r", mmap_mode=_INDEX_MMAP_MODE) output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams) chunk_len = int(sidecar.chunks[0]) for chunk_id, start in enumerate(range(0, length, chunk_len)): @@ -2814,10 +2814,10 @@ def _merge_run_pair( tracker: TempRunTracker | None = None, cparams: dict | None = None, ) -> SortedRun: - left_values_mm = blosc2.open(str(left.values_path), mmap_mode=_INDEX_MMAP_MODE) - left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode=_INDEX_MMAP_MODE) - right_values_mm = blosc2.open(str(right.values_path), mmap_mode=_INDEX_MMAP_MODE) - right_positions_mm = blosc2.open(str(right.positions_path), mmap_mode=_INDEX_MMAP_MODE) + left_values_mm = blosc2.open(str(left.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) + left_positions_mm = blosc2.open(str(left.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) + right_values_mm = blosc2.open(str(right.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) + right_positions_mm = blosc2.open(str(right.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE) out_values_path = workdir / f"full_merge_values_{merge_id}.b2nd" out_positions_path = workdir / f"full_merge_positions_{merge_id}.b2nd" @@ -3024,8 +3024,8 @@ def _build_full_descriptor_ooc( array, token, kind, full, final_run, dtype, persistent, tracker, cparams ) else: - sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] - positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] + sorted_values = blosc2.open(str(final_run.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] values_sidecar = _store_array_sidecar( array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams ) @@ -3257,14 +3257,14 @@ def iter_index_components(array: blosc2.NDArray, descriptor: dict): def _component_nbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: if component.path is not None: - return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).nbytes) + return int(blosc2.open(component.path, mode="r", mmap_mode=_INDEX_MMAP_MODE).nbytes) token = descriptor["token"] return int(_load_array_sidecar(array, token, component.category, component.name).nbytes) def _component_cbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: if component.path is not None: - return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).cbytes) + return int(blosc2.open(component.path, mode="r", mmap_mode=_INDEX_MMAP_MODE).cbytes) token = descriptor["token"] sidecar = _load_array_sidecar(array, token, component.category, component.name) kwargs = {} @@ -3804,8 +3804,8 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N array, descriptor, final_run.values_path, final_run.positions_path, final_run.length ) else: - sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] - positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] + sorted_values = blosc2.open(str(final_run.values_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mode="r", mmap_mode=_INDEX_MMAP_MODE)[:] _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) del sorted_values, positions final_run.values_path.unlink(missing_ok=True) @@ -4859,7 +4859,7 @@ def _bucket_batch_result_dtype(where_x) -> np.dtype: def _bucket_worker_source(where_x): if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: - return blosc2.open(str(where_x.urlpath), mmap_mode=_INDEX_MMAP_MODE) + return blosc2.open(str(where_x.urlpath), mode="r", mmap_mode=_INDEX_MMAP_MODE) return where_x @@ -4877,7 +4877,7 @@ def _gather_mmap_source(where_x): urlpath = str(urlpath) handle = _GATHER_MMAP_HANDLES.get(urlpath) if handle is None: - handle = blosc2.open(urlpath, mmap_mode=_INDEX_MMAP_MODE) + handle = blosc2.open(urlpath, mode="r", mmap_mode=_INDEX_MMAP_MODE) _GATHER_MMAP_HANDLES[urlpath] = handle return handle @@ -5104,17 +5104,17 @@ def process_batch(chunk_ids: np.ndarray) -> tuple[list[tuple[int, np.ndarray]], batch_values = ( values_sidecar if bucket.get("values_path") is None - else blosc2.open(bucket["values_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(bucket["values_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_buckets = ( bucket_sidecar if bucket.get("bucket_positions_path") is None - else blosc2.open(bucket["bucket_positions_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(bucket["bucket_positions_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_l2 = ( l2_sidecar if bucket.get("l2_path") is None - else blosc2.open(bucket["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(bucket["l2_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_results = [] batch_candidate_segments = 0 @@ -5251,9 +5251,9 @@ def _partial_chunk_nav_positions_cython( def process_cython_batch(chunk_ids: np.ndarray) -> tuple[np.ndarray, int]: if len(chunk_ids) == 0: return np.empty(0, dtype=np.int64), 0 - batch_values = blosc2.open(partial["values_path"], mmap_mode=_INDEX_MMAP_MODE) - batch_positions = blosc2.open(partial["positions_path"], mmap_mode=_INDEX_MMAP_MODE) - batch_l2 = blosc2.open(partial["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_values = blosc2.open(partial["values_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) + batch_positions = blosc2.open(partial["positions_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(partial["l2_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) batch_l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) batch_span_values = np.empty(chunk_len, dtype=dtype) batch_local_positions = np.empty(chunk_len, dtype=local_position_dtype) @@ -5300,17 +5300,17 @@ def process_batch(chunk_ids: np.ndarray) -> tuple[list[np.ndarray], int]: batch_values = ( values_sidecar if partial.get("values_path") is None - else blosc2.open(partial["values_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(partial["values_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_positions = ( positions_sidecar if partial.get("positions_path") is None - else blosc2.open(partial["positions_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(partial["positions_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_l2 = ( l2_sidecar if partial.get("l2_path") is None - else blosc2.open(partial["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + else blosc2.open(partial["l2_path"], mode="r", mmap_mode=_INDEX_MMAP_MODE) ) batch_parts = [] batch_candidate_segments = 0 diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index b8410b9a..5c64eb4b 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -579,7 +579,7 @@ def save(self, **kwargs: Any) -> None: >>> # Save the LazyExpr to disk >>> expr.save(urlpath='lazy_array.b2nd', mode='w') >>> # Open and load the LazyExpr from disk - >>> disk_expr = blosc2.open('lazy_array.b2nd') + >>> disk_expr = blosc2.open('lazy_array.b2nd', mode='r') >>> disk_expr[:2] [[0. 1.25 2.5 ] [3.75 5. 6.25]] @@ -4682,7 +4682,7 @@ def open_lazyarray(array): if isinstance(v, str): v = parent_path / v try: - op = blosc2.open(v) + op = blosc2.open(v, mode="r") except FileNotFoundError: missing_ops[key] = v else: diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index fd1de79a..51f88f96 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -494,9 +494,17 @@ def reshape( return dst +def _normalize_expr_operand(value: Any) -> Any: + """Normalize foreign expression operands to the array-like object lazy ops expect.""" + raw_col = getattr(value, "_raw_col", None) + return raw_col if raw_col is not None else value + + def _check_allowed_dtypes( value: bool | int | float | str | blosc2.Array, ): + value = _normalize_expr_operand(value) + def _is_array_like(v: Any) -> bool: try: # Try Protocol runtime check first (works when possible) @@ -3426,6 +3434,7 @@ def __abs__(self) -> blosc2.LazyExpr: @is_documented_by(bitwise_and) def __and__(self, value: int | float | blosc2.Array, /) -> blosc2.LazyExpr: + value = _normalize_expr_operand(value) _check_allowed_dtypes(value) return blosc2.LazyExpr(new_op=(self, "&", value)) @@ -3437,6 +3446,8 @@ def __rand__(self, value: int | float | blosc2.Array, /) -> blosc2.LazyExpr: @is_documented_by(bitwise_xor) def __xor__(self, other) -> blosc2.LazyExpr: + other = _normalize_expr_operand(other) + _check_allowed_dtypes(other) return blosc2.LazyExpr(new_op=(self, "^", other)) def __ixor__(self, other) -> blosc2.LazyExpr: @@ -3447,6 +3458,8 @@ def __rxor__(self, other) -> blosc2.LazyExpr: @is_documented_by(bitwise_or) def __or__(self, other) -> blosc2.LazyExpr: + other = _normalize_expr_operand(other) + _check_allowed_dtypes(other) return blosc2.LazyExpr(new_op=(self, "|", other)) def __ior__(self, other) -> blosc2.LazyExpr: diff --git a/src/blosc2/ref.py b/src/blosc2/ref.py index c0acca8e..cc842f25 100644 --- a/src/blosc2/ref.py +++ b/src/blosc2/ref.py @@ -118,7 +118,7 @@ def open(self): import blosc2 if self.kind == "urlpath": - return blosc2.open(self.urlpath) + return blosc2.open(self.urlpath, mode="a") if self.kind == "dictstore_key": return blosc2.DictStore(self.urlpath, mode="r")[self.key] if self.kind == "c2array": diff --git a/src/blosc2/schema_compiler.py b/src/blosc2/schema_compiler.py index 1300cecd..19a3d0c1 100644 --- a/src/blosc2/schema_compiler.py +++ b/src/blosc2/schema_compiler.py @@ -207,7 +207,7 @@ def validate_annotation_matches_spec(name: str, annotation: Any, spec: SchemaSpe # --------------------------------------------------------------------------- -_RESERVED_COLUMN_NAMES: frozenset[str] = frozenset({"_meta", "_valid_rows", "_cols"}) +_RESERVED_COLUMN_NAMES: frozenset[str] = frozenset({"_meta", "_valid_rows", "_cols", "_indexes"}) def _validate_column_name(name: str) -> None: diff --git a/src/blosc2/schunk.py b/src/blosc2/schunk.py index 2fbb0bd1..5d9ecadc 100644 --- a/src/blosc2/schunk.py +++ b/src/blosc2/schunk.py @@ -9,6 +9,7 @@ import os import pathlib +import warnings import zipfile from collections import namedtuple from collections.abc import Iterator, Mapping, MutableMapping @@ -198,82 +199,82 @@ def __init__( # noqa: C901 ) -> None: """Create a new super-chunk, or open an existing one. - Parameters - ---------- - chunksize: int, optional - The size, in bytes, of the chunks in the super-chunk. If not provided, - it is set automatically to a reasonable value. - - data: bytes-like object, optional - The data to be split into different chunks of size :paramref:`chunksize`. - If None, the Schunk instance will be empty initially. - - kwargs: dict, optional - Storage parameters. The default values are in :class:`blosc2.Storage`. - Supported keyword arguments: - storage: :class:`blosc2.Storage` or dict - All the storage parameters that you want to use as - a :class:`blosc2.Storage` or dict instance. - cparams: :class:`blosc2.CParams` or dict - All the compression parameters that you want to use as - a :class:`blosc2.CParams` or dict instance. - dparams: :class:`blosc2.DParams` or dict - All the decompression parameters that you want to use as - a :class:`blosc2.DParams` or dict instance. - others: Any - If `storage` is not passed, all the parameters of a :class:`blosc2.Storage` - can be passed as keyword arguments. - - Examples - -------- - >>> import blosc2 - >>> import numpy as np - >>> import os.path - >>> import shutil - >>> import tempfile - >>> cparams = blosc2.CParams() - >>> dparams = blosc2.DParams() - >>> storage = blosc2.Storage(contiguous=True) - >>> schunk = blosc2.SChunk(cparams=cparams, dparams=dparams, storage=storage) - - In the following, we will write and read a super-chunk to and from disk - via memory-mapped files. - - >>> a = np.arange(3, dtype=np.int64) - >>> chunksize = a.size * a.itemsize - >>> n_chunks = 2 - >>> tmpdirname = tempfile.mkdtemp() - >>> urlpath = os.path.join(tmpdirname, 'schunk.b2frame') - - Optional: we intend to write 2 chunks of 24 bytes each, and we expect - the compressed size to be smaller than the original size. Therefore, we - generously set the initial size of the mapping to 48 bytes - effectively avoiding remappings. - - >>> initial_mapping_size = chunksize * n_chunks - >>> schunk_mmap = blosc2.SChunk( - ... chunksize=chunksize, - ... mmap_mode="w+", - ... initial_mapping_size=initial_mapping_size, - ... urlpath=urlpath, - ... ) - >>> schunk_mmap.append_data(a) - 1 - >>> schunk_mmap.append_data(a * 2) - 2 - - Optional: explicitly close the file and free the mapping. - - >>> del schunk_mmap - - Reading the data back again via memory-mapped files: - - >>> schunk_mmap = blosc2.open(urlpath, mmap_mode="r") - >>> np.frombuffer(schunk_mmap.decompress_chunk(0), dtype=np.int64).tolist() - [0, 1, 2] - >>> np.frombuffer(schunk_mmap.decompress_chunk(1), dtype=np.int64).tolist() - [0, 2, 4] - >>> shutil.rmtree(tmpdirname) + Parameters + ---------- + chunksize: int, optional + The size, in bytes, of the chunks in the super-chunk. If not provided, + it is set automatically to a reasonable value. + + data: bytes-like object, optional + The data to be split into different chunks of size :paramref:`chunksize`. + If None, the Schunk instance will be empty initially. + + kwargs: dict, optional + Storage parameters. The default values are in :class:`blosc2.Storage`. + Supported keyword arguments: + storage: :class:`blosc2.Storage` or dict + All the storage parameters that you want to use as + a :class:`blosc2.Storage` or dict instance. + cparams: :class:`blosc2.CParams` or dict + All the compression parameters that you want to use as + a :class:`blosc2.CParams` or dict instance. + dparams: :class:`blosc2.DParams` or dict + All the decompression parameters that you want to use as + a :class:`blosc2.DParams` or dict instance. + others: Any + If `storage` is not passed, all the parameters of a :class:`blosc2.Storage` + can be passed as keyword arguments. + + Examples + -------- + >>> import blosc2 + >>> import numpy as np + >>> import os.path + >>> import shutil + >>> import tempfile + >>> cparams = blosc2.CParams() + >>> dparams = blosc2.DParams() + >>> storage = blosc2.Storage(contiguous=True) + >>> schunk = blosc2.SChunk(cparams=cparams, dparams=dparams, storage=storage) + + In the following, we will write and read a super-chunk to and from disk + via memory-mapped files. + + >>> a = np.arange(3, dtype=np.int64) + >>> chunksize = a.size * a.itemsize + >>> n_chunks = 2 + >>> tmpdirname = tempfile.mkdtemp() + >>> urlpath = os.path.join(tmpdirname, 'schunk.b2frame') + + Optional: we intend to write 2 chunks of 24 bytes each, and we expect + the compressed size to be smaller than the original size. Therefore, we + generously set the initial size of the mapping to 48 bytes + effectively avoiding remappings. + + >>> initial_mapping_size = chunksize * n_chunks + >>> schunk_mmap = blosc2.SChunk( + ... chunksize=chunksize, + ... mmap_mode="w+", + ... initial_mapping_size=initial_mapping_size, + ... urlpath=urlpath, + ... ) + >>> schunk_mmap.append_data(a) + 1 + >>> schunk_mmap.append_data(a * 2) + 2 + + Optional: explicitly close the file and free the mapping. + + >>> del schunk_mmap + + Reading the data back again via memory-mapped files: + + >>> schunk_mmap = blosc2.open(urlpath, mode="r", mmap_mode="r") + >>> np.frombuffer(schunk_mmap.decompress_chunk(0), dtype=np.int64).tolist() + [0, 1, 2] + >>> np.frombuffer(schunk_mmap.decompress_chunk(1), dtype=np.int64).tolist() + [0, 2, 4] + >>> shutil.rmtree(tmpdirname) """ # Check only allowed kwarg are passed allowed_kwargs = [ @@ -1634,13 +1635,17 @@ def _set_default_dparams(kwargs): def process_opened_object(res): meta = getattr(res, "schunk", res).meta if "proxy-source" in meta: + proxy_cache = res + cache_schunk = getattr(res, "schunk", res) + if getattr(cache_schunk, "urlpath", None) is not None and getattr(cache_schunk, "mode", None) == "r": + proxy_cache = blosc2_ext.open(cache_schunk.urlpath, "a", 0) proxy_src = meta["proxy-source"] if proxy_src["local_abspath"] is not None: - src = blosc2.open(proxy_src["local_abspath"]) - return blosc2.Proxy(src, _cache=res) + src = blosc2.open(proxy_src["local_abspath"], mode="a") + return blosc2.Proxy(src, _cache=proxy_cache) elif proxy_src["urlpath"] is not None: src = blosc2.C2Array(proxy_src["urlpath"][0], proxy_src["urlpath"][1], proxy_src["urlpath"][2]) - return blosc2.Proxy(src, _cache=res) + return blosc2.Proxy(src, _cache=proxy_cache) elif not proxy_src["caterva2_env"]: raise RuntimeError("Could not find the source when opening a Proxy") @@ -1706,8 +1711,22 @@ def _open_treestore_root_object(store, urlpath, mode): return store +def _finalize_special_open(special, urlpath, mode): + if special is None: + return None + if isinstance(special, blosc2.TreeStore): + return _open_treestore_root_object(special, urlpath, mode) + return special + + +_OPEN_MODE_SENTINEL = object() + + def open( - urlpath: str | pathlib.Path | blosc2.URLPath, mode: str = "a", offset: int = 0, **kwargs: dict + urlpath: str | pathlib.Path | blosc2.URLPath, + mode: str = _OPEN_MODE_SENTINEL, + offset: int = 0, + **kwargs: dict, ) -> ( blosc2.SChunk | blosc2.NDArray @@ -1733,7 +1752,10 @@ def open( mode: str, optional Persistence mode: 'r' means read only (must exist); 'a' means read/write (create if it doesn't exist); - 'w' means create (overwrite if it exists). Default is 'a'. + 'w' means create (overwrite if it exists). Defaults to 'a' for now, + but will change to 'r' in a future release. Pass ``mode='a'`` + explicitly to preserve writable behavior, or ``mode='r'`` for + read-only access. offset: int, optional An offset in the file where super-chunk or array data is located (e.g. in a file containing several such objects). @@ -1793,7 +1815,7 @@ def open( >>> # Create SChunk and append data >>> schunk = blosc2.SChunk(chunksize=chunksize, data=data.tobytes(), storage=storage) >>> # Open SChunk - >>> sc_open = blosc2.open(urlpath=urlpath) + >>> sc_open = blosc2.open(urlpath=urlpath, mode="r") >>> for i in range(nchunks): ... dest = np.empty(nelem // nchunks, dtype=data.dtype) ... schunk.decompress_chunk(i, dest) @@ -1808,12 +1830,25 @@ def open( To open the same schunk memory-mapped, we simply need to pass the `mmap_mode` parameter: - >>> sc_open_mmap = blosc2.open(urlpath=urlpath, mmap_mode="r") + >>> sc_open_mmap = blosc2.open(urlpath=urlpath, mode="r", mmap_mode="r") >>> sc_open.nchunks == sc_open_mmap.nchunks True >>> all(sc_open.decompress_chunk(i, dest1) == sc_open_mmap.decompress_chunk(i, dest1) for i in range(nchunks)) True """ + # Resolve the sentinel before URLPath check so we can raise the correct + # error without also triggering the deprecation warning for invalid calls. + if mode is _OPEN_MODE_SENTINEL: + # TODO: remove the sentinel/FutureWarning path once blosc2.open() defaults to mode="r". + warnings.warn( + "blosc2.open() currently defaults to mode='a', but this will change " + "to mode='r' in a future release. Pass mode='a' explicitly to keep " + "writable behavior, or mode='r' for read-only access.", + FutureWarning, + stacklevel=2, + ) + mode = "a" + if isinstance(urlpath, blosc2.URLPath): if mode != "r" or offset != 0 or kwargs != {}: raise NotImplementedError( @@ -1829,9 +1864,8 @@ def open( # more expensive store probing when that fails. if urlpath.endswith((".b2d", ".b2z", ".b2e")): special = _open_special_store(urlpath, mode, offset, **kwargs) + special = _finalize_special_open(special, urlpath, mode) if special is not None: - if isinstance(special, blosc2.TreeStore): - return _open_treestore_root_object(special, urlpath, mode) return special regular_exc = None @@ -1845,11 +1879,12 @@ def open( return process_opened_object(res) resolved_urlpath = _resolve_store_alias(urlpath) - special_path = resolved_urlpath if resolved_urlpath != urlpath or not os.path.exists(urlpath) else urlpath + special_path = ( + resolved_urlpath if resolved_urlpath != urlpath or not os.path.exists(urlpath) else urlpath + ) special = _open_special_store(special_path, mode, offset, **kwargs) + special = _finalize_special_open(special, special_path, mode) if special is not None: - if isinstance(special, blosc2.TreeStore): - return _open_treestore_root_object(special, special_path, mode) return special if regular_exc is not None: diff --git a/tests/ctable/test_column.py b/tests/ctable/test_column.py index 46cfca28..838d4c75 100644 --- a/tests/ctable/test_column.py +++ b/tests/ctable/test_column.py @@ -675,5 +675,74 @@ def test_repr_is_single_line(): assert "\n" not in repr(t) +def test_column_repr_shows_preview_values(): + t = CTable(Row, new_data=DATA20) + r = repr(t["id"][:]) + assert "Column('id'" in r + assert "dtype=int64" in r + assert "len=20" in r + assert "values=[0, 1, 2" in r + assert "..." in r + + +def test_info_omits_capacity_and_read_only_for_in_memory_table(): + t = CTable(Row, new_data=DATA20) + info = repr(t.info) + assert "capacity" not in info + assert "read_only" not in info + assert "open_mode" not in info + + +def test_info_shows_open_mode_for_persistent_table(tmp_path): + path = str(tmp_path / "table.b2d") + t = CTable(Row, new_data=DATA20, urlpath=path, mode="w") + t.close() + + opened = CTable.open(path) + info = repr(opened.info) + assert "capacity" not in info + assert "read_only" not in info + assert "open_mode : r" in info + opened.close() + + +def test_info_schema_expands_unicode_dtype_labels(): + t = CTable(StrRow, new_data=[("alpha",), ("beta",)]) + info = repr(t.info) + assert "U16 (Unicode, max 16 chars)" in info + + +def test_info_valid_rows_mask_only_reports_cbytes(): + t = CTable(Row, new_data=DATA20) + info = repr(t.info) + assert "valid_rows_mask : cbytes=" in info + assert "valid_rows_mask : nbytes=" not in info + + +def test_info_indexes_only_report_cbytes(tmp_path): + @dataclass + class IndexedRow: + id: int = blosc2.field(blosc2.int32()) + active: bool = blosc2.field(blosc2.bool(), default=True) + + data = [(i, i % 2 == 0) for i in range(32)] + path = str(tmp_path / "indexed.b2d") + t = CTable(IndexedRow, new_data=data, urlpath=path, mode="w") + t.create_index("id", kind=blosc2.IndexKind.FULL) + + info = repr(t.info) + index_block = info.split("indexes :", 1)[1] + assert "cbytes=" in index_block + assert "nbytes=" not in index_block + assert "cratio=" not in index_block + + +def test_info_cratio_uses_one_decimal_with_suffix(): + t = CTable(Row, new_data=DATA20) + info = repr(t.info) + assert "cratio :" in info + assert "x" in next(line for line in info.splitlines() if line.startswith("cratio")) + + if __name__ == "__main__": pytest.main(["-v", __file__]) diff --git a/tests/ctable/test_ctable_indexing.py b/tests/ctable/test_ctable_indexing.py new file mode 100644 index 00000000..eb643810 --- /dev/null +++ b/tests/ctable/test_ctable_indexing.py @@ -0,0 +1,438 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +"""Tests for CTable persistent and in-memory indexing.""" + +import dataclasses +import shutil +import tempfile +from pathlib import Path + +import numpy as np +import pytest + +import blosc2 + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +@dataclasses.dataclass +class Row: + id: int = blosc2.field(blosc2.int32()) + value: float = blosc2.field(blosc2.float64()) + category: int = blosc2.field(blosc2.int32()) + + +def _make_table(n=100, persistent_path=None): + """Return a CTable with *n* rows, optionally persistent at *persistent_path*.""" + if persistent_path is not None: + t = blosc2.CTable(Row, urlpath=persistent_path, mode="w") + else: + t = blosc2.CTable(Row) + for i in range(n): + t.append([i, float(i) * 1.5, i % 5]) + return t + + +# --------------------------------------------------------------------------- +# In-memory table tests +# --------------------------------------------------------------------------- + + +def test_create_index_in_memory(): + t = _make_table(50) + idx = t.create_index("id") + assert idx is not None + assert idx.col_name == "id" + assert not idx.stale + assert len(t.indexes) == 1 + assert t.indexes[0].col_name == "id" + + +def test_create_index_in_memory_duplicate_raises(): + t = _make_table(20) + t.create_index("id") + with pytest.raises(ValueError, match="Index already exists"): + t.create_index("id") + + +def test_drop_index_in_memory(): + t = _make_table(20) + t.create_index("id") + t.drop_index("id") + assert len(t.indexes) == 0 + with pytest.raises(KeyError): + t.index("id") + + +def test_drop_nonexistent_index_raises(): + t = _make_table(20) + with pytest.raises(KeyError, match="No index found"): + t.drop_index("id") + + +def test_drop_indexed_column_clears_catalog(): + t = _make_table(20) + t.create_index("id") + t.drop_column("id") + assert [idx.col_name for idx in t.indexes] == [] + with pytest.raises(KeyError, match="No index found"): + t.index("id") + + +def test_where_with_index_matches_scan_in_memory(): + t = _make_table(200) + t.create_index("id") + result_idx = t.where(t["id"] > 100) + # Drop index to force scan + t.drop_index("id") + result_scan = t.where(t["id"] > 100) + ids_idx = sorted(int(v) for v in result_idx["id"].to_numpy()) + ids_scan = sorted(int(v) for v in result_scan["id"].to_numpy()) + assert ids_idx == ids_scan + + +def test_bool_column_composes_naturally_in_where(): + @dataclasses.dataclass + class BoolRow: + sensor_id: int = blosc2.field(blosc2.int32()) + region: str = blosc2.field(blosc2.string(max_length=8), default="") + active: bool = blosc2.field(blosc2.bool(), default=True) + + t = blosc2.CTable(BoolRow) + for i in range(20): + t.append([i, "north" if i % 4 == 0 else "south", i % 2 == 0]) + + result = t.where((t["sensor_id"] >= 8) & t["active"] & (t["region"] == "north")) + assert sorted(int(v) for v in result["sensor_id"].to_numpy()) == [8, 12, 16] + + result_bare = t.where(t["active"]) + assert sorted(int(v) for v in result_bare["sensor_id"].to_numpy()) == list(range(0, 20, 2)) + + +def test_rebuild_index_in_memory(): + t = _make_table(30) + t.create_index("id") + t.append([999, 999.0, 4]) # marks stale + assert t.index("id").stale + idx2 = t.rebuild_index("id") + assert not idx2.stale + result = t.where(t["id"] == 999) + assert len(result) == 1 + + +def test_stale_on_append_in_memory(): + t = _make_table(20) + t.create_index("id") + t.append([100, 100.0, 0]) + assert t.index("id").stale + + +def test_stale_on_extend_in_memory(): + t = _make_table(20) + t.create_index("id") + t.extend([[101, 101.0, 0], [102, 102.0, 1]]) + assert t.index("id").stale + + +def test_stale_on_column_setitem_in_memory(): + t = _make_table(20) + t.create_index("id") + t["id"][0] = 999 + assert t.index("id").stale + + +def test_stale_on_column_assign_in_memory(): + t = _make_table(20) + t.create_index("id") + t["id"].assign(np.arange(20, dtype=np.int32)) + assert t.index("id").stale + + +def test_delete_bumps_visibility_epoch_not_stale_in_memory(): + t = _make_table(20) + t.create_index("id") + t.delete(0) + idx = t.index("id") + # delete should NOT mark stale (only bumps visibility_epoch) + assert not idx.stale + _, vis_e = t._storage.get_epoch_counters() + assert vis_e >= 1 + + +def test_stale_fallback_to_scan_in_memory(): + t = _make_table(50) + t.create_index("id") + t.append([200, 200.0, 0]) # marks stale + # Query should still work (falls back to scan) + result = t.where(t["id"] > 40) + ids = sorted(int(v) for v in result["id"].to_numpy()) + assert 200 in ids + assert 41 in ids + + +def test_compact_index_in_memory(): + t = _make_table(50, persistent_path=None) + t.create_index("id", kind=blosc2.IndexKind.FULL) + # compact_index should not raise for full indexes + t.compact_index("id") + + +def test_multi_column_conjunction_uses_multiple_indexes_in_memory(): + t = _make_table(200) + t.create_index("id", kind=blosc2.IndexKind.FULL) + t.create_index("category", kind=blosc2.IndexKind.FULL) + expr = (t["id"] >= 50) & (t["id"] < 120) & (t["category"] == 3) + result_idx = t.where(expr) + t.drop_index("id") + t.drop_index("category") + result_scan = t.where(expr) + ids_idx = sorted(int(v) for v in result_idx["id"].to_numpy()) + ids_scan = sorted(int(v) for v in result_scan["id"].to_numpy()) + assert ids_idx == ids_scan + + +# --------------------------------------------------------------------------- +# Persistent table tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tmpdir(): + d = tempfile.mkdtemp() + yield Path(d) + shutil.rmtree(d, ignore_errors=True) + + +def test_create_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + idx = t.create_index("id") + assert not idx.stale + # Sidecar directory must exist + index_dir = Path(path) / "_indexes" / "id" + assert index_dir.exists() + # At least one .b2nd sidecar file + sidecars = list(index_dir.glob("**/*.b2nd")) + assert sidecars, "No sidecar .b2nd files found" + + +def test_catalog_survives_reopen(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + del t # close + + t2 = blosc2.open(path, mode="r") + idxs = t2.indexes + assert len(idxs) == 1 + assert idxs[0].col_name == "id" + assert not idxs[0].stale + + +def test_where_with_index_matches_scan_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(200, persistent_path=path) + t.create_index("id") + result_idx = t.where(t["id"] > 150) + + t.drop_index("id") + result_scan = t.where(t["id"] > 150) + + ids_idx = sorted(int(v) for v in result_idx["id"].to_numpy()) + ids_scan = sorted(int(v) for v in result_scan["id"].to_numpy()) + assert ids_idx == ids_scan + + +def test_drop_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + t.drop_index("id") + assert len(t.indexes) == 0 + index_dir = Path(path) / "_indexes" / "id" + # After drop, index dir should be gone (or empty) + sidecars = list(index_dir.glob("**/*.b2nd")) if index_dir.exists() else [] + assert sidecars == [] + + +def test_drop_index_persistent_catalog_cleared(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + t.drop_index("id") + del t + + t2 = blosc2.open(path, mode="r") + assert len(t2.indexes) == 0 + + +def test_drop_indexed_column_removes_persistent_sidecars(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(30, persistent_path=path) + t.create_index("id") + t.drop_column("id") + assert len(t.indexes) == 0 + assert not (Path(path) / "_indexes" / "id").exists() + + +def test_rebuild_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + t.create_index("id") + t.append([500, 750.0, 2]) # marks stale + assert t.index("id").stale + idx2 = t.rebuild_index("id") + assert not idx2.stale + result = t.where(t["id"] == 500) + assert len(result) == 1 + + +def test_compact_index_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(50, persistent_path=path) + t.create_index("id", kind=blosc2.IndexKind.FULL) + t.compact_index("id") + # Query should still work after compact + result = t.where(t["id"] > 40) + ids = sorted(int(v) for v in result["id"].to_numpy()) + expected = list(range(41, 50)) + assert ids == expected + + +def test_stale_on_append_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(20, persistent_path=path) + t.create_index("id") + t.append([200, 300.0, 1]) + assert t.index("id").stale + + +def test_stale_persists_after_reopen(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(20, persistent_path=path) + t.create_index("id") + t.append([200, 300.0, 1]) # marks stale + del t + + t2 = blosc2.open(path, mode="r") + assert t2.index("id").stale + + +def test_delete_bumps_visibility_epoch_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(20, persistent_path=path) + t.create_index("id") + t.delete(0) + idx = t.index("id") + # delete should NOT mark index stale + assert not idx.stale + _, vis_e = t._storage.get_epoch_counters() + assert vis_e >= 1 + + +def test_query_after_reopen_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(100, persistent_path=path) + t.create_index("id") + del t + + t2 = blosc2.open(path, mode="r") + result = t2.where(t2["id"] > 90) + ids = sorted(int(v) for v in result["id"].to_numpy()) + assert ids == list(range(91, 100)) + + +def test_rename_indexed_column_rebuilds_catalog_persistent(tmpdir): + path = str(tmpdir / "table.b2d") + t = _make_table(40, persistent_path=path) + t.create_index("id") + t.rename_column("id", "newid") + assert [idx.col_name for idx in t.indexes] == ["newid"] + assert not (Path(path) / "_indexes" / "id").exists() + assert (Path(path) / "_indexes" / "newid").exists() + result = t.where(t["newid"] > 35) + assert sorted(int(v) for v in result["newid"].to_numpy()) == [36, 37, 38, 39] + + +# --------------------------------------------------------------------------- +# View tests +# --------------------------------------------------------------------------- + + +def test_view_cannot_create_index(): + t = _make_table(20) + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.create_index("id") + + +def test_view_cannot_drop_index(): + t = _make_table(20) + t.create_index("id") + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.drop_index("id") + + +def test_view_cannot_rebuild_index(): + t = _make_table(20) + t.create_index("id") + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.rebuild_index("id") + + +def test_view_cannot_compact_index(): + t = _make_table(20) + t.create_index("id") + view = t.where(t["id"] > 5) + with pytest.raises(ValueError, match="view"): + view.compact_index("id") + + +def test_view_query_uses_root_index(): + t = _make_table(200) + t.create_index("id") + # Query on the original table + result_direct = t.where(t["id"] > 180) + ids_direct = sorted(int(v) for v in result_direct["id"].to_numpy()) + assert ids_direct == list(range(181, 200)) + + +def test_malformed_catalog_entry_raises_clear_error(): + t = _make_table(20) + t._storage.save_index_catalog({"id": {"kind": "bucket"}}) + with pytest.raises(ValueError, match="Malformed index metadata"): + t.where(t["id"] > 5) + + +# --------------------------------------------------------------------------- +# index() and indexes property +# --------------------------------------------------------------------------- + + +def test_index_lookup_missing_raises(): + t = _make_table(10) + with pytest.raises(KeyError): + t.index("nonexistent") + + +def test_indexes_empty_on_new_table(): + t = _make_table(10) + assert t.indexes == [] + + +def test_indexes_multiple_columns(): + t = _make_table(30) + t.create_index("id") + t.create_index("category") + assert len(t.indexes) == 2 + col_names = {idx.col_name for idx in t.indexes} + assert col_names == {"id", "category"} diff --git a/tests/ndarray/test_c2array_expr.py b/tests/ndarray/test_c2array_expr.py index e65d1805..c4b778f1 100644 --- a/tests/ndarray/test_c2array_expr.py +++ b/tests/ndarray/test_c2array_expr.py @@ -186,7 +186,7 @@ def test_save(cat2_context): for op in ops: del op del expr - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") res = expr.compute() assert res.dtype == np.float64 np.testing.assert_allclose(res[:], nres, rtol=tol, atol=tol) diff --git a/tests/ndarray/test_dsl_kernels.py b/tests/ndarray/test_dsl_kernels.py index ac37beeb..8261536f 100644 --- a/tests/ndarray/test_dsl_kernels.py +++ b/tests/ndarray/test_dsl_kernels.py @@ -872,7 +872,7 @@ def _save_reload_compute(kernel, inputs_np, inputs_b2, dtype, urlpaths, extra_kw """Save a LazyUDF backed by *kernel*, reload it, and return (reloaded_expr, result).""" lazy = blosc2.lazyudf(kernel, inputs_b2, dtype=dtype, **(extra_kwargs or {})) lazy.save(urlpath=urlpaths["lazy"]) - reloaded = blosc2.open(urlpaths["lazy"]) + reloaded = blosc2.open(urlpaths["lazy"], mode="r") return reloaded, reloaded.compute() @@ -951,7 +951,7 @@ def test_dsl_save_getitem(tmp_path): lazy = blosc2.lazyudf(kernel_save_simple, (a, b), dtype=np.float64) lazy.save(urlpath=str(tmp_path / "lazy.b2nd")) - reloaded = blosc2.open(str(tmp_path / "lazy.b2nd")) + reloaded = blosc2.open(str(tmp_path / "lazy.b2nd"), mode="r") assert isinstance(reloaded.func, DSLKernel) expected = (na + nb) ** 2 @@ -970,7 +970,7 @@ def test_dsl_save_input_names_match(tmp_path): lazy = blosc2.lazyudf(kernel_save_simple, (a, b), dtype=np.float64) lazy.save(urlpath=str(tmp_path / "lazy.b2nd")) - reloaded = blosc2.open(str(tmp_path / "lazy.b2nd")) + reloaded = blosc2.open(str(tmp_path / "lazy.b2nd"), mode="r") assert isinstance(reloaded.func, DSLKernel) assert reloaded.func.input_names == ["x", "y"] diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 3faa1daf..77f03232 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -618,8 +618,8 @@ def test_persistent_chunk_local_sidecars_use_cparams(tmp_path, kind): meta = descriptor["bucket"] if kind == "bucket" else descriptor["partial"] aux_key = "bucket_positions_path" if kind == "bucket" else "positions_path" - values_sidecar = blosc2.open(meta["values_path"]) - aux_sidecar = blosc2.open(meta[aux_key]) + values_sidecar = blosc2.open(meta["values_path"], mode="r") + aux_sidecar = blosc2.open(meta[aux_key], mode="r") for sidecar in (values_sidecar, aux_sidecar): assert sidecar.cparams.codec == blosc2.Codec.LZ4 @@ -1102,9 +1102,9 @@ def test_compact_full_index_clears_runs_and_preserves_results(tmp_path): assert reopened.indexes[0]["full"]["runs"] == [] for values_path, positions_path in run_paths: with pytest.raises(FileNotFoundError): - blosc2.open(values_path) + blosc2.open(values_path, mode="r") with pytest.raises(FileNotFoundError): - blosc2.open(positions_path) + blosc2.open(positions_path, mode="r") expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", reopened.fields).where(reopened) explained = expr.explain() @@ -1150,8 +1150,8 @@ def test_forced_ooc_full_index_merge_preserves_sorted_sidecars(monkeypatch, tmp_ descriptor = arr.create_index(kind=blosc2.IndexKind.FULL) meta = descriptor["full"] - values_sidecar = blosc2.open(meta["values_path"]) - positions_sidecar = blosc2.open(meta["positions_path"]) + values_sidecar = blosc2.open(meta["values_path"], mode="r") + positions_sidecar = blosc2.open(meta["positions_path"], mode="r") np.testing.assert_array_equal(values_sidecar[:], np.sort(data, kind="stable")) np.testing.assert_array_equal(values_sidecar[:], data[positions_sidecar[:]]) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 20dba823..6525a07d 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -466,7 +466,7 @@ def test_arctan2_pow(urlpath, shape_fixture, dtype_fixture, function, value1, va expr = blosc2.LazyExpr(new_op=(a1, function, a2)) if urlpath is not None: expr.save(urlpath=urlpath_save) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr if function == "**": @@ -480,7 +480,7 @@ def test_arctan2_pow(urlpath, shape_fixture, dtype_fixture, function, value1, va expr = blosc2.LazyExpr(new_op=(a1, function, value2)) if urlpath is not None: expr.save(urlpath=urlpath_save) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr if function == "**": @@ -496,7 +496,7 @@ def test_arctan2_pow(urlpath, shape_fixture, dtype_fixture, function, value1, va expr = blosc2.LazyExpr(new_op=(value1, function, a2)) if urlpath is not None: expr.save(urlpath=urlpath_save) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr if function == "**": @@ -716,7 +716,7 @@ def test_save(): ) np.testing.assert_allclose(res[:], nres, rtol=tol, atol=tol) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") # After opening, check that a lazy expression does have an array # and schunk attributes. This is to allow the .info() method to work. assert hasattr(expr, "array") is True @@ -735,7 +735,7 @@ def test_save(): var_dict = {"a1": ops[0], "a2": ops[1], "a3": ops[2], "a4": ops[3], "x": x} lazy_expr = eval(expr, var_dict) lazy_expr.save(urlpath=urlpath_save2) - expr = blosc2.open(urlpath_save2) + expr = blosc2.open(urlpath_save2, mode="r") assert expr.array.dtype == np.float64 res = expr.compute() nres = ne_evaluate("na1 / na2 + na2 - na3 * na4**3") @@ -759,7 +759,7 @@ def test_save_unsafe(): expr.save(urlpath=urlpath) disk_arrays.append(urlpath) - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") # Replace expression by a (potentially) unsafe expression expr.expression = "import os; os.system('touch /tmp/unsafe')" with pytest.raises(ValueError) as excinfo: @@ -807,7 +807,7 @@ def test_save_functions(function, dtype_fixture, shape_fixture): expr = blosc2.LazyExpr(new_op=(a1, function, None)) expr.save(urlpath=urlpath_save) del expr - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() # Evaluate using NumExpr @@ -823,7 +823,7 @@ def test_save_functions(function, dtype_fixture, shape_fixture): res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol) @@ -847,7 +847,7 @@ def test_save_contains(values): # Construct the lazy expression expr_lazy = blosc2.LazyExpr(new_op=(a1_blosc, "contains", value2)) expr_lazy.save(urlpath=urlpath_save) - expr_lazy = blosc2.open(urlpath_save) + expr_lazy = blosc2.open(urlpath_save, mode="r") # Evaluate using NumExpr expr_numexpr = f"{'contains'}(a1, value2)" res_numexpr = ne_evaluate(expr_numexpr) @@ -857,7 +857,7 @@ def test_save_contains(values): # Construct the lazy expression expr_lazy = blosc2.LazyExpr(new_op=(a1_blosc, "contains", a2_blosc)) expr_lazy.save(urlpath=urlpath_save) - expr_lazy = blosc2.open(urlpath_save) + expr_lazy = blosc2.open(urlpath_save, mode="r") # Evaluate using NumExpr res_numexpr = ne_evaluate("contains(a2, a1)") else: # ("str", "NDArray") @@ -867,7 +867,7 @@ def test_save_contains(values): # Construct the lazy expression expr_lazy = blosc2.LazyExpr(new_op=(value1, "contains", a2_blosc)) expr_lazy.save(urlpath=urlpath_save) - expr_lazy = blosc2.open(urlpath_save) + expr_lazy = blosc2.open(urlpath_save, mode="r") # Evaluate using NumExpr res_numexpr = ne_evaluate("contains(value1, a2)") res_lazyexpr = expr_lazy.compute() @@ -901,7 +901,7 @@ def test_save_many_functions(dtype_fixture, shape_fixture): res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol, atol=atol) - expr = blosc2.open(urlpath_save) + expr = blosc2.open(urlpath_save, mode="r") res_lazyexpr = expr.compute() np.testing.assert_allclose(res_lazyexpr[:], res_numexpr, rtol=rtol, atol=atol) @@ -946,7 +946,7 @@ def test_save_constructor(disk, shape, dtype, constructor): a = b2func(lshape, dtype=dtype, shape=shape, urlpath=urlpath, mode="w") expr = f"a + {constructor}({lshape}, dtype={dtype}, shape={shape}) + 1" if disk: - a = blosc2.open(urlpath) + a = blosc2.open(urlpath, mode="r") npfunc = getattr(np, constructor) if constructor == "linspace": na = npfunc(0, 10, lshape, dtype=dtype).reshape(shape) @@ -964,7 +964,7 @@ def test_save_constructor(disk, shape, dtype, constructor): assert lexpr.shape == a.shape if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() nres = na + na + 1 assert np.allclose(res[()], nres) @@ -986,7 +986,7 @@ def test_save_2_constructors(shape, disk): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() na = np.arange(lshape).reshape(shape) nb = np.ones(shape) @@ -1013,7 +1013,7 @@ def test_save_constructor_reshape(shape, disk): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() na = np.arange(lshape).reshape(shape) nb = np.ones(shape) @@ -1037,7 +1037,7 @@ def test_save_2equal_constructors(shape, disk): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() na = np.ones(shape, dtype=np.int8) nb = np.ones(shape) @@ -1360,9 +1360,9 @@ def test_fill_disk_operands(chunks, blocks, disk, fill_value): b = blosc2.zeros((N, N), urlpath=bpath, mode="w", chunks=chunks, blocks=blocks) c = blosc2.zeros((N, N), urlpath=cpath, mode="w", chunks=chunks, blocks=blocks) if disk: - a = blosc2.open("a.b2nd") - b = blosc2.open("b.b2nd") - c = blosc2.open("c.b2nd") + a = blosc2.open("a.b2nd", mode="r") + b = blosc2.open("b.b2nd", mode="r") + c = blosc2.open("c.b2nd", mode="r") expr = ((a**3 + blosc2.sin(c * 2)) < b) & ~(c > 0) @@ -1709,7 +1709,7 @@ def test_missing_operator(): blosc2.remove_urlpath("b.b2nd") # Re-open the lazy expression with pytest.raises(blosc2.exceptions.MissingOperands) as excinfo: - blosc2.open("expr.b2nd") + blosc2.open("expr.b2nd", mode="r") # Check that some operand is missing assert "a" not in excinfo.value.missing_ops @@ -1746,7 +1746,7 @@ def test_save_dictstore_operands(tmp_path): "b": {"kind": "dictstore_key", "version": 1, "urlpath": str(store_path), "key": "/b"}, } - restored = blosc2.open(expr_path) + restored = blosc2.open(expr_path, mode="r") assert isinstance(restored, blosc2.LazyExpr) np.testing.assert_array_equal(restored[:], expected) @@ -1762,7 +1762,7 @@ def test_save_proxy_operands_reopen_default_mode(tmp_path): expr = proxy + proxy expr.save(str(expr_path)) - restored = blosc2.open(str(expr_path)) + restored = blosc2.open(str(expr_path), mode="r") assert isinstance(restored, blosc2.LazyExpr) np.testing.assert_array_equal(restored[:], np.arange(10, dtype=np.int64) * 2) @@ -1780,13 +1780,13 @@ def test_lazyexpr_vlmeta_in_memory_and_persisted(tmp_path): expr_path = tmp_path / "expr_vlmeta.b2nd" expr.save(str(expr_path)) - restored = blosc2.open(str(expr_path)) + restored = blosc2.open(str(expr_path), mode="r") assert restored.vlmeta["name"] == "sum" assert restored.vlmeta["config"] == {"scale": 1} restored.vlmeta["note"] = "persisted" - reopened = blosc2.open(str(expr_path)) + reopened = blosc2.open(str(expr_path), mode="r") assert reopened.vlmeta["note"] == "persisted" np.testing.assert_array_equal(reopened[:], np.arange(5, dtype=np.int64) * 2) @@ -1862,19 +1862,19 @@ def test_chain_persistentexpressions(): le1_ = blosc2.lazyexpr("a ** 3 + sin(a ** 2)", {"a": a}) le1_.save("expr1.b2nd", mode="w") - myle1 = blosc2.open("expr1.b2nd") + myle1 = blosc2.open("expr1.b2nd", mode="r") le2_ = blosc2.lazyexpr("(le1 < c)", {"le1": myle1, "c": c}) le2_.save("expr2.b2nd", mode="w") - myle2 = blosc2.open("expr2.b2nd") + myle2 = blosc2.open("expr2.b2nd", mode="r") le3_ = blosc2.lazyexpr("(b < 0)", {"b": b}) le3_.save("expr3.b2nd", mode="w") - myle3 = blosc2.open("expr3.b2nd") + myle3 = blosc2.open("expr3.b2nd", mode="r") le4_ = blosc2.lazyexpr("(le2 & le3)", {"le2": myle2, "le3": myle3}) le4_.save("expr4.b2nd", mode="w") - myle4 = blosc2.open("expr4.b2nd") + myle4 = blosc2.open("expr4.b2nd", mode="r") assert (myle4[:] == le4[:]).all() # Remove files diff --git a/tests/ndarray/test_lazyudf.py b/tests/ndarray/test_lazyudf.py index 3aa76cb6..ab0cd814 100644 --- a/tests/ndarray/test_lazyudf.py +++ b/tests/ndarray/test_lazyudf.py @@ -483,7 +483,7 @@ def test_save_ludf(): expr.save(urlpath=urlpath) del expr - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") assert isinstance(expr, blosc2.LazyUDF) res_lazyexpr = expr.compute() np.testing.assert_array_equal(res_lazyexpr[:], npc) @@ -493,7 +493,7 @@ def test_save_ludf(): expr = blosc2.lazyudf(udf1p_numba, (array,), np.float64) expr.save(urlpath=urlpath) del expr - expr = blosc2.open(urlpath) + expr = blosc2.open(urlpath, mode="r") assert isinstance(expr, blosc2.LazyUDF) res_lazyexpr = expr.compute() np.testing.assert_array_equal(res_lazyexpr[:], npc) @@ -511,7 +511,7 @@ def test_lazyudf_vlmeta_roundtrip(tmp_path): expr.vlmeta["attrs"] = {"version": 1} expr.save(urlpath=str(expr_path)) - restored = blosc2.open(str(expr_path)) + restored = blosc2.open(str(expr_path), mode="r") assert isinstance(restored, blosc2.LazyUDF) assert restored.vlmeta["name"] == "increment" diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index 5a4f376d..c21c774a 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -77,7 +77,7 @@ def test_shape_with_zeros(shape, urlpath): data = np.zeros(shape, dtype="int32") ndarray = blosc2.asarray(data, urlpath=urlpath, mode="w") if urlpath is not None: - ndarray = blosc2.open(urlpath) + ndarray = blosc2.open(urlpath, mode="r") assert isinstance(ndarray, blosc2.NDArray) assert ndarray.shape == shape assert ndarray.size == 0 @@ -502,11 +502,11 @@ def test_argsort_scalar(): def test_save(): a = blosc2.arange(0, 10, 1, dtype="i4", shape=(10,)) blosc2.save(a, "test.b2nd") - c = blosc2.open("test.b2nd") + c = blosc2.open("test.b2nd", mode="r") assert np.array_equal(a[:], c[:]) blosc2.remove_urlpath("test.b2nd") with pytest.raises(FileNotFoundError): - blosc2.open("test.b2nd") + blosc2.open("test.b2nd", mode="r") def test_oindex(): diff --git a/tests/ndarray/test_persistency.py b/tests/ndarray/test_persistency.py index ee79c680..939cd5a8 100644 --- a/tests/ndarray/test_persistency.py +++ b/tests/ndarray/test_persistency.py @@ -32,7 +32,7 @@ def test_persistency(shape, chunks, blocks, urlpath, contiguous, dtype): size = int(np.prod(shape)) nparray = np.arange(size, dtype=dtype).reshape(shape) _ = blosc2.asarray(nparray, chunks=chunks, blocks=blocks, urlpath=urlpath, contiguous=contiguous) - b = blosc2.open(urlpath) + b = blosc2.open(urlpath, mode="r") bc = b[:] diff --git a/tests/ndarray/test_proxy.py b/tests/ndarray/test_proxy.py index 15648e99..fc4577d9 100644 --- a/tests/ndarray/test_proxy.py +++ b/tests/ndarray/test_proxy.py @@ -90,10 +90,10 @@ def test_open(urlpath, shape, chunks, blocks, slices, dtype): del b if urlpath is None: with pytest.raises(RuntimeError): - _ = blosc2.open(proxy_urlpath) + _ = blosc2.open(proxy_urlpath, mode="a") else: - b = blosc2.open(proxy_urlpath) - a = blosc2.open(urlpath) + b = blosc2.open(proxy_urlpath, mode="a") + a = blosc2.open(urlpath, mode="r") if not struct_dtype: np.testing.assert_almost_equal(b[...], a[...]) else: diff --git a/tests/ndarray/test_proxy_c2array.py b/tests/ndarray/test_proxy_c2array.py index fea92163..1f7d427f 100644 --- a/tests/ndarray/test_proxy_c2array.py +++ b/tests/ndarray/test_proxy_c2array.py @@ -86,7 +86,7 @@ def test_open(cat2_context): del a del b - b = blosc2.open(urlpath) + b = blosc2.open(urlpath, mode="r") a = get_array(shape, chunks_blocks) np.testing.assert_allclose(b[...], a[...]) diff --git a/tests/ndarray/test_proxy_expr.py b/tests/ndarray/test_proxy_expr.py index e4aea5a5..17b9f6b0 100644 --- a/tests/ndarray/test_proxy_expr.py +++ b/tests/ndarray/test_proxy_expr.py @@ -76,7 +76,7 @@ def test_expr_proxy_operands(chunks_blocks, cat2_context): urlpath = "expr_proxies.b2nd" expr.save(urlpath=urlpath, mode="w") del expr - expr_opened = blosc2.open("expr_proxies.b2nd") + expr_opened = blosc2.open("expr_proxies.b2nd", mode="r") assert isinstance(expr_opened, blosc2.LazyExpr) # All diff --git a/tests/ndarray/test_reductions.py b/tests/ndarray/test_reductions.py index 91bd17e6..c5ece156 100644 --- a/tests/ndarray/test_reductions.py +++ b/tests/ndarray/test_reductions.py @@ -427,7 +427,7 @@ def test_fast_path(chunks, blocks, disk, fill_value, reduce_op, axis): else: a = blosc2.zeros(shape, dtype=np.float64, chunks=chunks, blocks=blocks, urlpath=urlpath, mode="w") if disk: - a = blosc2.open(urlpath) + a = blosc2.open(urlpath, mode="r") na = a[:] if reduce_op in {"cumulative_sum", "cumulative_prod"}: axis = 0 if axis is None else axis @@ -473,7 +473,7 @@ def test_miniexpr_slice(chunks, blocks, disk, fill_value, reduce_op): else: a = blosc2.zeros(shape, dtype=np.float64, chunks=chunks, blocks=blocks, urlpath=urlpath, mode="w") if disk: - a = blosc2.open(urlpath) + a = blosc2.open(urlpath, mode="r") na = a[:] # Test slice # TODO: Make this work with miniexpr (currently just skips to normal reduction eval) @@ -520,8 +520,8 @@ def test_save_version1(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] nb = b[:] @@ -531,7 +531,7 @@ def test_save_version1(disk, fill_value, reduce_op, axis): assert lexpr.shape == a.shape if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -581,8 +581,8 @@ def test_save_version2(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] nb = b[:] @@ -591,7 +591,7 @@ def test_save_version2(disk, fill_value, reduce_op, axis): lexpr = blosc2.lazyexpr(expr, operands={"a": a, "b": b}) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -641,8 +641,8 @@ def test_save_version3(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] nb = b[:] @@ -651,7 +651,7 @@ def test_save_version3(disk, fill_value, reduce_op, axis): lexpr = blosc2.lazyexpr(expr, operands={"a": a, "b": b}) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -701,8 +701,8 @@ def test_save_version4(disk, fill_value, reduce_op, axis): a = blosc2.zeros(shape, dtype=np.float64, urlpath=urlpath, mode="w") b = blosc2.zeros(shape, dtype=np.float64, urlpath="b.b2nd", mode="w") - 0.1 if disk: - a = blosc2.open(urlpath) - b = blosc2.open("b.b2nd") + a = blosc2.open(urlpath, mode="r") + b = blosc2.open("b.b2nd", mode="r") na = a[:] # Just a single reduction @@ -710,7 +710,7 @@ def test_save_version4(disk, fill_value, reduce_op, axis): lexpr = blosc2.lazyexpr(expr, operands={"a": a}) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") res = lexpr.compute() if reduce_op in {"cumulative_sum", "cumulative_prod"}: oploc = "npcumsum" if reduce_op == "cumulative_sum" else "npcumprod" @@ -738,7 +738,7 @@ def test_save_constructor_reduce(shape, disk, compute): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") if compute: res = lexpr.compute() res = res[()] # for later comparison with nres @@ -767,7 +767,7 @@ def test_save_constructor_reduce2(shape, disk, compute): lexpr = blosc2.lazyexpr(expr) if disk: lexpr.save("out.b2nd") - lexpr = blosc2.open("out.b2nd") + lexpr = blosc2.open("out.b2nd", mode="r") if compute: res = lexpr.compute() res = res[()] # for later comparison with nres diff --git a/tests/ndarray/test_stringarrays.py b/tests/ndarray/test_stringarrays.py index a512622a..e6f37114 100644 --- a/tests/ndarray/test_stringarrays.py +++ b/tests/ndarray/test_stringarrays.py @@ -124,7 +124,7 @@ def test_unicode_roundtrip_on_disk(tmp_path, shape): ) # Re-open from disk - out = blosc2.open(path) + out = blosc2.open(path, mode="r") assert out.dtype == arr.dtype assert np.array_equal(out, arr) @@ -153,7 +153,7 @@ def test_unicode_on_disk_partial_io(tmp_path): b2[6:10] = replacement arr[6:10] = replacement - reopened = blosc2.open(path) + reopened = blosc2.open(path, mode="r") assert np.array_equal(reopened, arr) @@ -167,7 +167,7 @@ def test_unicode_on_disk_persistence(tmp_path): b2 = blosc2.open(path, mode="a") b2[:] = arr2 - reopened = blosc2.open(path) + reopened = blosc2.open(path, mode="r") assert np.array_equal(reopened, arr2) diff --git a/tests/ndarray/test_struct_dtype.py b/tests/ndarray/test_struct_dtype.py index 9f2c3055..dc2b736e 100644 --- a/tests/ndarray/test_struct_dtype.py +++ b/tests/ndarray/test_struct_dtype.py @@ -36,7 +36,7 @@ def test_scalar(shape, dtype, urlpath): assert a.dtype == b.dtype if urlpath is not None: - c = blosc2.open(urlpath) + c = blosc2.open(urlpath, mode="r") assert np.array_equal(c[:], b) assert c.shape == a.shape assert c.dtype == a.dtype diff --git a/tests/test_mmap.py b/tests/test_mmap.py index 53dfc27c..c0870d1f 100644 --- a/tests/test_mmap.py +++ b/tests/test_mmap.py @@ -53,7 +53,9 @@ def test_initial_mapping_size(tmp_path, monkeypatch, capfd, initial_mapping_size # Reading via open for mmap_mode in ["r", "r+", "c"]: open_mapping_size = None if mmap_mode == "r" else initial_mapping_size - schunk_open = blosc2.open(urlpath, mmap_mode=mmap_mode, initial_mapping_size=open_mapping_size) + schunk_open = blosc2.open( + urlpath, mode="r", mmap_mode=mmap_mode, initial_mapping_size=open_mapping_size + ) for i in range(nchunks): buffer = i * np.arange(chunk_nitems, dtype=dtype) bytes_obj = buffer.tobytes() @@ -95,13 +97,13 @@ def test_initial_mapping_size(tmp_path, monkeypatch, capfd, initial_mapping_size # Error handling with pytest.raises(ValueError, match=r"w\+ mmap_mode cannot be used to open an existing file"): - blosc2.open(urlpath, mmap_mode="w+") + blosc2.open(urlpath, mode="a", mmap_mode="w+") with pytest.raises(ValueError, match="initial_mapping_size can only be used with writing modes"): - blosc2.open(urlpath, mmap_mode="r", initial_mapping_size=100) + blosc2.open(urlpath, mode="a", mmap_mode="r", initial_mapping_size=100) with pytest.raises(ValueError, match="initial_mapping_size can only be used with mmap_mode"): - blosc2.open(urlpath, mmap_mode=None, initial_mapping_size=100) + blosc2.open(urlpath, mode="a", mmap_mode=None, initial_mapping_size=100) with pytest.raises(ValueError, match="initial_mapping_size can only be used with writing modes"): blosc2.SChunk(mmap_mode="r", initial_mapping_size=100, **storage) diff --git a/tests/test_open.py b/tests/test_open.py index c7f78c88..a8610fac 100644 --- a/tests/test_open.py +++ b/tests/test_open.py @@ -110,7 +110,7 @@ def test_open(contiguous, urlpath, cparams, dparams, nchunks, chunk_nitems, dtyp def test_open_fake(): with pytest.raises(FileNotFoundError): - _ = blosc2.open("none.b2nd") + _ = blosc2.open("none.b2nd", mode="r") @pytest.mark.parametrize("offset", [0, 42]) @@ -148,3 +148,34 @@ def test_open_offset(offset, urlpath, mode, mmap_mode): blosc2.open(urlpath, mode, mmap_mode=mmap_mode) blosc2.remove_urlpath(urlpath) + + +def test_open_no_mode_warns(tmp_path): + """FutureWarning is emitted when mode is omitted.""" + urlpath = str(tmp_path / "test.b2nd") + blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") + with pytest.warns(FutureWarning, match="mode='a'"): + _ = blosc2.open(urlpath) + + +def test_open_explicit_mode_no_warn(tmp_path): + """No FutureWarning is emitted when mode is explicitly given.""" + import warnings + + urlpath = str(tmp_path / "test.b2nd") + blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") + with warnings.catch_warnings(): + warnings.simplefilter("error", FutureWarning) + _ = blosc2.open(urlpath, mode="r") + _ = blosc2.open(urlpath, mode="a") + + +def test_open_mmap_without_mode_warns(tmp_path): + """FutureWarning is emitted when mode is omitted, even with mmap_mode.""" + if blosc2.IS_WASM: + pytest.skip("mmap_mode is not supported reliably on wasm32") + + urlpath = str(tmp_path / "test.b2nd") + blosc2.asarray(np.arange(10), urlpath=urlpath, mode="w") + with pytest.warns(FutureWarning, match="mode='a'"): + _ = blosc2.open(urlpath, mmap_mode="r") diff --git a/tests/test_open_c2array.py b/tests/test_open_c2array.py index 14bb6710..8d4458ac 100644 --- a/tests/test_open_c2array.py +++ b/tests/test_open_c2array.py @@ -43,7 +43,7 @@ def test_open_c2array(cat2_context): assert a1.cratio == a_open.cratio with pytest.raises(NotImplementedError): - _ = blosc2.open(urlpath) + _ = blosc2.open(urlpath, mode="a") with pytest.raises(NotImplementedError): _ = blosc2.open(urlpath, mode="r", offset=0, cparams={}) diff --git a/tests/test_proxy_schunk.py b/tests/test_proxy_schunk.py index 4245b1aa..7155834c 100644 --- a/tests/test_proxy_schunk.py +++ b/tests/test_proxy_schunk.py @@ -68,9 +68,9 @@ def test_open(urlpath, chunksize, nchunks): del schunk if urlpath is None: with pytest.raises(RuntimeError): - _ = blosc2.open(proxy_urlpath) + _ = blosc2.open(proxy_urlpath, mode="a") else: - proxy = blosc2.open(proxy_urlpath) + proxy = blosc2.open(proxy_urlpath, mode="a") assert proxy[0 : len(data) * 4] == bytes_obj blosc2.remove_urlpath(urlpath)