From d887d83f6d74f717ad88dc2814e253f8835be345 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 7 Nov 2024 06:33:46 +0100 Subject: [PATCH 01/11] Preliminary support for numpy type inference --- examples/ndarray/reduce_and_enlarge.py | 27 ++-- src/blosc2/lazyexpr.py | 185 +++++++++++++++++++++++-- tests/ndarray/test_lazyexpr.py | 5 +- 3 files changed, 190 insertions(+), 27 deletions(-) diff --git a/examples/ndarray/reduce_and_enlarge.py b/examples/ndarray/reduce_and_enlarge.py index b16eff9d4..eadccc451 100644 --- a/examples/ndarray/reduce_and_enlarge.py +++ b/examples/ndarray/reduce_and_enlarge.py @@ -14,12 +14,14 @@ # In particular, note how re-opening a stored expression can adapt to # changes in the operands. +import numpy as np + import blosc2 # Create arrays with specific dimensions a = blosc2.full((2, 3, 4), 1, urlpath="a.b2nd", mode="w") # 3D array with dimensions (2, 3, 4) b = blosc2.full((2, 4), 2, urlpath="b.b2nd", mode="w") # 2D array with dimensions (2, 4) -c = blosc2.full(4, 3, urlpath="c.b2nd", mode="w") # 1D array with dimensions (4,) +c = blosc2.full(4, 3, dtype=np.int8, urlpath="c.b2nd", mode="w") # 1D array with dimensions (4,) print("Array a:", a[:]) print("Array b:", b[:]) @@ -28,11 +30,18 @@ # Define an expression using the arrays above # expression = "a.sum() + b * c" # expression = "a.sum(axis=1) + b * c" -expression = "sum(a, axis=1) + b * c" +# expression = "sum(a, axis=1) + b * c + 0" +expression = "c + np.int8(0)" +# expression = "sum(a, axis=1) + b * sin(c)" # Define the operands for the expression -operands = {"a": a, "b": b, "c": c} +# operands = {"a": a, "b": b, "c": c} +operands = {"c": c} # Create a lazy expression +print("expression:", expression, type(expression), operands["c"].dtype) lazy_expression = blosc2.lazyexpr(expression, operands) +print(lazy_expression.shape, lazy_expression.dtype) # Print the shape of the lazy expression +# lazy_expression = blosc2.lazyexpr(expression) +print(lazy_expression[:]) # Evaluate and print the result of the lazy expression (should be a 2x4 arr) # Store and reload the expressions url_path = "my_expr.b2nd" # Define the path for the file @@ -41,15 +50,5 @@ url_path = "my_expr.b2nd" # Define the path for the file lazy_expression = blosc2.open(urlpath=url_path) # Open the saved file print(lazy_expression) # Print the lazy expression -print(lazy_expression.shape) # Print the shape of the lazy expression +print(lazy_expression.shape, lazy_expression.dtype) # Print the shape of the lazy expression print(lazy_expression[:]) # Evaluate and print the result of the lazy expression (should be a 2x4 arr) - -# Enlarge the arrays and re-evaluate the expression -a.resize((3, 3, 4)) -a[2] = 3 -b.resize((3, 4)) -b[2] = 5 -lazy_expression = blosc2.open(urlpath=url_path) # Open the saved file -print(lazy_expression.shape) -result = lazy_expression.compute() -print(result[:]) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 0ce8d9787..8fcd265e2 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -277,7 +277,7 @@ def compute_broadcast_shape(arrays): Returns the shape of the outcome of an operation with the input arrays. """ # When dealing with UDFs, one can arrive params that are not arrays - shapes = [arr.shape for arr in arrays if hasattr(arr, "shape")] + shapes = [arr.shape for arr in arrays if hasattr(arr, "shape") and arr is not np] return np.broadcast_shapes(*shapes) @@ -338,6 +338,8 @@ def compute_smaller_slice(larger_shape, smaller_shape, larger_slice): # Define valid method names valid_methods = {"sum", "prod", "min", "max", "std", "mean", "var", "any", "all", "where"} +valid_methods |= {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"} +valid_methods |= {"float32", "float64", "complex64", "complex128"} def validate_expr(expr: str) -> None: @@ -410,6 +412,48 @@ def visit_Call(self, node): return set(visitor.operands) +class InferDtypeVisitor(ast.NodeVisitor): + def __init__(self, dtype_dict): + self.dtype_dict = dtype_dict + self.result_dtype = None + + def visit_Name(self, node): + if node.id in self.dtype_dict: + if self.result_dtype is None: + self.result_dtype = self.dtype_dict[node.id] + else: + self.result_dtype = np.result_type(self.result_dtype, self.dtype_dict[node.id]) + + def visit_Constant(self, node): + const_dtype = np.array(node.value).dtype + if self.result_dtype is None: + self.result_dtype = const_dtype + else: + self.result_dtype = np.result_type(self.result_dtype, const_dtype) + + def visit_BinOp(self, node): + self.visit(node.left) + left_dtype = self.result_dtype + + self.result_dtype = None + self.visit(node.right) + right_dtype = self.result_dtype + + self.result_dtype = np.result_type(left_dtype, right_dtype) + + def infer_dtype(self, expr): + self.result_dtype = None + tree = ast.parse(expr, mode="eval") + self.visit(tree.body) + return self.result_dtype + + +def infer_expression_dtype(expr, array_dict): + dtype_dict = {k: v.dtype for k, v in array_dict.items()} + visitor = InferDtypeVisitor(dtype_dict) + return visitor.infer_dtype(expr) + + def validate_inputs(inputs: dict, out=None) -> tuple: # noqa: C901 """Validate the inputs for the expression.""" if len(inputs) == 0: @@ -417,7 +461,7 @@ def validate_inputs(inputs: dict, out=None) -> tuple: # noqa: C901 "You need to pass at least one array. Use blosc2.empty() if values are not really needed." ) - inputs = [input for input in inputs.values() if hasattr(input, "shape")] + inputs = [input for input in inputs.values() if hasattr(input, "shape") and input is not np] shape = compute_broadcast_shape(inputs) if not all(np.array_equal(shape, input.shape) for input in inputs): @@ -1893,7 +1937,7 @@ def info_items(self): items += [("dtype", self.dtype)] return items - def save(self, urlpath=None, **kwargs): + def _save(self, urlpath=None, **kwargs): if urlpath is None: raise ValueError("To save a LazyArray you must provide an urlpath") @@ -1918,6 +1962,9 @@ def save(self, urlpath=None, **kwargs): "urlbase": value.urlbase, } continue + if key in {"numpy", "np"}: + # Provide access to cast funcs like int8 et al. + continue if isinstance(value, blosc2.Proxy): # Take the required info from the Proxy._cache container value = value._cache @@ -1934,6 +1981,57 @@ def save(self, urlpath=None, **kwargs): "operands": operands, } + def save(self, urlpath=None, **kwargs): + if urlpath is None: + raise ValueError("To save a LazyArray you must provide an urlpath") + + expression = self.expression_tosave if hasattr(self, "expression_tosave") else self.expression + operands_ = self.operands_tosave if hasattr(self, "operands_tosave") else self.operands + # Validate expression + validate_expr(expression) + + meta = kwargs.get("meta", {}) + meta["LazyArray"] = LazyArrayEnum.Expr.value + kwargs["urlpath"] = urlpath + kwargs["meta"] = meta + kwargs["mode"] = "w" # always overwrite the file in urlpath + + # Create an empty array; useful for providing the shape and dtype of the outcome + array = blosc2.empty(shape=self.shape, dtype=self.dtype, **kwargs) + + # Save the expression and operands in the metadata + operands = {} + print(operands_) + for key, value in operands_.items(): + if isinstance(value, blosc2.C2Array): + operands[key] = { + "path": str(value.path), + "urlbase": value.urlbase, + } + continue + # TODO: can this be removed? + if key in {"numpy", "np"}: + # Provide access to cast funcs like int8 et al. + continue + if isinstance(value, blosc2.Proxy): + # Take the required info from the Proxy._cache container + value = value._cache + if not hasattr(value, "schunk"): + print(f"expression: {expression}") + raise ValueError( + "To save a LazyArray, all operands must be blosc2.NDArray or blosc2.C2Array objects" + ) + if value.schunk.urlpath is None: + raise ValueError("To save a LazyArray, all operands must be stored on disk/network") + operands[key] = value.schunk.urlpath + print("expression:", expression) + print("operands:", operands) + array.schunk.vlmeta["_LazyArray"] = { + "expression": expression, + "UDF": None, + "operands": operands, + } + @classmethod def _new_expr(cls, expression, operands, guess, out=None, where=None): # Validate the expression @@ -1941,14 +2039,23 @@ def _new_expr(cls, expression, operands, guess, out=None, where=None): if guess: # The expression has been validated, so we can evaluate it # in guessing mode to avoid computing reductions + # Extract possible numpy scalars + _expression, local_vars = transform_expression(expression) + # Let's include numpy and blosc2 as operands so that some functions can be used + # Most in particular, castings like np.int8 et al. can be very useful to allow + # for desired data types in the output. + _operands = operands | local_vars _globals = {func: getattr(blosc2, func) for func in functions if func in expression} - new_expr = eval(expression, _globals, operands) - _dtype = new_expr.dtype + _globals |= {"np": np, "numpy": np} + new_expr = eval(_expression, _globals, _operands) + _dtype = infer_expression_dtype(_expression, _operands) _shape = new_expr.shape if isinstance(new_expr, blosc2.LazyExpr): # Restore the original expression and operands - new_expr.expression = expression - new_expr.operands = operands + new_expr.expression = _expression + new_expr.expression_tosave = expression + new_expr.operands = _operands + new_expr.operands_tosave = operands else: # An immediate evaluation happened (e.g. all operands are numpy arrays) new_expr = cls(None) @@ -2093,6 +2200,46 @@ def save(self, **kwargs): raise NotImplementedError("For safety reasons, this is not implemented for UDFs") +class TransformNumpyCalls(ast.NodeTransformer): + def __init__(self): + self.replacements = {} + self.tmp_counter = 0 + + def visit_Call(self, node): + # Check if the call is a numpy type-casting call + if ( + isinstance(node.func, ast.Attribute) + and isinstance(node.func.value, ast.Name) + and node.func.value.id in ["np", "numpy"] + and isinstance(node.args[0], ast.Constant) + ): + # Create a new temporary variable name + tmp_var = f"tmp{self.tmp_counter}" + self.tmp_counter += 1 + + # Evaluate the type-casting call to create the new variable's value + numpy_type = getattr(np, node.func.attr) + self.replacements[tmp_var] = numpy_type(node.args[0].value) + + # Replace the call node with a variable node + return ast.copy_location(ast.Name(id=tmp_var, ctx=ast.Load()), node) + return self.generic_visit(node) + + +def transform_expression(expr: str): + # Parse the expression into an AST + tree = ast.parse(expr, mode="eval") + + # Transform the AST + transformer = TransformNumpyCalls() + transformed_tree = transformer.visit(tree) + + # Generate the modified expression + transformed_expr = ast.unparse(transformed_tree) + + return transformed_expr, transformer.replacements + + def _open_lazyarray(array): value = array.schunk.meta["LazyArray"] if value == LazyArrayEnum.UDF.value: @@ -2119,16 +2266,31 @@ def _open_lazyarray(array): expr = lazyarray["expression"] globals = {func: getattr(blosc2, func) for func in functions if func in expr} + globals |= {"np": np, "numpy": np} # Validate the expression (prevent security issues) validate_expr(expr) + # Extract possible numpy scalars + # _expr, local_vars = transform_expression(expr) + # operands_dict |= local_vars + # Extract possible numpy scalars + _expression, local_vars = transform_expression(expr) + # Let's include numpy and blosc2 as operands so that some functions can be used + # Most in particular, castings like np.int8 et al. can be very useful to allow + # for desired data types in the output. + # local_vars |= {"np": np, "numpy": np} + _operands = operands_dict | local_vars # Create the expression as such - new_expr = eval(expr, globals, operands_dict) - _dtype = new_expr.dtype + new_expr = eval(_expression, globals, _operands) + _dtype = infer_expression_dtype(_expression, _operands) + # _dtype = new_expr.dtype _shape = new_expr.shape if isinstance(new_expr, blosc2.LazyExpr): # Restore the original expression and operands - new_expr.expression = expr - new_expr.operands = operands_dict + print("new_expr:", new_expr.expression, expr, operands_dict) + new_expr.expression = _expression + new_expr.expression_tosave = expr + new_expr.operands = _operands + new_expr.operands_tosave = operands_dict # Make the array info available for the user (only available when opened from disk) new_expr.array = array # We want to expose schunk too, so that .info() can be used on the LazyArray @@ -2317,6 +2479,7 @@ def lazyexpr( where_args = {"_where_x": where[0], "_where_y": where[1]} expression._where_args = where_args return expression + if operands is None: # Try to get operands from variables in the stack operands = get_expr_operands(expression) diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 5dd16da00..1e79ce645 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -589,11 +589,12 @@ def test_save_unsafe(): assert expr.expression in str(excinfo.value) # Check that an invalid expression cannot be easily saved. - # As this can easily be worked around, the best protection is + # Although, as this can easily be worked around, the best protection is # during loading time (tested above). + expr.expression_tosave = "import os; os.system('touch /tmp/unsafe')" with pytest.raises(ValueError) as excinfo: expr.save(urlpath=urlpath) - assert expr.expression in str(excinfo.value) + assert expr.expression_tosave in str(excinfo.value) for urlpath in disk_arrays: blosc2.remove_urlpath(urlpath) From 2d615cc43109591c69fc4b456b9a2b4cb8bd6e57 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 7 Nov 2024 07:41:22 +0100 Subject: [PATCH 02/11] Initial test --- src/blosc2/lazyexpr.py | 22 +++++++++++-------- tests/ndarray/test_lazyexpr.py | 40 ++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 8fcd265e2..b62c7801d 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -747,6 +747,7 @@ def fast_eval( # noqa: C901 The output array. """ out = kwargs.pop("_output", None) + dtype = kwargs.pop("dtype", None) where: dict | None = kwargs.pop("_where_args", None) if isinstance(out, blosc2.NDArray): # If 'out' has been passed, and is a NDArray, use it as the base array @@ -817,11 +818,9 @@ def fast_eval( # noqa: C901 if out is None: # We can enter here when using any of the eval() or __getitem__() methods if getitem: - out = np.empty(shape, dtype=result.dtype) + out = np.empty(shape, dtype=dtype) else: - out = blosc2.empty( - shape, chunks=chunks, blocks=basearr.blocks, dtype=result.dtype, **kwargs - ) + out = blosc2.empty(shape, chunks=chunks, blocks=basearr.blocks, dtype=dtype, **kwargs) # Store the result in the output array if getitem: @@ -872,6 +871,7 @@ def slices_eval( # noqa: C901 out = kwargs.pop("_output", None) chunks = kwargs.get("chunks") where: dict | None = kwargs.pop("_where_args", None) + dtype = kwargs.pop("dtype", None) # Compute the shape and chunks of the output array, including broadcasting shape = compute_broadcast_shape(operands.values()) @@ -972,23 +972,24 @@ def slices_eval( # noqa: C901 # The result is a linear array shape_ = math.prod(shape) if getitem: - out = np.empty(shape_, dtype=result.dtype) + out = np.empty(shape_, dtype=dtype) else: if "chunks" not in kwargs and (where is None or len(where) == 2): # Let's use the same chunks as the first operand (it could have been automatic too) - out = blosc2.empty(shape_, chunks=chunks, dtype=result.dtype, **kwargs) + out = blosc2.empty(shape_, chunks=chunks, dtype=dtype, **kwargs) elif "chunks" in kwargs and (where is not None and len(where) < 2 and len(shape_) > 1): # Remove the chunks argument if the where condition is not a tuple with two elements kwargs.pop("chunks") - out = blosc2.empty(shape_, dtype=result.dtype, **kwargs) + out = blosc2.empty(shape_, dtype=dtype, **kwargs) else: - out = blosc2.empty(shape_, dtype=result.dtype, **kwargs) + out = blosc2.empty(shape_, dtype=dtype, **kwargs) # Check if the in out partitions are well-behaved (i.e. no padding) behaved = blosc2.are_partitions_behaved(out.shape, out.chunks, out.blocks) if where is None or len(where) == 2: if behaved: # Fast path + result = np.asarray(result, dtype=dtype) out.schunk.update_data(nchunk, result, copy=False) else: out[slice_] = result @@ -1051,7 +1052,8 @@ def reduce_slices( # noqa: C901 reduce_op = reduce_args.pop("op") axis = reduce_args["axis"] keepdims = reduce_args["keepdims"] - dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None + # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None + dtype = kwargs.get("dtype") # Compute the shape and chunks of the output array, including broadcasting shape = compute_broadcast_shape(operands.values()) @@ -1906,6 +1908,8 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray: kwargs["_output"] = self._output if hasattr(self, "_where_args"): kwargs["_where_args"] = self._where_args + if hasattr(self, "_dtype"): + kwargs["dtype"] = self._dtype return self._compute_expr(item, kwargs) def __getitem__(self, item): diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 1e79ce645..c5fcd1dc4 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -975,3 +975,43 @@ def test_fill_disk_operands(chunks, blocks, disk, fill_value): ) def test_get_expr_operands(expression, expected_operands): assert blosc2.get_expr_operands(expression) == set(expected_operands) + + +@pytest.mark.parametrize( + "scalar", + [ + "np.int8(0)", + # "np.uint16(0)", + ], +) +@pytest.mark.parametrize( + ("dtype1", "dtype2"), + [ + (np.int8, np.int8), + # (np.int8, np.int16), + # (np.int8, np.int32), + # (np.int8, np.int64), + # (np.int8, np.float32), + # (np.int8, np.float64), + # (np.uint16, np.uint16), + # (np.uint16, np.uint32), + # (np.uint16, np.uint64), + # (np.uint16, np.float32), + # (np.uint16, np.float64), + # (np.float32, np.float32), + # (np.float32, np.float64), + ], +) +def test_dtype_infer(dtype1, dtype2, scalar): + shape = (5, 10) + na = np.linspace(0, 1, np.prod(shape), dtype=dtype1).reshape(shape) + nb = np.linspace(1, 2, np.prod(shape), dtype=dtype2).reshape(shape) + a = blosc2.asarray(na) + b = blosc2.asarray(nb) + + expr = blosc2.lazyexpr(f"a + b * {scalar}", operands={"a": a, "b": b}) + nres = na + nb * eval(scalar) + # res = expr[()]. # TODO + res = expr.compute() + np.testing.assert_allclose(res[()], nres) + assert res.dtype == nres.dtype From e995529bbee3c62a7603164358c359aaa2230fa9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 8 Nov 2024 18:06:44 +0100 Subject: [PATCH 03/11] Use actual numpy/numexpr type inference instead of naive infer_expression_dtype() --- src/blosc2/lazyexpr.py | 98 +++++++++++---------------- src/blosc2/ndarray.py | 9 ++- tests/ndarray/test_lazyexpr.py | 26 +++---- tests/ndarray/test_lazyexpr_fields.py | 11 ++- 4 files changed, 67 insertions(+), 77 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index b62c7801d..6fe5eec5e 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -24,6 +24,8 @@ from queue import Empty, Queue from typing import TYPE_CHECKING +from numpy.exceptions import ComplexWarning + if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -412,48 +414,6 @@ def visit_Call(self, node): return set(visitor.operands) -class InferDtypeVisitor(ast.NodeVisitor): - def __init__(self, dtype_dict): - self.dtype_dict = dtype_dict - self.result_dtype = None - - def visit_Name(self, node): - if node.id in self.dtype_dict: - if self.result_dtype is None: - self.result_dtype = self.dtype_dict[node.id] - else: - self.result_dtype = np.result_type(self.result_dtype, self.dtype_dict[node.id]) - - def visit_Constant(self, node): - const_dtype = np.array(node.value).dtype - if self.result_dtype is None: - self.result_dtype = const_dtype - else: - self.result_dtype = np.result_type(self.result_dtype, const_dtype) - - def visit_BinOp(self, node): - self.visit(node.left) - left_dtype = self.result_dtype - - self.result_dtype = None - self.visit(node.right) - right_dtype = self.result_dtype - - self.result_dtype = np.result_type(left_dtype, right_dtype) - - def infer_dtype(self, expr): - self.result_dtype = None - tree = ast.parse(expr, mode="eval") - self.visit(tree.body) - return self.result_dtype - - -def infer_expression_dtype(expr, array_dict): - dtype_dict = {k: v.dtype for k, v in array_dict.items()} - visitor = InferDtypeVisitor(dtype_dict) - return visitor.infer_dtype(expr) - - def validate_inputs(inputs: dict, out=None) -> tuple: # noqa: C901 """Validate the inputs for the expression.""" if len(inputs) == 0: @@ -824,9 +784,15 @@ def fast_eval( # noqa: C901 # Store the result in the output array if getitem: - out[slice_] = result + try: + out[slice_] = result + except ComplexWarning: + # The result is a complex number, so we need to convert it to real. + # This is a workaround for rigidness of NumExpr with type casting. + result = result.real.astype(out.dtype) + out[slice_] = result else: - if behaved and result.shape == chunks_: + if behaved and result.shape == chunks_ and result.dtype == out.dtype: # Fast path only works for results that are full chunks out.schunk.update_data(nchunk, result, copy=False) else: @@ -1052,8 +1018,8 @@ def reduce_slices( # noqa: C901 reduce_op = reduce_args.pop("op") axis = reduce_args["axis"] keepdims = reduce_args["keepdims"] - # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None - dtype = kwargs.get("dtype") + dtype = kwargs.pop("dtype", None) + # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else _dtype # Compute the shape and chunks of the output array, including broadcasting shape = compute_broadcast_shape(operands.values()) @@ -1205,8 +1171,8 @@ def reduce_slices( # noqa: C901 result = reduce_op.value.reduce(result, **reduce_args) if out is None: - if dtype is None: - dtype = result.dtype + # if dtype is None: + # dtype = result.dtype if is_inside_eval(): # We already have the dtype and reduced_shape, so return immediately # Use a blosc2 container, as it consumes less memory in general @@ -1594,7 +1560,18 @@ def dtype(self): # In some situations, we already know the dtype return self._dtype operands = {key: np.ones(1, dtype=value.dtype) for key, value in self.operands.items()} - _out = ne.evaluate(self.expression, local_dict=operands) + if "contains" in self.expression: + _out = ne.evaluate(self.expression, local_dict=operands) + else: + # Create a globals dict with the functions of numpy + globals_dict = {f: getattr(np, f) for f in functions if f != "contains"} + try: + _out = eval(self.expression, globals_dict, operands) + except RuntimeWarning: + # Sometimes, numpy gets a RuntimeWarning when evaluating expressions + # with synthetic operands (1's). Let's try with numexpr, which is not so picky + # about this. + _out = ne.evaluate(self.expression, local_dict=operands) return _out.dtype @property @@ -1770,17 +1747,26 @@ def __ge__(self, value): def where(self, value1=None, value2=None): if self.dtype != np.bool_: raise ValueError("where() can only be used with boolean expressions") + dtype = self.dtype # This just acts as a 'decorator' for the existing expression if value1 is not None and value2 is not None: + # Guess the outcome dtype for value1 and value2 + dtype = np.result_type(np.asarray(value1), np.asarray(value2)) args = {"_where_x": value1, "_where_y": value2} elif value1 is not None: + dtype = np.asarray(value1).dtype args = {"_where_x": value1} elif value2 is not None: raise ValueError("where() requires value1 when using value2") else: args = {} - self._where_args = args - return self + # Create a new expression + new_expr = blosc2.LazyExpr(new_op=(self, None, None)) + new_expr.expression = self.expression + new_expr.operands = self.operands + new_expr._where_args = args + new_expr._dtype = dtype + return new_expr def sum(self, axis=None, dtype=None, keepdims=False, **kwargs): reduce_args = { @@ -1908,8 +1894,7 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray: kwargs["_output"] = self._output if hasattr(self, "_where_args"): kwargs["_where_args"] = self._where_args - if hasattr(self, "_dtype"): - kwargs["dtype"] = self._dtype + kwargs["dtype"] = self.dtype return self._compute_expr(item, kwargs) def __getitem__(self, item): @@ -2052,7 +2037,7 @@ def _new_expr(cls, expression, operands, guess, out=None, where=None): _globals = {func: getattr(blosc2, func) for func in functions if func in expression} _globals |= {"np": np, "numpy": np} new_expr = eval(_expression, _globals, _operands) - _dtype = infer_expression_dtype(_expression, _operands) + _dtype = new_expr.dtype _shape = new_expr.shape if isinstance(new_expr, blosc2.LazyExpr): # Restore the original expression and operands @@ -2283,10 +2268,9 @@ def _open_lazyarray(array): # for desired data types in the output. # local_vars |= {"np": np, "numpy": np} _operands = operands_dict | local_vars - # Create the expression as such + # Create the expression as such, but in guessing mode new_expr = eval(_expression, globals, _operands) - _dtype = infer_expression_dtype(_expression, _operands) - # _dtype = new_expr.dtype + _dtype = new_expr.dtype _shape = new_expr.shape if isinstance(new_expr, blosc2.LazyExpr): # Restore the original expression and operands diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index b9b5734fb..6408b5ebc 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -13,6 +13,8 @@ from collections import namedtuple from typing import TYPE_CHECKING, NamedTuple +from numpy.exceptions import ComplexWarning + if TYPE_CHECKING: from collections.abc import Iterator, Sequence @@ -1133,7 +1135,12 @@ def __setitem__(self, key: int | slice | Sequence[slice], value: object): value = np.full(shape, value, dtype=self.dtype) elif isinstance(value, np.ndarray): if value.dtype != self.dtype: - raise ValueError("The dtype of the value should be the same as the array") + try: + value = value.astype(self.dtype) + except ComplexWarning: + # numexpr type inference can lead to unnecessary type promotions + # when using complex functions (e.g. conj) with real arrays + value = value.real.astype(self.dtype) if value.shape == (): value = np.full(shape, value, dtype=self.dtype) elif isinstance(value, NDArray): diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index c5fcd1dc4..15fe1d19e 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -981,25 +981,25 @@ def test_get_expr_operands(expression, expected_operands): "scalar", [ "np.int8(0)", - # "np.uint16(0)", + "np.uint16(0)", ], ) @pytest.mark.parametrize( ("dtype1", "dtype2"), [ (np.int8, np.int8), - # (np.int8, np.int16), - # (np.int8, np.int32), - # (np.int8, np.int64), - # (np.int8, np.float32), - # (np.int8, np.float64), - # (np.uint16, np.uint16), - # (np.uint16, np.uint32), - # (np.uint16, np.uint64), - # (np.uint16, np.float32), - # (np.uint16, np.float64), - # (np.float32, np.float32), - # (np.float32, np.float64), + (np.int8, np.int16), + (np.int8, np.int32), + (np.int8, np.int64), + (np.int8, np.float32), + (np.int8, np.float64), + (np.uint16, np.uint16), + (np.uint16, np.uint32), + (np.uint16, np.uint64), + (np.uint16, np.float32), + (np.uint16, np.float64), + (np.float32, np.float32), + (np.float32, np.float64), ], ) def test_dtype_infer(dtype1, dtype2, scalar): diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py index 97f7be6f9..3873c206d 100644 --- a/tests/ndarray/test_lazyexpr_fields.py +++ b/tests/ndarray/test_lazyexpr_fields.py @@ -276,8 +276,8 @@ def test_where_fusion(array_fixture): # Two where() calls with a reduction (and using broadcasting) axis = None if sa1.ndim == 1 else 1 - res = expr.where(0, 1) + expr.where(0, 1).sum(axis=axis) - nres = np.where(npexpr, 0, 1) + np.where(npexpr, 0, 1).sum(axis=axis) + res = expr.where(0.5, 0.2) + expr.where(0.3, 0.6).sum(axis=axis) + nres = np.where(npexpr, 0.5, 0.2) + np.where(npexpr, 0.3, 0.6).sum(axis=axis) np.testing.assert_allclose(res[:], nres) # Reuse the result in another expression @@ -290,11 +290,10 @@ def test_where_fusion(array_fixture): nres = 2 * nres + 4 * nres np.testing.assert_allclose(res[:], nres) - # TODO: this is not working yet # Reuse the result in another expression twice II - # res = 2 * res + blosc2.sqrt(res) - # nres = 2 * nres + nres.sqrt() - # np.testing.assert_allclose(res[:], nres) + res = 2 * res + blosc2.sqrt(res) + nres = 2 * nres + np.sqrt(nres) + np.testing.assert_allclose(res[:], nres) # TODO: this is not working yet # Reuse the result in another expression twice III From e0ef9116f1ddb122d717a31c882de91fc364c1d4 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 07:16:20 +0100 Subject: [PATCH 04/11] Fix dtype fo sum and prod reductions --- src/blosc2/lazyexpr.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 6fe5eec5e..d5a94443d 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1018,8 +1018,12 @@ def reduce_slices( # noqa: C901 reduce_op = reduce_args.pop("op") axis = reduce_args["axis"] keepdims = reduce_args["keepdims"] - dtype = kwargs.pop("dtype", None) - # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else _dtype + # For sum and prod, we can specify the dtype of the output array + dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None + if dtype is None: + dtype = kwargs.pop("dtype", None) + else: + del kwargs["dtype"] # Compute the shape and chunks of the output array, including broadcasting shape = compute_broadcast_shape(operands.values()) From fea712e06322f8820d9d8966e82efe341f2aeaf9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 11:19:14 +0100 Subject: [PATCH 05/11] Fix for inferring types when scalars are in expressions --- src/blosc2/lazyexpr.py | 24 ++++++++++++------------ tests/ndarray/test_lazyexpr.py | 28 +++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 17 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index d5a94443d..d30cd4ac8 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -753,13 +753,15 @@ def fast_eval( # noqa: C901 operands, slice_, chunks_, full_chunk, aligned, nchunk, iter_disk, chunk_operands ) - if isinstance(out, np.ndarray) and not where: - # Fast path: put the result straight in the output array (avoiding a memory copy) - if callable(expression): - expression(tuple(chunk_operands.values()), out[slice_], offset=offset) - else: - ne.evaluate(expression, chunk_operands, out=out[slice_]) - continue + # Since ne.evaluate() can return a dtype larger than the one in computed in the expression, + # we cannot take this fast path + # if isinstance(out, np.ndarray) and not where: + # # Fast path: put the result straight in the output array (avoiding a memory copy) + # if callable(expression): + # expression(tuple(chunk_operands.values()), out[slice_], offset=offset) + # else: + # ne.evaluate(expression, chunk_operands, out=out[slice_]) + # continue if callable(expression): result = np.empty(chunks_, dtype=out.dtype) expression(tuple(chunk_operands.values()), result, offset=offset) @@ -1436,6 +1438,7 @@ def __init__(self, new_op): # noqa: C901 self.expression = f"{op}(o0, o1)" return + self._dtype = np.result_type(value1, value2) if np.isscalar(value1) and np.isscalar(value2): self.expression = f"({value1} {op} {value2})" elif np.isscalar(value2): @@ -1511,6 +1514,7 @@ def update_expr(self, new_op): # noqa: C901 value1 = value1.compute() if hasattr(value2, "_where_args"): value2 = value2.compute() + self._dtype = np.result_type(value1, value2) if not isinstance(value1, LazyExpr) and not isinstance(value2, LazyExpr): # We converted some of the operands to NDArray (where() handling above) new_operands = {"o0": value1, "o1": value2} @@ -1903,11 +1907,7 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray: def __getitem__(self, item): kwargs = {"_getitem": True} - if hasattr(self, "_output"): - kwargs["_output"] = self._output - if hasattr(self, "_where_args"): - kwargs["_where_args"] = self._where_args - return self._compute_expr(item, kwargs) + return self.compute(item, **kwargs) def __str__(self): return f"{self.expression}" diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py index 15fe1d19e..158134783 100644 --- a/tests/ndarray/test_lazyexpr.py +++ b/tests/ndarray/test_lazyexpr.py @@ -297,9 +297,9 @@ def test_functions(function, dtype_fixture, shape_fixture): # Compare the results np.testing.assert_allclose(res_lazyexpr[:], res_numexpr) - # For some reason real is not supported by numpy's assert_allclose - # (TypeError: bad operand type for abs(): 'LazyExpr') - if function == "real": + # For some reason real and imag are not supported by numpy's assert_allclose + # (TypeError: bad operand type for abs(): 'LazyExpr' and segfaults are observed) + if function in ("real", "imag"): return # Using numpy functions @@ -981,7 +981,16 @@ def test_get_expr_operands(expression, expected_operands): "scalar", [ "np.int8(0)", + "np.uint8(0)", + "np.int16(0)", "np.uint16(0)", + "np.int32(0)", + "np.uint32(0)", + "np.int64(0)", + "np.float32(0)", + "np.float64(0)", + "np.complex64(0)", + "np.complex128(0)", ], ) @pytest.mark.parametrize( @@ -995,11 +1004,15 @@ def test_get_expr_operands(expression, expected_operands): (np.int8, np.float64), (np.uint16, np.uint16), (np.uint16, np.uint32), - (np.uint16, np.uint64), + # (np.uint16, np.uint64), # numexpr does not support uint64 (np.uint16, np.float32), (np.uint16, np.float64), + (np.int32, np.int32), + (np.int32, np.int64), (np.float32, np.float32), (np.float32, np.float64), + (np.complex64, np.complex64), + (np.complex64, np.complex128), ], ) def test_dtype_infer(dtype1, dtype2, scalar): @@ -1009,9 +1022,14 @@ def test_dtype_infer(dtype1, dtype2, scalar): a = blosc2.asarray(na) b = blosc2.asarray(nb) + # Using compute() expr = blosc2.lazyexpr(f"a + b * {scalar}", operands={"a": a, "b": b}) nres = na + nb * eval(scalar) - # res = expr[()]. # TODO res = expr.compute() np.testing.assert_allclose(res[()], nres) assert res.dtype == nres.dtype + + # Using __getitem__ + res = expr[()] + np.testing.assert_allclose(res, nres) + assert res.dtype == nres.dtype From 4b64f0c105df17e922030fedfe44a1df08de18fb Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 11:47:20 +0100 Subject: [PATCH 06/11] Fix remaining tests on fields in lazy expressions --- src/blosc2/lazyexpr.py | 8 +++++--- src/blosc2/proxy.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index d30cd4ac8..5c08c513b 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1755,14 +1755,16 @@ def __ge__(self, value): def where(self, value1=None, value2=None): if self.dtype != np.bool_: raise ValueError("where() can only be used with boolean expressions") - dtype = self.dtype # This just acts as a 'decorator' for the existing expression if value1 is not None and value2 is not None: # Guess the outcome dtype for value1 and value2 - dtype = np.result_type(np.asarray(value1), np.asarray(value2)) + dtype = np.result_type(value1, value2) args = {"_where_x": value1, "_where_y": value2} elif value1 is not None: - dtype = np.asarray(value1).dtype + if hasattr(value1, "dtype"): + dtype = value1.dtype + else: + dtype = np.asarray(value1).dtype args = {"_where_x": value1} elif value2 is not None: raise ValueError("where() requires value1 when using value2") diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py index 6129ca94a..2203d1f8a 100644 --- a/src/blosc2/proxy.py +++ b/src/blosc2/proxy.py @@ -512,8 +512,8 @@ class ProxyNDField(blosc2.Operand): def __init__(self, proxy: Proxy, field: str): self.proxy = proxy self.field = field + self.dtype = proxy.dtype[field] self.shape = proxy.shape - self.dtype = proxy.dtype def __getitem__(self, item: slice | list[slice]) -> np.ndarray: """ From ac3e16c60bb5a05583c8bc6d19a270d0804cb549 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 12:46:27 +0100 Subject: [PATCH 07/11] Assignments with values with different dtype are allowed now --- tests/ndarray/test_setitem.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/ndarray/test_setitem.py b/tests/ndarray/test_setitem.py index 41643ec44..feab7ed0a 100644 --- a/tests/ndarray/test_setitem.py +++ b/tests/ndarray/test_setitem.py @@ -62,5 +62,6 @@ def test_setitem_different_dtype(shape, slices): nparray = np.arange(size, dtype=np.int32).reshape(shape) a = blosc2.empty(nparray.shape, dtype=np.float64) - with pytest.raises(ValueError): - a[slices] = nparray + a[slices] = nparray[slices] + nparray_ = nparray.astype(a.dtype) + np.testing.assert_almost_equal(a[slices], nparray_[slices]) From e3dfd45f3d9ef7f3c01903a1d7fbba6eeb73ec7c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 13:02:08 +0100 Subject: [PATCH 08/11] Cache LazyExpr.dtype, but not .shape --- src/blosc2/lazyexpr.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 5c08c513b..84c19da7a 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1567,6 +1567,13 @@ def dtype(self): if hasattr(self, "_dtype"): # In some situations, we already know the dtype return self._dtype + if ( + hasattr(self, "_dtype_") + and hasattr(self, "_expression_") + and self._expression_ == self.expression + ): + # Use the cached dtype + return self._dtype_ operands = {key: np.ones(1, dtype=value.dtype) for key, value in self.operands.items()} if "contains" in self.expression: _out = ne.evaluate(self.expression, local_dict=operands) @@ -1580,13 +1587,13 @@ def dtype(self): # with synthetic operands (1's). Let's try with numexpr, which is not so picky # about this. _out = ne.evaluate(self.expression, local_dict=operands) - return _out.dtype + self._dtype_ = _out.dtype + self._expression_ = self.expression + return self._dtype_ @property def shape(self): - if hasattr(self, "_shape"): - # In some situations, we already know the shape - return self._shape + # Operands shape can change, so we always need to recompute this _shape, chunks, blocks, fast_path = validate_inputs(self.operands) if fast_path: # fast_path ensure that all the operands have the same partitions From 7ec8f12cf17680f95b7421bbc167ddb65a2a66ce Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 13:35:18 +0100 Subject: [PATCH 09/11] Code cleaning --- src/blosc2/lazyexpr.py | 123 ++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 68 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 84c19da7a..a266d8a83 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -342,6 +342,7 @@ def compute_smaller_slice(larger_shape, smaller_shape, larger_slice): valid_methods = {"sum", "prod", "min", "max", "std", "mean", "var", "any", "all", "where"} valid_methods |= {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"} valid_methods |= {"float32", "float64", "complex64", "complex128"} +valid_methods |= {"bool", "str", "bytes"} def validate_expr(expr: str) -> None: @@ -414,6 +415,46 @@ def visit_Call(self, node): return set(visitor.operands) +class TransformNumpyCalls(ast.NodeTransformer): + def __init__(self): + self.replacements = {} + self.tmp_counter = 0 + + def visit_Call(self, node): + # Check if the call is a numpy type-casting call + if ( + isinstance(node.func, ast.Attribute) + and isinstance(node.func.value, ast.Name) + and node.func.value.id in ["np", "numpy"] + and isinstance(node.args[0], ast.Constant) + ): + # Create a new temporary variable name + tmp_var = f"tmp{self.tmp_counter}" + self.tmp_counter += 1 + + # Evaluate the type-casting call to create the new variable's value + numpy_type = getattr(np, node.func.attr) + self.replacements[tmp_var] = numpy_type(node.args[0].value) + + # Replace the call node with a variable node + return ast.copy_location(ast.Name(id=tmp_var, ctx=ast.Load()), node) + return self.generic_visit(node) + + +def extract_numpy_scalars(expr: str): + # Parse the expression into an AST + tree = ast.parse(expr, mode="eval") + + # Transform the AST + transformer = TransformNumpyCalls() + transformed_tree = transformer.visit(tree) + + # Generate the modified expression + transformed_expr = ast.unparse(transformed_tree) + + return transformed_expr, transformer.replacements + + def validate_inputs(inputs: dict, out=None) -> tuple: # noqa: C901 """Validate the inputs for the expression.""" if len(inputs) == 0: @@ -1177,8 +1218,6 @@ def reduce_slices( # noqa: C901 result = reduce_op.value.reduce(result, **reduce_args) if out is None: - # if dtype is None: - # dtype = result.dtype if is_inside_eval(): # We already have the dtype and reduced_shape, so return immediately # Use a blosc2 container, as it consumes less memory in general @@ -1794,6 +1833,15 @@ def sum(self, axis=None, dtype=None, keepdims=False, **kwargs): } return self.compute(_reduce_args=reduce_args, **kwargs) + def prod(self, axis=None, dtype=None, keepdims=False, **kwargs): + reduce_args = { + "op": ReduceOp.PROD, + "axis": axis, + "dtype": dtype, + "keepdims": keepdims, + } + return self.compute(_reduce_args=reduce_args, **kwargs) + def get_num_elements(self, axis, item): if np.isscalar(axis): axis = (axis,) @@ -1852,15 +1900,6 @@ def var(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs): out = blosc2.asarray(out, **kwargs) return out - def prod(self, axis=None, dtype=None, keepdims=False, **kwargs): - reduce_args = { - "op": ReduceOp.PROD, - "axis": axis, - "dtype": dtype, - "keepdims": keepdims, - } - return self.compute(_reduce_args=reduce_args, **kwargs) - def min(self, axis=None, keepdims=False, **kwargs): reduce_args = { "op": ReduceOp.MIN, @@ -2011,23 +2050,16 @@ def save(self, urlpath=None, **kwargs): "urlbase": value.urlbase, } continue - # TODO: can this be removed? - if key in {"numpy", "np"}: - # Provide access to cast funcs like int8 et al. - continue if isinstance(value, blosc2.Proxy): # Take the required info from the Proxy._cache container value = value._cache if not hasattr(value, "schunk"): - print(f"expression: {expression}") raise ValueError( "To save a LazyArray, all operands must be blosc2.NDArray or blosc2.C2Array objects" ) if value.schunk.urlpath is None: raise ValueError("To save a LazyArray, all operands must be stored on disk/network") operands[key] = value.schunk.urlpath - print("expression:", expression) - print("operands:", operands) array.schunk.vlmeta["_LazyArray"] = { "expression": expression, "UDF": None, @@ -2042,7 +2074,7 @@ def _new_expr(cls, expression, operands, guess, out=None, where=None): # The expression has been validated, so we can evaluate it # in guessing mode to avoid computing reductions # Extract possible numpy scalars - _expression, local_vars = transform_expression(expression) + _expression, local_vars = extract_numpy_scalars(expression) # Let's include numpy and blosc2 as operands so that some functions can be used # Most in particular, castings like np.int8 et al. can be very useful to allow # for desired data types in the output. @@ -2202,46 +2234,6 @@ def save(self, **kwargs): raise NotImplementedError("For safety reasons, this is not implemented for UDFs") -class TransformNumpyCalls(ast.NodeTransformer): - def __init__(self): - self.replacements = {} - self.tmp_counter = 0 - - def visit_Call(self, node): - # Check if the call is a numpy type-casting call - if ( - isinstance(node.func, ast.Attribute) - and isinstance(node.func.value, ast.Name) - and node.func.value.id in ["np", "numpy"] - and isinstance(node.args[0], ast.Constant) - ): - # Create a new temporary variable name - tmp_var = f"tmp{self.tmp_counter}" - self.tmp_counter += 1 - - # Evaluate the type-casting call to create the new variable's value - numpy_type = getattr(np, node.func.attr) - self.replacements[tmp_var] = numpy_type(node.args[0].value) - - # Replace the call node with a variable node - return ast.copy_location(ast.Name(id=tmp_var, ctx=ast.Load()), node) - return self.generic_visit(node) - - -def transform_expression(expr: str): - # Parse the expression into an AST - tree = ast.parse(expr, mode="eval") - - # Transform the AST - transformer = TransformNumpyCalls() - transformed_tree = transformer.visit(tree) - - # Generate the modified expression - transformed_expr = ast.unparse(transformed_tree) - - return transformed_expr, transformer.replacements - - def _open_lazyarray(array): value = array.schunk.meta["LazyArray"] if value == LazyArrayEnum.UDF.value: @@ -2268,18 +2260,14 @@ def _open_lazyarray(array): expr = lazyarray["expression"] globals = {func: getattr(blosc2, func) for func in functions if func in expr} + # Let's include numpy as operands so that some functions can be used. + # Most in particular, castings like np.int8 et al. can be very useful to allow + # for desired data types in the output. globals |= {"np": np, "numpy": np} # Validate the expression (prevent security issues) validate_expr(expr) # Extract possible numpy scalars - # _expr, local_vars = transform_expression(expr) - # operands_dict |= local_vars - # Extract possible numpy scalars - _expression, local_vars = transform_expression(expr) - # Let's include numpy and blosc2 as operands so that some functions can be used - # Most in particular, castings like np.int8 et al. can be very useful to allow - # for desired data types in the output. - # local_vars |= {"np": np, "numpy": np} + _expression, local_vars = extract_numpy_scalars(expr) _operands = operands_dict | local_vars # Create the expression as such, but in guessing mode new_expr = eval(_expression, globals, _operands) @@ -2287,7 +2275,6 @@ def _open_lazyarray(array): _shape = new_expr.shape if isinstance(new_expr, blosc2.LazyExpr): # Restore the original expression and operands - print("new_expr:", new_expr.expression, expr, operands_dict) new_expr.expression = _expression new_expr.expression_tosave = expr new_expr.operands = _operands From 5b809678dcccb848fd47270145b3e0492ec2967d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 13:48:07 +0100 Subject: [PATCH 10/11] Refactor to avoid duplicated code --- src/blosc2/lazyexpr.py | 37 +++++-------------------------------- 1 file changed, 5 insertions(+), 32 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index a266d8a83..e7fe26356 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -2259,38 +2259,11 @@ def _open_lazyarray(array): raise TypeError("Error when retrieving the operands") expr = lazyarray["expression"] - globals = {func: getattr(blosc2, func) for func in functions if func in expr} - # Let's include numpy as operands so that some functions can be used. - # Most in particular, castings like np.int8 et al. can be very useful to allow - # for desired data types in the output. - globals |= {"np": np, "numpy": np} - # Validate the expression (prevent security issues) - validate_expr(expr) - # Extract possible numpy scalars - _expression, local_vars = extract_numpy_scalars(expr) - _operands = operands_dict | local_vars - # Create the expression as such, but in guessing mode - new_expr = eval(_expression, globals, _operands) - _dtype = new_expr.dtype - _shape = new_expr.shape - if isinstance(new_expr, blosc2.LazyExpr): - # Restore the original expression and operands - new_expr.expression = _expression - new_expr.expression_tosave = expr - new_expr.operands = _operands - new_expr.operands_tosave = operands_dict - # Make the array info available for the user (only available when opened from disk) - new_expr.array = array - # We want to expose schunk too, so that .info() can be used on the LazyArray - new_expr.schunk = array.schunk - else: - # An immediate evaluation happened (e.g. all operands are numpy arrays) - new_expr = LazyExpr(None) - new_expr.expression = expr - new_expr.operands = operands_dict - # Cache the dtype and shape (should be immutable) - new_expr._dtype = _dtype - new_expr._shape = _shape + new_expr = LazyExpr._new_expr(expr, operands_dict, guess=True, out=None, where=None) + # Make the array info available for the user (only available when opened from disk) + new_expr.array = array + # We want to expose schunk too, so that .info() can be used on the LazyArray + new_expr.schunk = array.schunk return new_expr From 9af89e59222db18d6968a537db67bec5be8de79e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 11 Nov 2024 14:17:30 +0100 Subject: [PATCH 11/11] Split large test in subtests --- src/blosc2/lazyexpr.py | 1 - tests/ndarray/test_lazyexpr_fields.py | 65 +++++++++++++++++++++------ 2 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index e7fe26356..be91952fc 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -2042,7 +2042,6 @@ def save(self, urlpath=None, **kwargs): # Save the expression and operands in the metadata operands = {} - print(operands_) for key, value in operands_.items(): if isinstance(value, blosc2.C2Array): operands[key] = { diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py index 3873c206d..c138e5dba 100644 --- a/tests/ndarray/test_lazyexpr_fields.py +++ b/tests/ndarray/test_lazyexpr_fields.py @@ -261,42 +261,81 @@ def test_where_reduction(array_fixture): np.testing.assert_allclose(res, nres) -# This is a more complex case with where() calls combined with reductions, +# More complex cases with where() calls combined with reductions, # broadcasting, reusing the result in another expression and other -# funny stuff that is not working yet. -def test_where_fusion(array_fixture): +# funny stuff + + +# Two where() calls +def test_where_fusion1(array_fixture): sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture expr = a1**2 + a2**2 > 2 * a1 * a2 + 1 npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1 - # Two where() calls res = expr.where(0, 1) + expr.where(0, 1) nres = np.where(npexpr, 0, 1) + np.where(npexpr, 0, 1) np.testing.assert_allclose(res[:], nres) - # Two where() calls with a reduction (and using broadcasting) + +# Two where() calls with a reduction (and using broadcasting) +def test_where_fusion2(array_fixture): + sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture + expr = a1**2 + a2**2 > 2 * a1 * a2 + 1 + npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1 + axis = None if sa1.ndim == 1 else 1 res = expr.where(0.5, 0.2) + expr.where(0.3, 0.6).sum(axis=axis) nres = np.where(npexpr, 0.5, 0.2) + np.where(npexpr, 0.3, 0.6).sum(axis=axis) np.testing.assert_allclose(res[:], nres) - # Reuse the result in another expression + +# Reuse the result in another expression +def test_where_fusion3(array_fixture): + sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture + expr = a1**2 + a2**2 > 2 * a1 * a2 + 1 + npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1 + + res = expr.where(0, 1) + expr.where(0, 1) + nres = np.where(npexpr, 0, 1) + np.where(npexpr, 0, 1) res = expr.where(0, 1) + res.sum() nres = np.where(npexpr, 0, 1) + nres.sum() np.testing.assert_allclose(res[:], nres) - # Reuse the result in another expression twice + +# Reuse the result in another expression twice +def test_where_fusion4(array_fixture): + sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture + expr = a1**2 + a2**2 > 2 * a1 * a2 + 1 + npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1 + + res = expr.where(0.1, 0.7) + expr.where(0.2, 5) + nres = np.where(npexpr, 0.1, 0.7) + np.where(npexpr, 0.2, 5) res = 2 * res + 4 * res nres = 2 * nres + 4 * nres np.testing.assert_allclose(res[:], nres) - # Reuse the result in another expression twice II + +# Reuse the result in another expression twice II +def test_where_fusion5(array_fixture): + sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture + expr = a1**2 + a2**2 > 2 * a1 * a2 + 1 + npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1 + + res = expr.where(-1, 7) + expr.where(2, 5) + nres = np.where(npexpr, -1, 7) + np.where(npexpr, 2, 5) res = 2 * res + blosc2.sqrt(res) nres = 2 * nres + np.sqrt(nres) np.testing.assert_allclose(res[:], nres) - # TODO: this is not working yet - # Reuse the result in another expression twice III - # res = expr.where(0, 1) + res - # nres = np.where(npexpr, 0, 1) + nres - # np.testing.assert_allclose(res[:], nres) + +# Reuse the result in another expression twice III +def test_where_fusion6(array_fixture): + sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture + expr = a1**2 + a2**2 > 2 * a1 * a2 + 1 + npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1 + + res = expr.where(-1, 1) + expr.where(2, 1) + nres = np.where(npexpr, -1, 1) + np.where(npexpr, 2, 1) + res = expr.where(6.1, 1) + res + nres = np.where(npexpr, 6.1, 1) + nres + np.testing.assert_allclose(res[:], nres)