From d887d83f6d74f717ad88dc2814e253f8835be345 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 7 Nov 2024 06:33:46 +0100
Subject: [PATCH 01/11] Preliminary support for numpy type inference

---
 examples/ndarray/reduce_and_enlarge.py |  27 ++--
 src/blosc2/lazyexpr.py                 | 185 +++++++++++++++++++++++--
 tests/ndarray/test_lazyexpr.py         |   5 +-
 3 files changed, 190 insertions(+), 27 deletions(-)

diff --git a/examples/ndarray/reduce_and_enlarge.py b/examples/ndarray/reduce_and_enlarge.py
index b16eff9d4..eadccc451 100644
--- a/examples/ndarray/reduce_and_enlarge.py
+++ b/examples/ndarray/reduce_and_enlarge.py
@@ -14,12 +14,14 @@
 # In particular, note how re-opening a stored expression can adapt to
 # changes in the operands.
 
+import numpy as np
+
 import blosc2
 
 # Create arrays with specific dimensions
 a = blosc2.full((2, 3, 4), 1, urlpath="a.b2nd", mode="w")  # 3D array with dimensions (2, 3, 4)
 b = blosc2.full((2, 4), 2, urlpath="b.b2nd", mode="w")  # 2D array with dimensions (2, 4)
-c = blosc2.full(4, 3, urlpath="c.b2nd", mode="w")  # 1D array with dimensions (4,)
+c = blosc2.full(4, 3, dtype=np.int8, urlpath="c.b2nd", mode="w")  # 1D array with dimensions (4,)
 
 print("Array a:", a[:])
 print("Array b:", b[:])
@@ -28,11 +30,18 @@
 # Define an expression using the arrays above
 # expression = "a.sum() + b * c"
 # expression = "a.sum(axis=1) + b * c"
-expression = "sum(a, axis=1) + b * c"
+# expression = "sum(a, axis=1) + b * c + 0"
+expression = "c + np.int8(0)"
+# expression = "sum(a, axis=1) + b * sin(c)"
 # Define the operands for the expression
-operands = {"a": a, "b": b, "c": c}
+# operands = {"a": a, "b": b, "c": c}
+operands = {"c": c}
 # Create a lazy expression
+print("expression:", expression, type(expression), operands["c"].dtype)
 lazy_expression = blosc2.lazyexpr(expression, operands)
+print(lazy_expression.shape, lazy_expression.dtype)  # Print the shape of the lazy expression
+# lazy_expression = blosc2.lazyexpr(expression)
+print(lazy_expression[:])  # Evaluate and print the result of the lazy expression (should be a 2x4 arr)
 
 # Store and reload the expressions
 url_path = "my_expr.b2nd"  # Define the path for the file
@@ -41,15 +50,5 @@
 url_path = "my_expr.b2nd"  # Define the path for the file
 lazy_expression = blosc2.open(urlpath=url_path)  # Open the saved file
 print(lazy_expression)  # Print the lazy expression
-print(lazy_expression.shape)  # Print the shape of the lazy expression
+print(lazy_expression.shape, lazy_expression.dtype)  # Print the shape of the lazy expression
 print(lazy_expression[:])  # Evaluate and print the result of the lazy expression (should be a 2x4 arr)
-
-# Enlarge the arrays and re-evaluate the expression
-a.resize((3, 3, 4))
-a[2] = 3
-b.resize((3, 4))
-b[2] = 5
-lazy_expression = blosc2.open(urlpath=url_path)  # Open the saved file
-print(lazy_expression.shape)
-result = lazy_expression.compute()
-print(result[:])
diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 0ce8d9787..8fcd265e2 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -277,7 +277,7 @@ def compute_broadcast_shape(arrays):
     Returns the shape of the outcome of an operation with the input arrays.
     """
     # When dealing with UDFs, one can arrive params that are not arrays
-    shapes = [arr.shape for arr in arrays if hasattr(arr, "shape")]
+    shapes = [arr.shape for arr in arrays if hasattr(arr, "shape") and arr is not np]
     return np.broadcast_shapes(*shapes)
 
 
@@ -338,6 +338,8 @@ def compute_smaller_slice(larger_shape, smaller_shape, larger_slice):
 
 # Define valid method names
 valid_methods = {"sum", "prod", "min", "max", "std", "mean", "var", "any", "all", "where"}
+valid_methods |= {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"}
+valid_methods |= {"float32", "float64", "complex64", "complex128"}
 
 
 def validate_expr(expr: str) -> None:
@@ -410,6 +412,48 @@ def visit_Call(self, node):
     return set(visitor.operands)
 
 
+class InferDtypeVisitor(ast.NodeVisitor):
+    def __init__(self, dtype_dict):
+        self.dtype_dict = dtype_dict
+        self.result_dtype = None
+
+    def visit_Name(self, node):
+        if node.id in self.dtype_dict:
+            if self.result_dtype is None:
+                self.result_dtype = self.dtype_dict[node.id]
+            else:
+                self.result_dtype = np.result_type(self.result_dtype, self.dtype_dict[node.id])
+
+    def visit_Constant(self, node):
+        const_dtype = np.array(node.value).dtype
+        if self.result_dtype is None:
+            self.result_dtype = const_dtype
+        else:
+            self.result_dtype = np.result_type(self.result_dtype, const_dtype)
+
+    def visit_BinOp(self, node):
+        self.visit(node.left)
+        left_dtype = self.result_dtype
+
+        self.result_dtype = None
+        self.visit(node.right)
+        right_dtype = self.result_dtype
+
+        self.result_dtype = np.result_type(left_dtype, right_dtype)
+
+    def infer_dtype(self, expr):
+        self.result_dtype = None
+        tree = ast.parse(expr, mode="eval")
+        self.visit(tree.body)
+        return self.result_dtype
+
+
+def infer_expression_dtype(expr, array_dict):
+    dtype_dict = {k: v.dtype for k, v in array_dict.items()}
+    visitor = InferDtypeVisitor(dtype_dict)
+    return visitor.infer_dtype(expr)
+
+
 def validate_inputs(inputs: dict, out=None) -> tuple:  # noqa: C901
     """Validate the inputs for the expression."""
     if len(inputs) == 0:
@@ -417,7 +461,7 @@ def validate_inputs(inputs: dict, out=None) -> tuple:  # noqa: C901
             "You need to pass at least one array.  Use blosc2.empty() if values are not really needed."
         )
 
-    inputs = [input for input in inputs.values() if hasattr(input, "shape")]
+    inputs = [input for input in inputs.values() if hasattr(input, "shape") and input is not np]
     shape = compute_broadcast_shape(inputs)
 
     if not all(np.array_equal(shape, input.shape) for input in inputs):
@@ -1893,7 +1937,7 @@ def info_items(self):
         items += [("dtype", self.dtype)]
         return items
 
-    def save(self, urlpath=None, **kwargs):
+    def _save(self, urlpath=None, **kwargs):
         if urlpath is None:
             raise ValueError("To save a LazyArray you must provide an urlpath")
 
@@ -1918,6 +1962,9 @@ def save(self, urlpath=None, **kwargs):
                     "urlbase": value.urlbase,
                 }
                 continue
+            if key in {"numpy", "np"}:
+                # Provide access to cast funcs like int8 et al.
+                continue
             if isinstance(value, blosc2.Proxy):
                 # Take the required info from the Proxy._cache container
                 value = value._cache
@@ -1934,6 +1981,57 @@ def save(self, urlpath=None, **kwargs):
             "operands": operands,
         }
 
+    def save(self, urlpath=None, **kwargs):
+        if urlpath is None:
+            raise ValueError("To save a LazyArray you must provide an urlpath")
+
+        expression = self.expression_tosave if hasattr(self, "expression_tosave") else self.expression
+        operands_ = self.operands_tosave if hasattr(self, "operands_tosave") else self.operands
+        # Validate expression
+        validate_expr(expression)
+
+        meta = kwargs.get("meta", {})
+        meta["LazyArray"] = LazyArrayEnum.Expr.value
+        kwargs["urlpath"] = urlpath
+        kwargs["meta"] = meta
+        kwargs["mode"] = "w"  # always overwrite the file in urlpath
+
+        # Create an empty array; useful for providing the shape and dtype of the outcome
+        array = blosc2.empty(shape=self.shape, dtype=self.dtype, **kwargs)
+
+        # Save the expression and operands in the metadata
+        operands = {}
+        print(operands_)
+        for key, value in operands_.items():
+            if isinstance(value, blosc2.C2Array):
+                operands[key] = {
+                    "path": str(value.path),
+                    "urlbase": value.urlbase,
+                }
+                continue
+            # TODO: can this be removed?
+            if key in {"numpy", "np"}:
+                # Provide access to cast funcs like int8 et al.
+                continue
+            if isinstance(value, blosc2.Proxy):
+                # Take the required info from the Proxy._cache container
+                value = value._cache
+            if not hasattr(value, "schunk"):
+                print(f"expression: {expression}")
+                raise ValueError(
+                    "To save a LazyArray, all operands must be blosc2.NDArray or blosc2.C2Array objects"
+                )
+            if value.schunk.urlpath is None:
+                raise ValueError("To save a LazyArray, all operands must be stored on disk/network")
+            operands[key] = value.schunk.urlpath
+        print("expression:", expression)
+        print("operands:", operands)
+        array.schunk.vlmeta["_LazyArray"] = {
+            "expression": expression,
+            "UDF": None,
+            "operands": operands,
+        }
+
     @classmethod
     def _new_expr(cls, expression, operands, guess, out=None, where=None):
         # Validate the expression
@@ -1941,14 +2039,23 @@ def _new_expr(cls, expression, operands, guess, out=None, where=None):
         if guess:
             # The expression has been validated, so we can evaluate it
             # in guessing mode to avoid computing reductions
+            # Extract possible numpy scalars
+            _expression, local_vars = transform_expression(expression)
+            # Let's include numpy and blosc2 as operands so that some functions can be used
+            # Most in particular, castings like np.int8 et al. can be very useful to allow
+            # for desired data types in the output.
+            _operands = operands | local_vars
             _globals = {func: getattr(blosc2, func) for func in functions if func in expression}
-            new_expr = eval(expression, _globals, operands)
-            _dtype = new_expr.dtype
+            _globals |= {"np": np, "numpy": np}
+            new_expr = eval(_expression, _globals, _operands)
+            _dtype = infer_expression_dtype(_expression, _operands)
             _shape = new_expr.shape
             if isinstance(new_expr, blosc2.LazyExpr):
                 # Restore the original expression and operands
-                new_expr.expression = expression
-                new_expr.operands = operands
+                new_expr.expression = _expression
+                new_expr.expression_tosave = expression
+                new_expr.operands = _operands
+                new_expr.operands_tosave = operands
             else:
                 # An immediate evaluation happened (e.g. all operands are numpy arrays)
                 new_expr = cls(None)
@@ -2093,6 +2200,46 @@ def save(self, **kwargs):
         raise NotImplementedError("For safety reasons, this is not implemented for UDFs")
 
 
+class TransformNumpyCalls(ast.NodeTransformer):
+    def __init__(self):
+        self.replacements = {}
+        self.tmp_counter = 0
+
+    def visit_Call(self, node):
+        # Check if the call is a numpy type-casting call
+        if (
+            isinstance(node.func, ast.Attribute)
+            and isinstance(node.func.value, ast.Name)
+            and node.func.value.id in ["np", "numpy"]
+            and isinstance(node.args[0], ast.Constant)
+        ):
+            # Create a new temporary variable name
+            tmp_var = f"tmp{self.tmp_counter}"
+            self.tmp_counter += 1
+
+            # Evaluate the type-casting call to create the new variable's value
+            numpy_type = getattr(np, node.func.attr)
+            self.replacements[tmp_var] = numpy_type(node.args[0].value)
+
+            # Replace the call node with a variable node
+            return ast.copy_location(ast.Name(id=tmp_var, ctx=ast.Load()), node)
+        return self.generic_visit(node)
+
+
+def transform_expression(expr: str):
+    # Parse the expression into an AST
+    tree = ast.parse(expr, mode="eval")
+
+    # Transform the AST
+    transformer = TransformNumpyCalls()
+    transformed_tree = transformer.visit(tree)
+
+    # Generate the modified expression
+    transformed_expr = ast.unparse(transformed_tree)
+
+    return transformed_expr, transformer.replacements
+
+
 def _open_lazyarray(array):
     value = array.schunk.meta["LazyArray"]
     if value == LazyArrayEnum.UDF.value:
@@ -2119,16 +2266,31 @@ def _open_lazyarray(array):
 
     expr = lazyarray["expression"]
     globals = {func: getattr(blosc2, func) for func in functions if func in expr}
+    globals |= {"np": np, "numpy": np}
     # Validate the expression (prevent security issues)
     validate_expr(expr)
+    # Extract possible numpy scalars
+    # _expr, local_vars = transform_expression(expr)
+    # operands_dict |= local_vars
+    # Extract possible numpy scalars
+    _expression, local_vars = transform_expression(expr)
+    # Let's include numpy and blosc2 as operands so that some functions can be used
+    # Most in particular, castings like np.int8 et al. can be very useful to allow
+    # for desired data types in the output.
+    # local_vars |= {"np": np, "numpy": np}
+    _operands = operands_dict | local_vars
     # Create the expression as such
-    new_expr = eval(expr, globals, operands_dict)
-    _dtype = new_expr.dtype
+    new_expr = eval(_expression, globals, _operands)
+    _dtype = infer_expression_dtype(_expression, _operands)
+    # _dtype = new_expr.dtype
     _shape = new_expr.shape
     if isinstance(new_expr, blosc2.LazyExpr):
         # Restore the original expression and operands
-        new_expr.expression = expr
-        new_expr.operands = operands_dict
+        print("new_expr:", new_expr.expression, expr, operands_dict)
+        new_expr.expression = _expression
+        new_expr.expression_tosave = expr
+        new_expr.operands = _operands
+        new_expr.operands_tosave = operands_dict
         # Make the array info available for the user (only available when opened from disk)
         new_expr.array = array
         # We want to expose schunk too, so that .info() can be used on the LazyArray
@@ -2317,6 +2479,7 @@ def lazyexpr(
             where_args = {"_where_x": where[0], "_where_y": where[1]}
             expression._where_args = where_args
         return expression
+
     if operands is None:
         # Try to get operands from variables in the stack
         operands = get_expr_operands(expression)
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 5dd16da00..1e79ce645 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -589,11 +589,12 @@ def test_save_unsafe():
     assert expr.expression in str(excinfo.value)
 
     # Check that an invalid expression cannot be easily saved.
-    # As this can easily be worked around, the best protection is
+    # Although, as this can easily be worked around, the best protection is
     # during loading time (tested above).
+    expr.expression_tosave = "import os; os.system('touch /tmp/unsafe')"
     with pytest.raises(ValueError) as excinfo:
         expr.save(urlpath=urlpath)
-    assert expr.expression in str(excinfo.value)
+    assert expr.expression_tosave in str(excinfo.value)
 
     for urlpath in disk_arrays:
         blosc2.remove_urlpath(urlpath)

From 2d615cc43109591c69fc4b456b9a2b4cb8bd6e57 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Thu, 7 Nov 2024 07:41:22 +0100
Subject: [PATCH 02/11] Initial test

---
 src/blosc2/lazyexpr.py         | 22 +++++++++++--------
 tests/ndarray/test_lazyexpr.py | 40 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 8fcd265e2..b62c7801d 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -747,6 +747,7 @@ def fast_eval(  # noqa: C901
         The output array.
     """
     out = kwargs.pop("_output", None)
+    dtype = kwargs.pop("dtype", None)
     where: dict | None = kwargs.pop("_where_args", None)
     if isinstance(out, blosc2.NDArray):
         # If 'out' has been passed, and is a NDArray, use it as the base array
@@ -817,11 +818,9 @@ def fast_eval(  # noqa: C901
             if out is None:
                 # We can enter here when using any of the eval() or __getitem__() methods
                 if getitem:
-                    out = np.empty(shape, dtype=result.dtype)
+                    out = np.empty(shape, dtype=dtype)
                 else:
-                    out = blosc2.empty(
-                        shape, chunks=chunks, blocks=basearr.blocks, dtype=result.dtype, **kwargs
-                    )
+                    out = blosc2.empty(shape, chunks=chunks, blocks=basearr.blocks, dtype=dtype, **kwargs)
 
         # Store the result in the output array
         if getitem:
@@ -872,6 +871,7 @@ def slices_eval(  # noqa: C901
     out = kwargs.pop("_output", None)
     chunks = kwargs.get("chunks")
     where: dict | None = kwargs.pop("_where_args", None)
+    dtype = kwargs.pop("dtype", None)
     # Compute the shape and chunks of the output array, including broadcasting
     shape = compute_broadcast_shape(operands.values())
 
@@ -972,23 +972,24 @@ def slices_eval(  # noqa: C901
                 # The result is a linear array
                 shape_ = math.prod(shape)
             if getitem:
-                out = np.empty(shape_, dtype=result.dtype)
+                out = np.empty(shape_, dtype=dtype)
             else:
                 if "chunks" not in kwargs and (where is None or len(where) == 2):
                     # Let's use the same chunks as the first operand (it could have been automatic too)
-                    out = blosc2.empty(shape_, chunks=chunks, dtype=result.dtype, **kwargs)
+                    out = blosc2.empty(shape_, chunks=chunks, dtype=dtype, **kwargs)
                 elif "chunks" in kwargs and (where is not None and len(where) < 2 and len(shape_) > 1):
                     # Remove the chunks argument if the where condition is not a tuple with two elements
                     kwargs.pop("chunks")
-                    out = blosc2.empty(shape_, dtype=result.dtype, **kwargs)
+                    out = blosc2.empty(shape_, dtype=dtype, **kwargs)
                 else:
-                    out = blosc2.empty(shape_, dtype=result.dtype, **kwargs)
+                    out = blosc2.empty(shape_, dtype=dtype, **kwargs)
                 # Check if the in out partitions are well-behaved (i.e. no padding)
                 behaved = blosc2.are_partitions_behaved(out.shape, out.chunks, out.blocks)
 
         if where is None or len(where) == 2:
             if behaved:
                 # Fast path
+                result = np.asarray(result, dtype=dtype)
                 out.schunk.update_data(nchunk, result, copy=False)
             else:
                 out[slice_] = result
@@ -1051,7 +1052,8 @@ def reduce_slices(  # noqa: C901
     reduce_op = reduce_args.pop("op")
     axis = reduce_args["axis"]
     keepdims = reduce_args["keepdims"]
-    dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None
+    # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None
+    dtype = kwargs.get("dtype")
 
     # Compute the shape and chunks of the output array, including broadcasting
     shape = compute_broadcast_shape(operands.values())
@@ -1906,6 +1908,8 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray:
             kwargs["_output"] = self._output
         if hasattr(self, "_where_args"):
             kwargs["_where_args"] = self._where_args
+        if hasattr(self, "_dtype"):
+            kwargs["dtype"] = self._dtype
         return self._compute_expr(item, kwargs)
 
     def __getitem__(self, item):
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 1e79ce645..c5fcd1dc4 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -975,3 +975,43 @@ def test_fill_disk_operands(chunks, blocks, disk, fill_value):
 )
 def test_get_expr_operands(expression, expected_operands):
     assert blosc2.get_expr_operands(expression) == set(expected_operands)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        "np.int8(0)",
+        # "np.uint16(0)",
+    ],
+)
+@pytest.mark.parametrize(
+    ("dtype1", "dtype2"),
+    [
+        (np.int8, np.int8),
+        # (np.int8, np.int16),
+        # (np.int8, np.int32),
+        # (np.int8, np.int64),
+        # (np.int8, np.float32),
+        # (np.int8, np.float64),
+        # (np.uint16, np.uint16),
+        # (np.uint16, np.uint32),
+        # (np.uint16, np.uint64),
+        # (np.uint16, np.float32),
+        # (np.uint16, np.float64),
+        # (np.float32, np.float32),
+        # (np.float32, np.float64),
+    ],
+)
+def test_dtype_infer(dtype1, dtype2, scalar):
+    shape = (5, 10)
+    na = np.linspace(0, 1, np.prod(shape), dtype=dtype1).reshape(shape)
+    nb = np.linspace(1, 2, np.prod(shape), dtype=dtype2).reshape(shape)
+    a = blosc2.asarray(na)
+    b = blosc2.asarray(nb)
+
+    expr = blosc2.lazyexpr(f"a + b * {scalar}", operands={"a": a, "b": b})
+    nres = na + nb * eval(scalar)
+    # res = expr[()]. # TODO
+    res = expr.compute()
+    np.testing.assert_allclose(res[()], nres)
+    assert res.dtype == nres.dtype

From e995529bbee3c62a7603164358c359aaa2230fa9 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Fri, 8 Nov 2024 18:06:44 +0100
Subject: [PATCH 03/11] Use actual numpy/numexpr type inference instead of
 naive infer_expression_dtype()

---
 src/blosc2/lazyexpr.py                | 98 +++++++++++----------------
 src/blosc2/ndarray.py                 |  9 ++-
 tests/ndarray/test_lazyexpr.py        | 26 +++----
 tests/ndarray/test_lazyexpr_fields.py | 11 ++-
 4 files changed, 67 insertions(+), 77 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index b62c7801d..6fe5eec5e 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -24,6 +24,8 @@
 from queue import Empty, Queue
 from typing import TYPE_CHECKING
 
+from numpy.exceptions import ComplexWarning
+
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
 
@@ -412,48 +414,6 @@ def visit_Call(self, node):
     return set(visitor.operands)
 
 
-class InferDtypeVisitor(ast.NodeVisitor):
-    def __init__(self, dtype_dict):
-        self.dtype_dict = dtype_dict
-        self.result_dtype = None
-
-    def visit_Name(self, node):
-        if node.id in self.dtype_dict:
-            if self.result_dtype is None:
-                self.result_dtype = self.dtype_dict[node.id]
-            else:
-                self.result_dtype = np.result_type(self.result_dtype, self.dtype_dict[node.id])
-
-    def visit_Constant(self, node):
-        const_dtype = np.array(node.value).dtype
-        if self.result_dtype is None:
-            self.result_dtype = const_dtype
-        else:
-            self.result_dtype = np.result_type(self.result_dtype, const_dtype)
-
-    def visit_BinOp(self, node):
-        self.visit(node.left)
-        left_dtype = self.result_dtype
-
-        self.result_dtype = None
-        self.visit(node.right)
-        right_dtype = self.result_dtype
-
-        self.result_dtype = np.result_type(left_dtype, right_dtype)
-
-    def infer_dtype(self, expr):
-        self.result_dtype = None
-        tree = ast.parse(expr, mode="eval")
-        self.visit(tree.body)
-        return self.result_dtype
-
-
-def infer_expression_dtype(expr, array_dict):
-    dtype_dict = {k: v.dtype for k, v in array_dict.items()}
-    visitor = InferDtypeVisitor(dtype_dict)
-    return visitor.infer_dtype(expr)
-
-
 def validate_inputs(inputs: dict, out=None) -> tuple:  # noqa: C901
     """Validate the inputs for the expression."""
     if len(inputs) == 0:
@@ -824,9 +784,15 @@ def fast_eval(  # noqa: C901
 
         # Store the result in the output array
         if getitem:
-            out[slice_] = result
+            try:
+                out[slice_] = result
+            except ComplexWarning:
+                # The result is a complex number, so we need to convert it to real.
+                # This is a workaround for rigidness of NumExpr with type casting.
+                result = result.real.astype(out.dtype)
+                out[slice_] = result
         else:
-            if behaved and result.shape == chunks_:
+            if behaved and result.shape == chunks_ and result.dtype == out.dtype:
                 # Fast path only works for results that are full chunks
                 out.schunk.update_data(nchunk, result, copy=False)
             else:
@@ -1052,8 +1018,8 @@ def reduce_slices(  # noqa: C901
     reduce_op = reduce_args.pop("op")
     axis = reduce_args["axis"]
     keepdims = reduce_args["keepdims"]
-    # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None
-    dtype = kwargs.get("dtype")
+    dtype = kwargs.pop("dtype", None)
+    # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else _dtype
 
     # Compute the shape and chunks of the output array, including broadcasting
     shape = compute_broadcast_shape(operands.values())
@@ -1205,8 +1171,8 @@ def reduce_slices(  # noqa: C901
             result = reduce_op.value.reduce(result, **reduce_args)
 
         if out is None:
-            if dtype is None:
-                dtype = result.dtype
+            # if dtype is None:
+            #     dtype = result.dtype
             if is_inside_eval():
                 # We already have the dtype and reduced_shape, so return immediately
                 # Use a blosc2 container, as it consumes less memory in general
@@ -1594,7 +1560,18 @@ def dtype(self):
             # In some situations, we already know the dtype
             return self._dtype
         operands = {key: np.ones(1, dtype=value.dtype) for key, value in self.operands.items()}
-        _out = ne.evaluate(self.expression, local_dict=operands)
+        if "contains" in self.expression:
+            _out = ne.evaluate(self.expression, local_dict=operands)
+        else:
+            # Create a globals dict with the functions of numpy
+            globals_dict = {f: getattr(np, f) for f in functions if f != "contains"}
+            try:
+                _out = eval(self.expression, globals_dict, operands)
+            except RuntimeWarning:
+                # Sometimes, numpy gets a RuntimeWarning when evaluating expressions
+                # with synthetic operands (1's). Let's try with numexpr, which is not so picky
+                # about this.
+                _out = ne.evaluate(self.expression, local_dict=operands)
         return _out.dtype
 
     @property
@@ -1770,17 +1747,26 @@ def __ge__(self, value):
     def where(self, value1=None, value2=None):
         if self.dtype != np.bool_:
             raise ValueError("where() can only be used with boolean expressions")
+        dtype = self.dtype
         # This just acts as a 'decorator' for the existing expression
         if value1 is not None and value2 is not None:
+            # Guess the outcome dtype for value1 and value2
+            dtype = np.result_type(np.asarray(value1), np.asarray(value2))
             args = {"_where_x": value1, "_where_y": value2}
         elif value1 is not None:
+            dtype = np.asarray(value1).dtype
             args = {"_where_x": value1}
         elif value2 is not None:
             raise ValueError("where() requires value1 when using value2")
         else:
             args = {}
-        self._where_args = args
-        return self
+        # Create a new expression
+        new_expr = blosc2.LazyExpr(new_op=(self, None, None))
+        new_expr.expression = self.expression
+        new_expr.operands = self.operands
+        new_expr._where_args = args
+        new_expr._dtype = dtype
+        return new_expr
 
     def sum(self, axis=None, dtype=None, keepdims=False, **kwargs):
         reduce_args = {
@@ -1908,8 +1894,7 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray:
             kwargs["_output"] = self._output
         if hasattr(self, "_where_args"):
             kwargs["_where_args"] = self._where_args
-        if hasattr(self, "_dtype"):
-            kwargs["dtype"] = self._dtype
+        kwargs["dtype"] = self.dtype
         return self._compute_expr(item, kwargs)
 
     def __getitem__(self, item):
@@ -2052,7 +2037,7 @@ def _new_expr(cls, expression, operands, guess, out=None, where=None):
             _globals = {func: getattr(blosc2, func) for func in functions if func in expression}
             _globals |= {"np": np, "numpy": np}
             new_expr = eval(_expression, _globals, _operands)
-            _dtype = infer_expression_dtype(_expression, _operands)
+            _dtype = new_expr.dtype
             _shape = new_expr.shape
             if isinstance(new_expr, blosc2.LazyExpr):
                 # Restore the original expression and operands
@@ -2283,10 +2268,9 @@ def _open_lazyarray(array):
     # for desired data types in the output.
     # local_vars |= {"np": np, "numpy": np}
     _operands = operands_dict | local_vars
-    # Create the expression as such
+    # Create the expression as such, but in guessing mode
     new_expr = eval(_expression, globals, _operands)
-    _dtype = infer_expression_dtype(_expression, _operands)
-    # _dtype = new_expr.dtype
+    _dtype = new_expr.dtype
     _shape = new_expr.shape
     if isinstance(new_expr, blosc2.LazyExpr):
         # Restore the original expression and operands
diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py
index b9b5734fb..6408b5ebc 100644
--- a/src/blosc2/ndarray.py
+++ b/src/blosc2/ndarray.py
@@ -13,6 +13,8 @@
 from collections import namedtuple
 from typing import TYPE_CHECKING, NamedTuple
 
+from numpy.exceptions import ComplexWarning
+
 if TYPE_CHECKING:
     from collections.abc import Iterator, Sequence
 
@@ -1133,7 +1135,12 @@ def __setitem__(self, key: int | slice | Sequence[slice], value: object):
             value = np.full(shape, value, dtype=self.dtype)
         elif isinstance(value, np.ndarray):
             if value.dtype != self.dtype:
-                raise ValueError("The dtype of the value should be the same as the array")
+                try:
+                    value = value.astype(self.dtype)
+                except ComplexWarning:
+                    # numexpr type inference can lead to unnecessary type promotions
+                    # when using complex functions (e.g. conj) with real arrays
+                    value = value.real.astype(self.dtype)
             if value.shape == ():
                 value = np.full(shape, value, dtype=self.dtype)
         elif isinstance(value, NDArray):
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index c5fcd1dc4..15fe1d19e 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -981,25 +981,25 @@ def test_get_expr_operands(expression, expected_operands):
     "scalar",
     [
         "np.int8(0)",
-        # "np.uint16(0)",
+        "np.uint16(0)",
     ],
 )
 @pytest.mark.parametrize(
     ("dtype1", "dtype2"),
     [
         (np.int8, np.int8),
-        # (np.int8, np.int16),
-        # (np.int8, np.int32),
-        # (np.int8, np.int64),
-        # (np.int8, np.float32),
-        # (np.int8, np.float64),
-        # (np.uint16, np.uint16),
-        # (np.uint16, np.uint32),
-        # (np.uint16, np.uint64),
-        # (np.uint16, np.float32),
-        # (np.uint16, np.float64),
-        # (np.float32, np.float32),
-        # (np.float32, np.float64),
+        (np.int8, np.int16),
+        (np.int8, np.int32),
+        (np.int8, np.int64),
+        (np.int8, np.float32),
+        (np.int8, np.float64),
+        (np.uint16, np.uint16),
+        (np.uint16, np.uint32),
+        (np.uint16, np.uint64),
+        (np.uint16, np.float32),
+        (np.uint16, np.float64),
+        (np.float32, np.float32),
+        (np.float32, np.float64),
     ],
 )
 def test_dtype_infer(dtype1, dtype2, scalar):
diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py
index 97f7be6f9..3873c206d 100644
--- a/tests/ndarray/test_lazyexpr_fields.py
+++ b/tests/ndarray/test_lazyexpr_fields.py
@@ -276,8 +276,8 @@ def test_where_fusion(array_fixture):
 
     # Two where() calls with a reduction (and using broadcasting)
     axis = None if sa1.ndim == 1 else 1
-    res = expr.where(0, 1) + expr.where(0, 1).sum(axis=axis)
-    nres = np.where(npexpr, 0, 1) + np.where(npexpr, 0, 1).sum(axis=axis)
+    res = expr.where(0.5, 0.2) + expr.where(0.3, 0.6).sum(axis=axis)
+    nres = np.where(npexpr, 0.5, 0.2) + np.where(npexpr, 0.3, 0.6).sum(axis=axis)
     np.testing.assert_allclose(res[:], nres)
 
     # Reuse the result in another expression
@@ -290,11 +290,10 @@ def test_where_fusion(array_fixture):
     nres = 2 * nres + 4 * nres
     np.testing.assert_allclose(res[:], nres)
 
-    # TODO: this is not working yet
     # Reuse the result in another expression twice II
-    # res = 2 * res + blosc2.sqrt(res)
-    # nres = 2 * nres + nres.sqrt()
-    # np.testing.assert_allclose(res[:], nres)
+    res = 2 * res + blosc2.sqrt(res)
+    nres = 2 * nres + np.sqrt(nres)
+    np.testing.assert_allclose(res[:], nres)
 
     # TODO: this is not working yet
     # Reuse the result in another expression twice III

From e0ef9116f1ddb122d717a31c882de91fc364c1d4 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 07:16:20 +0100
Subject: [PATCH 04/11] Fix dtype fo sum and prod reductions

---
 src/blosc2/lazyexpr.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 6fe5eec5e..d5a94443d 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1018,8 +1018,12 @@ def reduce_slices(  # noqa: C901
     reduce_op = reduce_args.pop("op")
     axis = reduce_args["axis"]
     keepdims = reduce_args["keepdims"]
-    dtype = kwargs.pop("dtype", None)
-    # dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else _dtype
+    # For sum and prod, we can specify the dtype of the output array
+    dtype = reduce_args["dtype"] if reduce_op in (ReduceOp.SUM, ReduceOp.PROD) else None
+    if dtype is None:
+        dtype = kwargs.pop("dtype", None)
+    else:
+        del kwargs["dtype"]
 
     # Compute the shape and chunks of the output array, including broadcasting
     shape = compute_broadcast_shape(operands.values())

From fea712e06322f8820d9d8966e82efe341f2aeaf9 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 11:19:14 +0100
Subject: [PATCH 05/11] Fix for inferring types when scalars are in expressions

---
 src/blosc2/lazyexpr.py         | 24 ++++++++++++------------
 tests/ndarray/test_lazyexpr.py | 28 +++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index d5a94443d..d30cd4ac8 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -753,13 +753,15 @@ def fast_eval(  # noqa: C901
             operands, slice_, chunks_, full_chunk, aligned, nchunk, iter_disk, chunk_operands
         )
 
-        if isinstance(out, np.ndarray) and not where:
-            # Fast path: put the result straight in the output array (avoiding a memory copy)
-            if callable(expression):
-                expression(tuple(chunk_operands.values()), out[slice_], offset=offset)
-            else:
-                ne.evaluate(expression, chunk_operands, out=out[slice_])
-            continue
+        # Since ne.evaluate() can return a dtype larger than the one in computed in the expression,
+        # we cannot take this fast path
+        # if isinstance(out, np.ndarray) and not where:
+        #     # Fast path: put the result straight in the output array (avoiding a memory copy)
+        #     if callable(expression):
+        #         expression(tuple(chunk_operands.values()), out[slice_], offset=offset)
+        #     else:
+        #         ne.evaluate(expression, chunk_operands, out=out[slice_])
+        #     continue
         if callable(expression):
             result = np.empty(chunks_, dtype=out.dtype)
             expression(tuple(chunk_operands.values()), result, offset=offset)
@@ -1436,6 +1438,7 @@ def __init__(self, new_op):  # noqa: C901
                 self.expression = f"{op}(o0, o1)"
             return
 
+        self._dtype = np.result_type(value1, value2)
         if np.isscalar(value1) and np.isscalar(value2):
             self.expression = f"({value1} {op} {value2})"
         elif np.isscalar(value2):
@@ -1511,6 +1514,7 @@ def update_expr(self, new_op):  # noqa: C901
             value1 = value1.compute()
         if hasattr(value2, "_where_args"):
             value2 = value2.compute()
+        self._dtype = np.result_type(value1, value2)
         if not isinstance(value1, LazyExpr) and not isinstance(value2, LazyExpr):
             # We converted some of the operands to NDArray (where() handling above)
             new_operands = {"o0": value1, "o1": value2}
@@ -1903,11 +1907,7 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray:
 
     def __getitem__(self, item):
         kwargs = {"_getitem": True}
-        if hasattr(self, "_output"):
-            kwargs["_output"] = self._output
-        if hasattr(self, "_where_args"):
-            kwargs["_where_args"] = self._where_args
-        return self._compute_expr(item, kwargs)
+        return self.compute(item, **kwargs)
 
     def __str__(self):
         return f"{self.expression}"
diff --git a/tests/ndarray/test_lazyexpr.py b/tests/ndarray/test_lazyexpr.py
index 15fe1d19e..158134783 100644
--- a/tests/ndarray/test_lazyexpr.py
+++ b/tests/ndarray/test_lazyexpr.py
@@ -297,9 +297,9 @@ def test_functions(function, dtype_fixture, shape_fixture):
     # Compare the results
     np.testing.assert_allclose(res_lazyexpr[:], res_numexpr)
 
-    # For some reason real is not supported by numpy's assert_allclose
-    # (TypeError: bad operand type for abs(): 'LazyExpr')
-    if function == "real":
+    # For some reason real and imag are not supported by numpy's assert_allclose
+    # (TypeError: bad operand type for abs(): 'LazyExpr' and segfaults are observed)
+    if function in ("real", "imag"):
         return
 
     # Using numpy functions
@@ -981,7 +981,16 @@ def test_get_expr_operands(expression, expected_operands):
     "scalar",
     [
         "np.int8(0)",
+        "np.uint8(0)",
+        "np.int16(0)",
         "np.uint16(0)",
+        "np.int32(0)",
+        "np.uint32(0)",
+        "np.int64(0)",
+        "np.float32(0)",
+        "np.float64(0)",
+        "np.complex64(0)",
+        "np.complex128(0)",
     ],
 )
 @pytest.mark.parametrize(
@@ -995,11 +1004,15 @@ def test_get_expr_operands(expression, expected_operands):
         (np.int8, np.float64),
         (np.uint16, np.uint16),
         (np.uint16, np.uint32),
-        (np.uint16, np.uint64),
+        # (np.uint16, np.uint64), # numexpr does not support uint64
         (np.uint16, np.float32),
         (np.uint16, np.float64),
+        (np.int32, np.int32),
+        (np.int32, np.int64),
         (np.float32, np.float32),
         (np.float32, np.float64),
+        (np.complex64, np.complex64),
+        (np.complex64, np.complex128),
     ],
 )
 def test_dtype_infer(dtype1, dtype2, scalar):
@@ -1009,9 +1022,14 @@ def test_dtype_infer(dtype1, dtype2, scalar):
     a = blosc2.asarray(na)
     b = blosc2.asarray(nb)
 
+    # Using compute()
     expr = blosc2.lazyexpr(f"a + b * {scalar}", operands={"a": a, "b": b})
     nres = na + nb * eval(scalar)
-    # res = expr[()]. # TODO
     res = expr.compute()
     np.testing.assert_allclose(res[()], nres)
     assert res.dtype == nres.dtype
+
+    # Using __getitem__
+    res = expr[()]
+    np.testing.assert_allclose(res, nres)
+    assert res.dtype == nres.dtype

From 4b64f0c105df17e922030fedfe44a1df08de18fb Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 11:47:20 +0100
Subject: [PATCH 06/11] Fix remaining tests on fields in lazy expressions

---
 src/blosc2/lazyexpr.py | 8 +++++---
 src/blosc2/proxy.py    | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index d30cd4ac8..5c08c513b 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1755,14 +1755,16 @@ def __ge__(self, value):
     def where(self, value1=None, value2=None):
         if self.dtype != np.bool_:
             raise ValueError("where() can only be used with boolean expressions")
-        dtype = self.dtype
         # This just acts as a 'decorator' for the existing expression
         if value1 is not None and value2 is not None:
             # Guess the outcome dtype for value1 and value2
-            dtype = np.result_type(np.asarray(value1), np.asarray(value2))
+            dtype = np.result_type(value1, value2)
             args = {"_where_x": value1, "_where_y": value2}
         elif value1 is not None:
-            dtype = np.asarray(value1).dtype
+            if hasattr(value1, "dtype"):
+                dtype = value1.dtype
+            else:
+                dtype = np.asarray(value1).dtype
             args = {"_where_x": value1}
         elif value2 is not None:
             raise ValueError("where() requires value1 when using value2")
diff --git a/src/blosc2/proxy.py b/src/blosc2/proxy.py
index 6129ca94a..2203d1f8a 100644
--- a/src/blosc2/proxy.py
+++ b/src/blosc2/proxy.py
@@ -512,8 +512,8 @@ class ProxyNDField(blosc2.Operand):
     def __init__(self, proxy: Proxy, field: str):
         self.proxy = proxy
         self.field = field
+        self.dtype = proxy.dtype[field]
         self.shape = proxy.shape
-        self.dtype = proxy.dtype
 
     def __getitem__(self, item: slice | list[slice]) -> np.ndarray:
         """

From ac3e16c60bb5a05583c8bc6d19a270d0804cb549 Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 12:46:27 +0100
Subject: [PATCH 07/11] Assignments with values with different dtype are
 allowed now

---
 tests/ndarray/test_setitem.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/ndarray/test_setitem.py b/tests/ndarray/test_setitem.py
index 41643ec44..feab7ed0a 100644
--- a/tests/ndarray/test_setitem.py
+++ b/tests/ndarray/test_setitem.py
@@ -62,5 +62,6 @@ def test_setitem_different_dtype(shape, slices):
     nparray = np.arange(size, dtype=np.int32).reshape(shape)
     a = blosc2.empty(nparray.shape, dtype=np.float64)
 
-    with pytest.raises(ValueError):
-        a[slices] = nparray
+    a[slices] = nparray[slices]
+    nparray_ = nparray.astype(a.dtype)
+    np.testing.assert_almost_equal(a[slices], nparray_[slices])

From e3dfd45f3d9ef7f3c01903a1d7fbba6eeb73ec7c Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 13:02:08 +0100
Subject: [PATCH 08/11] Cache LazyExpr.dtype, but not .shape

---
 src/blosc2/lazyexpr.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 5c08c513b..84c19da7a 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -1567,6 +1567,13 @@ def dtype(self):
         if hasattr(self, "_dtype"):
             # In some situations, we already know the dtype
             return self._dtype
+        if (
+            hasattr(self, "_dtype_")
+            and hasattr(self, "_expression_")
+            and self._expression_ == self.expression
+        ):
+            # Use the cached dtype
+            return self._dtype_
         operands = {key: np.ones(1, dtype=value.dtype) for key, value in self.operands.items()}
         if "contains" in self.expression:
             _out = ne.evaluate(self.expression, local_dict=operands)
@@ -1580,13 +1587,13 @@ def dtype(self):
                 # with synthetic operands (1's). Let's try with numexpr, which is not so picky
                 # about this.
                 _out = ne.evaluate(self.expression, local_dict=operands)
-        return _out.dtype
+        self._dtype_ = _out.dtype
+        self._expression_ = self.expression
+        return self._dtype_
 
     @property
     def shape(self):
-        if hasattr(self, "_shape"):
-            # In some situations, we already know the shape
-            return self._shape
+        # Operands shape can change, so we always need to recompute this
         _shape, chunks, blocks, fast_path = validate_inputs(self.operands)
         if fast_path:
             # fast_path ensure that all the operands have the same partitions

From 7ec8f12cf17680f95b7421bbc167ddb65a2a66ce Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 13:35:18 +0100
Subject: [PATCH 09/11] Code cleaning

---
 src/blosc2/lazyexpr.py | 123 ++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 68 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index 84c19da7a..a266d8a83 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -342,6 +342,7 @@ def compute_smaller_slice(larger_shape, smaller_shape, larger_slice):
 valid_methods = {"sum", "prod", "min", "max", "std", "mean", "var", "any", "all", "where"}
 valid_methods |= {"int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64"}
 valid_methods |= {"float32", "float64", "complex64", "complex128"}
+valid_methods |= {"bool", "str", "bytes"}
 
 
 def validate_expr(expr: str) -> None:
@@ -414,6 +415,46 @@ def visit_Call(self, node):
     return set(visitor.operands)
 
 
+class TransformNumpyCalls(ast.NodeTransformer):
+    def __init__(self):
+        self.replacements = {}
+        self.tmp_counter = 0
+
+    def visit_Call(self, node):
+        # Check if the call is a numpy type-casting call
+        if (
+            isinstance(node.func, ast.Attribute)
+            and isinstance(node.func.value, ast.Name)
+            and node.func.value.id in ["np", "numpy"]
+            and isinstance(node.args[0], ast.Constant)
+        ):
+            # Create a new temporary variable name
+            tmp_var = f"tmp{self.tmp_counter}"
+            self.tmp_counter += 1
+
+            # Evaluate the type-casting call to create the new variable's value
+            numpy_type = getattr(np, node.func.attr)
+            self.replacements[tmp_var] = numpy_type(node.args[0].value)
+
+            # Replace the call node with a variable node
+            return ast.copy_location(ast.Name(id=tmp_var, ctx=ast.Load()), node)
+        return self.generic_visit(node)
+
+
+def extract_numpy_scalars(expr: str):
+    # Parse the expression into an AST
+    tree = ast.parse(expr, mode="eval")
+
+    # Transform the AST
+    transformer = TransformNumpyCalls()
+    transformed_tree = transformer.visit(tree)
+
+    # Generate the modified expression
+    transformed_expr = ast.unparse(transformed_tree)
+
+    return transformed_expr, transformer.replacements
+
+
 def validate_inputs(inputs: dict, out=None) -> tuple:  # noqa: C901
     """Validate the inputs for the expression."""
     if len(inputs) == 0:
@@ -1177,8 +1218,6 @@ def reduce_slices(  # noqa: C901
             result = reduce_op.value.reduce(result, **reduce_args)
 
         if out is None:
-            # if dtype is None:
-            #     dtype = result.dtype
             if is_inside_eval():
                 # We already have the dtype and reduced_shape, so return immediately
                 # Use a blosc2 container, as it consumes less memory in general
@@ -1794,6 +1833,15 @@ def sum(self, axis=None, dtype=None, keepdims=False, **kwargs):
         }
         return self.compute(_reduce_args=reduce_args, **kwargs)
 
+    def prod(self, axis=None, dtype=None, keepdims=False, **kwargs):
+        reduce_args = {
+            "op": ReduceOp.PROD,
+            "axis": axis,
+            "dtype": dtype,
+            "keepdims": keepdims,
+        }
+        return self.compute(_reduce_args=reduce_args, **kwargs)
+
     def get_num_elements(self, axis, item):
         if np.isscalar(axis):
             axis = (axis,)
@@ -1852,15 +1900,6 @@ def var(self, axis=None, dtype=None, keepdims=False, ddof=0, **kwargs):
             out = blosc2.asarray(out, **kwargs)
         return out
 
-    def prod(self, axis=None, dtype=None, keepdims=False, **kwargs):
-        reduce_args = {
-            "op": ReduceOp.PROD,
-            "axis": axis,
-            "dtype": dtype,
-            "keepdims": keepdims,
-        }
-        return self.compute(_reduce_args=reduce_args, **kwargs)
-
     def min(self, axis=None, keepdims=False, **kwargs):
         reduce_args = {
             "op": ReduceOp.MIN,
@@ -2011,23 +2050,16 @@ def save(self, urlpath=None, **kwargs):
                     "urlbase": value.urlbase,
                 }
                 continue
-            # TODO: can this be removed?
-            if key in {"numpy", "np"}:
-                # Provide access to cast funcs like int8 et al.
-                continue
             if isinstance(value, blosc2.Proxy):
                 # Take the required info from the Proxy._cache container
                 value = value._cache
             if not hasattr(value, "schunk"):
-                print(f"expression: {expression}")
                 raise ValueError(
                     "To save a LazyArray, all operands must be blosc2.NDArray or blosc2.C2Array objects"
                 )
             if value.schunk.urlpath is None:
                 raise ValueError("To save a LazyArray, all operands must be stored on disk/network")
             operands[key] = value.schunk.urlpath
-        print("expression:", expression)
-        print("operands:", operands)
         array.schunk.vlmeta["_LazyArray"] = {
             "expression": expression,
             "UDF": None,
@@ -2042,7 +2074,7 @@ def _new_expr(cls, expression, operands, guess, out=None, where=None):
             # The expression has been validated, so we can evaluate it
             # in guessing mode to avoid computing reductions
             # Extract possible numpy scalars
-            _expression, local_vars = transform_expression(expression)
+            _expression, local_vars = extract_numpy_scalars(expression)
             # Let's include numpy and blosc2 as operands so that some functions can be used
             # Most in particular, castings like np.int8 et al. can be very useful to allow
             # for desired data types in the output.
@@ -2202,46 +2234,6 @@ def save(self, **kwargs):
         raise NotImplementedError("For safety reasons, this is not implemented for UDFs")
 
 
-class TransformNumpyCalls(ast.NodeTransformer):
-    def __init__(self):
-        self.replacements = {}
-        self.tmp_counter = 0
-
-    def visit_Call(self, node):
-        # Check if the call is a numpy type-casting call
-        if (
-            isinstance(node.func, ast.Attribute)
-            and isinstance(node.func.value, ast.Name)
-            and node.func.value.id in ["np", "numpy"]
-            and isinstance(node.args[0], ast.Constant)
-        ):
-            # Create a new temporary variable name
-            tmp_var = f"tmp{self.tmp_counter}"
-            self.tmp_counter += 1
-
-            # Evaluate the type-casting call to create the new variable's value
-            numpy_type = getattr(np, node.func.attr)
-            self.replacements[tmp_var] = numpy_type(node.args[0].value)
-
-            # Replace the call node with a variable node
-            return ast.copy_location(ast.Name(id=tmp_var, ctx=ast.Load()), node)
-        return self.generic_visit(node)
-
-
-def transform_expression(expr: str):
-    # Parse the expression into an AST
-    tree = ast.parse(expr, mode="eval")
-
-    # Transform the AST
-    transformer = TransformNumpyCalls()
-    transformed_tree = transformer.visit(tree)
-
-    # Generate the modified expression
-    transformed_expr = ast.unparse(transformed_tree)
-
-    return transformed_expr, transformer.replacements
-
-
 def _open_lazyarray(array):
     value = array.schunk.meta["LazyArray"]
     if value == LazyArrayEnum.UDF.value:
@@ -2268,18 +2260,14 @@ def _open_lazyarray(array):
 
     expr = lazyarray["expression"]
     globals = {func: getattr(blosc2, func) for func in functions if func in expr}
+    # Let's include numpy as operands so that some functions can be used.
+    # Most in particular, castings like np.int8 et al. can be very useful to allow
+    # for desired data types in the output.
     globals |= {"np": np, "numpy": np}
     # Validate the expression (prevent security issues)
     validate_expr(expr)
     # Extract possible numpy scalars
-    # _expr, local_vars = transform_expression(expr)
-    # operands_dict |= local_vars
-    # Extract possible numpy scalars
-    _expression, local_vars = transform_expression(expr)
-    # Let's include numpy and blosc2 as operands so that some functions can be used
-    # Most in particular, castings like np.int8 et al. can be very useful to allow
-    # for desired data types in the output.
-    # local_vars |= {"np": np, "numpy": np}
+    _expression, local_vars = extract_numpy_scalars(expr)
     _operands = operands_dict | local_vars
     # Create the expression as such, but in guessing mode
     new_expr = eval(_expression, globals, _operands)
@@ -2287,7 +2275,6 @@ def _open_lazyarray(array):
     _shape = new_expr.shape
     if isinstance(new_expr, blosc2.LazyExpr):
         # Restore the original expression and operands
-        print("new_expr:", new_expr.expression, expr, operands_dict)
         new_expr.expression = _expression
         new_expr.expression_tosave = expr
         new_expr.operands = _operands

From 5b809678dcccb848fd47270145b3e0492ec2967d Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 13:48:07 +0100
Subject: [PATCH 10/11] Refactor to avoid duplicated code

---
 src/blosc2/lazyexpr.py | 37 +++++--------------------------------
 1 file changed, 5 insertions(+), 32 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index a266d8a83..e7fe26356 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -2259,38 +2259,11 @@ def _open_lazyarray(array):
             raise TypeError("Error when retrieving the operands")
 
     expr = lazyarray["expression"]
-    globals = {func: getattr(blosc2, func) for func in functions if func in expr}
-    # Let's include numpy as operands so that some functions can be used.
-    # Most in particular, castings like np.int8 et al. can be very useful to allow
-    # for desired data types in the output.
-    globals |= {"np": np, "numpy": np}
-    # Validate the expression (prevent security issues)
-    validate_expr(expr)
-    # Extract possible numpy scalars
-    _expression, local_vars = extract_numpy_scalars(expr)
-    _operands = operands_dict | local_vars
-    # Create the expression as such, but in guessing mode
-    new_expr = eval(_expression, globals, _operands)
-    _dtype = new_expr.dtype
-    _shape = new_expr.shape
-    if isinstance(new_expr, blosc2.LazyExpr):
-        # Restore the original expression and operands
-        new_expr.expression = _expression
-        new_expr.expression_tosave = expr
-        new_expr.operands = _operands
-        new_expr.operands_tosave = operands_dict
-        # Make the array info available for the user (only available when opened from disk)
-        new_expr.array = array
-        # We want to expose schunk too, so that .info() can be used on the LazyArray
-        new_expr.schunk = array.schunk
-    else:
-        # An immediate evaluation happened (e.g. all operands are numpy arrays)
-        new_expr = LazyExpr(None)
-        new_expr.expression = expr
-        new_expr.operands = operands_dict
-    # Cache the dtype and shape (should be immutable)
-    new_expr._dtype = _dtype
-    new_expr._shape = _shape
+    new_expr = LazyExpr._new_expr(expr, operands_dict, guess=True, out=None, where=None)
+    # Make the array info available for the user (only available when opened from disk)
+    new_expr.array = array
+    # We want to expose schunk too, so that .info() can be used on the LazyArray
+    new_expr.schunk = array.schunk
     return new_expr
 
 

From 9af89e59222db18d6968a537db67bec5be8de79e Mon Sep 17 00:00:00 2001
From: Francesc Alted <francesc@blosc.org>
Date: Mon, 11 Nov 2024 14:17:30 +0100
Subject: [PATCH 11/11] Split large test in subtests

---
 src/blosc2/lazyexpr.py                |  1 -
 tests/ndarray/test_lazyexpr_fields.py | 65 +++++++++++++++++++++------
 2 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py
index e7fe26356..be91952fc 100644
--- a/src/blosc2/lazyexpr.py
+++ b/src/blosc2/lazyexpr.py
@@ -2042,7 +2042,6 @@ def save(self, urlpath=None, **kwargs):
 
         # Save the expression and operands in the metadata
         operands = {}
-        print(operands_)
         for key, value in operands_.items():
             if isinstance(value, blosc2.C2Array):
                 operands[key] = {
diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py
index 3873c206d..c138e5dba 100644
--- a/tests/ndarray/test_lazyexpr_fields.py
+++ b/tests/ndarray/test_lazyexpr_fields.py
@@ -261,42 +261,81 @@ def test_where_reduction(array_fixture):
     np.testing.assert_allclose(res, nres)
 
 
-# This is a more complex case with where() calls combined with reductions,
+# More complex cases with where() calls combined with reductions,
 # broadcasting, reusing the result in another expression and other
-# funny stuff that is not working yet.
-def test_where_fusion(array_fixture):
+# funny stuff
+
+
+# Two where() calls
+def test_where_fusion1(array_fixture):
     sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
     expr = a1**2 + a2**2 > 2 * a1 * a2 + 1
     npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1
 
-    # Two where() calls
     res = expr.where(0, 1) + expr.where(0, 1)
     nres = np.where(npexpr, 0, 1) + np.where(npexpr, 0, 1)
     np.testing.assert_allclose(res[:], nres)
 
-    # Two where() calls with a reduction (and using broadcasting)
+
+# Two where() calls with a reduction (and using broadcasting)
+def test_where_fusion2(array_fixture):
+    sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    expr = a1**2 + a2**2 > 2 * a1 * a2 + 1
+    npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1
+
     axis = None if sa1.ndim == 1 else 1
     res = expr.where(0.5, 0.2) + expr.where(0.3, 0.6).sum(axis=axis)
     nres = np.where(npexpr, 0.5, 0.2) + np.where(npexpr, 0.3, 0.6).sum(axis=axis)
     np.testing.assert_allclose(res[:], nres)
 
-    # Reuse the result in another expression
+
+# Reuse the result in another expression
+def test_where_fusion3(array_fixture):
+    sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    expr = a1**2 + a2**2 > 2 * a1 * a2 + 1
+    npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1
+
+    res = expr.where(0, 1) + expr.where(0, 1)
+    nres = np.where(npexpr, 0, 1) + np.where(npexpr, 0, 1)
     res = expr.where(0, 1) + res.sum()
     nres = np.where(npexpr, 0, 1) + nres.sum()
     np.testing.assert_allclose(res[:], nres)
 
-    # Reuse the result in another expression twice
+
+# Reuse the result in another expression twice
+def test_where_fusion4(array_fixture):
+    sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    expr = a1**2 + a2**2 > 2 * a1 * a2 + 1
+    npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1
+
+    res = expr.where(0.1, 0.7) + expr.where(0.2, 5)
+    nres = np.where(npexpr, 0.1, 0.7) + np.where(npexpr, 0.2, 5)
     res = 2 * res + 4 * res
     nres = 2 * nres + 4 * nres
     np.testing.assert_allclose(res[:], nres)
 
-    # Reuse the result in another expression twice II
+
+# Reuse the result in another expression twice II
+def test_where_fusion5(array_fixture):
+    sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    expr = a1**2 + a2**2 > 2 * a1 * a2 + 1
+    npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1
+
+    res = expr.where(-1, 7) + expr.where(2, 5)
+    nres = np.where(npexpr, -1, 7) + np.where(npexpr, 2, 5)
     res = 2 * res + blosc2.sqrt(res)
     nres = 2 * nres + np.sqrt(nres)
     np.testing.assert_allclose(res[:], nres)
 
-    # TODO: this is not working yet
-    # Reuse the result in another expression twice III
-    # res = expr.where(0, 1) + res
-    # nres = np.where(npexpr, 0, 1) + nres
-    # np.testing.assert_allclose(res[:], nres)
+
+# Reuse the result in another expression twice III
+def test_where_fusion6(array_fixture):
+    sa1, sa2, nsa1, nsa2, a1, a2, a3, a4, na1, na2, na3, na4 = array_fixture
+    expr = a1**2 + a2**2 > 2 * a1 * a2 + 1
+    npexpr = na1**2 + na2**2 > 2 * na1 * na2 + 1
+
+    res = expr.where(-1, 1) + expr.where(2, 1)
+    nres = np.where(npexpr, -1, 1) + np.where(npexpr, 2, 1)
+    res = expr.where(6.1, 1) + res
+    nres = np.where(npexpr, 6.1, 1) + nres
+    np.testing.assert_allclose(res[:], nres)