CodeLinaro · winskuo-quic · Apr 24, 2026 · Apr 24, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/.claude/skills/qualcomm/SKILL.md b/.claude/skills/qualcomm/SKILL.md
@@ -93,6 +93,5 @@ Required flags: `-m` (SoC model), `-b` (Android build dir). Optional: `-s` (devi
 | `TestExampleLLMScript` | LLM script tests |
 | `TestExampleMultimodalityScript` | Multimodality script tests |
 | `TestExampleOssScript` | OSS model script tests |
-| `TestExampleQaihubScript` | QAI Hub script tests |
 | `TestExampleScript` | General example script tests |
 | `TestUtilsScript` | Utility script tests |
diff --git a/backends/cadence/aot/BUCK b/backends/cadence/aot/BUCK
@@ -156,6 +156,8 @@ fbcode_target(_kind = executorch_generated_lib,
         "//executorch/backends/cadence/generic/operators:op_quantized_conv2d",
         "//executorch/backends/cadence/generic/operators:op_quantized_conv1d_ncl",
         "//executorch/backends/cadence/generic/operators:op_quantized_conv1d_nlc",
+        "//executorch/backends/cadence/generic/operators:op_quantized_depthwise_conv1d_ncl",
+        "//executorch/backends/cadence/generic/operators:op_quantized_depthwise_conv1d_nlc",
         "//executorch/backends/cadence/generic/operators:op_quantized_fully_connected",
         "//executorch/backends/cadence/generic/operators:op_quantized_layer_norm",
         "//executorch/backends/cadence/generic/operators:op_quantized_linear",

diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -389,6 +389,11 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out
 
+- func: cadence::quantized_conv2d_depthwise_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv2d_depthwise_nhwc_out
+
 - func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null

diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
@@ -31,6 +31,7 @@
     HierarchicalInplacePassInterface,
     register_cadence_pass,
     RemoveOrReplacePassInterface,
+    set_arg,
 )
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.backends.transforms.fuse_cascaded_transpose_or_permute_ops import (
@@ -1003,6 +1004,75 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
         return True
 
 
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class FuseSliceSameDimPass(RemoveOrReplacePassInterface):
+    """Fuse chained slices on the same dim into a single slice.
+
+    When a slice_copy's input is another slice_copy on the same dimension
+    with step=1, the child slice can read directly from the grandparent
+    with merged indices, eliminating the intermediate slice.
+
+    Handles negative start/end indices by canonicalizing them against the
+    relevant dimension size before merging.
+    """
+
+    @staticmethod
+    def _canonicalize(val: int, dim_size: int) -> int:
+        return val + dim_size if val < 0 else val
+
+    @property
+    def targets(self) -> list[EdgeOpOverload]:
+        return [exir_ops.edge.aten.slice_copy.Tensor]
+
+    def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
+        parent = get_arg(node, "input", torch.fx.Node)
+        if parent.target != exir_ops.edge.aten.slice_copy.Tensor:
+            return False
+
+        grandparent = get_arg(parent, "input", torch.fx.Node)
+        ndim = len(grandparent.meta["val"].shape)
+        child_dim = get_arg(node, "dim", int) % ndim
+        parent_dim = get_arg(parent, "dim", int) % ndim
+        if child_dim != parent_dim:
+            return False
+
+        child_start = get_arg(node, "start", Optional[int])
+        child_end = get_arg(node, "end", Optional[int])
+        child_step = get_arg(node, "step", int)
+        parent_start = get_arg(parent, "start", Optional[int])
+        parent_end = get_arg(parent, "end", Optional[int])
+        parent_step = get_arg(parent, "step", int)
+
+        if child_step != 1 or parent_step != 1:
+            return False
+        if (
+            child_start is None
+            or child_end is None
+            or parent_start is None
+            or parent_end is None
+        ):
+            return False
+
+        grandparent_dim_size = grandparent.meta["val"].shape[parent_dim]
+        parent_dim_size = parent.meta["val"].shape[parent_dim]
+
+        p_start = self._canonicalize(parent_start, grandparent_dim_size)
+        p_end = self._canonicalize(parent_end, grandparent_dim_size)
+        c_start = self._canonicalize(child_start, parent_dim_size)
+        c_end = self._canonicalize(child_end, parent_dim_size)
+
+        new_start = p_start + c_start
+        new_end = min(p_start + c_end, p_end)
+
+        if new_end > grandparent_dim_size:
+            return False
+
+        node.replace_input_with(parent, grandparent)
+        set_arg(node, "start", new_start)
+        set_arg(node, "end", new_end)
+        return True
+
+
 class HierarchicalCSEPass(HierarchicalInplacePassInterface):
     """
     A hierarchical Common Subexpression Elimination (CSE) pass that recursively
@@ -1035,4 +1105,5 @@ class CadenceFuseOpsInGraph:
         FuseMulScalarIntoDequantPass,
         FuseFullThenReshapePass,
         FuseTransposeOrPermuteOpPairsPass,
+        FuseSliceSameDimPass,
     ]
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -238,6 +238,12 @@ def register_fake(
 lib.define(
     "quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_conv2d_depthwise_nhwc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv2d_depthwise_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
 )
@@ -2105,6 +2111,49 @@ def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_conv2d_depthwise_nhwc")
+def quantized_conv2d_depthwise_nhwc_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    in_size = input.shape
+    assert len(in_size) > 2
+    assert len(in_size) < 6
+    # Depthwise weight is always [*kernel_size, OC]:
+    # 2D: [KH, KW, OC], 1D: [K, OC]
+    *kernel_size, out_channels = weight.shape
+
+    output_size = (
+        get_conv1d_output_size(
+            in_size,
+            out_channels,
+            stride[-1],
+            padding[-1],
+            dilation[-1],
+            kernel_size[0],
+            True,
+        )
+        if len(in_size) == 3
+        else get_conv2d_output_size(
+            in_size, out_channels, stride, padding, dilation, kernel_size, True
+        )
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_layer_norm")
 def quantized_layer_norm_meta(
     input: torch.Tensor,

diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -1556,6 +1556,41 @@ def quantized_conv2d_nhwc(
     )
 
 
+@impl_tracked(m, "quantized_conv2d_depthwise_nhwc")
+def quantized_conv2d_depthwise_nhwc(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    return quantized_conv2d_nhwc_per_tensor(
+        input_tensor,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out_multiplier,
+        out_shift,
+    )
+
+
 def quantized_conv_variant(
     layout: str,
     input_dtype: torch.dtype,