Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
56da964
Numel and Nbytes Validation (#19121) (#19121)
JacobSzwejbka Apr 24, 2026
60ffe19
portable: accumulate in fp32 for Half/BFloat16 in grid_sampler_2d bil…
jgibson2 Apr 24, 2026
2330652
Allow chunked prefill when num_prompt_tokens > max_seq_len
navsud Apr 25, 2026
0a43e2f
MLX delegate: add integer support for aten.bitwise_not (#19053)
AlessandroVacca Apr 25, 2026
222711e
Add safe_numel()
lucylq Apr 25, 2026
e1cd352
Use safe numel() in ET (retake) (#19130)
lucylq Apr 25, 2026
b8f04aa
Android: Module implements Closeable (#19124)
psiddh Apr 25, 2026
7e2ff8a
Android: consistent error types across all modules (#19099)
psiddh Apr 25, 2026
c1d482e
Merge back to back slices on the same dim
DrJessop Apr 25, 2026
5252704
Add C++ unit tests for cadence::quantized_conv2d_nhwc + add depthwise…
hsharma35 Apr 25, 2026
6b175ff
Revert Android PRs #19099, #19124, #19092, #19028 (#19133)
JacobSzwejbka Apr 26, 2026
bdf1bf4
Back out D102011505 and D101260086 (#19134)
JacobSzwejbka Apr 26, 2026
2d9bbc1
Add top-k sampling support to llm Sampler (#19122)
kirklandsign Apr 26, 2026
563be2f
Re-apply D101260086: Android unified error reporting
JacobSzwejbka Apr 26, 2026
bf64fa1
Back out "Re-apply D101260086: Android unified error reporting" (#19137)
JacobSzwejbka Apr 26, 2026
dd5f6b1
delete aihub readme under executorch/examples/qualcomm/README.md
winskuo-quic Apr 21, 2026
bafc40c
Delete more aihub readme under executorch/examples/qualcomm/README.md
winskuo-quic Apr 21, 2026
69d56bc
Update README to fix minor error
winskuo-quic Apr 22, 2026
dea0731
More fix on readme removal
winskuo-quic Apr 22, 2026
3d4c00d
Remove SKILL.md info related to aihub and aihub info under backends/q…
winskuo-quic Apr 22, 2026
98bc518
Remove cpp, python, and cmake files
winskuo-quic Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .claude/skills/qualcomm/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,5 @@ Required flags: `-m` (SoC model), `-b` (Android build dir). Optional: `-s` (devi
| `TestExampleLLMScript` | LLM script tests |
| `TestExampleMultimodalityScript` | Multimodality script tests |
| `TestExampleOssScript` | OSS model script tests |
| `TestExampleQaihubScript` | QAI Hub script tests |
| `TestExampleScript` | General example script tests |
| `TestUtilsScript` | Utility script tests |
2 changes: 2 additions & 0 deletions backends/cadence/aot/BUCK
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ fbcode_target(_kind = executorch_generated_lib,
"//executorch/backends/cadence/generic/operators:op_quantized_conv2d",
"//executorch/backends/cadence/generic/operators:op_quantized_conv1d_ncl",
"//executorch/backends/cadence/generic/operators:op_quantized_conv1d_nlc",
"//executorch/backends/cadence/generic/operators:op_quantized_depthwise_conv1d_ncl",
"//executorch/backends/cadence/generic/operators:op_quantized_depthwise_conv1d_nlc",
"//executorch/backends/cadence/generic/operators:op_quantized_fully_connected",
"//executorch/backends/cadence/generic/operators:op_quantized_layer_norm",
"//executorch/backends/cadence/generic/operators:op_quantized_linear",
Expand Down
5 changes: 5 additions & 0 deletions backends/cadence/aot/functions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,11 @@
- arg_meta: null
kernel_name: impl::generic::quantized_conv2d_nhwc_per_tensor_out

- func: cadence::quantized_conv2d_depthwise_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: impl::generic::quantized_conv2d_depthwise_nhwc_out

- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
Expand Down
71 changes: 71 additions & 0 deletions backends/cadence/aot/fuse_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
HierarchicalInplacePassInterface,
register_cadence_pass,
RemoveOrReplacePassInterface,
set_arg,
)
from executorch.backends.cadence.aot.utils import get_edge_overload_packet
from executorch.backends.transforms.fuse_cascaded_transpose_or_permute_ops import (
Expand Down Expand Up @@ -1003,6 +1004,75 @@ def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
return True


@register_cadence_pass(CadencePassAttribute(opt_level=0))
class FuseSliceSameDimPass(RemoveOrReplacePassInterface):
"""Fuse chained slices on the same dim into a single slice.

When a slice_copy's input is another slice_copy on the same dimension
with step=1, the child slice can read directly from the grandparent
with merged indices, eliminating the intermediate slice.

Handles negative start/end indices by canonicalizing them against the
relevant dimension size before merging.
"""

@staticmethod
def _canonicalize(val: int, dim_size: int) -> int:
return val + dim_size if val < 0 else val

@property
def targets(self) -> list[EdgeOpOverload]:
return [exir_ops.edge.aten.slice_copy.Tensor]

def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool:
parent = get_arg(node, "input", torch.fx.Node)
if parent.target != exir_ops.edge.aten.slice_copy.Tensor:
return False

grandparent = get_arg(parent, "input", torch.fx.Node)
ndim = len(grandparent.meta["val"].shape)
child_dim = get_arg(node, "dim", int) % ndim
parent_dim = get_arg(parent, "dim", int) % ndim
if child_dim != parent_dim:
return False

child_start = get_arg(node, "start", Optional[int])
child_end = get_arg(node, "end", Optional[int])
child_step = get_arg(node, "step", int)
parent_start = get_arg(parent, "start", Optional[int])
parent_end = get_arg(parent, "end", Optional[int])
parent_step = get_arg(parent, "step", int)

if child_step != 1 or parent_step != 1:
return False
if (
child_start is None
or child_end is None
or parent_start is None
or parent_end is None
):
return False

grandparent_dim_size = grandparent.meta["val"].shape[parent_dim]
parent_dim_size = parent.meta["val"].shape[parent_dim]

p_start = self._canonicalize(parent_start, grandparent_dim_size)
p_end = self._canonicalize(parent_end, grandparent_dim_size)
c_start = self._canonicalize(child_start, parent_dim_size)
c_end = self._canonicalize(child_end, parent_dim_size)

new_start = p_start + c_start
new_end = min(p_start + c_end, p_end)

if new_end > grandparent_dim_size:
return False

node.replace_input_with(parent, grandparent)
set_arg(node, "start", new_start)
set_arg(node, "end", new_end)
return True


class HierarchicalCSEPass(HierarchicalInplacePassInterface):
"""
A hierarchical Common Subexpression Elimination (CSE) pass that recursively
Expand Down Expand Up @@ -1035,4 +1105,5 @@ class CadenceFuseOpsInGraph:
FuseMulScalarIntoDequantPass,
FuseFullThenReshapePass,
FuseTransposeOrPermuteOpPairsPass,
FuseSliceSameDimPass,
]
49 changes: 49 additions & 0 deletions backends/cadence/aot/ops_registrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@ def register_fake(
lib.define(
"quantized_conv2d_nhwc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, Tensor? offset=None, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
"quantized_conv2d_depthwise_nhwc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
)
lib.define(
"quantized_conv2d_depthwise_nhwc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
)
lib.define(
"quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
)
Expand Down Expand Up @@ -2105,6 +2111,49 @@ def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_meta(
return input.new_empty(output_size, dtype=input.dtype)


@register_fake("cadence::quantized_conv2d_depthwise_nhwc")
def quantized_conv2d_depthwise_nhwc_meta(
input: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: Tuple[int],
padding: Tuple[int],
dilation: Tuple[int],
groups: int,
in_zero_point: int,
weight_zero_point: int,
bias_scale: float,
output_scale: float,
output_zero_point: int,
out_multiplier: int,
out_shift: int,
) -> torch.Tensor:
in_size = input.shape
assert len(in_size) > 2
assert len(in_size) < 6
# Depthwise weight is always [*kernel_size, OC]:
# 2D: [KH, KW, OC], 1D: [K, OC]
*kernel_size, out_channels = weight.shape

output_size = (
get_conv1d_output_size(
in_size,
out_channels,
stride[-1],
padding[-1],
dilation[-1],
kernel_size[0],
True,
)
if len(in_size) == 3
else get_conv2d_output_size(
in_size, out_channels, stride, padding, dilation, kernel_size, True
)
)

return input.new_empty(output_size, dtype=input.dtype)


@register_fake("cadence::quantized_layer_norm")
def quantized_layer_norm_meta(
input: torch.Tensor,
Expand Down
35 changes: 35 additions & 0 deletions backends/cadence/aot/ref_implementations.py
Original file line number Diff line number Diff line change
Expand Up @@ -1556,6 +1556,41 @@ def quantized_conv2d_nhwc(
)


@impl_tracked(m, "quantized_conv2d_depthwise_nhwc")
def quantized_conv2d_depthwise_nhwc(
input_tensor: torch.Tensor,
weight: torch.Tensor,
bias: torch.Tensor,
stride: tuple[int, int],
padding: tuple[int, int],
dilation: tuple[int, int],
groups: int,
in_zero_point: int,
weight_zero_point: int,
bias_scale: float,
output_scale: float,
output_zero_point: int,
out_multiplier: int,
out_shift: int,
) -> torch.Tensor:
return quantized_conv2d_nhwc_per_tensor(
input_tensor,
weight,
bias,
stride,
padding,
dilation,
groups,
in_zero_point,
weight_zero_point,
bias_scale,
output_scale,
output_zero_point,
out_multiplier,
out_shift,
)


def quantized_conv_variant(
layout: str,
input_dtype: torch.dtype,
Expand Down
Loading
Loading