Skip to content

Commit

Permalink
neon qs8 rsum use vpadal to sum int8 to int16
Browse files Browse the repository at this point in the history
- Replace 2 vaddw with 1 vpadal

PiperOrigin-RevId: 635027760
  • Loading branch information
fbarchard authored and xnnpack-bot committed May 19, 2024
1 parent fcb3669 commit 30d3c7b
Show file tree
Hide file tree
Showing 19 changed files with 739 additions and 881 deletions.
52 changes: 16 additions & 36 deletions bench/qs8-rsum-minmax-fp32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,
->UseRealTime();

#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u16,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u16,
BENCHMARK_CAPTURE(qs8_rsum, neon_u16,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_u16,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
Expand All @@ -48,8 +48,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u32,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u32,
BENCHMARK_CAPTURE(qs8_rsum, neon_u32,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_u32,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
Expand All @@ -58,8 +58,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u64,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u64,
BENCHMARK_CAPTURE(qs8_rsum, neon_u32_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_u32_acc2,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
Expand All @@ -68,8 +68,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u16_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u16_acc2,
BENCHMARK_CAPTURE(qs8_rsum, neon_u64,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_u64,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
Expand All @@ -78,8 +78,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u32_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u32_acc2,
BENCHMARK_CAPTURE(qs8_rsum, neon_u64_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_u64_acc2,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
Expand All @@ -88,28 +88,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u64_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u64_acc2,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u32_acc4,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u32_acc4,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE(qs8_rsum, neon_addw_u64_acc4,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_addw_u64_acc4,
BENCHMARK_CAPTURE(qs8_rsum, neon_u64_acc4,
xnn_qs8_rsum_minmax_fp32_ukernel__neon_u64_acc4,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkRSUM)
Expand Down Expand Up @@ -138,8 +118,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u64,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u64,
BENCHMARK_CAPTURE(qs8_rsum, neondot_u32_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u32_acc2,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
Expand All @@ -148,8 +128,8 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u32_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u32_acc2,
BENCHMARK_CAPTURE(qs8_rsum, neondot_u64,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u64,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
Expand Down
15 changes: 7 additions & 8 deletions cmake/gen/neon_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -684,14 +684,13 @@ SET(ALL_NEON_MICROKERNEL_SRCS
src/qs8-requantization/qs8-requantization-rndna-neon.c
src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c
src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u16-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u16.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32-acc4.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u64-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u64-acc4.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u64.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u16-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u16.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc4.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64.c
src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-u8.c
src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-u16.c
src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-u24.c
Expand Down
15 changes: 7 additions & 8 deletions gen/neon_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -680,14 +680,13 @@ ALL_NEON_MICROKERNEL_SRCS = [
"src/qs8-requantization/qs8-requantization-rndna-neon.c",
"src/qs8-requantization/qs8-requantization-rndnu-neon-mull.c",
"src/qs8-requantization/qs8-requantization-rndnu-neon-qdmulh.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u16-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u16.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32-acc4.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u64-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u64-acc4.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u64.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u16-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u16.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc4.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-u8.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-u16.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-neon-ld64-u24.c",
Expand Down
94 changes: 0 additions & 94 deletions src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32-acc2.c

This file was deleted.

88 changes: 0 additions & 88 deletions src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-addw-u32.c

This file was deleted.

0 comments on commit 30d3c7b

Please sign in to comment.