Skip to content

Commit

Permalink
sse41 qs8 rsum accumulating microkernels
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 633200354
  • Loading branch information
alankelly authored and xnnpack-bot committed May 17, 2024
1 parent 3e0f89e commit 04a11cf
Show file tree
Hide file tree
Showing 36 changed files with 2,325 additions and 13 deletions.
60 changes: 60 additions & 0 deletions bench/qs8-rsum-minmax-fp32.cc
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,66 @@ BENCHMARK_CAPTURE(qs8_rsum, scalar_imagic_u4,
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u16,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u16,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u32,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u32,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u64,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u64,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u32_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u32_acc2,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u64_acc2,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u64_acc2,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(qs8_rsum, neondot_u64_acc4,
xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u64_acc4,
xnn_init_qs8_avgpool_minmax_fp32_neon_params,
benchmark::utils::CheckNEONDOT)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_DOTPROD && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#ifndef XNNPACK_BENCHMARK_NO_MAIN
BENCHMARK_MAIN();
#endif
8 changes: 7 additions & 1 deletion cmake/gen/neondot_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,10 @@ SET(ALL_NEONDOT_MICROKERNEL_SRCS
src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x8c4-minmax-fp32-neondot.c
src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x16c4-minmax-fp32-neondot.c
src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c4-minmax-fp32-neondot.c
src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-fp32-neondot.c)
src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-fp32-neondot.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u16.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64-acc4.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64.c)
8 changes: 8 additions & 0 deletions cmake/gen/sse41_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,14 @@ SET(ALL_SSE41_MICROKERNEL_SRCS
src/qs8-requantization/qs8-requantization-rndna-sse41.c
src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c
src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u16-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u16.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32-acc4.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64-acc2.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64-acc4.c
src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64.c
src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u8.c
src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u16.c
src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u24.c
Expand Down
6 changes: 6 additions & 0 deletions gen/neondot_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,10 @@ ALL_NEONDOT_MICROKERNEL_SRCS = [
"src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-6x16c4-minmax-fp32-neondot.c",
"src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x8c4-minmax-fp32-neondot.c",
"src/qs8-qc8w-igemm/gen/qs8-qc8w-igemm-8x16c4-minmax-fp32-neondot.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u16.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64-acc4.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64.c",
]
8 changes: 8 additions & 0 deletions gen/sse41_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,14 @@ ALL_SSE41_MICROKERNEL_SRCS = [
"src/qs8-requantization/qs8-requantization-rndna-sse41.c",
"src/qs8-requantization/qs8-requantization-rndnu-sse41-sra.c",
"src/qs8-requantization/qs8-requantization-rndnu-sse41-srl.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u16-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u16.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32-acc4.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64-acc2.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64-acc4.c",
"src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u8.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u16.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-sse41-mul16-ld64-u24.c",
Expand Down
21 changes: 21 additions & 0 deletions scripts/generate-qs8-rsum.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,24 @@ tools/xngen src/qs8-rsum/neon-mlal.c.in -D ACCUMULATORS=2 -D CHANNEL_TILE=64 -D

tools/xngen src/qs8-rsum/neon-mlal.c.in -D ACCUMULATORS=4 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -D ARMV8=0 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc4.c &
tools/xngen src/qs8-rsum/neon-mlal.c.in -D ACCUMULATORS=4 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -D ARMV8=0 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc4.c &

tools/xngen src/qs8-rsum/neondot.c.in -D ACCUMULATORS=1 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u16.c &
tools/xngen src/qs8-rsum/neondot.c.in -D ACCUMULATORS=1 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32.c &
tools/xngen src/qs8-rsum/neondot.c.in -D ACCUMULATORS=1 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64.c &

tools/xngen src/qs8-rsum/neondot.c.in -D ACCUMULATORS=2 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32-acc2.c &
tools/xngen src/qs8-rsum/neondot.c.in -D ACCUMULATORS=2 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64-acc2.c &

tools/xngen src/qs8-rsum/neondot.c.in -D ACCUMULATORS=4 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u64-acc4.c &

################################### x86 SSE ###################################
tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=1 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u16.c &
tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=1 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32.c &
tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=1 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64.c &

tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=2 -D CHANNEL_TILE=16 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u16-acc2.c &
tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=2 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32-acc2.c &
tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=2 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64-acc2.c &

tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=4 -D CHANNEL_TILE=32 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u32-acc4.c &
tools/xngen src/qs8-rsum/sse41.c.in -D ACCUMULATORS=4 -D CHANNEL_TILE=64 -D REQUANTIZATION=FP32 -o src/qs8-rsum/gen/qs8-rsum-minmax-fp32-sse41-u64-acc4.c &
14 changes: 12 additions & 2 deletions src/microparams-init.c
Original file line number Diff line number Diff line change
Expand Up @@ -1120,13 +1120,23 @@ size_t xnn_init_qs8_avgpool_minmax_fp32_sse4_params(
for (uint32_t i = 0; i < 4; i++) {
params->fp32_sse4.init_bias[i] = init_bias;
params->fp32_sse4.scale[i] = scale;
params->fp32_sse4.magic_bias[i] = 12582912.0f;
params->fp32_sse4.magic_bias_less_output_zero_point[i] = INT32_C(0x4B400000) - (int32_t) output_zero_point;
params->fp32_sse4.output_max_less_zero_point[i] = output_max_less_zero_point;
params->fp32_sse4.magic_bias_less_output_zero_point[i] = INT32_C(0x4B400000) - (int32_t) output_zero_point;
}
for (uint32_t i = 0; i < 8; i++) {
params->fp32_sse4.output_zero_point[i] = (int16_t) output_zero_point;
}
for (uint32_t i = 0; i < 16; i++) {
params->fp32_sse4.output_min[i] = output_min;
params->fp32_sse4.output_max[i] = output_max;
}
for (uint32_t i = 0; i < 7; i++) {
params->fp32_sse4.mask_table[i] = -1;
}
for (uint32_t i = 7; i < 14; i++) {
params->fp32_sse4.mask_table[i] = 0;
}
return sizeof(params->fp32_sse4);
}
Expand Down Expand Up @@ -1164,10 +1174,10 @@ size_t xnn_init_qs8_avgpool_minmax_fp32_neon_params(
params->fp32_neon.magic_bias_less_output_zero_point = INT32_C(0x4B400000) - (int32_t) output_zero_point;
params->fp32_neon.output_min = output_min;
params->fp32_neon.output_max = output_max;
for (uint32_t i = 0; i < 7; i++) {
for (uint32_t i = 0; i < 15; i++) {
params->fp32_neon.mask_table[i] = 1;
}
for (uint32_t i = 7; i < 14; i++) {
for (uint32_t i = 15; i < 30; i++) {
params->fp32_neon.mask_table[i] = 0;
}
return sizeof(params->fp32_neon);
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u16-acc2.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u16_acc2(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u16.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u16(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc2.c
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u32_acc2(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc4.c
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u32_acc4(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u32(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc2.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u64_acc2(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc4.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u64_acc4(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64.c
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u64(
}
if (XNN_UNLIKELY(batch != 0)) {
int8x8_t vt = vld1_s8(input);
vone = vld1_s8(&params->fp32_neon.mask_table[7 - batch]);
vone = vld1_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc16_0 = vmlal_s8(vacc16_0, vt, vone);
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
Expand Down
63 changes: 63 additions & 0 deletions src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u16.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
// Auto-generated file. Do not edit!
// Template: src/qs8-rsum/neondot.c.in
// Generator: tools/xngen
//
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <arm_neon.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/reduce.h>

void xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u16(
size_t batch,
const int8_t* input,
int8_t* output,
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(input != NULL);
assert(output != NULL);

int8x16_t vone = vdupq_n_s8(1);
int32x4_t vacc0 = vmovq_n_s32(0);
for (; batch >= 16; batch -= 16) {
const int8x16_t vt0 = vld1q_s8(input); input += 16;

vacc0 = vdotq_s32(vacc0, vt0, vone);
}
if (XNN_UNLIKELY(batch != 0)) {
for (; batch >= 16; batch -= 16) {
const int8x16_t vt = vld1q_s8(input); input += 16;
vacc0 = vdotq_s32(vacc0, vt, vone);
}
if (XNN_UNLIKELY(batch != 0)) {
int8x16_t vt = vld1q_s8(input);
vone = vld1q_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc0 = vdotq_s32(vacc0, vt, vone);
}
}
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);

const int32_t vinit_bias = params->fp32_neon.init_bias;
const float vscale = params->fp32_neon.scale;
const int32_t output_min = params->fp32_neon.output_min;
const int32_t output_max = params->fp32_neon.output_max;
const float vmagic_bias = params->fp32_neon.magic_bias;
const int32_t vmagic_bias_less_output_zero_point = params->fp32_neon.magic_bias_less_output_zero_point;

float vfpacc = (float) (vget_lane_s32(vacc_lo, 0) + vinit_bias) * vscale;
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc);
vout -= vmagic_bias_less_output_zero_point;
vout = math_max_s32(vout, output_min);
vout = math_min_s32(vout, output_max);
*output += (int8_t) vout;
}
67 changes: 67 additions & 0 deletions src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neondot-u32-acc2.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
// Auto-generated file. Do not edit!
// Template: src/qs8-rsum/neondot.c.in
// Generator: tools/xngen
//
// Copyright 2024 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <arm_neon.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/reduce.h>

void xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u32_acc2(
size_t batch,
const int8_t* input,
int8_t* output,
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(input != NULL);
assert(output != NULL);

int8x16_t vone = vdupq_n_s8(1);
int32x4_t vacc0 = vmovq_n_s32(0);
int32x4_t vacc1 = vmovq_n_s32(0);
for (; batch >= 32; batch -= 32) {
const int8x16_t vt0 = vld1q_s8(input); input += 16;
const int8x16_t vt1 = vld1q_s8(input); input += 16;

vacc0 = vdotq_s32(vacc0, vt0, vone);
vacc1 = vdotq_s32(vacc1, vt1, vone);
}
if (XNN_UNLIKELY(batch != 0)) {
for (; batch >= 16; batch -= 16) {
const int8x16_t vt = vld1q_s8(input); input += 16;
vacc0 = vdotq_s32(vacc0, vt, vone);
}
if (XNN_UNLIKELY(batch != 0)) {
int8x16_t vt = vld1q_s8(input);
vone = vld1q_s8(&params->fp32_neon.mask_table[15 - batch]);
vacc0 = vdotq_s32(vacc0, vt, vone);
}
}
vacc0 = vaddq_s32(vacc0, vacc1);
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);

const int32_t vinit_bias = params->fp32_neon.init_bias;
const float vscale = params->fp32_neon.scale;
const int32_t output_min = params->fp32_neon.output_min;
const int32_t output_max = params->fp32_neon.output_max;
const float vmagic_bias = params->fp32_neon.magic_bias;
const int32_t vmagic_bias_less_output_zero_point = params->fp32_neon.magic_bias_less_output_zero_point;

float vfpacc = (float) (vget_lane_s32(vacc_lo, 0) + vinit_bias) * vscale;
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc);
vout -= vmagic_bias_less_output_zero_point;
vout = math_max_s32(vout, output_min);
vout = math_min_s32(vout, output_max);
*output += (int8_t) vout;
}

0 comments on commit 04a11cf

Please sign in to comment.