Skip to content

Commit 3e0f89e

Browse files
fbarchardxnnpack-bot
authored andcommitted
neon mlal qs8 rsum use sliced accumulator sum to allow co-issuing of adds
PiperOrigin-RevId: 634581950
1 parent f79f872 commit 3e0f89e

File tree

5 files changed

+27
-4
lines changed

5 files changed

+27
-4
lines changed

bench/f32-rsum.cc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,17 @@
258258
->UseRealTime();
259259
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
260260

261+
262+
#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
263+
BENCHMARK_CAPTURE(f32_rsum, rvv_u1v,
264+
xnn_f32_rsum_ukernel__rvv_u1v,
265+
xnn_init_f32_scale_scalar_params,
266+
benchmark::utils::CheckRVV)
267+
->Apply(BenchmarkRSUM)
268+
->UseRealTime();
269+
#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
270+
271+
261272
BENCHMARK_CAPTURE(f32_rsum, scalar_u1,
262273
xnn_f32_rsum_ukernel__scalar_u1,
263274
xnn_init_f32_scale_scalar_params)

src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc4.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,8 +83,8 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u32_acc4(
8383
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
8484
}
8585
vacc0 = vaddq_s32(vacc0, vacc1);
86+
vacc2 = vaddq_s32(vacc2, vacc3);
8687
vacc0 = vaddq_s32(vacc0, vacc2);
87-
vacc0 = vaddq_s32(vacc0, vacc3);
8888
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
8989
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);
9090

src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc4.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,8 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u64_acc4(
9999
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
100100
}
101101
vacc0 = vaddq_s32(vacc0, vacc1);
102+
vacc2 = vaddq_s32(vacc2, vacc3);
102103
vacc0 = vaddq_s32(vacc0, vacc2);
103-
vacc0 = vaddq_s32(vacc0, vacc3);
104104
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
105105
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);
106106

src/qs8-rsum/neon-mlal.c.in

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,13 @@ void xnn_qs8_rsum_minmax_${REQUANTIZATION.lower()}_ukernel__neon_mlal_u${CHANNEL
7070
}
7171
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
7272
}
73-
$for A in range(1, ACCUMULATORS):
74-
vacc0 = vaddq_s32(vacc0, vacc${A});
73+
$if ACCUMULATORS > 1:
74+
$ACC_SLICE = 1
75+
$while ACC_SLICE < ACCUMULATORS:
76+
$for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
77+
$if A + ACC_SLICE < ACCUMULATORS:
78+
vacc${A} = vaddq_s32(vacc${A}, vacc${A + ACC_SLICE});
79+
$ACC_SLICE *= 2
7580
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
7681
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);
7782

test/f32-rsum.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1388,6 +1388,13 @@
13881388
.Test(xnn_f32_rsum_ukernel__rvv_u1v, xnn_init_f32_scale_scalar_params);
13891389
}
13901390
}
1391+
1392+
TEST(F32_RSUM__RVV_U1V, overflow_accumulator) {
1393+
TEST_REQUIRES_RISCV_VECTOR;
1394+
RSumMicrokernelTester()
1395+
.batch_size(128)
1396+
.Test(xnn_f32_rsum_ukernel__rvv_u1v, xnn_init_f32_scale_scalar_params);
1397+
}
13911398
#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
13921399

13931400

0 commit comments

Comments
 (0)