Skip to content

Commit

Permalink
neon mlal qs8 rsum use sliced accumulator sum to allow co-issuing of …
Browse files Browse the repository at this point in the history
…adds

PiperOrigin-RevId: 634581950
  • Loading branch information
fbarchard authored and xnnpack-bot committed May 17, 2024
1 parent f79f872 commit 3e0f89e
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 4 deletions.
11 changes: 11 additions & 0 deletions bench/f32-rsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,17 @@
->UseRealTime();
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD


#if XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV
BENCHMARK_CAPTURE(f32_rsum, rvv_u1v,
xnn_f32_rsum_ukernel__rvv_u1v,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckRVV)
->Apply(BenchmarkRSUM)
->UseRealTime();
#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV


BENCHMARK_CAPTURE(f32_rsum, scalar_u1,
xnn_f32_rsum_ukernel__scalar_u1,
xnn_init_f32_scale_scalar_params)
Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u32-acc4.c
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u32_acc4(
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
}
vacc0 = vaddq_s32(vacc0, vacc1);
vacc2 = vaddq_s32(vacc2, vacc3);
vacc0 = vaddq_s32(vacc0, vacc2);
vacc0 = vaddq_s32(vacc0, vacc3);
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);

Expand Down
2 changes: 1 addition & 1 deletion src/qs8-rsum/gen/qs8-rsum-minmax-fp32-neon-u64-acc4.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ void xnn_qs8_rsum_minmax_fp32_ukernel__neon_mlal_u64_acc4(
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
}
vacc0 = vaddq_s32(vacc0, vacc1);
vacc2 = vaddq_s32(vacc2, vacc3);
vacc0 = vaddq_s32(vacc0, vacc2);
vacc0 = vaddq_s32(vacc0, vacc3);
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);

Expand Down
9 changes: 7 additions & 2 deletions src/qs8-rsum/neon-mlal.c.in
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,13 @@ void xnn_qs8_rsum_minmax_${REQUANTIZATION.lower()}_ukernel__neon_mlal_u${CHANNEL
}
vacc0 = vaddq_s32(vacc0, vaddq_s32(vmovl_s16(vget_low_s16(vacc16_0)), vmovl_s16(vget_high_s16(vacc16_0))));
}
$for A in range(1, ACCUMULATORS):
vacc0 = vaddq_s32(vacc0, vacc${A});
$if ACCUMULATORS > 1:
$ACC_SLICE = 1
$while ACC_SLICE < ACCUMULATORS:
$for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
$if A + ACC_SLICE < ACCUMULATORS:
vacc${A} = vaddq_s32(vacc${A}, vacc${A + ACC_SLICE});
$ACC_SLICE *= 2
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0));
vacc_lo = vpadd_s32(vacc_lo, vacc_lo);

Expand Down
7 changes: 7 additions & 0 deletions test/f32-rsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1388,6 +1388,13 @@
.Test(xnn_f32_rsum_ukernel__rvv_u1v, xnn_init_f32_scale_scalar_params);
}
}

TEST(F32_RSUM__RVV_U1V, overflow_accumulator) {
TEST_REQUIRES_RISCV_VECTOR;
RSumMicrokernelTester()
.batch_size(128)
.Test(xnn_f32_rsum_ukernel__rvv_u1v, xnn_init_f32_scale_scalar_params);
}
#endif // XNN_ENABLE_RISCV_VECTOR && XNN_ARCH_RISCV


Expand Down

0 comments on commit 3e0f89e

Please sign in to comment.