Skip to content

Commit

Permalink
sse41 qs8 rsum accumulating microkernels
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 633200354
  • Loading branch information
alankelly authored and xnnpack-bot committed May 16, 2024
1 parent 7fabcac commit becf393
Show file tree
Hide file tree
Showing 62 changed files with 4,645 additions and 376 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2991,6 +2991,12 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(qs8-requantization-test PRIVATE hardware-config logging microkernels-all)
ADD_TEST(NAME qs8-requantization-test COMMAND qs8-requantization-test)

ADD_EXECUTABLE(qs8-rsum-minmax-fp32-test test/qs8-rsum-mimax-fp32.cc)
TARGET_INCLUDE_DIRECTORIES(qs8-rsum-minmax-fp32-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main microparams-init)
TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE hardware-config logging microkernels-all)
ADD_TEST(NAME qs8-rsum-minmax-fp32-test COMMAND qs8-rsum-minmax-fp32-test)

ADD_EXECUTABLE(qs8-vadd-minmax-test test/qs8-vadd-minmax.cc)
SET_TARGET_PROPERTIES(qs8-vadd-minmax-test PROPERTIES CXX_EXTENSIONS YES)
TARGET_INCLUDE_DIRECTORIES(qs8-vadd-minmax-test PRIVATE include src test)
Expand Down
22 changes: 19 additions & 3 deletions bench/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ xnnpack_benchmark(
deps = MICROKERNEL_BENCHMARK_DEPS + ["//:requantization_stubs"],
)

xnnpack_benchmark(
name = "qs8_rsum_minmax_fp32_bench",
srcs = [
"qs8-rsum-minmax-fp32.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS + [
":rsum_benchmark",
],
)

xnnpack_benchmark(
name = "qs8_vadd_bench",
srcs = [
Expand Down Expand Up @@ -442,15 +452,19 @@ xnnpack_benchmark(
srcs = [
"f16-rsum.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
deps = MICROKERNEL_BENCHMARK_DEPS + [
":rsum_benchmark",
],
)

xnnpack_benchmark(
name = "f16_f32acc_rsum_bench",
srcs = [
"f16-f32acc-rsum.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
deps = MICROKERNEL_BENCHMARK_DEPS + [
":rsum_benchmark",
],
)

xnnpack_benchmark(
Expand Down Expand Up @@ -839,7 +853,9 @@ xnnpack_benchmark(
srcs = [
"f32-rsum.cc",
],
deps = MICROKERNEL_BENCHMARK_DEPS,
deps = MICROKERNEL_BENCHMARK_DEPS + [
":rsum_benchmark",
],
)

xnnpack_benchmark(
Expand Down
68 changes: 54 additions & 14 deletions bench/f16-f32acc-rdsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,71 +20,111 @@


#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, neonfp16arith_c16,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, neonfp16arith_c16,
xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c16,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckNEONFP16ARITH)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, neonfp16arith_c32,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, neonfp16arith_c32,
xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c32,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckNEONFP16ARITH)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, neonfp16arith_c64,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, neonfp16arith_c64,
xnn_f16_f32acc_rdsum_ukernel_7p7x__neonfp16arith_c64,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckNEONFP16ARITH)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64)


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, avx512skx_c16,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c16,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c16,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c32,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c32,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c64,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c64,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, f16c_c128,
xnn_f16_f32acc_rdsum_ukernel_7p7x__f16c_c128,
xnn_init_f16_f32acc_scale_avx_params,
benchmark::utils::CheckF16C)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rdsum, avx512skx_c16,
xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c16,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckAVX512SKX)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, avx512skx_c32,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, avx512skx_c32,
xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c32,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckAVX512SKX)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, avx512skx_c64,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, avx512skx_c64,
xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c64,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckAVX512SKX)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE(f16_f32acc_rsum_discontig, avx512skx_c128,
BENCHMARK_CAPTURE(f16_f32acc_rdsum, avx512skx_c128,
xnn_f16_f32acc_rdsum_ukernel_7p7x__avx512skx_c128,
xnn_init_f16_f32acc_scale_scalar_params,
benchmark::utils::CheckAVX512SKX)
->Apply(BenchmarkBatch)
->Apply(BenchmarkRDSUM)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64

Expand Down

0 comments on commit becf393

Please sign in to comment.