Skip to content

Commit

Permalink
Accumulating AVX rdsum microkernels
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 627355959
  • Loading branch information
alankelly authored and xnnpack-bot committed Apr 24, 2024
1 parent 8637b70 commit a564ce7
Show file tree
Hide file tree
Showing 36 changed files with 4,786 additions and 989 deletions.
105 changes: 58 additions & 47 deletions bench/f32-rdsum.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,91 +10,102 @@
#include <xnnpack.h>
#include <xnnpack/aligned-allocator.h>
#include <xnnpack/common.h>
#include <xnnpack/gavgpool.h>
#include <xnnpack/reduce.h>
#include <xnnpack/microfnptr.h>
#include <xnnpack/microparams-init.h>


BENCHMARK_CAPTURE( f32_rsum_discontig, scalar_c4,
xnn_f32_rdsum_ukernel_7p7x__scalar_c4,
xnn_init_f32_scale_scalar_params)
->Apply(BenchmarkBatch)
->UseRealTime();

#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c4,
xnn_f32_gavgpool_minmax_ukernel_7p7x__neon_c4,
xnn_init_f32_scaleminmax_scalar_params,
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c16,
xnn_f32_rdsum_ukernel_7p7x__neon_c16,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64


#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, sse_c4,
xnn_f32_gavgpool_minmax_ukernel_7p7x__sse_c4,
xnn_init_f32_scaleminmax_sse_params)
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c32,
xnn_f32_rdsum_ukernel_7p7x__neon_c32,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64


#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE( f32_rsum_discontig, wasmsimd_arm_c4,
xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_arm_c4,
xnn_init_f32_scaleminmax_scalar_params)
#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c64,
xnn_f32_rdsum_ukernel_7p7x__neon_c64,
xnn_init_f32_scale_scalar_params,
benchmark::utils::CheckNEON)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64


#if XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE( f32_rsum_discontig, wasmsimd_x86_c4,
xnn_f32_gavgpool_minmax_ukernel_7p7x__wasmsimd_x86_c4,
xnn_init_f32_scaleminmax_scalar_params)
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, sse_c16,
xnn_f32_rdsum_ukernel_7p7x__sse_c16,
xnn_init_f32_scale_sse_params)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
BENCHMARK_CAPTURE( f32_rsum_discontig, wasm_c1,
xnn_f32_gavgpool_minmax_ukernel_7p7x__wasm_c1,
xnn_init_f32_scaleminmax_scalar_params)
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, sse_c32,
xnn_f32_rdsum_ukernel_7p7x__sse_c32,
xnn_init_f32_scale_sse_params)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_WASM || XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


BENCHMARK_CAPTURE( f32_rsum_discontig, scalar_c1,
xnn_f32_gavgpool_minmax_ukernel_7p7x__scalar_c1,
xnn_init_f32_scaleminmax_scalar_params)
->Apply(BenchmarkBatch)
->UseRealTime();
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, sse_c64,
xnn_f32_rdsum_ukernel_7p7x__sse_c64,
xnn_init_f32_scale_sse_params)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64

#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c16,
xnn_f32_rdsum_minmax_ukernel_7p7x__neon_c16,
xnn_init_f32_scaleminmax_scalar_params,
benchmark::utils::CheckNEON)

#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, avx_c16,
xnn_f32_rdsum_ukernel_7p7x__avx_c16,
xnn_init_f32_scale_avx_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c32,
xnn_f32_rdsum_minmax_ukernel_7p7x__neon_c32,
xnn_init_f32_scaleminmax_scalar_params,
benchmark::utils::CheckNEON)
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, avx_c32,
xnn_f32_rdsum_ukernel_7p7x__avx_c32,
xnn_init_f32_scale_avx_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#if XNN_ARCH_ARM || XNN_ARCH_ARM64
BENCHMARK_CAPTURE( f32_rsum_discontig, neon_c64,
xnn_f32_rdsum_minmax_ukernel_7p7x__neon_c64,
xnn_init_f32_scaleminmax_scalar_params,
benchmark::utils::CheckNEON)
#if XNN_ARCH_X86 || XNN_ARCH_X86_64
BENCHMARK_CAPTURE( f32_rsum_discontig, avx_c64,
xnn_f32_rdsum_ukernel_7p7x__avx_c64,
xnn_init_f32_scale_avx_params,
benchmark::utils::CheckAVX)
->Apply(BenchmarkBatch)
->UseRealTime();
#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64


#ifndef XNNPACK_BENCHMARK_NO_MAIN
Expand Down
14 changes: 6 additions & 8 deletions bench/rsum-benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
#include <xnnpack.h>
#include <xnnpack/aligned-allocator.h>
#include <xnnpack/common.h>
#include <xnnpack/gavgpool.h>
#include <xnnpack/reduce.h>
#include <xnnpack/microfnptr.h>

namespace {
void f32_rsum_discontig(
benchmark::State& state,
xnn_f32_gavgpool_minmax_multipass_ukernel_fn rsum_discontig,
xnn_init_f32_scaleminmax_params_fn init_params,
xnn_f32_rdsum_ukernel_fn rdsum,
xnn_init_f32_scale_params_fn init_params,
benchmark::utils::IsaCheckFunction isa_check = nullptr)
{
if (isa_check != nullptr && !isa_check(state)) {
Expand All @@ -37,17 +37,15 @@ void f32_rsum_discontig(

std::vector<float, AlignedAllocator<float, 64>> input(rows * channels + XNN_EXTRA_BYTES / sizeof(float));
std::vector<float> output(channels);
std::vector<float> buffer(channels + XNN_EXTRA_BYTES / sizeof(float));
std::vector<float> zero(channels + XNN_EXTRA_BYTES / sizeof(float), 0.f);
std::iota(input.begin(), input.end(), 0.0f);

// Prepare parameters.
union xnn_f32_scaleminmax_params params;
init_params(&params,
/*scale=*/1.0f / rows, -std::numeric_limits<float>::infinity(), +std::numeric_limits<float>::infinity());
union xnn_f32_scale_params params;
init_params(&params, /*scale=*/1.0f / rows);

for (auto _ : state) {
rsum_discontig(rows, channels, input.data(), rows * sizeof(float), zero.data(), buffer.data(), output.data(), &params);
rdsum(rows, channels, input.data(), rows * sizeof(float), zero.data(), output.data(), &params);
}

const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
Expand Down
6 changes: 6 additions & 0 deletions cmake/microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ SET(ALL_AVX_MICROKERNEL_SRCS
src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c
src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c
src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c
src/f32-rminmax/gen/f32-rmax-avx-u8.c
src/f32-rminmax/gen/f32-rmax-avx-u16-acc2.c
src/f32-rminmax/gen/f32-rmax-avx-u24-acc3.c
Expand Down Expand Up @@ -6577,6 +6580,9 @@ SET(ALL_SSE_MICROKERNEL_SRCS
src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c
src/f32-prelu/gen/f32-prelu-sse-2x4.c
src/f32-prelu/gen/f32-prelu-sse-2x8.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c
src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c
src/f32-rminmax/gen/f32-rmax-sse-u4.c
src/f32-rminmax/gen/f32-rmax-sse-u8-acc2.c
src/f32-rminmax/gen/f32-rmax-sse-u12-acc3.c
Expand Down
6 changes: 6 additions & 0 deletions microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ ALL_AVX_MICROKERNEL_SRCS = [
"src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u16.c",
"src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u24.c",
"src/f32-qu8-vcvt/gen/f32-qu8-vcvt-avx-u32.c",
"src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c",
"src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c",
"src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c",
"src/f32-rminmax/gen/f32-rmax-avx-u8.c",
"src/f32-rminmax/gen/f32-rmax-avx-u16-acc2.c",
"src/f32-rminmax/gen/f32-rmax-avx-u24-acc3.c",
Expand Down Expand Up @@ -6605,6 +6608,9 @@ ALL_SSE_MICROKERNEL_SRCS = [
"src/f32-ppmm/gen/f32-ppmm-4x8-minmax-sse.c",
"src/f32-prelu/gen/f32-prelu-sse-2x4.c",
"src/f32-prelu/gen/f32-prelu-sse-2x8.c",
"src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c",
"src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c",
"src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c",
"src/f32-rminmax/gen/f32-rmax-sse-u4.c",
"src/f32-rminmax/gen/f32-rmax-sse-u8-acc2.c",
"src/f32-rminmax/gen/f32-rmax-sse-u12-acc3.c",
Expand Down
2 changes: 1 addition & 1 deletion scripts/generate-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,6 @@ tools/generate-vunary-benchmark.py --spec test/f16-vhswish.yaml --output bench/f
tools/generate-vunary-benchmark.py --spec test/f32-vhswish.yaml --output bench/f32-vhswish.cc &

### Tests for Rsum micro-kernels
tools/generate-rdsum-benchmark.py --spec test/f32-gavgpool-minmax.yaml --output bench/f32-rdsum.cc
tools/generate-rdsum-benchmark.py --spec test/f32-rdsum.yaml --output bench/f32-rdsum.cc

wait
10 changes: 10 additions & 0 deletions scripts/generate-f32-rdsum.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,14 @@ tools/xngen src/f32-rdsum/neon.c.in -D CHANNELS=16 -D ACCUMULATORS=7 -o src/f32-
tools/xngen src/f32-rdsum/neon.c.in -D CHANNELS=32 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c32.c &
tools/xngen src/f32-rdsum/neon.c.in -D CHANNELS=64 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-neon-c64.c &

#################################### SSE ####################################
tools/xngen src/f32-rdsum/sse.c.in -D CHANNELS=16 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c16.c &
tools/xngen src/f32-rdsum/sse.c.in -D CHANNELS=32 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c32.c &
tools/xngen src/f32-rdsum/sse.c.in -D CHANNELS=64 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-sse-c64.c &

#################################### AVX ####################################
tools/xngen src/f32-rdsum/avx.c.in -D CHANNELS=16 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c16.c &
tools/xngen src/f32-rdsum/avx.c.in -D CHANNELS=32 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c32.c &
tools/xngen src/f32-rdsum/avx.c.in -D CHANNELS=64 -D ACCUMULATORS=7 -o src/f32-rdsum/gen/f32-rdsum-7p7x-minmax-avx-c64.c &

wait
1 change: 1 addition & 0 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,7 @@ tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/f32-rsu

tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/u8-rmax.yaml --output test/u8-rmax.cc &

tools/generate-rdsum-test.py --spec test/f32-rdsum.yaml --output test/f32-rdsum.cc &
### Tests for Fill micro-kernels
tools/generate-fill-test.py --spec test/xx-fill.yaml --output test/xx-fill.cc &

Expand Down

0 comments on commit a564ce7

Please sign in to comment.