From f49c9ce7e0d497593fe5afab3ebbaabba68647a0 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Sat, 4 May 2024 17:45:02 -0700 Subject: [PATCH] F16-RMINMAX microkernels using AVX512 FP16 arithmetics - F16-RMINMAX and F16-RMIN microkernels using FP16 arithmetics PiperOrigin-RevId: 630721869 --- bench/f16-rmin.cc | 33 ++++ bench/f16-rminmax.cc | 33 ++++ cmake/microkernels.cmake | 12 +- microkernels.bzl | 10 + scripts/generate-f16-rminmax.sh | 12 ++ src/amalgam/gen/avx512skx.c | 24 ++- .../gen/f16-rmin-avx512fp16-u128-acc2.c | 72 +++++++ .../gen/f16-rmin-avx512fp16-u128-acc4.c | 76 +++++++ src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c | 58 ++++++ .../gen/f16-rmin-avx512fp16-u64-acc2.c | 68 +++++++ .../gen/f16-rmin-avx512fp16-u96-acc3.c | 72 +++++++ .../gen/f16-rminmax-avx512fp16-u128-acc2.c | 87 ++++++++ .../gen/f16-rminmax-avx512fp16-u128-acc4.c | 95 +++++++++ .../gen/f16-rminmax-avx512fp16-u32.c | 67 +++++++ .../gen/f16-rminmax-avx512fp16-u64-acc2.c | 81 ++++++++ .../gen/f16-rminmax-avx512fp16-u96-acc3.c | 88 +++++++++ src/xnnpack/reduce.h | 12 ++ test/f16-rmin.cc | 185 ++++++++++++++++++ test/f16-rmin.yaml | 7 + test/f16-rminmax.cc | 185 ++++++++++++++++++ test/f16-rminmax.yaml | 7 + 21 files changed, 1277 insertions(+), 7 deletions(-) create mode 100644 src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c create mode 100644 src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c create mode 100644 src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c create mode 100644 src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c create mode 100644 src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c create mode 100644 src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c create mode 100644 src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c create mode 100644 src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c create mode 100644 src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c create mode 100644 src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c diff --git a/bench/f16-rmin.cc b/bench/f16-rmin.cc index 008a330f411..793487406a7 100644 --- a/bench/f16-rmin.cc +++ b/bench/f16-rmin.cc @@ -98,6 +98,39 @@ static void f16_rmin( ->UseRealTime(); #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + BENCHMARK_CAPTURE(f16_rmin, avx512fp16_u32, + xnn_f16_rmin_ukernel__avx512fp16_u32, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rmin, avx512fp16_u64_acc2, + xnn_f16_rmin_ukernel__avx512fp16_u64_acc2, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rmin, avx512fp16_u96_acc3, + xnn_f16_rmin_ukernel__avx512fp16_u96_acc3, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rmin, avx512fp16_u128_acc2, + xnn_f16_rmin_ukernel__avx512fp16_u128_acc2, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rmin, avx512fp16_u128_acc4, + xnn_f16_rmin_ukernel__avx512fp16_u128_acc4, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_rmin, avx512skx_u16, xnn_f16_rmin_ukernel__avx512skx_u16, diff --git a/bench/f16-rminmax.cc b/bench/f16-rminmax.cc index efa0522c5b6..ea2b2cf75d4 100644 --- a/bench/f16-rminmax.cc +++ b/bench/f16-rminmax.cc @@ -98,6 +98,39 @@ static void f16_rminmax( ->UseRealTime(); #endif // XNN_ARCH_ARM || XNN_ARCH_ARM64 +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + BENCHMARK_CAPTURE(f16_rminmax, avx512fp16_u32, + xnn_f16_rminmax_ukernel__avx512fp16_u32, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rminmax, avx512fp16_u64_acc2, + xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rminmax, avx512fp16_u96_acc3, + xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rminmax, avx512fp16_u128_acc2, + xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); + BENCHMARK_CAPTURE(f16_rminmax, avx512fp16_u128_acc4, + xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4, + /*init_params=*/nullptr, + benchmark::utils::CheckAVX512FP16) + ->Apply(benchmark::utils::ReductionParameters) + ->UseRealTime(); +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 BENCHMARK_CAPTURE(f16_rminmax, avx512skx_u16, xnn_f16_rminmax_ukernel__avx512skx_u16, diff --git a/cmake/microkernels.cmake b/cmake/microkernels.cmake index 74365d24070..f53f1b404d2 100644 --- a/cmake/microkernels.cmake +++ b/cmake/microkernels.cmake @@ -1548,7 +1548,17 @@ SET(ALL_AVX512FP16_MICROKERNEL_SRCS src/f16-rminmax/gen/f16-rmax-avx512fp16-u64-acc2.c src/f16-rminmax/gen/f16-rmax-avx512fp16-u96-acc3.c src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc2.c - src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc4.c) + src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc4.c + src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c + src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c + src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c + src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c + src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c + src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c + src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c + src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c + src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c + src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c) SET(ALL_AVX512SKX_MICROKERNEL_SRCS src/f16-f32-vcvt/gen/f16-f32-vcvt-avx512skx-u16.c diff --git a/microkernels.bzl b/microkernels.bzl index 7a123b50bfb..69d221f0bf8 100644 --- a/microkernels.bzl +++ b/microkernels.bzl @@ -1550,6 +1550,16 @@ ALL_AVX512FP16_MICROKERNEL_SRCS = [ "src/f16-rminmax/gen/f16-rmax-avx512fp16-u96-acc3.c", "src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc2.c", "src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc4.c", + "src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c", + "src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c", + "src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c", + "src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c", + "src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c", + "src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c", + "src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c", + "src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c", + "src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c", + "src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c", ] ALL_AVX512SKX_MICROKERNEL_SRCS = [ diff --git a/scripts/generate-f16-rminmax.sh b/scripts/generate-f16-rminmax.sh index 1e464ff4862..32439cff1c1 100755 --- a/scripts/generate-f16-rminmax.sh +++ b/scripts/generate-f16-rminmax.sh @@ -30,6 +30,18 @@ tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=128 -D ACCUMULATORS=2 -D OP=MAX -o src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc2.c & tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=128 -D ACCUMULATORS=4 -D OP=MAX -o src/f16-rminmax/gen/f16-rmax-avx512fp16-u128-acc4.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -D OP=MIN -o src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -D OP=MIN -o src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 -D OP=MIN -o src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=128 -D ACCUMULATORS=2 -D OP=MIN -o src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=128 -D ACCUMULATORS=4 -D OP=MIN -o src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c & + +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=32 -D ACCUMULATORS=1 -D OP=MINMAX -o src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=64 -D ACCUMULATORS=2 -D OP=MINMAX -o src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=96 -D ACCUMULATORS=3 -D OP=MINMAX -o src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=128 -D ACCUMULATORS=2 -D OP=MINMAX -o src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c & +tools/xngen src/f16-rminmax/avx512fp16.c.in -D BATCH_TILE=128 -D ACCUMULATORS=4 -D OP=MINMAX -o src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c & + ################################## x86 AVX512SKX ################################# tools/xngen src/f16-rminmax/avx512skx.c.in -D BATCH_TILE=16 -D ACCUMULATORS=1 -D OP=MAX -o src/f16-rminmax/gen/f16-rmax-avx512skx-u16.c & tools/xngen src/f16-rminmax/avx512skx.c.in -D BATCH_TILE=32 -D ACCUMULATORS=2 -D OP=MAX -o src/f16-rminmax/gen/f16-rmax-avx512skx-u32-acc2.c & diff --git a/src/amalgam/gen/avx512skx.c b/src/amalgam/gen/avx512skx.c index fc47b8b0177..50d456330ab 100644 --- a/src/amalgam/gen/avx512skx.c +++ b/src/amalgam/gen/avx512skx.c @@ -1313,10 +1313,18 @@ void xnn_f32_vtanh_ukernel__avx512skx_expm1minus_rr1_lut4_p4h3ts_perm_div_u64( const __m512 vx3 = _mm512_loadu_ps(input + 48); input += 64; - const __m512 vz0 = _mm512_range_ps(vsat_cutoff, vx0, 0xA); - const __m512 vz1 = _mm512_range_ps(vsat_cutoff, vx1, 0xA); - const __m512 vz2 = _mm512_range_ps(vsat_cutoff, vx2, 0xA); - const __m512 vz3 = _mm512_range_ps(vsat_cutoff, vx3, 0xA); + const __mmask16 vnan_mask0 = _mm512_cmp_ps_mask(vx0, vx0, _CMP_EQ_OQ); + __m512 vz0 = _mm512_range_ps(vsat_cutoff, vx0, 0xA); + vz0 = _mm512_mask_blend_ps(vnan_mask0, vx0, vz0); + const __mmask16 vnan_mask1 = _mm512_cmp_ps_mask(vx1, vx1, _CMP_EQ_OQ); + __m512 vz1 = _mm512_range_ps(vsat_cutoff, vx1, 0xA); + vz1 = _mm512_mask_blend_ps(vnan_mask1, vx1, vz1); + const __mmask16 vnan_mask2 = _mm512_cmp_ps_mask(vx2, vx2, _CMP_EQ_OQ); + __m512 vz2 = _mm512_range_ps(vsat_cutoff, vx2, 0xA); + vz2 = _mm512_mask_blend_ps(vnan_mask2, vx2, vz2); + const __mmask16 vnan_mask3 = _mm512_cmp_ps_mask(vx3, vx3, _CMP_EQ_OQ); + __m512 vz3 = _mm512_range_ps(vsat_cutoff, vx3, 0xA); + vz3 = _mm512_mask_blend_ps(vnan_mask3, vx3, vz3); __m512 vn0 = _mm512_fmadd_ps(vz0, vminus_log2e, vmagic_bias); __m512 vn1 = _mm512_fmadd_ps(vz1, vminus_log2e, vmagic_bias); __m512 vn2 = _mm512_fmadd_ps(vz2, vminus_log2e, vmagic_bias); @@ -1399,7 +1407,9 @@ void xnn_f32_vtanh_ukernel__avx512skx_expm1minus_rr1_lut4_p4h3ts_perm_div_u64( const __m512 vx = _mm512_loadu_ps(input); input += 16; - const __m512 vz = _mm512_range_ps(vsat_cutoff, vx, 0xA); + const __mmask16 vnan_mask = _mm512_cmp_ps_mask(vx, vx, _CMP_EQ_OQ); + __m512 vz = _mm512_range_ps(vsat_cutoff, vx, 0xA); + vz = _mm512_mask_blend_ps(vnan_mask, vx, vz); __m512 vn = _mm512_fmadd_ps(vz, vminus_log2e, vmagic_bias); const __m512i ve = _mm512_slli_epi32(_mm512_castps_si512(vn), 21); @@ -1438,7 +1448,9 @@ void xnn_f32_vtanh_ukernel__avx512skx_expm1minus_rr1_lut4_p4h3ts_perm_div_u64( const __m512 vx = _mm512_maskz_loadu_ps(vmask, input); - const __m512 vz = _mm512_range_ps(vsat_cutoff, vx, 0xA); + const __mmask16 vnan_mask = _mm512_cmp_ps_mask(vx, vx, _CMP_EQ_OQ); + __m512 vz = _mm512_range_ps(vsat_cutoff, vx, 0xA); + vz = _mm512_mask_blend_ps(vnan_mask, vx, vz); __m512 vn = _mm512_fmadd_ps(vz, vminus_log2e, vmagic_bias); const __m512i ve = _mm512_slli_epi32(_mm512_castps_si512(vn), 21); diff --git a/src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c new file mode 100644 index 00000000000..41b0a25e544 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc2.c @@ -0,0 +1,72 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rmin_ukernel__avx512fp16_u128_acc2( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmin1 = vmin0; + for (; batch >= 128 * sizeof(uint16_t); batch -= 128 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + const __m512h vt2 = _mm512_loadu_ph((i + 64)); + const __m512h vt3 = _mm512_loadu_ph((i + 96)); + i += 128; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmin0 = _mm512_min_ph(vmin0, vt2); + vmin1 = _mm512_min_ph(vmin1, vt3); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c new file mode 100644 index 00000000000..080cc97b109 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u128-acc4.c @@ -0,0 +1,76 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rmin_ukernel__avx512fp16_u128_acc4( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmin1 = vmin0; + __m512h vmin2 = vmin0; + __m512h vmin3 = vmin0; + for (; batch >= 128 * sizeof(uint16_t); batch -= 128 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + const __m512h vt2 = _mm512_loadu_ph((i + 64)); + const __m512h vt3 = _mm512_loadu_ph((i + 96)); + i += 128; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmin2 = _mm512_min_ph(vmin2, vt2); + vmin3 = _mm512_min_ph(vmin3, vt3); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + vmin2 = _mm512_min_ph(vmin2, vmin3); + vmin0 = _mm512_min_ph(vmin0, vmin2); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c new file mode 100644 index 00000000000..9ba8a2d7256 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u32.c @@ -0,0 +1,58 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rmin_ukernel__avx512fp16_u32( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c new file mode 100644 index 00000000000..c3577dcd9cc --- /dev/null +++ b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u64-acc2.c @@ -0,0 +1,68 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rmin_ukernel__avx512fp16_u64_acc2( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmin1 = vmin0; + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + i += 64; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c new file mode 100644 index 00000000000..9fb399e3cd6 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rmin-avx512fp16-u96-acc3.c @@ -0,0 +1,72 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rmin_ukernel__avx512fp16_u96_acc3( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmin1 = vmin0; + __m512h vmin2 = vmin0; + for (; batch >= 96 * sizeof(uint16_t); batch -= 96 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + const __m512h vt2 = _mm512_loadu_ph((i + 64)); + i += 96; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmin2 = _mm512_min_ph(vmin2, vt2); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + vmin0 = _mm512_min_ph(vmin0, vmin2); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c new file mode 100644 index 00000000000..86df8921a88 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc2.c @@ -0,0 +1,87 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmax0 = vmin0; + __m512h vmin1 = vmin0; + __m512h vmax1 = vmax0; + for (; batch >= 128 * sizeof(uint16_t); batch -= 128 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + const __m512h vt2 = _mm512_loadu_ph((i + 64)); + const __m512h vt3 = _mm512_loadu_ph((i + 96)); + i += 128; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmax0 = _mm512_max_ph(vmax0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmax1 = _mm512_max_ph(vmax1, vt1); + vmin0 = _mm512_min_ph(vmin0, vt2); + vmax0 = _mm512_max_ph(vmax0, vt2); + vmin1 = _mm512_min_ph(vmin1, vt3); + vmax1 = _mm512_max_ph(vmax1, vt3); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + vmax0 = _mm512_max_ph(vmax0, vmax1); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + vmax0 = _mm512_max_ph(vmax0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + vmax0 = _mm512_mask_max_ph(vmax0, vmask, vmax0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m256h vmax256 = _mm256_max_ph(_mm512_castph512_ph256(vmax0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmax0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + __m128h vmax = _mm_max_ph(_mm256_castph256_ph128(vmax256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmax256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmax), _mm_castph_ps(vmax)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmax)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + vmax = _mm_max_sh(vmax, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmax), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); + *((uint16_t*) output + 1) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmax), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c new file mode 100644 index 00000000000..bf804f9412b --- /dev/null +++ b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u128-acc4.c @@ -0,0 +1,95 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmax0 = vmin0; + __m512h vmin1 = vmin0; + __m512h vmax1 = vmax0; + __m512h vmin2 = vmin0; + __m512h vmax2 = vmax0; + __m512h vmin3 = vmin0; + __m512h vmax3 = vmax0; + for (; batch >= 128 * sizeof(uint16_t); batch -= 128 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + const __m512h vt2 = _mm512_loadu_ph((i + 64)); + const __m512h vt3 = _mm512_loadu_ph((i + 96)); + i += 128; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmax0 = _mm512_max_ph(vmax0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmax1 = _mm512_max_ph(vmax1, vt1); + vmin2 = _mm512_min_ph(vmin2, vt2); + vmax2 = _mm512_max_ph(vmax2, vt2); + vmin3 = _mm512_min_ph(vmin3, vt3); + vmax3 = _mm512_max_ph(vmax3, vt3); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + vmax0 = _mm512_max_ph(vmax0, vmax1); + vmin2 = _mm512_min_ph(vmin2, vmin3); + vmax2 = _mm512_max_ph(vmax2, vmax3); + vmin0 = _mm512_min_ph(vmin0, vmin2); + vmax0 = _mm512_max_ph(vmax0, vmax2); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + vmax0 = _mm512_max_ph(vmax0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + vmax0 = _mm512_mask_max_ph(vmax0, vmask, vmax0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m256h vmax256 = _mm256_max_ph(_mm512_castph512_ph256(vmax0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmax0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + __m128h vmax = _mm_max_ph(_mm256_castph256_ph128(vmax256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmax256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmax), _mm_castph_ps(vmax)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmax)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + vmax = _mm_max_sh(vmax, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmax), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); + *((uint16_t*) output + 1) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmax), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c new file mode 100644 index 00000000000..b7ed03bff51 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u32.c @@ -0,0 +1,67 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rminmax_ukernel__avx512fp16_u32( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmax0 = vmin0; + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + vmax0 = _mm512_max_ph(vmax0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + vmax0 = _mm512_mask_max_ph(vmax0, vmask, vmax0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m256h vmax256 = _mm256_max_ph(_mm512_castph512_ph256(vmax0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmax0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + __m128h vmax = _mm_max_ph(_mm256_castph256_ph128(vmax256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmax256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmax), _mm_castph_ps(vmax)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmax)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + vmax = _mm_max_sh(vmax, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmax), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); + *((uint16_t*) output + 1) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmax), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c new file mode 100644 index 00000000000..8ae7c447c54 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u64-acc2.c @@ -0,0 +1,81 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmax0 = vmin0; + __m512h vmin1 = vmin0; + __m512h vmax1 = vmax0; + for (; batch >= 64 * sizeof(uint16_t); batch -= 64 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + i += 64; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmax0 = _mm512_max_ph(vmax0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmax1 = _mm512_max_ph(vmax1, vt1); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + vmax0 = _mm512_max_ph(vmax0, vmax1); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + vmax0 = _mm512_max_ph(vmax0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + vmax0 = _mm512_mask_max_ph(vmax0, vmask, vmax0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m256h vmax256 = _mm256_max_ph(_mm512_castph512_ph256(vmax0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmax0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + __m128h vmax = _mm_max_ph(_mm256_castph256_ph128(vmax256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmax256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmax), _mm_castph_ps(vmax)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmax)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + vmax = _mm_max_sh(vmax, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmax), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); + *((uint16_t*) output + 1) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmax), 0); +#endif //defined(__clang__) +} diff --git a/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c new file mode 100644 index 00000000000..5ad5806abc5 --- /dev/null +++ b/src/f16-rminmax/gen/f16-rminmax-avx512fp16-u96-acc3.c @@ -0,0 +1,88 @@ +// Auto-generated file. Do not edit! +// Template: src/f16-rminmax/avx512fp16.c.in +// Generator: tools/xngen +// +// Copyright 2024 Google LLC +// +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. + +#include + +#include + +#include +#include + + +void xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3( + size_t batch, + const void* input, + void* output, + const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) +{ + assert(batch != 0); + assert(batch % sizeof(uint16_t) == 0); + assert(input != NULL); + assert(output != NULL); + +#if defined(__clang__) + const uint16_t* i = (const uint16_t*) input; + __m512h vmin0 = _mm512_castsi512_ph(_mm512_set1_epi16(*i)); + __m512h vmax0 = vmin0; + __m512h vmin1 = vmin0; + __m512h vmax1 = vmax0; + __m512h vmin2 = vmin0; + __m512h vmax2 = vmax0; + for (; batch >= 96 * sizeof(uint16_t); batch -= 96 * sizeof(uint16_t)) { + const __m512h vt0 = _mm512_loadu_ph(i); + const __m512h vt1 = _mm512_loadu_ph((i + 32)); + const __m512h vt2 = _mm512_loadu_ph((i + 64)); + i += 96; + + vmin0 = _mm512_min_ph(vmin0, vt0); + vmax0 = _mm512_max_ph(vmax0, vt0); + vmin1 = _mm512_min_ph(vmin1, vt1); + vmax1 = _mm512_max_ph(vmax1, vt1); + vmin2 = _mm512_min_ph(vmin2, vt2); + vmax2 = _mm512_max_ph(vmax2, vt2); + } + vmin0 = _mm512_min_ph(vmin0, vmin1); + vmax0 = _mm512_max_ph(vmax0, vmax1); + vmin0 = _mm512_min_ph(vmin0, vmin2); + vmax0 = _mm512_max_ph(vmax0, vmax2); + for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) { + const __m512h vt = _mm512_loadu_ph(i); + i += 32; + + vmin0 = _mm512_min_ph(vmin0, vt); + vmax0 = _mm512_max_ph(vmax0, vt); + } + if XNN_UNLIKELY(batch != 0) { + assert(batch >= 1 * sizeof(uint16_t)); + assert(batch <= 31 * sizeof(uint16_t)); + + // Prepare mask for valid elements (depends on batch). + batch >>= XNN_LOG2_SIZEOF_HALF; + const __mmask32 vmask = _cvtu32_mask32((uint32_t) ((UINT32_C(1) << batch) - UINT32_C(1))); + + const __m512h vt = _mm512_castsi512_ph(_mm512_maskz_loadu_epi16(vmask, i)); + + vmin0 = _mm512_mask_min_ph(vmin0, vmask, vmin0, vt); + vmax0 = _mm512_mask_max_ph(vmax0, vmask, vmax0, vt); + } + __m256h vmin256 = _mm256_min_ph(_mm512_castph512_ph256(vmin0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmin0), 1))); + __m256h vmax256 = _mm256_max_ph(_mm512_castph512_ph256(vmax0), _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(vmax0), 1))); + __m128h vmin = _mm_min_ph(_mm256_castph256_ph128(vmin256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmin256), 1))); + __m128h vmax = _mm_max_ph(_mm256_castph256_ph128(vmax256), _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(vmax256), 1))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmin), _mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehl_ps(_mm_castph_ps(vmax), _mm_castph_ps(vmax)))); + vmin = _mm_min_ph(vmin, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmin)))); + vmax = _mm_max_ph(vmax, _mm_castps_ph(_mm_movehdup_ps(_mm_castph_ps(vmax)))); + vmin = _mm_min_sh(vmin, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmin), 16))); + vmax = _mm_max_sh(vmax, _mm_castsi128_ph(_mm_srli_epi32(_mm_castph_si128(vmax), 16))); + + *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmin), 0); + *((uint16_t*) output + 1) = (uint16_t) _mm_extract_epi16(_mm_castph_si128(vmax), 0); +#endif //defined(__clang__) +} diff --git a/src/xnnpack/reduce.h b/src/xnnpack/reduce.h index ad1a0b3ad49..6b0a58818c5 100644 --- a/src/xnnpack/reduce.h +++ b/src/xnnpack/reduce.h @@ -87,6 +87,18 @@ DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmax_ukernel__avx512fp16_u96_acc3) DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmax_ukernel__avx512fp16_u128_acc2) DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmax_ukernel__avx512fp16_u128_acc4) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmin_ukernel__avx512fp16_u32) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmin_ukernel__avx512fp16_u64_acc2) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmin_ukernel__avx512fp16_u96_acc3) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmin_ukernel__avx512fp16_u128_acc2) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmin_ukernel__avx512fp16_u128_acc4) + +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rminmax_ukernel__avx512fp16_u32) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2) +DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4) + DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmax_ukernel__avx512skx_u16) DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmax_ukernel__avx512skx_u32_acc2) DECLARE_F16_REDUCE_UKERNEL_FUNCTION(xnn_f16_rmax_ukernel__avx512skx_u48_acc3) diff --git a/test/f16-rmin.cc b/test/f16-rmin.cc index 4d29ca166cb..fab237ec464 100644 --- a/test/f16-rmin.cc +++ b/test/f16-rmin.cc @@ -203,6 +203,191 @@ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMIN__AVX512FP16_U32, batch_eq_32) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(32) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::Min); + } + + TEST(F16_RMIN__AVX512FP16_U32, batch_div_32) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U32, batch_lt_32) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 32; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U32, batch_gt_32) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 33; batch_size < 64; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::Min); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMIN__AVX512FP16_U64_ACC2, batch_eq_64) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(64) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::Min); + } + + TEST(F16_RMIN__AVX512FP16_U64_ACC2, batch_div_64) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 128; batch_size < 640; batch_size += 64) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U64_ACC2, batch_lt_64) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 64; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U64_ACC2, batch_gt_64) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 65; batch_size < 128; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::Min); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMIN__AVX512FP16_U96_ACC3, batch_eq_96) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(96) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::Min); + } + + TEST(F16_RMIN__AVX512FP16_U96_ACC3, batch_div_96) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 192; batch_size < 960; batch_size += 96) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U96_ACC3, batch_lt_96) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 96; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U96_ACC3, batch_gt_96) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 97; batch_size < 192; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::Min); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMIN__AVX512FP16_U128_ACC2, batch_eq_128) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(128) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::Min); + } + + TEST(F16_RMIN__AVX512FP16_U128_ACC2, batch_div_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 256; batch_size < 1280; batch_size += 128) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U128_ACC2, batch_lt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 128; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U128_ACC2, batch_gt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 129; batch_size < 256; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::Min); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMIN__AVX512FP16_U128_ACC4, batch_eq_128) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(128) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::Min); + } + + TEST(F16_RMIN__AVX512FP16_U128_ACC4, batch_div_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 256; batch_size < 1280; batch_size += 128) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U128_ACC4, batch_lt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 128; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::Min); + } + } + + TEST(F16_RMIN__AVX512FP16_U128_ACC4, batch_gt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 129; batch_size < 256; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rmin_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::Min); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_RMIN__AVX512SKX_U16, batch_eq_16) { TEST_REQUIRES_X86_AVX512SKX; diff --git a/test/f16-rmin.yaml b/test/f16-rmin.yaml index e07f76bcd20..ee4ee8b0aee 100644 --- a/test/f16-rmin.yaml +++ b/test/f16-rmin.yaml @@ -10,6 +10,13 @@ - name: xnn_f16_rmin_ukernel__neonfp16arith_u32_acc2 - name: xnn_f16_rmin_ukernel__neonfp16arith_u32_acc4 +# x86 AVX512FP16 +- name: xnn_f16_rmin_ukernel__avx512fp16_u32 +- name: xnn_f16_rmin_ukernel__avx512fp16_u64_acc2 +- name: xnn_f16_rmin_ukernel__avx512fp16_u96_acc3 +- name: xnn_f16_rmin_ukernel__avx512fp16_u128_acc2 +- name: xnn_f16_rmin_ukernel__avx512fp16_u128_acc4 + # x86 AVX512SKX - name: xnn_f16_rmin_ukernel__avx512skx_u16 - name: xnn_f16_rmin_ukernel__avx512skx_u32_acc2 diff --git a/test/f16-rminmax.cc b/test/f16-rminmax.cc index 2d544a803da..3aba99c29fe 100644 --- a/test/f16-rminmax.cc +++ b/test/f16-rminmax.cc @@ -203,6 +203,191 @@ #endif // XNN_ENABLE_ARM_FP16_VECTOR && (XNN_ARCH_ARM || XNN_ARCH_ARM64) +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMINMAX__AVX512FP16_U32, batch_eq_32) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(32) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::MinMax); + } + + TEST(F16_RMINMAX__AVX512FP16_U32, batch_div_32) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 64; batch_size < 320; batch_size += 32) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U32, batch_lt_32) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 32; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U32, batch_gt_32) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 33; batch_size < 64; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u32, ReduceMicrokernelTester::OpType::MinMax); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMINMAX__AVX512FP16_U64_ACC2, batch_eq_64) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(64) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + + TEST(F16_RMINMAX__AVX512FP16_U64_ACC2, batch_div_64) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 128; batch_size < 640; batch_size += 64) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U64_ACC2, batch_lt_64) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 64; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U64_ACC2, batch_gt_64) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 65; batch_size < 128; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMINMAX__AVX512FP16_U96_ACC3, batch_eq_96) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(96) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::MinMax); + } + + TEST(F16_RMINMAX__AVX512FP16_U96_ACC3, batch_div_96) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 192; batch_size < 960; batch_size += 96) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U96_ACC3, batch_lt_96) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 96; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U96_ACC3, batch_gt_96) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 97; batch_size < 192; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3, ReduceMicrokernelTester::OpType::MinMax); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMINMAX__AVX512FP16_U128_ACC2, batch_eq_128) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(128) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + + TEST(F16_RMINMAX__AVX512FP16_U128_ACC2, batch_div_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 256; batch_size < 1280; batch_size += 128) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U128_ACC2, batch_lt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 128; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U128_ACC2, batch_gt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 129; batch_size < 256; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2, ReduceMicrokernelTester::OpType::MinMax); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + +#if XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + TEST(F16_RMINMAX__AVX512FP16_U128_ACC4, batch_eq_128) { + TEST_REQUIRES_X86_AVX512FP16; + ReduceMicrokernelTester() + .batch_size(128) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::MinMax); + } + + TEST(F16_RMINMAX__AVX512FP16_U128_ACC4, batch_div_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 256; batch_size < 1280; batch_size += 128) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U128_ACC4, batch_lt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 1; batch_size < 128; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::MinMax); + } + } + + TEST(F16_RMINMAX__AVX512FP16_U128_ACC4, batch_gt_128) { + TEST_REQUIRES_X86_AVX512FP16; + for (size_t batch_size = 129; batch_size < 256; batch_size++) { + ReduceMicrokernelTester() + .batch_size(batch_size) + .Test(xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4, ReduceMicrokernelTester::OpType::MinMax); + } + } +#endif // XNN_ENABLE_AVX512FP16 && (XNN_ARCH_X86 || XNN_ARCH_X86_64) + + #if XNN_ARCH_X86 || XNN_ARCH_X86_64 TEST(F16_RMINMAX__AVX512SKX_U16, batch_eq_16) { TEST_REQUIRES_X86_AVX512SKX; diff --git a/test/f16-rminmax.yaml b/test/f16-rminmax.yaml index b1ae9439874..ef180bed3cf 100644 --- a/test/f16-rminmax.yaml +++ b/test/f16-rminmax.yaml @@ -10,6 +10,13 @@ - name: xnn_f16_rminmax_ukernel__neonfp16arith_u32_acc2 - name: xnn_f16_rminmax_ukernel__neonfp16arith_u32_acc4 +# x86 AVX512FP16 +- name: xnn_f16_rminmax_ukernel__avx512fp16_u32 +- name: xnn_f16_rminmax_ukernel__avx512fp16_u64_acc2 +- name: xnn_f16_rminmax_ukernel__avx512fp16_u96_acc3 +- name: xnn_f16_rminmax_ukernel__avx512fp16_u128_acc2 +- name: xnn_f16_rminmax_ukernel__avx512fp16_u128_acc4 + # x86 AVX512SKX - name: xnn_f16_rminmax_ukernel__avx512skx_u16 - name: xnn_f16_rminmax_ukernel__avx512skx_u32_acc2