scalar qs8 rsum accumulating microkernels

PiperOrigin-RevId: 631504843
google · May 16, 2024 · cda8726 · cda8726
1 parent 7fabcac
commit cda8726
Show file tree

Hide file tree

Showing 19 changed files with 934 additions and 9 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -2991,6 +2991,12 @@ IF(XNNPACK_BUILD_TESTS)
   TARGET_LINK_LIBRARIES(qs8-requantization-test PRIVATE hardware-config logging microkernels-all)
   ADD_TEST(NAME qs8-requantization-test COMMAND qs8-requantization-test)
 
+  ADD_EXECUTABLE(qs8-rsum-minmax-fp32-test test/qs8-rsum-mimax-fp32.cc)
+  TARGET_INCLUDE_DIRECTORIES(qs8-rsum-minmax-fp32-test PRIVATE include src test)
+  TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main microparams-init)
+  TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE hardware-config logging microkernels-all)
+  ADD_TEST(NAME qs8-rsum-minmax-fp32-test COMMAND qs8-rsum-minmax-fp32-test)
+
   ADD_EXECUTABLE(qs8-vadd-minmax-test test/qs8-vadd-minmax.cc)
   SET_TARGET_PROPERTIES(qs8-vadd-minmax-test PROPERTIES CXX_EXTENSIONS YES)
   TARGET_INCLUDE_DIRECTORIES(qs8-vadd-minmax-test PRIVATE include src test)

diff --git a/cmake/gen/scalar_microkernels.cmake b/cmake/gen/scalar_microkernels.cmake
@@ -827,6 +827,9 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
   src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c
   src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c
   src/qs8-requantization/qs8-requantization-rndnu-scalar.c
+  src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c
+  src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c
+  src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c
   src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u1.c
   src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c
   src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u4.c

diff --git a/gen/scalar_microkernels.bzl b/gen/scalar_microkernels.bzl
@@ -823,6 +823,9 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
     "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c",
     "src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c",
     "src/qs8-requantization/qs8-requantization-rndnu-scalar.c",
+    "src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c",
+    "src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c",
+    "src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u1.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c",
     "src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u4.c",

diff --git a/scripts/generate-qs8-rsum.sh b/scripts/generate-qs8-rsum.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+# Copyright 2024 Google LLC
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#################################### Scalar ###################################
+tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=1 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c &
+tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=2 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c &
+tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=4 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c &
diff --git a/scripts/generate-tests.sh b/scripts/generate-tests.sh
@@ -241,6 +241,7 @@ tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-r
 tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-rmin.yaml --output test/f32-rmin.cc &
 tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-rminmax.yaml --output test/f32-rminmax.cc &
 
+tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/qs8-rsum-minmax-fp32.yaml --output test/qs8-rsum-minmax-fp32.cc &
 tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/f32-rsum.yaml --output test/f32-rsum.cc &
 
 tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/u8-rmax.yaml --output test/u8-rmax.cc &

diff --git a/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c
@@ -0,0 +1,47 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-rsum/scalar.c.in
+//   Generator: tools/xngen
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/reduce.h>
+
+
+void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1(
+    size_t batch,
+    const int8_t* restrict input,
+    int8_t* restrict output,
+    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+  int32_t vacc0 = vinit_bias;
+  do {
+    const int32_t vt = (int32_t) *input++;
+    vacc0 += vt;
+    batch -= sizeof(int8_t);
+  } while (batch != 0);
+
+  const float vscale = params->fp32_scalar_imagic.scale;
+  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+
+  float vfpacc = (float) vacc0 * vscale;
+  vfpacc += vmagic_bias;
+  int32_t vout = (int32_t) float_as_uint32(vfpacc);
+  vout = math_max_s32(vout, vmagic_min);
+  vout = math_min_s32(vout, vmagic_max);
+  vout -= vmagic_bias_less_zero_point;
+
+  *output += (int8_t) vout;
+}
diff --git a/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c
@@ -0,0 +1,55 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-rsum/scalar.c.in
+//   Generator: tools/xngen
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/reduce.h>
+
+
+void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2(
+    size_t batch,
+    const int8_t* restrict input,
+    int8_t* restrict output,
+    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+  int32_t vacc0 = vinit_bias;
+  for (; batch >= 2; batch -= 2) {
+    const int32_t vt0 = (int32_t) input[0];
+    const int32_t vt1 = (int32_t) input[1];
+    input += 2;
+
+    vacc0 += vt0;
+    vacc0 += vt1;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    const int32_t vt = (int32_t) *input;
+    vacc0 += vt;
+  }
+
+  const float vscale = params->fp32_scalar_imagic.scale;
+  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+
+  float vfpacc = (float) vacc0 * vscale;
+  vfpacc += vmagic_bias;
+  int32_t vout = (int32_t) float_as_uint32(vfpacc);
+  vout = math_max_s32(vout, vmagic_min);
+  vout = math_min_s32(vout, vmagic_max);
+  vout -= vmagic_bias_less_zero_point;
+
+  *output += (int8_t) vout;
+}
diff --git a/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c b/src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c
@@ -0,0 +1,62 @@
+// Auto-generated file. Do not edit!
+//   Template: src/qs8-rsum/scalar.c.in
+//   Generator: tools/xngen
+//
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/reduce.h>
+
+
+void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4(
+    size_t batch,
+    const int8_t* restrict input,
+    int8_t* restrict output,
+    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
+  int32_t vacc0 = vinit_bias;
+  for (; batch >= 4; batch -= 4) {
+    const int32_t vt0 = (int32_t) input[0];
+    const int32_t vt1 = (int32_t) input[1];
+    const int32_t vt2 = (int32_t) input[2];
+    const int32_t vt3 = (int32_t) input[3];
+    input += 4;
+
+    vacc0 += vt0;
+    vacc0 += vt1;
+    vacc0 += vt2;
+    vacc0 += vt3;
+  }
+
+  if XNN_UNLIKELY(batch != 0) {
+    do {
+      const int32_t vt = (int32_t) *input++;
+      vacc0 += vt;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  }
+
+  const float vscale = params->fp32_scalar_imagic.scale;
+  const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+  const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+  const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+  const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+
+  float vfpacc = (float) vacc0 * vscale;
+  vfpacc += vmagic_bias;
+  int32_t vout = (int32_t) float_as_uint32(vfpacc);
+  vout = math_max_s32(vout, vmagic_min);
+  vout = math_min_s32(vout, vmagic_max);
+  vout -= vmagic_bias_less_zero_point;
+
+  *output += (int8_t) vout;
+}
diff --git a/src/qs8-rsum/scalar.c.in b/src/qs8-rsum/scalar.c.in
@@ -0,0 +1,99 @@
+// This source code is licensed under the BSD-style license found in the
+// LICENSE file in the root directory of this source tree.
+
+$assert CHANNEL_TILE >= 1
+$assert VARIANT in ("FMAGIC", "IMAGIC", "LRINTF")
+#include <assert.h>
+
+#include <xnnpack/common.h>
+#include <xnnpack/math.h>
+#include <xnnpack/reduce.h>
+
+
+$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower()
+$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
+$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
+void xnn_qs8_rsum_minmax_${REQUANTIZATION.lower()}_ukernel__scalar_${VARIANT.lower()}_u${CHANNEL_TILE}(
+    size_t batch,
+    const int8_t* restrict input,
+    int8_t* restrict output,
+    const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
+{
+  assert(batch != 0);
+  assert(input != NULL);
+  assert(output != NULL);
+
+  const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias;
+  $for A in range(ACCUMULATORS):
+    int32_t vacc${A} = vinit_bias;
+  $if CHANNEL_TILE == 1:
+    do {
+      const int32_t vt = (int32_t) *input++;
+      vacc0 += vt;
+      batch -= sizeof(int8_t);
+    } while (batch != 0);
+  $else:
+    for (; batch >= ${CHANNEL_TILE}; batch -= ${CHANNEL_TILE}) {
+      $for N in range(CHANNEL_TILE):
+        const int32_t vt${N} = (int32_t) input[${N}];
+      input += ${CHANNEL_TILE};
+
+      $for N in range(CHANNEL_TILE):
+        vacc${N % ACCUMULATORS} += vt${N};
+    }
+    $if ACCUMULATORS > 1:
+      $ACC_SLICE = 1
+      $while ACC_SLICE < ACCUMULATORS:
+        $for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
+          $if A + ACC_SLICE < ACCUMULATORS:
+            vacc${A} += vacc${A + ACC_SLICE};
+        $ACC_SLICE *= 2
+
+    if XNN_UNLIKELY(batch != 0) {
+      $if CHANNEL_TILE == 2:
+        const int32_t vt = (int32_t) *input;
+        vacc0 += vt;
+      $else:
+        do {
+          const int32_t vt = (int32_t) *input++;
+          vacc0 += vt;
+          batch -= sizeof(int8_t);
+        } while (batch != 0);
+    }
+
+  const float vscale = params->${PARAMS_STRUCT}.scale;
+  $if VARIANT == "FMAGIC":
+    const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
+    const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
+    const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
+    const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
+  $elif VARIANT == "IMAGIC":
+    const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
+    const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
+    const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
+    const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
+  $elif VARIANT == "LRINTF":
+    const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
+    const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
+    const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;
+
+  float vfpacc = (float) vacc0 * vscale;
+  $if VARIANT == "FMAGIC":
+    vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point);
+    vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point);
+    vfpacc += vmagic_bias;
+    int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
+  $elif VARIANT == "IMAGIC":
+    vfpacc += vmagic_bias;
+    int32_t vout = (int32_t) float_as_uint32(vfpacc);
+    vout = math_max_s32(vout, vmagic_min);
+    vout = math_min_s32(vout, vmagic_max);
+    vout -= vmagic_bias_less_zero_point;
+  $elif VARIANT == "LRINTF":
+    vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point);
+    vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point);
+    const int32_t vrndacc = (int32_t) lrintf(vfpacc);
+    int32_t vout = vrndacc + voutput_zero_point;
+
+  *output += (int8_t) vout;
+}
diff --git a/src/xnnpack/microfnptr.h b/src/xnnpack/microfnptr.h
@@ -1597,6 +1597,12 @@ typedef void (*xnn_f32_rsum_ukernel_fn)(
     float* output,
     const union xnn_f32_scale_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
 
+typedef void (*xnn_qs8_rsum_ukernel_fn)(
+    size_t batch,
+    const int8_t* input,
+    int8_t* output,
+    const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+
 // RMAX: Reduce-MAX
 
 typedef void (*xnn_rmax_ukernel_fn)(

diff --git a/src/xnnpack/reduce.h b/src/xnnpack/reduce.h
@@ -329,6 +329,17 @@ DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u12_acc3)
 DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u16_acc2)
 DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u16_acc4)
 
+#define DECLARE_QS8_RSUM_UKERNEL_FUNCTION(fn_name) \
+  XNN_INTERNAL void fn_name(                       \
+      size_t batch,                                \
+      const int8_t* input,                         \
+      int8_t* output,                              \
+      const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);
+
+DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1)
+DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2)
+DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4)
+
 #define DECLARE_F32_RDSUM_UKERNEL_FUNCTION(fn_name) \
   XNN_INTERNAL void fn_name(                        \
       size_t rows,                                  \

diff --git a/test/BUILD.bazel b/test/BUILD.bazel
@@ -1084,6 +1084,15 @@ xnnpack_unit_test(
     deps = MICROKERNEL_TEST_DEPS,
 )
 
+xnnpack_unit_test(
+    name = "qs8_rsum_minmax_fp32_test",
+    srcs = [
+        "qs8-rsum-minmax-fp32.cc",
+        "rsum-microkernel-tester.h",
+    ],
+    deps = MICROKERNEL_TEST_DEPS,
+)
+
 xnnpack_unit_test(
     name = "f16_f32acc_rdsum_test",
     srcs = [