Skip to content

Commit

Permalink
scalar qs8 rsum accumulating microkernels
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 631504843
  • Loading branch information
alankelly authored and xnnpack-bot committed May 16, 2024
1 parent 7fabcac commit cda8726
Show file tree
Hide file tree
Showing 19 changed files with 934 additions and 9 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2991,6 +2991,12 @@ IF(XNNPACK_BUILD_TESTS)
TARGET_LINK_LIBRARIES(qs8-requantization-test PRIVATE hardware-config logging microkernels-all)
ADD_TEST(NAME qs8-requantization-test COMMAND qs8-requantization-test)

ADD_EXECUTABLE(qs8-rsum-minmax-fp32-test test/qs8-rsum-mimax-fp32.cc)
TARGET_INCLUDE_DIRECTORIES(qs8-rsum-minmax-fp32-test PRIVATE include src test)
TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE fp16 pthreadpool GTest::gtest GTest::gtest_main microparams-init)
TARGET_LINK_LIBRARIES(qs8-rsum-minmax-fp32-test PRIVATE hardware-config logging microkernels-all)
ADD_TEST(NAME qs8-rsum-minmax-fp32-test COMMAND qs8-rsum-minmax-fp32-test)

ADD_EXECUTABLE(qs8-vadd-minmax-test test/qs8-vadd-minmax.cc)
SET_TARGET_PROPERTIES(qs8-vadd-minmax-test PROPERTIES CXX_EXTENSIONS YES)
TARGET_INCLUDE_DIRECTORIES(qs8-vadd-minmax-test PRIVATE include src test)
Expand Down
3 changes: 3 additions & 0 deletions cmake/gen/scalar_microkernels.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,9 @@ SET(ALL_SCALAR_MICROKERNEL_SRCS
src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c
src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c
src/qs8-requantization/qs8-requantization-rndnu-scalar.c
src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c
src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c
src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c
src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u1.c
src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c
src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u4.c
Expand Down
3 changes: 3 additions & 0 deletions gen/scalar_microkernels.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -823,6 +823,9 @@ ALL_SCALAR_MICROKERNEL_SRCS = [
"src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned32.c",
"src/qs8-requantization/qs8-requantization-rndna-scalar-unsigned64.c",
"src/qs8-requantization/qs8-requantization-rndnu-scalar.c",
"src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c",
"src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c",
"src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u1.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u2.c",
"src/qs8-vadd/gen/qs8-vadd-minmax-scalar-u4.c",
Expand Down
10 changes: 10 additions & 0 deletions scripts/generate-qs8-rsum.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/sh
# Copyright 2024 Google LLC
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

#################################### Scalar ###################################
tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=1 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c &
tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=2 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c &
tools/xngen src/qs8-rsum/scalar.c.in -D CHANNEL_TILE=4 -D ACCUMULATORS=1 -D REQUANTIZATION=FP32 -D VARIANT=IMAGIC -D WASM=0 -o src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c &
1 change: 1 addition & 0 deletions scripts/generate-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,7 @@ tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-r
tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-rmin.yaml --output test/f32-rmin.cc &
tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/f32-rminmax.yaml --output test/f32-rminmax.cc &

tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/qs8-rsum-minmax-fp32.yaml --output test/qs8-rsum-minmax-fp32.cc &
tools/generate-reduce-test.py --tester RSumMicrokernelTester --spec test/f32-rsum.yaml --output test/f32-rsum.cc &

tools/generate-reduce-test.py --tester ReduceMicrokernelTester --spec test/u8-rmax.yaml --output test/u8-rmax.cc &
Expand Down
47 changes: 47 additions & 0 deletions src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u1-acc1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// Auto-generated file. Do not edit!
// Template: src/qs8-rsum/scalar.c.in
// Generator: tools/xngen
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/reduce.h>


void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1(
size_t batch,
const int8_t* restrict input,
int8_t* restrict output,
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(input != NULL);
assert(output != NULL);

const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t vacc0 = vinit_bias;
do {
const int32_t vt = (int32_t) *input++;
vacc0 += vt;
batch -= sizeof(int8_t);
} while (batch != 0);

const float vscale = params->fp32_scalar_imagic.scale;
const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;

float vfpacc = (float) vacc0 * vscale;
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc);
vout = math_max_s32(vout, vmagic_min);
vout = math_min_s32(vout, vmagic_max);
vout -= vmagic_bias_less_zero_point;

*output += (int8_t) vout;
}
55 changes: 55 additions & 0 deletions src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u2-acc1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Auto-generated file. Do not edit!
// Template: src/qs8-rsum/scalar.c.in
// Generator: tools/xngen
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/reduce.h>


void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2(
size_t batch,
const int8_t* restrict input,
int8_t* restrict output,
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(input != NULL);
assert(output != NULL);

const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t vacc0 = vinit_bias;
for (; batch >= 2; batch -= 2) {
const int32_t vt0 = (int32_t) input[0];
const int32_t vt1 = (int32_t) input[1];
input += 2;

vacc0 += vt0;
vacc0 += vt1;
}

if XNN_UNLIKELY(batch != 0) {
const int32_t vt = (int32_t) *input;
vacc0 += vt;
}

const float vscale = params->fp32_scalar_imagic.scale;
const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;

float vfpacc = (float) vacc0 * vscale;
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc);
vout = math_max_s32(vout, vmagic_min);
vout = math_min_s32(vout, vmagic_max);
vout -= vmagic_bias_less_zero_point;

*output += (int8_t) vout;
}
62 changes: 62 additions & 0 deletions src/qs8-rsum/gen/qs8-rdsum-minmax-fp32-scalar-imagic-u4-acc1.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
// Auto-generated file. Do not edit!
// Template: src/qs8-rsum/scalar.c.in
// Generator: tools/xngen
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

#include <assert.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/reduce.h>


void xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4(
size_t batch,
const int8_t* restrict input,
int8_t* restrict output,
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(input != NULL);
assert(output != NULL);

const int32_t vinit_bias = params->fp32_scalar_imagic.init_bias;
int32_t vacc0 = vinit_bias;
for (; batch >= 4; batch -= 4) {
const int32_t vt0 = (int32_t) input[0];
const int32_t vt1 = (int32_t) input[1];
const int32_t vt2 = (int32_t) input[2];
const int32_t vt3 = (int32_t) input[3];
input += 4;

vacc0 += vt0;
vacc0 += vt1;
vacc0 += vt2;
vacc0 += vt3;
}

if XNN_UNLIKELY(batch != 0) {
do {
const int32_t vt = (int32_t) *input++;
vacc0 += vt;
batch -= sizeof(int8_t);
} while (batch != 0);
}

const float vscale = params->fp32_scalar_imagic.scale;
const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;

float vfpacc = (float) vacc0 * vscale;
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc);
vout = math_max_s32(vout, vmagic_min);
vout = math_min_s32(vout, vmagic_max);
vout -= vmagic_bias_less_zero_point;

*output += (int8_t) vout;
}
99 changes: 99 additions & 0 deletions src/qs8-rsum/scalar.c.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.

$assert CHANNEL_TILE >= 1
$assert VARIANT in ("FMAGIC", "IMAGIC", "LRINTF")
#include <assert.h>

#include <xnnpack/common.h>
#include <xnnpack/math.h>
#include <xnnpack/reduce.h>


$PARAMS_STRUCT = "fp32_scalar_" + VARIANT.lower()
$MIN_F32 = "__builtin_wasm_min_f32" if WASM else "math_min_f32"
$MAX_F32 = "__builtin_wasm_max_f32" if WASM else "math_max_f32"
void xnn_qs8_rsum_minmax_${REQUANTIZATION.lower()}_ukernel__scalar_${VARIANT.lower()}_u${CHANNEL_TILE}(
size_t batch,
const int8_t* restrict input,
int8_t* restrict output,
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
{
assert(batch != 0);
assert(input != NULL);
assert(output != NULL);

const int32_t vinit_bias = params->${PARAMS_STRUCT}.init_bias;
$for A in range(ACCUMULATORS):
int32_t vacc${A} = vinit_bias;
$if CHANNEL_TILE == 1:
do {
const int32_t vt = (int32_t) *input++;
vacc0 += vt;
batch -= sizeof(int8_t);
} while (batch != 0);
$else:
for (; batch >= ${CHANNEL_TILE}; batch -= ${CHANNEL_TILE}) {
$for N in range(CHANNEL_TILE):
const int32_t vt${N} = (int32_t) input[${N}];
input += ${CHANNEL_TILE};

$for N in range(CHANNEL_TILE):
vacc${N % ACCUMULATORS} += vt${N};
}
$if ACCUMULATORS > 1:
$ACC_SLICE = 1
$while ACC_SLICE < ACCUMULATORS:
$for A in range(0, ACCUMULATORS, ACC_SLICE * 2):
$if A + ACC_SLICE < ACCUMULATORS:
vacc${A} += vacc${A + ACC_SLICE};
$ACC_SLICE *= 2

if XNN_UNLIKELY(batch != 0) {
$if CHANNEL_TILE == 2:
const int32_t vt = (int32_t) *input;
vacc0 += vt;
$else:
do {
const int32_t vt = (int32_t) *input++;
vacc0 += vt;
batch -= sizeof(int8_t);
} while (batch != 0);
}

const float vscale = params->${PARAMS_STRUCT}.scale;
$if VARIANT == "FMAGIC":
const float voutput_min_less_zero_point = params->fp32_scalar_fmagic.output_min_less_zero_point;
const float voutput_max_less_zero_point = params->fp32_scalar_fmagic.output_max_less_zero_point;
const float vmagic_bias = params->fp32_scalar_fmagic.magic_bias;
const int32_t vmagic_bias_less_output_zero_point = params->fp32_scalar_fmagic.magic_bias_less_output_zero_point;
$elif VARIANT == "IMAGIC":
const float vmagic_bias = params->fp32_scalar_imagic.magic_bias;
const int32_t vmagic_min = params->fp32_scalar_imagic.magic_min;
const int32_t vmagic_max = params->fp32_scalar_imagic.magic_max;
const int32_t vmagic_bias_less_zero_point = params->fp32_scalar_imagic.magic_bias_less_zero_point;
$elif VARIANT == "LRINTF":
const float voutput_min_less_zero_point = params->fp32_scalar_lrintf.output_min_less_zero_point;
const float voutput_max_less_zero_point = params->fp32_scalar_lrintf.output_max_less_zero_point;
const int32_t voutput_zero_point = params->fp32_scalar_lrintf.output_zero_point;

float vfpacc = (float) vacc0 * vscale;
$if VARIANT == "FMAGIC":
vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point);
vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point);
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc) - vmagic_bias_less_output_zero_point;
$elif VARIANT == "IMAGIC":
vfpacc += vmagic_bias;
int32_t vout = (int32_t) float_as_uint32(vfpacc);
vout = math_max_s32(vout, vmagic_min);
vout = math_min_s32(vout, vmagic_max);
vout -= vmagic_bias_less_zero_point;
$elif VARIANT == "LRINTF":
vfpacc = ${MAX_F32}(vfpacc, voutput_min_less_zero_point);
vfpacc = ${MIN_F32}(vfpacc, voutput_max_less_zero_point);
const int32_t vrndacc = (int32_t) lrintf(vfpacc);
int32_t vout = vrndacc + voutput_zero_point;

*output += (int8_t) vout;
}
6 changes: 6 additions & 0 deletions src/xnnpack/microfnptr.h
Original file line number Diff line number Diff line change
Expand Up @@ -1597,6 +1597,12 @@ typedef void (*xnn_f32_rsum_ukernel_fn)(
float* output,
const union xnn_f32_scale_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);

typedef void (*xnn_qs8_rsum_ukernel_fn)(
size_t batch,
const int8_t* input,
int8_t* output,
const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);

// RMAX: Reduce-MAX

typedef void (*xnn_rmax_ukernel_fn)(
Expand Down
11 changes: 11 additions & 0 deletions src/xnnpack/reduce.h
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,17 @@ DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u12_acc3)
DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u16_acc2)
DECLARE_F32_RSUM_UKERNEL_FUNCTION(xnn_f32_rsum_ukernel__wasmsimd_u16_acc4)

#define DECLARE_QS8_RSUM_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t batch, \
const int8_t* input, \
int8_t* output, \
const union xnn_qs8_avgpool_minmax_params params[XNN_RESTRICT XNN_MIN_ELEMENTS(1)]);

DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u1)
DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u2)
DECLARE_QS8_RSUM_UKERNEL_FUNCTION(xnn_qs8_rsum_minmax_fp32_ukernel__scalar_imagic_u4)

#define DECLARE_F32_RDSUM_UKERNEL_FUNCTION(fn_name) \
XNN_INTERNAL void fn_name( \
size_t rows, \
Expand Down
9 changes: 9 additions & 0 deletions test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -1084,6 +1084,15 @@ xnnpack_unit_test(
deps = MICROKERNEL_TEST_DEPS,
)

xnnpack_unit_test(
name = "qs8_rsum_minmax_fp32_test",
srcs = [
"qs8-rsum-minmax-fp32.cc",
"rsum-microkernel-tester.h",
],
deps = MICROKERNEL_TEST_DEPS,
)

xnnpack_unit_test(
name = "f16_f32acc_rdsum_test",
srcs = [
Expand Down

0 comments on commit cda8726

Please sign in to comment.