-
Notifications
You must be signed in to change notification settings - Fork 326
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
sse41 qs8 rsum accumulating microkernels
PiperOrigin-RevId: 633200354
- Loading branch information
1 parent
3e0f89e
commit 04a11cf
Showing
36 changed files
with
2,325 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
// Auto-generated file. Do not edit! | ||
// Template: src/qs8-rsum/neondot.c.in | ||
// Generator: tools/xngen | ||
// | ||
// Copyright 2024 Google LLC | ||
// | ||
// This source code is licensed under the BSD-style license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#include <assert.h> | ||
|
||
#include <arm_neon.h> | ||
|
||
#include <xnnpack/common.h> | ||
#include <xnnpack/math.h> | ||
#include <xnnpack/reduce.h> | ||
|
||
void xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u16( | ||
size_t batch, | ||
const int8_t* input, | ||
int8_t* output, | ||
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) | ||
{ | ||
assert(batch != 0); | ||
assert(input != NULL); | ||
assert(output != NULL); | ||
|
||
int8x16_t vone = vdupq_n_s8(1); | ||
int32x4_t vacc0 = vmovq_n_s32(0); | ||
for (; batch >= 16; batch -= 16) { | ||
const int8x16_t vt0 = vld1q_s8(input); input += 16; | ||
|
||
vacc0 = vdotq_s32(vacc0, vt0, vone); | ||
} | ||
if (XNN_UNLIKELY(batch != 0)) { | ||
for (; batch >= 16; batch -= 16) { | ||
const int8x16_t vt = vld1q_s8(input); input += 16; | ||
vacc0 = vdotq_s32(vacc0, vt, vone); | ||
} | ||
if (XNN_UNLIKELY(batch != 0)) { | ||
int8x16_t vt = vld1q_s8(input); | ||
vone = vld1q_s8(¶ms->fp32_neon.mask_table[15 - batch]); | ||
vacc0 = vdotq_s32(vacc0, vt, vone); | ||
} | ||
} | ||
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0)); | ||
vacc_lo = vpadd_s32(vacc_lo, vacc_lo); | ||
|
||
const int32_t vinit_bias = params->fp32_neon.init_bias; | ||
const float vscale = params->fp32_neon.scale; | ||
const int32_t output_min = params->fp32_neon.output_min; | ||
const int32_t output_max = params->fp32_neon.output_max; | ||
const float vmagic_bias = params->fp32_neon.magic_bias; | ||
const int32_t vmagic_bias_less_output_zero_point = params->fp32_neon.magic_bias_less_output_zero_point; | ||
|
||
float vfpacc = (float) (vget_lane_s32(vacc_lo, 0) + vinit_bias) * vscale; | ||
vfpacc += vmagic_bias; | ||
int32_t vout = (int32_t) float_as_uint32(vfpacc); | ||
vout -= vmagic_bias_less_output_zero_point; | ||
vout = math_max_s32(vout, output_min); | ||
vout = math_min_s32(vout, output_max); | ||
*output += (int8_t) vout; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
// Auto-generated file. Do not edit! | ||
// Template: src/qs8-rsum/neondot.c.in | ||
// Generator: tools/xngen | ||
// | ||
// Copyright 2024 Google LLC | ||
// | ||
// This source code is licensed under the BSD-style license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
#include <assert.h> | ||
|
||
#include <arm_neon.h> | ||
|
||
#include <xnnpack/common.h> | ||
#include <xnnpack/math.h> | ||
#include <xnnpack/reduce.h> | ||
|
||
void xnn_qs8_rsum_minmax_fp32_ukernel__neondot_u32_acc2( | ||
size_t batch, | ||
const int8_t* input, | ||
int8_t* output, | ||
const union xnn_qs8_avgpool_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) | ||
{ | ||
assert(batch != 0); | ||
assert(input != NULL); | ||
assert(output != NULL); | ||
|
||
int8x16_t vone = vdupq_n_s8(1); | ||
int32x4_t vacc0 = vmovq_n_s32(0); | ||
int32x4_t vacc1 = vmovq_n_s32(0); | ||
for (; batch >= 32; batch -= 32) { | ||
const int8x16_t vt0 = vld1q_s8(input); input += 16; | ||
const int8x16_t vt1 = vld1q_s8(input); input += 16; | ||
|
||
vacc0 = vdotq_s32(vacc0, vt0, vone); | ||
vacc1 = vdotq_s32(vacc1, vt1, vone); | ||
} | ||
if (XNN_UNLIKELY(batch != 0)) { | ||
for (; batch >= 16; batch -= 16) { | ||
const int8x16_t vt = vld1q_s8(input); input += 16; | ||
vacc0 = vdotq_s32(vacc0, vt, vone); | ||
} | ||
if (XNN_UNLIKELY(batch != 0)) { | ||
int8x16_t vt = vld1q_s8(input); | ||
vone = vld1q_s8(¶ms->fp32_neon.mask_table[15 - batch]); | ||
vacc0 = vdotq_s32(vacc0, vt, vone); | ||
} | ||
} | ||
vacc0 = vaddq_s32(vacc0, vacc1); | ||
int32x2_t vacc_lo = vadd_s32(vget_low_s32(vacc0), vget_high_s32(vacc0)); | ||
vacc_lo = vpadd_s32(vacc_lo, vacc_lo); | ||
|
||
const int32_t vinit_bias = params->fp32_neon.init_bias; | ||
const float vscale = params->fp32_neon.scale; | ||
const int32_t output_min = params->fp32_neon.output_min; | ||
const int32_t output_max = params->fp32_neon.output_max; | ||
const float vmagic_bias = params->fp32_neon.magic_bias; | ||
const int32_t vmagic_bias_less_output_zero_point = params->fp32_neon.magic_bias_less_output_zero_point; | ||
|
||
float vfpacc = (float) (vget_lane_s32(vacc_lo, 0) + vinit_bias) * vscale; | ||
vfpacc += vmagic_bias; | ||
int32_t vout = (int32_t) float_as_uint32(vfpacc); | ||
vout -= vmagic_bias_less_output_zero_point; | ||
vout = math_max_s32(vout, output_min); | ||
vout = math_min_s32(vout, output_max); | ||
*output += (int8_t) vout; | ||
} |
Oops, something went wrong.