Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add WAsmSIMD rdsum accumulating microkernels
PiperOrigin-RevId: 627686402
- Loading branch information
1 parent
7189ed9
commit aecf0c5
Showing
17 changed files
with
3,397 additions
and
6 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
// Copyright 2024 Google LLC | ||
// | ||
// This source code is licensed under the BSD-style license found in the | ||
// LICENSE file in the root directory of this source tree. | ||
|
||
$ABC = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" | ||
#include <assert.h> | ||
|
||
#include <immintrin.h> | ||
|
||
#include <xnnpack/common.h> | ||
#include <xnnpack/reduce.h> | ||
#include <xnnpack/math.h> | ||
|
||
|
||
$UNROLL = CHANNELS >> 3 | ||
void xnn_f32_rdsum_ukernel_${ACCUMULATORS}p${ACCUMULATORS}x__avx_c${CHANNELS}( | ||
size_t rows, | ||
size_t channels, | ||
const float* input, | ||
size_t input_stride, | ||
const float* zero, | ||
float* output, | ||
const union xnn_f32_scale_params params[restrict XNN_MIN_ELEMENTS(1)]) | ||
{ | ||
assert(rows != 0); | ||
assert(channels != 0); | ||
assert(input != NULL); | ||
assert(output != NULL); | ||
|
||
const __m256 vscale = _mm256_set1_ps(params->avx.scale); | ||
|
||
size_t input_increment = ${ACCUMULATORS} * input_stride; | ||
for (; channels >= ${CHANNELS}; channels -= ${CHANNELS}) { | ||
const float* i0 = input; | ||
$for i in range(1, ACCUMULATORS): | ||
const float* i${i} = (const float*) ((uintptr_t) input + ${i} * input_stride); | ||
|
||
$for i in range(UNROLL): | ||
__m256 vacc${i} = _mm256_setzero_ps(); | ||
|
||
for (int r = rows; r > 0; r -= ${ACCUMULATORS}) { | ||
$for N in range(1, ACCUMULATORS, 2): | ||
if XNN_UNPREDICTABLE(r < ${N+1}) { | ||
i${N} = zero; | ||
} | ||
if XNN_UNPREDICTABLE(r <= ${N+1}) { | ||
i${N+1} = zero; | ||
} | ||
$for c in range(UNROLL): | ||
__m256 vin${c}; | ||
$for j in range(ACCUMULATORS): | ||
$for c in range(UNROLL): | ||
vin${c} = _mm256_loadu_ps(&i${j}[${c*8}]); | ||
$for c in range(UNROLL): | ||
vacc${c} = _mm256_add_ps(vin${c}, vacc${c}); | ||
$for N in range(0, ACCUMULATORS): | ||
i${N} = (const float*) ((uintptr_t) i${N} + input_increment); | ||
} | ||
$for i in range(UNROLL): | ||
vacc${i} = _mm256_mul_ps(vacc${i}, vscale); | ||
|
||
const float* o = output; | ||
$for i in range(0, UNROLL): | ||
__m256 vo${i} = _mm256_loadu_ps(o); o += 8; | ||
$for i in range(0, UNROLL): | ||
vacc${i} = _mm256_add_ps(vo${i}, vacc${i}); | ||
$for i in range(0, UNROLL): | ||
_mm256_storeu_ps(output, vacc${i}); output += 8; | ||
|
||
input = (const float*) ((uintptr_t) input + ${CHANNELS} * sizeof(float)); | ||
} | ||
__m256i vmask; | ||
if (channels != 0) { | ||
input_increment = ${ACCUMULATORS} * input_stride; | ||
const float* i0 = input; | ||
$for i in range(1, ACCUMULATORS): | ||
const float* i${i} = (const float*) ((uintptr_t) input + ${i} * input_stride); | ||
__m256 vacc[${UNROLL}]; | ||
$for i in range(UNROLL): | ||
vacc[${i}] = _mm256_setzero_ps(); | ||
|
||
const size_t num_full_chunks = channels >> 3; | ||
const size_t num_chunks = round_up_po2(channels, 8) >> 3; | ||
const size_t remainder = channels & 0x7; | ||
for (int r = rows; r > 0; r -= ${ACCUMULATORS}) { | ||
$for N in range(1, ACCUMULATORS, 2): | ||
if XNN_UNPREDICTABLE(r < ${N+1}) { | ||
i${N} = zero; | ||
} | ||
if XNN_UNPREDICTABLE(r <= ${N+1}) { | ||
i${N+1} = zero; | ||
} | ||
for (int i = 0; i < num_full_chunks; ++i) { | ||
$for c in range(ACCUMULATORS): | ||
vacc[i] = _mm256_add_ps(_mm256_loadu_ps(&i${c}[i*8]), vacc[i]); | ||
} | ||
|
||
if (remainder) { | ||
vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - (channels & 0x7) * sizeof(float))); | ||
$for c in range(ACCUMULATORS): | ||
vacc[num_full_chunks] = _mm256_add_ps(_mm256_maskload_ps(&i${c}[num_full_chunks*8], vmask), vacc[num_full_chunks]); | ||
} | ||
$for N in range(ACCUMULATORS): | ||
i${N} = (const float*) ((uintptr_t) i${N} + input_increment); | ||
} | ||
for (size_t i = 0; i < num_chunks; ++i) { | ||
vacc[i] = _mm256_mul_ps(vacc[i], vscale); | ||
} | ||
|
||
__m256 vo[${UNROLL}]; | ||
const float* o = output; | ||
for (int i = 0; i < channels >> 3; ++i) { | ||
vo[i] = _mm256_loadu_ps(o); o += 8; | ||
} | ||
for (int i = 0; i < channels >> 3; ++i) { | ||
vacc[i] = _mm256_add_ps(vo[i], vacc[i]); | ||
} | ||
for (int i = 0; i < channels >> 3; ++i) { | ||
_mm256_storeu_ps(output, vacc[i]); output += 8; | ||
} | ||
if (remainder) { | ||
const size_t pos = num_full_chunks; | ||
__m256 vout = vacc[pos]; | ||
const __m256 vdata = _mm256_maskload_ps(output, vmask); | ||
vout = _mm256_add_ps(vout, vdata); | ||
__m128 vout_lo = _mm256_castps256_ps128(vout); | ||
if (channels & 4) { | ||
_mm_storeu_ps(output, vout_lo); | ||
vout_lo = _mm256_extractf128_ps(vout, 1); | ||
output += 4; | ||
} | ||
if (channels & 2) { | ||
_mm_storel_pi((__m64*) output, vout_lo); | ||
vout_lo = _mm_movehl_ps(vout_lo, vout_lo); | ||
output += 2; | ||
} | ||
if (channels & 1) { | ||
_mm_store_ss(output, vout_lo); | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.