Skip to content

Commit

Permalink
AMX GEMM fix variable names to have more letters.
Browse files Browse the repository at this point in the history
- MRx64c4 requires 64 letters for variable name.
- Use offsets when loading from weights and increment w by NR once.
- kremainder const.
- Increment c pointers after each store.
- Add assert for NR == 16 for QC8.  Not necessary but that is only size tested.
  QC8 outputs 16 bytes at a time, and with NR > 16 would output multiple vectors.
  Consider outputing up to 64 bytes per vector.

PiperOrigin-RevId: 621915276
  • Loading branch information
fbarchard authored and xnnpack-bot committed Apr 4, 2024
1 parent 8b30931 commit 85071b8
Show file tree
Hide file tree
Showing 18 changed files with 612 additions and 715 deletions.
40 changes: 16 additions & 24 deletions src/amalgam/gen/avx512amx.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512amx(
__attribute__((aligned(64))) int32_t res0[1 * 16];

kc = round_up_po2(kc, 4 * sizeof(int8_t));
size_t kremainder = kc & 63;
if (kremainder == 0) { // zero is invalid config
kremainder = 64;
}
const size_t kremainder = (kc & 63) ? (kc & 63) : 64;

// Define tile config data structure
struct __tile_config {
Expand Down Expand Up @@ -126,10 +123,10 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512amx(

vscaled0x0123456789ABCDEF = _mm512_mul_ps(vscaled0x0123456789ABCDEF, _mm512_set1_ps(quantization_params[0].inv_scale));

const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;

vscaled0x0123456789ABCDEF = _mm512_fmadd_ps(vscaled0x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);

Expand All @@ -139,7 +136,6 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x16c4__avx512amx(

if XNN_LIKELY(nc >= 16) {
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);

c0 = (float*) ((uintptr_t) c0 + cn_stride);

a -= kc;
Expand Down Expand Up @@ -185,10 +181,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512amx(
__attribute__((aligned(64))) int32_t res0[7 * 16];

kc = round_up_po2(kc, 4 * sizeof(int8_t));
size_t kremainder = kc & 63;
if (kremainder == 0) { // zero is invalid config
kremainder = 64;
}
const size_t kremainder = (kc & 63) ? (kc & 63) : 64;

// Define tile config data structure
struct __tile_config {
Expand Down Expand Up @@ -318,10 +311,10 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512amx(
vscaled5x0123456789ABCDEF = _mm512_mul_ps(vscaled5x0123456789ABCDEF, _mm512_set1_ps(quantization_params[5].inv_scale));
vscaled6x0123456789ABCDEF = _mm512_mul_ps(vscaled6x0123456789ABCDEF, _mm512_set1_ps(quantization_params[6].inv_scale));

const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;

vscaled0x0123456789ABCDEF = _mm512_fmadd_ps(vscaled0x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
vscaled1x0123456789ABCDEF = _mm512_fmadd_ps(vscaled1x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
Expand Down Expand Up @@ -349,19 +342,18 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_7x16c4__avx512amx(

if XNN_LIKELY(nc >= 16) {
_mm512_storeu_ps(c6 + 0, vscaled6x0123456789ABCDEF);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);

c6 = (float*) ((uintptr_t) c6 + cn_stride);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a -= kc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx_prfm(
__attribute__((aligned(64))) int32_t res0[16 * 16];

kc = round_up_po2(kc, 4 * sizeof(int8_t));
size_t kremainder = kc & 63;
if (kremainder == 0) { // zero is invalid config
kremainder = 64;
}
const size_t kremainder = (kc & 63) ? (kc & 63) : 64;

// Define tile config data structure
struct __tile_config {
Expand Down Expand Up @@ -283,10 +280,10 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx_prfm(
vscaled14x0123456789ABCDEF = _mm512_mul_ps(vscaled14x0123456789ABCDEF, _mm512_set1_ps(quantization_params[14].inv_scale));
vscaled15x0123456789ABCDEF = _mm512_mul_ps(vscaled15x0123456789ABCDEF, _mm512_set1_ps(quantization_params[15].inv_scale));

const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;

vscaled0x0123456789ABCDEF = _mm512_fmadd_ps(vscaled0x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
vscaled1x0123456789ABCDEF = _mm512_fmadd_ps(vscaled1x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
Expand Down Expand Up @@ -341,37 +338,36 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx_prfm(

if XNN_LIKELY(nc >= 16) {
_mm512_storeu_ps(c15 + 0, vscaled15x0123456789ABCDEF);
_mm512_storeu_ps(c14 + 0, vscaled14x0123456789ABCDEF);
_mm512_storeu_ps(c13 + 0, vscaled13x0123456789ABCDEF);
_mm512_storeu_ps(c12 + 0, vscaled12x0123456789ABCDEF);
_mm512_storeu_ps(c11 + 0, vscaled11x0123456789ABCDEF);
_mm512_storeu_ps(c10 + 0, vscaled10x0123456789ABCDEF);
_mm512_storeu_ps(c9 + 0, vscaled9x0123456789ABCDEF);
_mm512_storeu_ps(c8 + 0, vscaled8x0123456789ABCDEF);
_mm512_storeu_ps(c7 + 0, vscaled7x0123456789ABCDEF);
_mm512_storeu_ps(c6 + 0, vscaled6x0123456789ABCDEF);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);

c15 = (float*) ((uintptr_t) c15 + cn_stride);
_mm512_storeu_ps(c14 + 0, vscaled14x0123456789ABCDEF);
c14 = (float*) ((uintptr_t) c14 + cn_stride);
_mm512_storeu_ps(c13 + 0, vscaled13x0123456789ABCDEF);
c13 = (float*) ((uintptr_t) c13 + cn_stride);
_mm512_storeu_ps(c12 + 0, vscaled12x0123456789ABCDEF);
c12 = (float*) ((uintptr_t) c12 + cn_stride);
_mm512_storeu_ps(c11 + 0, vscaled11x0123456789ABCDEF);
c11 = (float*) ((uintptr_t) c11 + cn_stride);
_mm512_storeu_ps(c10 + 0, vscaled10x0123456789ABCDEF);
c10 = (float*) ((uintptr_t) c10 + cn_stride);
_mm512_storeu_ps(c9 + 0, vscaled9x0123456789ABCDEF);
c9 = (float*) ((uintptr_t) c9 + cn_stride);
_mm512_storeu_ps(c8 + 0, vscaled8x0123456789ABCDEF);
c8 = (float*) ((uintptr_t) c8 + cn_stride);
_mm512_storeu_ps(c7 + 0, vscaled7x0123456789ABCDEF);
c7 = (float*) ((uintptr_t) c7 + cn_stride);
_mm512_storeu_ps(c6 + 0, vscaled6x0123456789ABCDEF);
c6 = (float*) ((uintptr_t) c6 + cn_stride);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a -= kc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx(
__attribute__((aligned(64))) int32_t res0[16 * 16];

kc = round_up_po2(kc, 4 * sizeof(int8_t));
size_t kremainder = kc & 63;
if (kremainder == 0) { // zero is invalid config
kremainder = 64;
}
const size_t kremainder = (kc & 63) ? (kc & 63) : 64;

// Define tile config data structure
struct __tile_config {
Expand Down Expand Up @@ -250,10 +247,10 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx(
vscaled14x0123456789ABCDEF = _mm512_mul_ps(vscaled14x0123456789ABCDEF, _mm512_set1_ps(quantization_params[14].inv_scale));
vscaled15x0123456789ABCDEF = _mm512_mul_ps(vscaled15x0123456789ABCDEF, _mm512_set1_ps(quantization_params[15].inv_scale));

const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
w = (const int32_t*) w + 16;

vscaled0x0123456789ABCDEF = _mm512_fmadd_ps(vscaled0x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
vscaled1x0123456789ABCDEF = _mm512_fmadd_ps(vscaled1x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
Expand Down Expand Up @@ -308,37 +305,36 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x16c4__avx512amx(

if XNN_LIKELY(nc >= 16) {
_mm512_storeu_ps(c15 + 0, vscaled15x0123456789ABCDEF);
_mm512_storeu_ps(c14 + 0, vscaled14x0123456789ABCDEF);
_mm512_storeu_ps(c13 + 0, vscaled13x0123456789ABCDEF);
_mm512_storeu_ps(c12 + 0, vscaled12x0123456789ABCDEF);
_mm512_storeu_ps(c11 + 0, vscaled11x0123456789ABCDEF);
_mm512_storeu_ps(c10 + 0, vscaled10x0123456789ABCDEF);
_mm512_storeu_ps(c9 + 0, vscaled9x0123456789ABCDEF);
_mm512_storeu_ps(c8 + 0, vscaled8x0123456789ABCDEF);
_mm512_storeu_ps(c7 + 0, vscaled7x0123456789ABCDEF);
_mm512_storeu_ps(c6 + 0, vscaled6x0123456789ABCDEF);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);

c15 = (float*) ((uintptr_t) c15 + cn_stride);
_mm512_storeu_ps(c14 + 0, vscaled14x0123456789ABCDEF);
c14 = (float*) ((uintptr_t) c14 + cn_stride);
_mm512_storeu_ps(c13 + 0, vscaled13x0123456789ABCDEF);
c13 = (float*) ((uintptr_t) c13 + cn_stride);
_mm512_storeu_ps(c12 + 0, vscaled12x0123456789ABCDEF);
c12 = (float*) ((uintptr_t) c12 + cn_stride);
_mm512_storeu_ps(c11 + 0, vscaled11x0123456789ABCDEF);
c11 = (float*) ((uintptr_t) c11 + cn_stride);
_mm512_storeu_ps(c10 + 0, vscaled10x0123456789ABCDEF);
c10 = (float*) ((uintptr_t) c10 + cn_stride);
_mm512_storeu_ps(c9 + 0, vscaled9x0123456789ABCDEF);
c9 = (float*) ((uintptr_t) c9 + cn_stride);
_mm512_storeu_ps(c8 + 0, vscaled8x0123456789ABCDEF);
c8 = (float*) ((uintptr_t) c8 + cn_stride);
_mm512_storeu_ps(c7 + 0, vscaled7x0123456789ABCDEF);
c7 = (float*) ((uintptr_t) c7 + cn_stride);
_mm512_storeu_ps(c6 + 0, vscaled6x0123456789ABCDEF);
c6 = (float*) ((uintptr_t) c6 + cn_stride);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a -= kc;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x32c4__avx512amx_prfm(
__attribute__((aligned(64))) int32_t res1[16 * 16];

kc = round_up_po2(kc, 4 * sizeof(int8_t));
size_t kremainder = kc & 63;
if (kremainder == 0) { // zero is invalid config
kremainder = 64;
}
const size_t kremainder = (kc & 63) ? (kc & 63) : 64;

// Define tile config data structure
struct __tile_config {
Expand Down Expand Up @@ -371,14 +368,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x32c4__avx512amx_prfm(
vscaled15x0123456789ABCDEF = _mm512_mul_ps(vscaled15x0123456789ABCDEF, _mm512_set1_ps(quantization_params[15].inv_scale));
vscaled15xGHIJKLMNOPQRSTUV = _mm512_mul_ps(vscaled15xGHIJKLMNOPQRSTUV, _mm512_set1_ps(quantization_params[15].inv_scale));

const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vfilter_output_scaleGHIJKLMNOPQRSTUV = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vbiasGHIJKLMNOPQRSTUV = _mm512_load_ps((const float*) w);
w = (const float*) w + 16;
const __m512 vfilter_output_scale0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
const __m512 vfilter_output_scaleGHIJKLMNOPQRSTUV = _mm512_load_ps((const float*) w + 16);
w = (const int32_t*) w + 32;
const __m512 vbias0123456789ABCDEF = _mm512_load_ps((const float*) w + 0);
const __m512 vbiasGHIJKLMNOPQRSTUV = _mm512_load_ps((const float*) w + 16);
w = (const int32_t*) w + 32;

vscaled0x0123456789ABCDEF = _mm512_fmadd_ps(vscaled0x0123456789ABCDEF, vfilter_output_scale0123456789ABCDEF, vbias0123456789ABCDEF);
vscaled0xGHIJKLMNOPQRSTUV = _mm512_fmadd_ps(vscaled0xGHIJKLMNOPQRSTUV, vfilter_output_scaleGHIJKLMNOPQRSTUV, vbiasGHIJKLMNOPQRSTUV);
Expand Down Expand Up @@ -482,52 +477,51 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_16x32c4__avx512amx_prfm(
if XNN_LIKELY(nc >= 32) {
_mm512_storeu_ps(c15 + 0, vscaled15x0123456789ABCDEF);
_mm512_storeu_ps(c15 + 16, vscaled15xGHIJKLMNOPQRSTUV);
c15 = (float*) ((uintptr_t) c15 + cn_stride);
_mm512_storeu_ps(c14 + 0, vscaled14x0123456789ABCDEF);
_mm512_storeu_ps(c14 + 16, vscaled14xGHIJKLMNOPQRSTUV);
c14 = (float*) ((uintptr_t) c14 + cn_stride);
_mm512_storeu_ps(c13 + 0, vscaled13x0123456789ABCDEF);
_mm512_storeu_ps(c13 + 16, vscaled13xGHIJKLMNOPQRSTUV);
c13 = (float*) ((uintptr_t) c13 + cn_stride);
_mm512_storeu_ps(c12 + 0, vscaled12x0123456789ABCDEF);
_mm512_storeu_ps(c12 + 16, vscaled12xGHIJKLMNOPQRSTUV);
c12 = (float*) ((uintptr_t) c12 + cn_stride);
_mm512_storeu_ps(c11 + 0, vscaled11x0123456789ABCDEF);
_mm512_storeu_ps(c11 + 16, vscaled11xGHIJKLMNOPQRSTUV);
c11 = (float*) ((uintptr_t) c11 + cn_stride);
_mm512_storeu_ps(c10 + 0, vscaled10x0123456789ABCDEF);
_mm512_storeu_ps(c10 + 16, vscaled10xGHIJKLMNOPQRSTUV);
c10 = (float*) ((uintptr_t) c10 + cn_stride);
_mm512_storeu_ps(c9 + 0, vscaled9x0123456789ABCDEF);
_mm512_storeu_ps(c9 + 16, vscaled9xGHIJKLMNOPQRSTUV);
c9 = (float*) ((uintptr_t) c9 + cn_stride);
_mm512_storeu_ps(c8 + 0, vscaled8x0123456789ABCDEF);
_mm512_storeu_ps(c8 + 16, vscaled8xGHIJKLMNOPQRSTUV);
c8 = (float*) ((uintptr_t) c8 + cn_stride);
_mm512_storeu_ps(c7 + 0, vscaled7x0123456789ABCDEF);
_mm512_storeu_ps(c7 + 16, vscaled7xGHIJKLMNOPQRSTUV);
c7 = (float*) ((uintptr_t) c7 + cn_stride);
_mm512_storeu_ps(c6 + 0, vscaled6x0123456789ABCDEF);
_mm512_storeu_ps(c6 + 16, vscaled6xGHIJKLMNOPQRSTUV);
c6 = (float*) ((uintptr_t) c6 + cn_stride);
_mm512_storeu_ps(c5 + 0, vscaled5x0123456789ABCDEF);
_mm512_storeu_ps(c5 + 16, vscaled5xGHIJKLMNOPQRSTUV);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
_mm512_storeu_ps(c4 + 0, vscaled4x0123456789ABCDEF);
_mm512_storeu_ps(c4 + 16, vscaled4xGHIJKLMNOPQRSTUV);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
_mm512_storeu_ps(c3 + 0, vscaled3x0123456789ABCDEF);
_mm512_storeu_ps(c3 + 16, vscaled3xGHIJKLMNOPQRSTUV);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm512_storeu_ps(c2 + 0, vscaled2x0123456789ABCDEF);
_mm512_storeu_ps(c2 + 16, vscaled2xGHIJKLMNOPQRSTUV);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm512_storeu_ps(c1 + 0, vscaled1x0123456789ABCDEF);
_mm512_storeu_ps(c1 + 16, vscaled1xGHIJKLMNOPQRSTUV);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm512_storeu_ps(c0 + 0, vscaled0x0123456789ABCDEF);
_mm512_storeu_ps(c0 + 16, vscaled0xGHIJKLMNOPQRSTUV);

c15 = (float*) ((uintptr_t) c15 + cn_stride);
c14 = (float*) ((uintptr_t) c14 + cn_stride);
c13 = (float*) ((uintptr_t) c13 + cn_stride);
c12 = (float*) ((uintptr_t) c12 + cn_stride);
c11 = (float*) ((uintptr_t) c11 + cn_stride);
c10 = (float*) ((uintptr_t) c10 + cn_stride);
c9 = (float*) ((uintptr_t) c9 + cn_stride);
c8 = (float*) ((uintptr_t) c8 + cn_stride);
c7 = (float*) ((uintptr_t) c7 + cn_stride);
c6 = (float*) ((uintptr_t) c6 + cn_stride);
c5 = (float*) ((uintptr_t) c5 + cn_stride);
c4 = (float*) ((uintptr_t) c4 + cn_stride);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
c1 = (float*) ((uintptr_t) c1 + cn_stride);
c0 = (float*) ((uintptr_t) c0 + cn_stride);

a -= kc;
Expand Down

0 comments on commit 85071b8

Please sign in to comment.