Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xxhash: support SVE by intrinsic code #752

Merged
merged 2 commits into from Nov 9, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion .github/workflows/ci.yml
Expand Up @@ -378,11 +378,16 @@ jobs:
CPPFLAGS="-DXXH_VECTOR=XXH_SCALAR" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_NEON" CFLAGS="-O3 -march=armv7-a -fPIC -mfloat-abi=softfp -mfpu=neon-vfpv4" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check

- name: ARM64 (XXH_VECTOR=[ scalar, NEON ])
- name: ARM64 (XXH_VECTOR=[ scalar, NEON, SVE ])
if: ${{ matrix.name == 'ARM64' }}
run: |
CPPFLAGS="-DXXH_VECTOR=XXH_SCALAR" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_NEON" LDFLAGS="-static" CC=$XCC RUN_ENV=$XEMU make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_SVE" LDFLAGS="-static" CC=$XCC RUN_ENV="$XEMU -cpu max,sve128=on,sve256=off,sve512=off,sve1024=off,sve2048=off" make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_SVE" LDFLAGS="-static" CC=$XCC RUN_ENV="$XEMU -cpu max,sve128=on,sve256=on,sve512=off,sve1024=off,sve2048=off" make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_SVE" LDFLAGS="-static" CC=$XCC RUN_ENV="$XEMU -cpu max,sve128=on,sve256=on,sve512=on,sve1024=off,sve2048=off" make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_SVE" LDFLAGS="-static" CC=$XCC RUN_ENV="$XEMU -cpu max,sve128=on,sve256=on,sve512=on,sve1024=on,sve2048=off" make clean check
CPPFLAGS="-DXXH_VECTOR=XXH_SVE" LDFLAGS="-static" CC=$XCC RUN_ENV="$XEMU -cpu max,sve128=on,sve256=on,sve512=on,sve1024=on,sve2048=on" make clean check

- name: ${{ matrix.name }} (XXH_VECTOR=[ scalar, VSX ])
if: ${{ startsWith(matrix.name, 'PPC64') }}
Expand Down
88 changes: 86 additions & 2 deletions xxhash.h
Expand Up @@ -3004,7 +3004,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
#endif

#if defined(__GNUC__) || defined(__clang__)
# if defined(__ARM_NEON__) || defined(__ARM_NEON) \
# if defined(__ARM_FEATURE_SVE)
# include <arm_sve.h>
# elif defined(__ARM_NEON__) || defined(__ARM_NEON) \
|| defined(__aarch64__) || defined(_M_ARM) \
|| defined(_M_ARM64) || defined(_M_ARM64EC)
# define inline __inline__ /* circumvent a clang bug */
Expand Down Expand Up @@ -3131,6 +3133,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */
XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */
};
/*!
* @ingroup tuning
Expand All @@ -3152,10 +3155,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_AVX512 3
# define XXH_NEON 4
# define XXH_VSX 5
# define XXH_SVE 6
#endif

#ifndef XXH_VECTOR /* can be defined on command line */
# if ( \
# if defined(__ARM_FEATURE_SVE)
# define XXH_VECTOR XXH_SVE
# elif ( \
defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \
|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \
) && ( \
Expand All @@ -3178,6 +3184,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# endif
#endif

/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */
#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note : maybe minor, but surely worth mentioning.

This transparent fallback method
will make compilation succeed when SVE is selected, but actually not present on compilation target.

But it also means that a developer may think that the library is compiled with SVE enabled, because it's instructed to, but actually it's not and uses SCALAR instead, and the developer would be none the wiser.

I'm wondering if it would be preferable to let compilation fail instead.

Intermediate position, maybe a warning (though unfortunately, I believe #warning is not part of C90 syntax, and #pragma warning depends on compiler support. But then, so does __ARM_FEATURE_SVE).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. I'll update it.

# ifdef _MSC_VER
# pragma warning(once : 4606)
# else
# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead."
# endif
# undef XXH_VECTOR
# define XXH_VECTOR XXH_SCALAR
#endif

/*
* Controls the alignment of the accumulator,
* for compatibility with aligned vector loads, which are usually faster.
Expand All @@ -3197,12 +3214,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ {
# define XXH_ACC_ALIGN 16
# elif XXH_VECTOR == XXH_AVX512 /* avx512 */
# define XXH_ACC_ALIGN 64
# elif XXH_VECTOR == XXH_SVE /* sve */
# define XXH_ACC_ALIGN 64
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Quick question :

This variant seems to directly target 512-bit width.

My understanding of SVE is that it's more flexible with regards to vector width, compared to Intel's AVX.
That being said, the qemu documentation implies that there are multiple hardware implementations of SVE to expect, from 128 to 512 bits.

Will this vector extension work with all of them, or only specifically on the 512-bit one ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, you're right. SVE ranges from 128-bit to 2048-bit. Should I create a macro to calculate it for SVE?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've configured CI to run code from SVE128 to SVE2048. All of them could work with this XXH_ACC_ALIGN.

# endif
#endif

#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \
|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
#elif XXH_VECTOR == XXH_SVE
# define XXH_SEC_ALIGN XXH_ACC_ALIGN
#else
# define XXH_SEC_ALIGN 8
#endif
Expand Down Expand Up @@ -3524,6 +3545,20 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b)
# endif /* XXH_vec_mulo, XXH_vec_mule */
#endif /* XXH_VECTOR == XXH_VSX */

#if XXH_VECTOR == XXH_SVE
#define ACCRND(acc, offset) \
do { \
svuint64_t input_vec = svld1_u64(mask, xinput + offset); \
svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \
svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \
svuint64_t swapped = svtbl_u64(input_vec, kSwap); \
svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \
svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \
svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \
acc = svadd_u64_x(mask, acc, mul); \
} while (0)
#endif /* XXH_VECTOR == XXH_SVE */


/* prefetch
* can be disabled, by declaring XXH_NO_PREFETCH build macro */
Expand Down Expand Up @@ -4651,6 +4686,50 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)

#endif

#if (XXH_VECTOR == XXH_SVE)

XXH_FORCE_INLINE void
XXH3_accumulate_512_sve( void* XXH_RESTRICT acc,
const void* XXH_RESTRICT input,
const void* XXH_RESTRICT secret)
{
uint64_t *xacc = (uint64_t *)acc;
const uint64_t *xinput = (const uint64_t *)(const void *)input;
const uint64_t *xsecret = (const uint64_t *)(const void *)secret;
svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1);
uint64_t element_count = svcntd();
if (element_count >= 8) {
svbool_t mask = svptrue_pat_b64(SV_VL8);
svuint64_t vacc = svld1_u64(mask, xacc);
ACCRND(vacc, 0);
svst1_u64(mask, xacc, vacc);
} else if (element_count == 2) { /* sve128 */
svbool_t mask = svptrue_pat_b64(SV_VL2);
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
svuint64_t acc1 = svld1_u64(mask, xacc + 2);
svuint64_t acc2 = svld1_u64(mask, xacc + 4);
svuint64_t acc3 = svld1_u64(mask, xacc + 6);
ACCRND(acc0, 0);
ACCRND(acc1, 2);
ACCRND(acc2, 4);
ACCRND(acc3, 6);
svst1_u64(mask, xacc + 0, acc0);
svst1_u64(mask, xacc + 2, acc1);
svst1_u64(mask, xacc + 4, acc2);
svst1_u64(mask, xacc + 6, acc3);
} else {
svbool_t mask = svptrue_pat_b64(SV_VL4);
svuint64_t acc0 = svld1_u64(mask, xacc + 0);
svuint64_t acc1 = svld1_u64(mask, xacc + 4);
ACCRND(acc0, 0);
ACCRND(acc1, 4);
svst1_u64(mask, xacc + 0, acc0);
svst1_u64(mask, xacc + 4, acc1);
}
}

#endif

/* scalar variants - universal */

/*!
Expand Down Expand Up @@ -4844,6 +4923,11 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64);
#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#elif (XXH_VECTOR == XXH_SVE)
#define XXH3_accumulate_512 XXH3_accumulate_512_sve
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar

#else /* scalar */

#define XXH3_accumulate_512 XXH3_accumulate_512_scalar
Expand Down