New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
xxhash: support SVE by intrinsic code #752
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3004,7 +3004,9 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src | |
#endif | ||
|
||
#if defined(__GNUC__) || defined(__clang__) | ||
# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ | ||
# if defined(__ARM_FEATURE_SVE) | ||
# include <arm_sve.h> | ||
# elif defined(__ARM_NEON__) || defined(__ARM_NEON) \ | ||
|| defined(__aarch64__) || defined(_M_ARM) \ | ||
|| defined(_M_ARM64) || defined(_M_ARM64EC) | ||
# define inline __inline__ /* circumvent a clang bug */ | ||
|
@@ -3131,6 +3133,7 @@ enum XXH_VECTOR_TYPE /* fake enum */ { | |
XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ | ||
XXH_NEON = 4, /*!< NEON for most ARMv7-A and all AArch64 */ | ||
XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ | ||
XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ | ||
}; | ||
/*! | ||
* @ingroup tuning | ||
|
@@ -3152,10 +3155,13 @@ enum XXH_VECTOR_TYPE /* fake enum */ { | |
# define XXH_AVX512 3 | ||
# define XXH_NEON 4 | ||
# define XXH_VSX 5 | ||
# define XXH_SVE 6 | ||
#endif | ||
|
||
#ifndef XXH_VECTOR /* can be defined on command line */ | ||
# if ( \ | ||
# if defined(__ARM_FEATURE_SVE) | ||
# define XXH_VECTOR XXH_SVE | ||
# elif ( \ | ||
defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ | ||
|| defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ | ||
) && ( \ | ||
|
@@ -3178,6 +3184,17 @@ enum XXH_VECTOR_TYPE /* fake enum */ { | |
# endif | ||
#endif | ||
|
||
/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ | ||
#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) | ||
# ifdef _MSC_VER | ||
# pragma warning(once : 4606) | ||
# else | ||
# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." | ||
# endif | ||
# undef XXH_VECTOR | ||
# define XXH_VECTOR XXH_SCALAR | ||
#endif | ||
|
||
/* | ||
* Controls the alignment of the accumulator, | ||
* for compatibility with aligned vector loads, which are usually faster. | ||
|
@@ -3197,12 +3214,16 @@ enum XXH_VECTOR_TYPE /* fake enum */ { | |
# define XXH_ACC_ALIGN 16 | ||
# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ | ||
# define XXH_ACC_ALIGN 64 | ||
# elif XXH_VECTOR == XXH_SVE /* sve */ | ||
# define XXH_ACC_ALIGN 64 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Quick question : This variant seems to directly target 512-bit width. My understanding of Will this vector extension work with all of them, or only specifically on the 512-bit one ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, you're right. SVE ranges from 128-bit to 2048-bit. Should I create a macro to calculate it for SVE? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've configured CI to run code from SVE128 to SVE2048. All of them could work with this XXH_ACC_ALIGN. |
||
# endif | ||
#endif | ||
|
||
#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ | ||
|| XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 | ||
# define XXH_SEC_ALIGN XXH_ACC_ALIGN | ||
#elif XXH_VECTOR == XXH_SVE | ||
# define XXH_SEC_ALIGN XXH_ACC_ALIGN | ||
#else | ||
# define XXH_SEC_ALIGN 8 | ||
#endif | ||
|
@@ -3524,6 +3545,20 @@ XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) | |
# endif /* XXH_vec_mulo, XXH_vec_mule */ | ||
#endif /* XXH_VECTOR == XXH_VSX */ | ||
|
||
#if XXH_VECTOR == XXH_SVE | ||
#define ACCRND(acc, offset) \ | ||
do { \ | ||
svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ | ||
svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ | ||
svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ | ||
svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ | ||
svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ | ||
svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ | ||
svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ | ||
acc = svadd_u64_x(mask, acc, mul); \ | ||
} while (0) | ||
#endif /* XXH_VECTOR == XXH_SVE */ | ||
|
||
|
||
/* prefetch | ||
* can be disabled, by declaring XXH_NO_PREFETCH build macro */ | ||
|
@@ -4651,6 +4686,50 @@ XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) | |
|
||
#endif | ||
|
||
#if (XXH_VECTOR == XXH_SVE) | ||
|
||
XXH_FORCE_INLINE void | ||
XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, | ||
const void* XXH_RESTRICT input, | ||
const void* XXH_RESTRICT secret) | ||
{ | ||
uint64_t *xacc = (uint64_t *)acc; | ||
const uint64_t *xinput = (const uint64_t *)(const void *)input; | ||
const uint64_t *xsecret = (const uint64_t *)(const void *)secret; | ||
svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); | ||
uint64_t element_count = svcntd(); | ||
if (element_count >= 8) { | ||
svbool_t mask = svptrue_pat_b64(SV_VL8); | ||
svuint64_t vacc = svld1_u64(mask, xacc); | ||
ACCRND(vacc, 0); | ||
svst1_u64(mask, xacc, vacc); | ||
} else if (element_count == 2) { /* sve128 */ | ||
svbool_t mask = svptrue_pat_b64(SV_VL2); | ||
svuint64_t acc0 = svld1_u64(mask, xacc + 0); | ||
svuint64_t acc1 = svld1_u64(mask, xacc + 2); | ||
svuint64_t acc2 = svld1_u64(mask, xacc + 4); | ||
svuint64_t acc3 = svld1_u64(mask, xacc + 6); | ||
ACCRND(acc0, 0); | ||
ACCRND(acc1, 2); | ||
ACCRND(acc2, 4); | ||
ACCRND(acc3, 6); | ||
svst1_u64(mask, xacc + 0, acc0); | ||
svst1_u64(mask, xacc + 2, acc1); | ||
svst1_u64(mask, xacc + 4, acc2); | ||
svst1_u64(mask, xacc + 6, acc3); | ||
} else { | ||
svbool_t mask = svptrue_pat_b64(SV_VL4); | ||
svuint64_t acc0 = svld1_u64(mask, xacc + 0); | ||
svuint64_t acc1 = svld1_u64(mask, xacc + 4); | ||
ACCRND(acc0, 0); | ||
ACCRND(acc1, 4); | ||
svst1_u64(mask, xacc + 0, acc0); | ||
svst1_u64(mask, xacc + 4, acc1); | ||
} | ||
} | ||
|
||
#endif | ||
|
||
/* scalar variants - universal */ | ||
|
||
/*! | ||
|
@@ -4844,6 +4923,11 @@ typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); | |
#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx | ||
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar | ||
|
||
#elif (XXH_VECTOR == XXH_SVE) | ||
#define XXH3_accumulate_512 XXH3_accumulate_512_sve | ||
#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar | ||
#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar | ||
|
||
#else /* scalar */ | ||
|
||
#define XXH3_accumulate_512 XXH3_accumulate_512_scalar | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note : maybe minor, but surely worth mentioning.
This transparent fallback method
will make compilation succeed when
SVE
is selected, but actually not present on compilation target.But it also means that a developer may think that the library is compiled with
SVE
enabled, because it's instructed to, but actually it's not and usesSCALAR
instead, and the developer would be none the wiser.I'm wondering if it would be preferable to let compilation fail instead.
Intermediate position, maybe a warning (though unfortunately, I believe
#warning
is not part ofC90
syntax, and#pragma warning
depends on compiler support. But then, so does__ARM_FEATURE_SVE
).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I'll update it.