Skip to content
Permalink
Browse files

Adds detection and support for "SSE4" vs. "SSE" vector implementations.

In ef206c4, I made our "SSE" vector implementation depend on <=SSE4.1
instead of <=SSE2. I was rewriting the H4 SSV filter to use the signed
8-bit ints and a esl_sse_hmax_epi8() macro that uses the SSE4.1
_mm_max_epi8() intrinsic. Presciently, I noted that if this change
were going to cause trouble, it would be on AMD platforms;
incorrectly, I proposed that since AMD processors have supported
SSE4.1 since AMD Bulldozer (2011), I didn't expect trouble. Well,
trouble: Odyssey head nodes are AMD Phenom II X4 910e processors,
circa January 2010. I hadn't noticed a problem before because I've
been using eddyfs01 as my head node, which has Intel processors.

This commit separates detection and support for "SSE" versus "SSE4"
vector implementations. SSE requires <=SSE2 (HMMER3); SSE4 requires
<=SSE4.1 (current HMMER4).

This commit will temporarily break H4, which will need to change
ESL_SSE() autoconf macro call to ESL_SSE4(), eslENABLE_SSE ->
eslENABLE_SSE4, and suchlike.
  • Loading branch information...
cryptogenomicon committed Jun 6, 2018
1 parent fd94007 commit b3980dd3eaf9f713189e67472a9a13fdaf57fcca
Showing with 212 additions and 94 deletions.
  1. +32 −12 configure.ac
  2. +1 −0 esl_config.h.in
  3. +62 −10 esl_cpu.c
  4. +1 −0 esl_cpu.h
  5. +12 −10 esl_sse.c
  6. +9 −2 esl_sse.h
  7. +1 −1 esl_vmx.c
  8. +0 −44 m4/README.md
  9. +2 −2 m4/esl_avx512.m4
  10. +13 −13 m4/esl_sse.m4
  11. +79 −0 m4/esl_sse4.m4
@@ -59,6 +59,7 @@ m4_include([m4/ax_check_compile_flag.m4])
m4_include([m4/ax_gcc_func_attribute.m4])

m4_include([m4/esl_sse.m4])
m4_include([m4/esl_sse4.m4])
m4_include([m4/esl_avx.m4])
m4_include([m4/esl_avx512.m4])
m4_include([m4/esl_neon.m4])
@@ -168,6 +169,7 @@ AC_ARG_ENABLE(gcov, [AS_HELP_STRING([--enable-gcov], [compile for code cov
AC_ARG_ENABLE(gprof, [AS_HELP_STRING([--enable-gprof], [compile for gcc code profiling])], enable_gprof=$enableval, enable_gprof=no)

AC_ARG_ENABLE(sse, [AS_HELP_STRING([--enable-sse], [enable our SSE vector code])], enable_sse=$enableval, enable_sse=check)
AC_ARG_ENABLE(sse4, [AS_HELP_STRING([--enable-sse4], [enable our SSE4 vector code])], enable_sse4=$enableval, enable_sse4=check)
AC_ARG_ENABLE(avx, [AS_HELP_STRING([--enable-avx], [enable our AVX vector code])], enable_avx=$enableval, enable_avx=check)
AC_ARG_ENABLE(avx512, [AS_HELP_STRING([--enable-avx512], [enable our AVX-512 vector code])], enable_avx512=$enableval, enable_avx512=check)
AC_ARG_ENABLE(neon, [AS_HELP_STRING([--enable-neon], [enable our NEON vector code])] , enable_neon=$enableval, enable_neon=check)
@@ -249,13 +251,15 @@ fi

# Support for vector implementations (xref SRE:H3/28)
#
# We only worry about compile time here: do the compiler and linker
# support an instruction set. A program must also check at runtime
# that its processor supports the instruction set.
# We only worry about compile time here: i.e. do the compiler and
# linker support an instruction set. A program should also check at
# runtime that its processor supports the instruction set, using code
# in esl_cpu.
#
# Our SSE implementations require SSE+SSE2+SSE4.1;
# AVX implementations require AVX+AVX2;
# AVX-512 requires the F, ER, and BW subsets.
# Our "SSE" implementations require <=SSE2;
# "SSE4" requires <= SSE4.1;
# "AVX" requires <= AVX2;
# "AVX512" requires the F, ER, and BW subsets.
#
# If we were explicitly told to enable one ($enable_foo="yes") and we
# can't, fail with an error.
@@ -274,7 +278,7 @@ fi
#
if test "$enable_sse" = "yes" || test "$enable_sse" = "check"; then
ESL_SSE([
AC_DEFINE(eslENABLE_SSE, 1, [Set to enable the SSE vector implementation])
AC_DEFINE(eslENABLE_SSE, 1, [Set to enable SSE vector implementations])
SSE_CFLAGS=$esl_sse_cflags
AC_SUBST(SSE_CFLAGS)
enable_sse=yes
@@ -286,9 +290,23 @@ if test "$enable_sse" = "yes" || test "$enable_sse" = "check"; then
])
fi

if test "$enable_sse4" = "yes" || test "$enable_sse4" = "check"; then
ESL_SSE4([
AC_DEFINE(eslENABLE_SSE4, 1, [Set to enable SSE4 vector implementations])
SSE4_CFLAGS=$esl_sse4_cflags
AC_SUBST(SSE4_CFLAGS)
enable_sse4=yes
],[
if test "$enable_sse4" = "yes"; then
AC_MSG_FAILURE([Unable to compile SSE4. Try another compiler, or --disable-sse4])
fi
enable_sse4=no
])
fi

if test "$enable_avx" = "yes" || test "$enable_avx" = "check"; then
ESL_AVX([
AC_DEFINE(eslENABLE_AVX, 1, [Set to enable the AVX vector implementation])
AC_DEFINE(eslENABLE_AVX, 1, [Set to enable AVX vector implementations])
AVX_CFLAGS=$esl_avx_cflags
AC_SUBST(AVX_CFLAGS)
enable_avx=yes
@@ -302,7 +320,7 @@ fi

if test "$enable_avx512" = "yes" || test "$enable_avx512" = "check"; then
ESL_AVX512([
AC_DEFINE(eslENABLE_AVX512, 1, [Set to enable the AVX-512 vector implementation])
AC_DEFINE(eslENABLE_AVX512, 1, [Set to enable AVX-512 vector implementations])
AVX512_CFLAGS=$esl_avx512_cflags
AC_SUBST(AVX512_CFLAGS)
enable_avx512=yes
@@ -316,7 +334,7 @@ fi

if test "$enable_neon" = "yes" || test "$enable_neon" = "check"; then
ESL_NEON([
AC_DEFINE(eslENABLE_NEON, 1, [Set to enable the ARM NEON vector implementation])
AC_DEFINE(eslENABLE_NEON, 1, [Set to enable ARM NEON vector implementations])
AS_VAR_IF([esl_have_neon_aarch64],[yes],[AC_DEFINE(eslHAVE_NEON_AARCH64, 1, [Set to enable the ARM AARCH64 version of NEON])])
NEON_CFLAGS=$esl_neon_cflags
AC_SUBST(NEON_CFLAGS)
@@ -331,7 +349,7 @@ fi

if test "$enable_vmx" = "yes" || test "$enable_vmx" = "check"; then
ESL_VMX([
AC_DEFINE(eslENABLE_VMX, 1, [Set to enable the Altivec/VMX vector implementation])
AC_DEFINE(eslENABLE_VMX, 1, [Set to enable Altivec/VMX vector implementations])
VMX_CFLAGS=$esl_vmx_cflags
AC_SUBST(VMX_CFLAGS)
enable_vmx=yes
@@ -351,7 +369,7 @@ fi
# algorithms underflow to zero by design.
#
esl_save_cflags="$CFLAGS"
CFLAGS="$CFLAGS $SSE_CFLAGS"
CFLAGS="$CFLAGS $SSE_CFLAGS $SSE4_CFLAGS"

AC_MSG_CHECKING([whether flush-to-zero (FTZ) is supported])
AC_COMPILE_IFELSE(
@@ -582,13 +600,15 @@ Compiler:
CFLAGS= $CFLAGS
PTHREAD_CFLAGS= $PTHREAD_CFLAGS
SSE_CFLAGS= $SSE_CFLAGS
SSE4_CFLAGS= $SSE4_CFLAGS
AVX_CFLAGS= $AVX_CFLAGS
AVX512_CFLAGS= $AVX512_CFLAGS
NEON_CFLAGS= $NEON_CFLAGS
VMX_CFLAGS= $VMX_CFLAGS

Vector implementations enabled:
sse: $enable_sse
sse4: $enable_sse4
avx: $enable_avx
avx512: $enable_avx512
neon: $enable_neon
@@ -23,6 +23,7 @@

/* Optional parallel implementation support */
#undef eslENABLE_SSE
#undef eslENABLE_SSE4
#undef eslENABLE_AVX
#undef eslENABLE_AVX512
#undef eslENABLE_NEON
@@ -25,12 +25,15 @@
#include "esl_cpu.h"

/* declarations of static functions that come in section (2) */
#if defined(eslENABLE_SSE) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
static void cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd);
#endif
#ifdef eslENABLE_SSE
static int cpu_has_sse(void);
#endif
#ifdef eslENABLE_SSE4
static int cpu_has_sse4(void);
#endif
#ifdef eslENABLE_AVX
static int cpu_check_xcr0_ymm(void);
static int cpu_has_avx(void);
@@ -45,12 +48,12 @@ static int cpu_has_avx512(void);
*****************************************************************/

/* Function: esl_cpu_has_sse()
* Synopsis: Check if processor supports x86 SSE/SSE2/SSE4.1
* Synopsis: Check if processor supports x86 SSE/SSE2
* Incept: SRE, Wed Feb 1 09:19:11 2017
*
* Purpose: Returns TRUE if our code has an available SSE vector
* implementation compiled in, and the processor we're
* running on can support it (i.e. has SSE+SSE2+SSE4.1).
* running on can support it (i.e. has SSE+SSE2).
* Else returns FALSE.
*
* Note: Although these use static flags, they are thread-safe.
@@ -73,6 +76,29 @@ esl_cpu_has_sse(void)
}


/* Function: esl_cpu_has_sse4()
* Synopsis: Check if processor supports x86 <= SSE4.1
* Incept: SRE, Wed Jun 6 11:49:46 2018 [OdjBox, Otto Croy]
*
* Purpose: Returns TRUE if our code has an available SSE4 vector
* implementation compiled in, and the processor we're
* running on can support it (i.e. has SSE+SSE2+SSE4.1).
* Else returns FALSE.
*/
int
esl_cpu_has_sse4(void)
{
#ifdef eslENABLE_SSE4
static int sse4_support = -1;
if (sse4_support < 0)
sse4_support = cpu_has_sse4();
return sse4_support;
#else
return 0;
#endif
}



/* Function: esl_cpu_has_avx()
* Synopsis: Check if processor supports x86 AVX/AVX2.
@@ -137,6 +163,9 @@ esl_cpu_Get(void)
#ifdef eslENABLE_AVX
if (esl_cpu_has_avx()) return "AVX";
#endif
#ifdef eslENABLE_SSE4
if (esl_cpu_has_sse4()) return "SSE4";
#endif
#ifdef eslENABLE_SSE
if (esl_cpu_has_sse()) return "SSE";
#endif
@@ -156,7 +185,7 @@ esl_cpu_Get(void)
* 2. Internal code used in x86 vector code checks
*****************************************************************/

#if defined(eslENABLE_SSE) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4) || defined(eslENABLE_AVX) || defined(eslENABLE_AVX512)
/* cpu_run_id()
*
* Bit flags in EAX (and maybe ECX) registers specify the information
@@ -185,7 +214,7 @@ cpu_run_id(uint32_t eax, uint32_t ecx, uint32_t *abcd)
abcd[0] = eax; abcd[1] = ebx; abcd[2] = ecx; abcd[3] = edx;
#endif // ! _MSC_VER
}
#endif // eslENABLE_SSE | eslENABLE_AVX | eslENABLE_AVX512
#endif // eslENABLE_SSE | eslENABLE_SSE4 | eslENABLE_AVX | eslENABLE_AVX512



@@ -245,11 +274,32 @@ cpu_check_xcr0_zmm(void)
#ifdef eslENABLE_SSE
/* cpu_has_sse()
*
* Test whether processor supports SSE/SSE2/SSE4.1 instructions.
* Note that Easel's "SSE" vector code means SSE+SSE2+SSE4.1.
* Test whether processor supports SSE/SSE2 instructions.
* Note that Easel's "SSE" vector code means SSE+SSE2.
*/
static int
cpu_has_sse(void)
{
uint32_t abcd[4];
uint32_t sse2_mask = (1 << 25) | // edx: SSE
(1 << 26); // SSE2

cpu_run_id( 1, 0, abcd );
if ( (abcd[3] & sse2_mask) != sse2_mask) // edx check
return 0;
return 1;
}
#endif // eslENABLE_SSE


#ifdef eslENABLE_SSE4
/* cpu_has_sse4()
*
* Test whether processor supports SSE/SSE2/SSE4.1 instructions.
* Note that Easel's "SSE4" vector code means SSE+SSE2+SSE4.1.
*/
static int
cpu_has_sse4(void)
{
uint32_t abcd[4];
uint32_t sse2_mask = (1 << 25) | // edx: SSE
@@ -262,7 +312,7 @@ cpu_has_sse(void)
return 0;
return 1;
}
#endif // eslENABLE_SSE
#endif // eslENABLE_SSE4



@@ -363,8 +413,9 @@ utest_consistency(void)
{
char msg[] = "utest_consistency() failed";

if (esl_cpu_has_avx512() && ! esl_cpu_has_avx()) esl_fatal(msg);
if (esl_cpu_has_avx() && ! esl_cpu_has_sse()) esl_fatal(msg);
if (esl_cpu_has_avx512() && ! esl_cpu_has_avx()) esl_fatal(msg);
if (esl_cpu_has_avx() && ! esl_cpu_has_sse4()) esl_fatal(msg);
if (esl_cpu_has_sse4() && ! esl_cpu_has_sse()) esl_fatal(msg);
}

#endif // eslCPU_TESTDRIVE
@@ -402,6 +453,7 @@ int
main(int argc, char **argv)
{
printf("your cpu supports our SSE code : %s\n", esl_cpu_has_sse() ? "yes" : "no");
printf(" ...our SSE4 code : %s\n", esl_cpu_has_sse4() ? "yes" : "no");
printf(" ...our AVX code : %s\n", esl_cpu_has_avx() ? "yes" : "no");
printf(" ...our AVX512 code : %s\n", esl_cpu_has_avx512() ? "yes" : "no");
printf("Our dispatchers will choose : %s\n", esl_cpu_Get());
@@ -2,6 +2,7 @@
#define eslCPU_INCLUDED

extern int esl_cpu_has_sse(void);
extern int esl_cpu_has_sse4(void);
extern int esl_cpu_has_avx(void);
extern int esl_cpu_has_avx512(void);
extern char *esl_cpu_Get(void);
@@ -10,12 +10,13 @@
*
*****************************************************************
*
* This code is conditionally compiled, only when <eslENABLE_SSE> was
* set in <esl_config.h> by the configure script, and that will only
* happen on x86 platforms. When <eslENABLE_SSE> is not set, we
* include some dummy code to silence compiler and ranlib warnings
* about empty translation units and no symbols, and dummy drivers
* that do nothing but declare success.
* This code is conditionally compiled, only when <eslENABLE_SSE> or
* <eslENABLE_SSE4> was set in <esl_config.h> by the configure script,
* and that will only happen on x86 platforms. When neither
* <eslENABLE_SSE> nor <eslENABLE_SSE4> are set, we include some dummy
* code to silence compiler and ranlib warnings about empty
* translation units and no symbols, and dummy drivers that do nothing
* but declare success.
*
*****************************************************************
* Credits:
@@ -29,7 +30,7 @@
* information is appended at the end of the file.
*/
#include "esl_config.h"
#ifdef eslENABLE_SSE
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4)

#include <stdlib.h>
#include <stdio.h>
@@ -489,6 +490,7 @@ utest_hmax_epu8(ESL_RANDOMNESS *rng)
static void
utest_hmax_epi8(ESL_RANDOMNESS *rng)
{
#ifdef eslENABLE_SSE4 // no-op if eslENABLE_SSE only
union { __m128i v; int8_t x[16]; } u;
int8_t r1, r2;
int i,z;
@@ -504,9 +506,9 @@ utest_hmax_epi8(ESL_RANDOMNESS *rng)
r2 = esl_sse_hmax_epi8(u.v);
if (r1 != r2) esl_fatal("hmax_epi8 utest failed");
}
#endif // eslENABLE_SSE4
}


static void
utest_hmax_epi16(ESL_RANDOMNESS *rng)
{
@@ -625,7 +627,7 @@ main(int argc, char **argv)



#else // ! eslENABLE_SSE
#else // ! (eslENABLE_SSE || eslENABLE_SSE4)

/* If we don't have SSE compiled in, provide some nothingness to:
* a. prevent Mac OS/X ranlib from bitching about .o file that "has no symbols"
@@ -636,7 +638,7 @@ void esl_sse_silence_hack(void) { return; }
#if defined eslSSE_TESTDRIVE || eslSSE_EXAMPLE || eslSSE_BENCHMARK
int main(void) { return 0; }
#endif
#endif // eslENABLE_SSE or not
#endif // (eslENABLE_SSE || eslENABLE_SSE4) or not



0 comments on commit b3980dd

Please sign in to comment.
You can’t perform that action at this time.