Skip to content
Permalink
Browse files

Adds esl_sse_hmax_epi8(); SSE now requires SSE4.1

Starting work on H4's SSV filter, which will depend on SSE4.1
instructions, so we can work with signed epi8/int8_t.  I think we can
get away with this. SSE4.1 has been around since 2007 on Intel (Penryn
processors), and since 2011 on AMD processors (Bulldozer). If we have
a problem, it will be on AMD; there are some modern AMD processors
(Bobcat) that do not have SSE4.1, but I think they should be
disappearing.

Vector configuration code (esl_sse.m4; esl_cpu.c) changed to check for
compile-time SSE4.1 support in order to define our eslENABLE_SSE flag,
and run-time support to pass our CPU dispatch check.

Added esl_sse_hmax_epi8(), horizontal 16-way int8_t max, using SSE4.1
_mm_max_epi8() instruction.
  • Loading branch information...
cryptogenomicon committed May 18, 2017
1 parent ef206c4 commit e8087b597a744a74ca61fe5420c1ad8e568b5721
Showing with 135 additions and 52 deletions.
  1. +2 −2 configure.ac
  2. +11 −8 esl_cpu.c
  3. +0 −4 esl_sqio.c
  4. +72 −7 esl_sse.c
  5. +33 −17 esl_sse.h
  6. +5 −3 esl_sse.tex
  7. +12 −11 m4/esl_sse.m4
@@ -249,8 +249,8 @@ fi
# support an instruction set. A program must also check at runtime
# that its processor supports the instruction set.
#
# Our SSE implementations require SSE and SSE2;
# AVX implementations require AVX and AVX2;
# Our SSE implementations require SSE+SSE2+SSE4.1;
# AVX implementations require AVX+AVX2;
# AVX-512 requires the F, ER, and BW subsets.
#
# If we were explicitly told to enable one ($enable_foo="yes") and we
@@ -10,6 +10,7 @@
* References:
* https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
* https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
* https://en.wikipedia.org/wiki/CPUID
*/
#include "esl_config.h"

@@ -44,13 +45,13 @@ static int cpu_has_avx512(void);
*****************************************************************/

/* Function: esl_cpu_has_sse()
* Synopsis: Check if processor supports x86 SSE/SSE2.
* Synopsis: Check if processor supports x86 SSE/SSE2/SSE4.1
* Incept: SRE, Wed Feb 1 09:19:11 2017
*
* Purpose: Returns TRUE if our code has an available SSE vector
* implementation compiled in, and the processor we're
* running on can support it (i.e. has SSE+SSE2). Else
* returns FALSE.
* running on can support it (i.e. has SSE+SSE2+SSE4.1).
* Else returns FALSE.
*
* Note: Although these use static flags, they are thread-safe.
* They can only go in one direction, from a not-set-yet
@@ -213,18 +214,20 @@ cpu_check_xcr0_zmm(void)
#ifdef eslENABLE_SSE
/* cpu_has_sse()
*
* Test whether processor supports SSE/SSE2 instructions.
* Note that Easel's "SSE" vector code means SSE+SSE2.
* Test whether processor supports SSE/SSE2/SSE4.1 instructions.
* Note that Easel's "SSE" vector code means SSE+SSE2+SSE4.1.
*/
static int
cpu_has_sse(void)
{
uint32_t abcd[4];
uint32_t sse2_mask = (1 << 25) | // SSE
(1 << 26); // SSE2
uint32_t sse2_mask = (1 << 25) | // edx: SSE
(1 << 26); // SSE2
uint32_t sse41_mask = (1 << 19); // ecx: SSE4.1

cpu_run_id( 1, 0, abcd );
if ( (abcd[3] & sse2_mask) != sse2_mask) // edx
if ( (abcd[3] & sse2_mask) != sse2_mask || // edx check
(abcd[2] & sse41_mask) != sse41_mask) // ecx check
return 0;
return 1;
}
@@ -1464,10 +1464,6 @@ convert_sq_to_msa(ESL_SQ *sq, ESL_MSA **ret_msa)
* ./benchmark -d2w /misc/data0/genomes/c.elegans/genome/allWS120
* CPU Time: 2.16u 0.31s 00:00:02.47 Elapsed: 00:00:03
*/
/* gcc -std=gnu99 -O3 -fomit-frame-pointer -malign-double -fstrict-aliasing -msse2 -pthread -I. -L. -o esl_sqio_benchmark -DeslSQIO_BENCHMARK esl_sqio.c -leasel -lm
* icc -O3 -ansi_alias -xW -static -I. -L. -o esl_sqio_benchmark -DeslSQIO_BENCHMARK esl_sqio.c -leasel -lm
* ./esl_sqio_benchmark <seqfile>
*/
#ifdef eslSQIO_BENCHMARK
#include <stdlib.h>
#include <stdio.h>
@@ -37,8 +37,7 @@
#include <math.h>
#include <float.h>

#include <xmmintrin.h> /* SSE */
#include <emmintrin.h> /* SSE2 */
#include <x86intrin.h>

#include "easel.h"
#include "esl_sse.h"
@@ -466,11 +465,74 @@ utest_odds(ESL_GETOPTS *go, ESL_RANDOMNESS *r)
if (avgerr2 > 1e-8) esl_fatal("average error on expf() is intolerable\n");
if (maxerr2 > 1e-6) esl_fatal("maximum error on expf() is intolerable\n");
}


static void
utest_hmax_epu8(ESL_RANDOMNESS *rng)
{
union { __m128i v; uint8_t x[16]; } u;
uint8_t r1, r2;
int i,z;

for (i = 0; i < 100; i++)
{
r1 = 0;
for (z = 0; z < 16; z++)
{
u.x[z] = (uint8_t) (esl_rnd_Roll(rng, 256)); // 0..255
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epu8(u.v);
if (r1 != r2) esl_fatal("hmax_epu8 utest failed");
}
}

static void
utest_hmax_epi8(ESL_RANDOMNESS *rng)
{
union { __m128i v; int8_t x[16]; } u;
int8_t r1, r2;
int i,z;

for (i = 0; i < 100; i++)
{
r1 = -128;
for (z = 0; z < 16; z++)
{
u.x[z] = (int8_t) (esl_rnd_Roll(rng, 256) - 128); // -128..127
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epi8(u.v);
if (r1 != r2) esl_fatal("hmax_epi8 utest failed");
}
}


static void
utest_hmax_epi16(ESL_RANDOMNESS *rng)
{
union { __m128i v; int16_t x[8]; } u;
int16_t r1, r2;
int i,z;

for (i = 0; i < 100; i++)
{
r1 = -32768;
for (z = 0; z < 8; z++)
{
u.x[z] = (int16_t) (esl_rnd_Roll(rng, 65536) - 32768); // -32768..32767
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epi16(u.v);
if (r1 != r2) esl_fatal("hmax_epi16 utest failed: %d != %d", r1, r2);
}
}
#endif /*eslSSE_TESTDRIVE*/





/*****************************************************************
* 5. Test driver
*****************************************************************/
@@ -491,7 +553,7 @@ static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of random test points", 0 },
{ "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
{ "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
{ "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be verbose: show test report", 0 },
{ "--vv", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be very verbose: show individual test samples", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
@@ -502,14 +564,17 @@ static char banner[] = "test driver for sse module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_RANDOMNESS *r = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));;
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));;

utest_logf(go);
utest_expf(go);
utest_odds(go, r);
utest_odds(go, rng);
utest_hmax_epu8(rng);
utest_hmax_epi8(rng);
utest_hmax_epi16(rng);

esl_randomness_Destroy(r);
esl_randomness_Destroy(rng);
esl_getopts_Destroy(go);
return 0;
}
@@ -5,7 +5,7 @@
*
* Contents:
* 1. Function declarations (from esl_sse.c)
* 2. Inlined utilities for ps vectors (4 floats in __m128)
* 2. Inlined utilities for ps vectors (4 floats in __m128)
* 3. Inlined utilities for epu8 vectors (16 uchars in __m128i)
*/
#ifndef eslSSE_INCLUDED
@@ -16,8 +16,7 @@
#include "easel.h"

#include <stdio.h>
#include <xmmintrin.h> /* SSE */
#include <emmintrin.h> /* SSE2 */
#include <x86intrin.h>


/*****************************************************************
@@ -188,10 +187,7 @@ esl_sse_any_gt_epi16(__m128i a, __m128i b)


/* Function: esl_sse_hmax_epu8()
* Synopsis: Return the max of the 16 elements in epu8 vector.
*
* Purpose: Returns the maximum value of the 16 elements in
* an <epu8> vector.
* Synopsis: Return max of 16 uint8_t elements in epu8 vector.
*/
static inline uint8_t
esl_sse_hmax_epu8(__m128i a)
@@ -203,22 +199,42 @@ esl_sse_hmax_epu8(__m128i a)
return (uint8_t) _mm_extract_epi16(a, 0); /* only low-order 8 bits set; so _epi16 or _epi8 equiv; _epi8 is SSE4.1 */
}



/*****************************************************************
* 4. Inlined utilities for epi8 vectors
*****************************************************************/

/* Function: esl_sse_hmax_epi8()
* Synopsis: Return max of 16 int8_t elements in epi8 vector.
*/
static inline int8_t
esl_sse_hmax_epi8(__m128i a)
{
a = _mm_max_epi8(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(2,3,0,1))); // _MM_SHUFFLE() args are reversed._MM_SHUFFLE(3,2,1,0) is a no-op, for example.
a = _mm_max_epi8(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(0,1,2,3)));
a = _mm_max_epi8(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1)));
a = _mm_max_epi8(a, _mm_srli_epi16 (a, 8));
return (int8_t) _mm_cvtsi128_si32(a);
}


/*****************************************************************
* 5. Inlined utilities for epi16 vectors
*****************************************************************/

/* Function: esl_sse_hmax_epi16()
* Synopsis: Return the max of the 8 elements in epi16 vector.
*
* Purpose: Returns the maximum value of the 16 elements in
* an <epu8> vector.
* Synopsis: Return max of 16 int16_t elements in epi16 vector.
*/
static inline int16_t
esl_sse_hmax_epi16(__m128i a)
{
a = _mm_max_epi16(a, _mm_srli_si128(a, 8));
a = _mm_max_epi16(a, _mm_srli_si128(a, 4));
a = _mm_max_epi16(a, _mm_srli_si128(a, 2));
return (int16_t) _mm_extract_epi16(a, 0); /* only low-order 8 bits set; so _epi16 or _epi8 equiv; _epi8 is SSE4.1 */
a = _mm_max_epi16(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(1,0,3,2)));
a = _mm_max_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1,0,3,2)));
a = _mm_max_epi16(a, _mm_srli_epi32(a, 16));
return (int16_t) _mm_cvtsi128_si32(a);
}


#endif /*HAVE_SSE2*/
#endif /*eslENABLE_SSE*/
#endif /*eslSSE_INCLUDED*/

@@ -4,9 +4,11 @@
importantly, vectorized \ccode{logf()} and \ccode{expf()} routines.

The \eslmod{sse} module is only available on platforms that support
SSE2 instructions. This includes all modern Intel and AMD processors,
but nor PowerPC processors. By default, the Easel configure script
enables SSE if it is available on the compilation machine.
SSE, SSE2, and SSE4.1 instructions. This includes all modern Intel and
AMD processors (since Intel Penryn in 2007, and AMD Bulldozer, Jaguar,
and Piledriver processors since 2011), but not PowerPC processors. By
default, the Easel configure script enables SSE if it is available on
the compilation machine.


\begin{table}[hbp]
@@ -1,20 +1,20 @@
# ESL_SSE([ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
#
# Check whether compiler supports features we need in our SSE
# implementations (including SSE2).
# implementations (including SSE2, SSE4.1).
#
# We call this "SSE" in the generic sense of the lab's vector
# implementations in Easel, HMMER, Infernal, etc; more precisely,
# we're also checking for SSE2 intrinsic support.
# implementations in Easel, HMMER, Infernal, etc; but more precisely,
# we checking for everything up to SSE4.1 intrinsic support.
#
# Tries to compile and link a test program using the current CC,
# CFLAGS, and (optionally) any SSE_CFLAGS passed by the user. If no
# SSE_CFLAGS are provided, then we try to determine them, by trying
# nothing (i.e. the compiler deals with SSE intrinsics by default),
# then trying -msse2.
# then trying -msse4.1.
#
# Sets $esl_have_sse = yes | no
# Sets $esl_sse_cflags to any needed CFLAGS, such as -msse2
# Sets $esl_sse_cflags to any needed CFLAGS, such as -msse4.1
#
# A typical ACTION-IF-FOUND might be:
# AC_DEFINE(HAVE_SSE)
@@ -32,7 +32,7 @@ AC_DEFUN([ESL_SSE], [
if test "x$SSE_CFLAGS" != x; then
esl_sse_try_flags=$SSE_CFLAGS
else
esl_sse_try_flags="none -mavx2"
esl_sse_try_flags="none -msse4.1"
fi

save_CFLAGS=$CFLAGS
@@ -47,11 +47,12 @@ AC_DEFUN([ESL_SSE], [
#include <x86intrin.h>
#include <stdint.h>
int stub_sse(void) {
__m128i v1 = _mm_set1_epi32(42);
__m128i v2 = _mm_set1_epi32(86);
union { __m128i v; int32_t x[4]; } v3;
v3.v = _mm_add_epi32(v1, v2);
return (int) v3.x[0];
__m128i v1 = _mm_set1_epi8(-42);
__m128i v2 = _mm_set1_epi8(-86);
union { __m128i v; int8_t x[16]; } v3;
v3.v = _mm_adds_epi8(v1, v2);
v2 = _mm_max_epi8(v1, v1);
return (int) -v3.x[0];
}
int main(void) { if (stub_sse() != 128) return 1; else return 0; }
]])], [ esl_have_sse=yes; break; ], [])

0 comments on commit e8087b5

Please sign in to comment.
You can’t perform that action at this time.