Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
322 lines (282 sloc) 10.2 KB
/* Vectorized routines for x86 Streaming SIMD Extensions (SSE).
*
* This header file, unusually, provides many complete function
* implementations so they can be inlined by the compiler.
*
* esl_sse supports both our SSE implementations and our SSE4
* implementations. In a plain "SSE" (i.e. SSE/SSE2, e.g. HMMER3),
* SSE4.1-dependent code is ifdef'ed out.
*
* Contents:
* 1. Function declarations for esl_sse.c
* 2. Inlined functions: horizontal max, min, sum
* 3. Inlined functions: left, right shift
* 4. Inlined functions: any_gt
* 5. Inlined functions: select
*/
#ifndef eslSSE_INCLUDED
#define eslSSE_INCLUDED
#include "esl_config.h"
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4)
#include "easel.h"
#include <stdio.h>
#include <x86intrin.h>
/*****************************************************************
* 1. Function declarations (from esl_sse.c)
*****************************************************************/
extern __m128 esl_sse_logf(__m128 x);
extern __m128 esl_sse_expf(__m128 x);
extern void esl_sse_dump_ps(FILE *fp, __m128 v);
/*****************************************************************
* 2. Inlined functions: horizontal max, min
*****************************************************************/
/* Function: esl_sse_hmax_epu8()
* Synopsis: Return max of 16 uint8_t elements in epu8 vector.
*/
static inline uint8_t
esl_sse_hmax_epu8(__m128i a)
{
a = _mm_max_epu8(a, _mm_srli_si128(a, 8));
a = _mm_max_epu8(a, _mm_srli_si128(a, 4));
a = _mm_max_epu8(a, _mm_srli_si128(a, 2));
a = _mm_max_epu8(a, _mm_srli_si128(a, 1));
return (uint8_t) _mm_extract_epi16(a, 0); /* only low-order 8 bits set; so _epi16 or _epi8 equiv; _epi8 is SSE4.1 */
}
#ifdef eslENABLE_SSE4
/* Function: esl_sse_hmax_epi8()
* Synopsis: Return max of 16 int8_t elements in epi8 vector.
* (SSE4.1)
*/
static inline int8_t
esl_sse_hmax_epi8(__m128i a)
{
a = _mm_max_epi8(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(2,3,0,1))); // _MM_SHUFFLE() args are reversed._MM_SHUFFLE(3,2,1,0) is a no-op, for example.
a = _mm_max_epi8(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(0,1,2,3)));
a = _mm_max_epi8(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(2,3,0,1)));
a = _mm_max_epi8(a, _mm_srli_epi16 (a, 8));
return (int8_t) _mm_cvtsi128_si32(a);
}
#endif
/* Function: esl_sse_hmax_epi16()
* Synopsis: Return max of 16 int16_t elements in epi16 vector.
*/
static inline int16_t
esl_sse_hmax_epi16(__m128i a)
{
a = _mm_max_epi16(a, _mm_shuffle_epi32 (a, _MM_SHUFFLE(1,0,3,2)));
a = _mm_max_epi16(a, _mm_shufflelo_epi16(a, _MM_SHUFFLE(1,0,3,2)));
a = _mm_max_epi16(a, _mm_srli_epi32(a, 16));
return (int16_t) _mm_cvtsi128_si32(a);
}
/* Function: esl_sse_hmax_ps()
* Synopsis: Find the maximum of elements in a vector.
*
* Purpose: Find the maximum valued element in the four float elements
* in <a>, and return that maximum value in <*ret_max>.
*
* Xref: J3/90 for benchmarking of some alternative implementations.
*/
static inline void
esl_sse_hmax_ps(__m128 a, float *ret_max)
{
a = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)));
a = _mm_max_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
_mm_store_ss(ret_max, a);
}
/* Function: esl_sse_hmin_ps()
* Synopsis: Find the minimum of elements in a vector.
*
* Purpose: Find the minimum valued element in the four float elements
* in <a> and return that minimum value in <*ret_min>.
*/
static inline void
esl_sse_hmin_ps(__m128 a, float *ret_min)
{
a = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)));
a = _mm_min_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
_mm_store_ss(ret_min, a);
}
/* Function: esl_sse_hsum_ps()
* Synopsis: Takes the horizontal sum of elements in a vector.
*
* Purpose: Add the four float elements in vector <a>; return
* that sum in <*ret_sum>.
*/
static inline void
esl_sse_hsum_ps(__m128 a, float *ret_sum)
{
a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(0, 3, 2, 1)));
a = _mm_add_ps(a, _mm_shuffle_ps(a, a, _MM_SHUFFLE(1, 0, 3, 2)));
_mm_store_ss(ret_sum, a);
}
/*****************************************************************
* 3. Inlined functions: left, right shift
*****************************************************************/
/* Function: esl_sse_rightshift_int8()
* Synopsis: Shift int8 vector elements to the right, shifting -inf on
* Incept: SRE, Sun Jun 4 09:52:45 2017
*
* Purpose: Given an int8 vector <{ a0 .. a16}>, and a mask <{-inf,
* 0*15 }> with the desired value of -inf in slot 0
* (and zeros elsewhere), return <{ -inf, a0..a14 }>;
* i.e. shift the values in <a> to the right, while
* shifting $-\infty$ on.
*
* By our convention, "right" and "left" refer to memory
* order (low addresses on the left). On a little-endian
* (x86) architecture, this is a left shift in the hardware
* register.
*
* This can be used both for signed (epi8) and unsigned
* (epu8) int8 vectors.
*
* Xref: HMMER's simdvec.md: on our left/right convention.
*/
static inline __m128i
esl_sse_rightshift_int8(__m128i a, __m128i neginfmask)
{
return _mm_or_si128(_mm_slli_si128(a, 1), neginfmask);
}
/* Function: esl_sse_rightshift_int16()
* Synopsis: Shift int16 vector elements to the right, shifting -inf on
* Incept: SRE, Sun Jun 4 10:12:24 2017 [Gary Jules, Mad World]
*
* Purpose: Same as <esl_sse_rightshift_int8()> but for int16.
*/
static inline __m128i
esl_sse_rightshift_int16(__m128i a, __m128i neginfmask)
{
return _mm_or_si128( _mm_slli_si128(a, 2), neginfmask);
}
/* Function: esl_sse_rightshiftz_float()
* Synopsis: Shift float vector elements to the right, shifting zero on.
*
* Purpose: Same as <esl_sse_rightshift_int8()> but for floats,
* and the value that is shifted on is a zero.
*/
static inline __m128
esl_sse_rightshiftz_float(__m128 a)
{
// Tricky. IEEE754 representation of zero is all 0 bits, so shift alone suffices.
return (__m128) _mm_slli_si128( (__m128i) a, 4);
}
/* Function: esl_sse_leftshiftz_float()
* Synopsis: Shift float vector elements to the left, shifting zero on.
*
* Purpose: Same as <esl_sse_rightshift_float()> but leftwise: <[ a0 a1 a2
* a3 ]> becomes <[ a1 a2 a3 0 ]>. Used in Backwards.
*/
static inline __m128
esl_sse_leftshiftz_float(__m128 a)
{
// Same trick. IEEE754 representation of zero is all 0 bits.
return (__m128) _mm_srli_si128( (__m128i) a, 4);
}
/* Function: esl_sse_rightshift_ps()
* Synopsis: Shift vector elements to the right, shifting -inf on.
*
* Purpose: Returns a vector containing
* <{ b[0] a[0] a[1] a[2] }>:
* i.e. shift the values in <a> to the
* right, and load the first value of
* <b> into the first slot.
*
* Note: used in Infernal.
*/
static inline __m128
esl_sse_rightshift_ps(__m128 a, __m128 b)
{
return _mm_move_ss(_mm_shuffle_ps(a, a, _MM_SHUFFLE(2, 1, 0, 0)), b);
}
/* Function: esl_sse_leftshift_ps()
* Synopsis: Shift vector elements to the left, shifting -inf on.
*
* Purpose: Returns a vector containing
* <{ a[1] a[2] a[3] b[0]}>:
* i.e. shift the values in <a> to the
* left and load the first value of
* <b> into the first slot.
*
* Note: used in Infernal.
*/
static inline __m128
esl_sse_leftshift_ps(__m128 a, __m128 b)
{
register __m128 v = _mm_move_ss(a, b); /* now b[0] a[1] a[2] a[3] */
return _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1)); /* now a[1] a[2] a[3] b[0] */
}
/*****************************************************************
* 3. Inlined functions: any_gt
*****************************************************************/
/* Function: esl_sse_any_gt_epu8()
* Synopsis: Returns TRUE if any a[z] > b[z].
*
* Purpose: Return TRUE if any <a[z] > b[z]> for <z=0..15>
* in two <epu8> vectors of unsigned chars.
*
* We need this incantation because SSE provides
* no <cmpgt_epu8> instruction.
*
* For equality tests, note that <cmpeq_epi8> works fine
* for unsigned ints though there is no <cmpeq_epu8>
* instruction either).
*
* See vec_any_gt
*/
static inline int
esl_sse_any_gt_epu8(__m128i a, __m128i b)
{
__m128i mask = _mm_cmpeq_epi8(_mm_max_epu8(a,b), b); /* anywhere a>b, mask[z] = 0x0; elsewhere 0xff */
int maskbits = _mm_movemask_epi8(_mm_xor_si128(mask, _mm_cmpeq_epi8(mask, mask))); /* the xor incantation is a bitwise inversion */
return maskbits != 0;
}
/* Function: esl_sse_any_gt_epi16()
* Synopsis: Return >0 if any a[z] > b[z]
*/
static inline int
esl_sse_any_gt_epi16(__m128i a, __m128i b)
{
return (_mm_movemask_epi8(_mm_cmpgt_epi16(a,b)) != 0);
}
/* Function: esl_sse_any_gt_ps()
* Synopsis: Returns TRUE if any a[z] > b[z]
*
* Xref: From Apple Altivec/SSE migration guide.
*/
static inline int
esl_sse_any_gt_ps(__m128 a, __m128 b)
{
__m128 mask = _mm_cmpgt_ps(a,b);
int maskbits = _mm_movemask_ps( mask );
return maskbits != 0;
}
/*****************************************************************
* 5. Inlined functions: select
*****************************************************************/
/* Function: esl_sse_select_ps()
* Synopsis: SSE equivalent of <vec_sel()>
*
* Purpose: Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
* is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
*
* Useful for avoiding conditional branches. For example,
* to implement \ccode{if (a > 0) a += a;}:
*
* \begin{cchunk}
* mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
* twoa = _mm_add_ps(a, a);
* a = esl_sse_select_ps(a, twoa, mask);
* \end{cchunk}
*
* Notes: As recommended by the Altivec/SSE Migration Guide,
* Apple Computer, Inc.
*/
static inline __m128
esl_sse_select_ps(__m128 a, __m128 b, __m128 mask)
{
b = _mm_and_ps(b, mask);
a = _mm_andnot_ps(mask, a);
return _mm_or_ps(a,b);
}
#endif /*eslENABLE_SSE || eslENABLE_SSE4 */
#endif /*eslSSE_INCLUDED*/
You can’t perform that action at this time.