Skip to content
Permalink
Browse files

Renamed left,right shift functions in esl_avx for consistency.

  • Loading branch information...
cryptogenomicon committed Jun 4, 2017
1 parent 7352bd6 commit 6619945637d5ac847b587c27dad1064ce22c7add
Showing with 103 additions and 91 deletions.
  1. +34 −25 esl_avx.h
  2. +4 −3 esl_avx512.h
  3. +65 −63 esl_neon.h
@@ -113,38 +113,49 @@ esl_avx_hsum_ps(__m256 a, float *ret_sum)
* 3. Inlined functions: left and right shift
******************************************************************/

/* naming conventions: The left/right in the names of these functions refers to the direction of the SSE shift instruction
that they emulate, because that's what the first filters to be ported to AVX used. The esl_sse_(left/right)shift functions
are vector-logical, meaning that, on x86, they mirror the function of the shift instruction with the opposite name. For
self-consistency, I'm sticking with names that match the direction of the instruction, even though this means that the SSE
and AVX filters call different functions.
*/

// shifts vector left by one byte
static inline __m256i esl_avx_leftshift_one(__m256i vector)
/* Function: esl_avx_rightshift_int8()
* Synopsis: Shift int8 vector elements to the right.
* Incept: SRE, Sun Jun 4 17:12:07 2017
* See: esl_sse.h::esl_sse_rightshift_int8()
*/
static inline __m256i
esl_avx_rightshift_int8(__m256i v, __m256i neginfmask)
{
register __m256i temp_mask_AVX = _mm256_permute2x128_si256(vector, vector, _MM_SHUFFLE(0,0,3,0) );
return(_mm256_alignr_epi8(vector, temp_mask_AVX,15));
return _mm256_or_si256(_mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0,0,3,0)), 15), neginfmask);
}

// shifts vector left by two bytes
static inline __m256i esl_avx_leftshift_two(__m256i vector)
/* Function: esl_avx_rightshift_int16()
* Synopsis: Shift int16 vector elements to the right.
* Incept: SRE, Sun Jun 4 17:13:58 2017
* See: esl_sse.h::esl_sse_rightshift_int16()
*/
static inline __m256i
esl_avx_rightshift_int16(__m256i v, __m256i neginfmask)
{
register __m256i temp_mask_AVX = _mm256_permute2x128_si256(vector, vector, _MM_SHUFFLE(0,0,3,0) );
return(_mm256_alignr_epi8(vector, temp_mask_AVX,14));
return _mm256_or_si256(_mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0,0,3,0)), 14), neginfmask);
}
// shifts vector left by four bytes (one float)
static inline __m256 esl_avx_leftshift_ps(__m256 vector)

/* Function: esl_avx_rightshiftz_float()
* Synopsis: Shift float vector elements to the right, shifting zero on.
* Incept: SRE, Sun Jun 4 17:16:42 2017
* See: esl_sse.h::esl_sse_rightshiftz_float()
*/
static inline __m256
esl_avx_rightshiftz_float(__m256 v)
{
register __m256i temp_mask_AVX = _mm256_permute2x128_si256((__m256i) vector, (__m256i) vector, _MM_SHUFFLE(0,0,3,0) );
return((__m256) _mm256_alignr_epi8((__m256i) vector, temp_mask_AVX,12));
return ((__m256) _mm256_alignr_epi8((__m256i) v, _mm256_permute2x128_si256((__m256i) v, (__m256i) v, _MM_SHUFFLE(0,0,3,0) ), 12));
}

// shifts vector right by four bytes (one float)
static inline __m256 esl_avx_rightshift_ps(__m256 vector)
/* Function: esl_avx_leftshiftz_float()
* Synopsis: Shift float vector elements to the left, shifting zero on.
* Incept: SRE, Sun Jun 4 17:27:52 2017
* See: esl_sse.h::esl_sse_leftshiftz_float()
*/
static inline __m256
esl_avx_leftshiftz_float(__m256 v)
{
register __m256i temp1 = _mm256_permute2x128_si256((__m256i) vector, (__m256i) vector, 0x81); //result has vector[255:128] in low 128 bits, 0 in high 128
return((__m256) _mm256_alignr_epi8(temp1, (__m256i) vector,4));
//permute result has vector[255:128] in low 128 bits, 0 in high 128
return ((__m256) _mm256_alignr_epi8(_mm256_permute2x128_si256((__m256i) v, (__m256i) v, 0x81), v, 4));
}


@@ -161,7 +172,5 @@ esl_avx_any_gt_epi16(__m256i a, __m256i b)
return (_mm256_movemask_epi8(_mm256_cmpgt_epi16(a,b)) != 0);
}



#endif /*eslAVX_INCLUDED*/
#endif // eslENABLE_AVX
@@ -6,7 +6,7 @@
* Contents:
* 1. Function declarations for esl_avx512.c
* 2. Inlined functions: horizontal max, sum
* 3. Inlined functions: left and right shifts
* 3. Inlined functions: left, right shift
*/
#ifndef eslAVX512_INCLUDED
#define eslAVX512_INCLUDED
@@ -25,8 +25,6 @@
extern void esl_avx512_dump_512i_hex8(__m512i v);




/*****************************************************************
* 2. Inlined functions: horizontal max, sum
*****************************************************************/
@@ -115,6 +113,9 @@ esl_avx512_hsum_ps(__m512 a, float *ret_sum)

// shifts vector left by one byte. Uses a similar technique to the AVX macro, but is complicated by the
// lack of permute2x128 instruction in AVX-512



static inline __m512i
esl_avx512_leftshift_one(__m512i vector)
{
@@ -14,8 +14,9 @@
* 1. Data structures for ARM/Intel intrinsics compatibility
* 2. Function declarations for esl_neon
* 3. Inlined functions: horizontal max, sum
* 4. Inlined utilities for float vectors (4 floats in esl_neon_128f_t)
* 4. Inlined utilities for epu8 vectors (16 uchars in esl_neon_128i_t)
* 4. Inlined functions: left, right shift
* 5. Inlined functions: any_gt
* 6. Inlined functions: select
*
*/
#include "esl_config.h"
@@ -219,67 +220,9 @@ esl_neon_hsum_float(esl_neon_128f_t a, float *ret_sum)
}

/*****************************************************************
* 3. Inline utilities for ps vectors (4 floats in esl_neon_128f_t)
* 4. Inlined functions: left, right shifts
*****************************************************************/

/* Function: esl_neon_select_float()
* Synopsis: NEON equivalent of <vec_sel()>
*
* Purpose: Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
* is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
*
* Useful for avoiding conditional branches. For example,
* to implement \ccode{if (a > 0) a += a;}:
*
* \begin{cchunk}
* mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
* twoa = _mm_add_ps(a, a);
* a = esl_sse_select_ps(a, twoa, mask);
* \end{cchunk}
*
*/
static inline esl_neon_128f_t
esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
{
esl_neon_128i_t aview, bview, maskview, masknot;
esl_neon_128f_t ret;

maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
bview.s64x2 = vreinterpretq_s64_f32(b.f32x4);
aview.s64x2 = vreinterpretq_s64_f32(a.f32x4);
bview.s64x2 = vandq_s64(bview.s64x2, maskview.s64x2);
masknot.s32x4 = vmvnq_s32(maskview.s32x4);
aview.s64x2 = vandq_s64(masknot.s64x2, aview.s64x2);
ret.f32x4 = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));
return ret;
}


/* Function: esl_neon_any_gt_float()
* Synopsis: Returns TRUE if any a[z] > b[z]
*
* Purpose: Returns TRUE if any a[z] > b[z] in two
* <ps> vectors of floats.
*
* Note: Ported from esl_sse.c::esl_sse_any_gt_float().
*/
static inline int
esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
{
esl_neon_128i_t mask;
int l0, l1;
int maskbits;

mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
l0 = vgetq_lane_u64(mask.u64x2, 0);
l1 = vgetq_lane_u64(mask.u64x2, 1);
maskbits = l0 | l1;
return maskbits != 0;
}





/* Function: esl_neon_rightshift_float()
* Synopsis: Shift vector elements to the right.
@@ -320,8 +263,8 @@ esl_neon_leftshift_float(esl_neon_128f_t a, esl_neon_128f_t b)


/*****************************************************************
* 4. Inlined utilities for epu8 vectors (16 uchars in __m128i)
*****************************************************************/
* 5. Inlined functions: any_gt
*****************************************************************/

/* Function: esl_neon_any_gt_s16()
* Synopsis: Returns TRUE if any a[z] > b[z].
@@ -343,6 +286,65 @@ esl_neon_any_gt_s16(esl_neon_128i_t a, esl_neon_128i_t b)
return maskbits != 0;
}

/* Function: esl_neon_any_gt_float()
* Synopsis: Returns TRUE if any a[z] > b[z]
*
* Purpose: Returns TRUE if any a[z] > b[z] in two
* <ps> vectors of floats.
*
* Note: Ported from esl_sse.c::esl_sse_any_gt_float().
*/
static inline int
esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
{
esl_neon_128i_t mask;
int l0, l1;
int maskbits;

mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
l0 = vgetq_lane_u64(mask.u64x2, 0);
l1 = vgetq_lane_u64(mask.u64x2, 1);
maskbits = l0 | l1;
return maskbits != 0;
}



/*****************************************************************
* 6. Inlined functions: select
*****************************************************************/

/* Function: esl_neon_select_float()
* Synopsis: NEON equivalent of <vec_sel()>
*
* Purpose: Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
* is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
*
* Useful for avoiding conditional branches. For example,
* to implement \ccode{if (a > 0) a += a;}:
*
* \begin{cchunk}
* mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
* twoa = _mm_add_ps(a, a);
* a = esl_sse_select_ps(a, twoa, mask);
* \end{cchunk}
*
*/
static inline esl_neon_128f_t
esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
{
esl_neon_128i_t aview, bview, maskview, masknot;
esl_neon_128f_t ret;

maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
bview.s64x2 = vreinterpretq_s64_f32(b.f32x4);
aview.s64x2 = vreinterpretq_s64_f32(a.f32x4);
bview.s64x2 = vandq_s64(bview.s64x2, maskview.s64x2);
masknot.s32x4 = vmvnq_s32(maskview.s32x4);
aview.s64x2 = vandq_s64(masknot.s64x2, aview.s64x2);
ret.f32x4 = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));
return ret;
}



0 comments on commit 6619945

Please sign in to comment.
You can’t perform that action at this time.