Skip to content

Commit

Permalink
Renamed left,right shift functions in esl_avx for consistency.
Browse files Browse the repository at this point in the history
  • Loading branch information
cryptogenomicon committed Jun 4, 2017
1 parent 7352bd6 commit 6619945
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 91 deletions.
59 changes: 34 additions & 25 deletions esl_avx.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -113,38 +113,49 @@ esl_avx_hsum_ps(__m256 a, float *ret_sum)
* 3. Inlined functions: left and right shift * 3. Inlined functions: left and right shift
******************************************************************/ ******************************************************************/


/* naming conventions: The left/right in the names of these functions refers to the direction of the SSE shift instruction /* Function: esl_avx_rightshift_int8()
that they emulate, because that's what the first filters to be ported to AVX used. The esl_sse_(left/right)shift functions * Synopsis: Shift int8 vector elements to the right.
are vector-logical, meaning that, on x86, they mirror the function of the shift instruction with the opposite name. For * Incept: SRE, Sun Jun 4 17:12:07 2017
self-consistency, I'm sticking with names that match the direction of the instruction, even though this means that the SSE * See: esl_sse.h::esl_sse_rightshift_int8()
and AVX filters call different functions. */
*/ static inline __m256i

esl_avx_rightshift_int8(__m256i v, __m256i neginfmask)
// shifts vector left by one byte
static inline __m256i esl_avx_leftshift_one(__m256i vector)
{ {
register __m256i temp_mask_AVX = _mm256_permute2x128_si256(vector, vector, _MM_SHUFFLE(0,0,3,0) ); return _mm256_or_si256(_mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0,0,3,0)), 15), neginfmask);
return(_mm256_alignr_epi8(vector, temp_mask_AVX,15));
} }


// shifts vector left by two bytes /* Function: esl_avx_rightshift_int16()
static inline __m256i esl_avx_leftshift_two(__m256i vector) * Synopsis: Shift int16 vector elements to the right.
* Incept: SRE, Sun Jun 4 17:13:58 2017
* See: esl_sse.h::esl_sse_rightshift_int16()
*/
static inline __m256i
esl_avx_rightshift_int16(__m256i v, __m256i neginfmask)
{ {
register __m256i temp_mask_AVX = _mm256_permute2x128_si256(vector, vector, _MM_SHUFFLE(0,0,3,0) ); return _mm256_or_si256(_mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0,0,3,0)), 14), neginfmask);
return(_mm256_alignr_epi8(vector, temp_mask_AVX,14));
} }
// shifts vector left by four bytes (one float)
static inline __m256 esl_avx_leftshift_ps(__m256 vector) /* Function: esl_avx_rightshiftz_float()
* Synopsis: Shift float vector elements to the right, shifting zero on.
* Incept: SRE, Sun Jun 4 17:16:42 2017
* See: esl_sse.h::esl_sse_rightshiftz_float()
*/
static inline __m256
esl_avx_rightshiftz_float(__m256 v)
{ {
register __m256i temp_mask_AVX = _mm256_permute2x128_si256((__m256i) vector, (__m256i) vector, _MM_SHUFFLE(0,0,3,0) ); return ((__m256) _mm256_alignr_epi8((__m256i) v, _mm256_permute2x128_si256((__m256i) v, (__m256i) v, _MM_SHUFFLE(0,0,3,0) ), 12));
return((__m256) _mm256_alignr_epi8((__m256i) vector, temp_mask_AVX,12));
} }


// shifts vector right by four bytes (one float) /* Function: esl_avx_leftshiftz_float()
static inline __m256 esl_avx_rightshift_ps(__m256 vector) * Synopsis: Shift float vector elements to the left, shifting zero on.
* Incept: SRE, Sun Jun 4 17:27:52 2017
* See: esl_sse.h::esl_sse_leftshiftz_float()
*/
static inline __m256
esl_avx_leftshiftz_float(__m256 v)
{ {
register __m256i temp1 = _mm256_permute2x128_si256((__m256i) vector, (__m256i) vector, 0x81); //result has vector[255:128] in low 128 bits, 0 in high 128 //permute result has vector[255:128] in low 128 bits, 0 in high 128
return((__m256) _mm256_alignr_epi8(temp1, (__m256i) vector,4)); return ((__m256) _mm256_alignr_epi8(_mm256_permute2x128_si256((__m256i) v, (__m256i) v, 0x81), v, 4));
} }




Expand All @@ -161,7 +172,5 @@ esl_avx_any_gt_epi16(__m256i a, __m256i b)
return (_mm256_movemask_epi8(_mm256_cmpgt_epi16(a,b)) != 0); return (_mm256_movemask_epi8(_mm256_cmpgt_epi16(a,b)) != 0);
} }




#endif /*eslAVX_INCLUDED*/ #endif /*eslAVX_INCLUDED*/
#endif // eslENABLE_AVX #endif // eslENABLE_AVX
7 changes: 4 additions & 3 deletions esl_avx512.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
* Contents: * Contents:
* 1. Function declarations for esl_avx512.c * 1. Function declarations for esl_avx512.c
* 2. Inlined functions: horizontal max, sum * 2. Inlined functions: horizontal max, sum
* 3. Inlined functions: left and right shifts * 3. Inlined functions: left, right shift
*/ */
#ifndef eslAVX512_INCLUDED #ifndef eslAVX512_INCLUDED
#define eslAVX512_INCLUDED #define eslAVX512_INCLUDED
Expand All @@ -25,8 +25,6 @@
extern void esl_avx512_dump_512i_hex8(__m512i v); extern void esl_avx512_dump_512i_hex8(__m512i v);






/***************************************************************** /*****************************************************************
* 2. Inlined functions: horizontal max, sum * 2. Inlined functions: horizontal max, sum
*****************************************************************/ *****************************************************************/
Expand Down Expand Up @@ -115,6 +113,9 @@ esl_avx512_hsum_ps(__m512 a, float *ret_sum)


// shifts vector left by one byte. Uses a similar technique to the AVX macro, but is complicated by the // shifts vector left by one byte. Uses a similar technique to the AVX macro, but is complicated by the
// lack of permute2x128 instruction in AVX-512 // lack of permute2x128 instruction in AVX-512



static inline __m512i static inline __m512i
esl_avx512_leftshift_one(__m512i vector) esl_avx512_leftshift_one(__m512i vector)
{ {
Expand Down
128 changes: 65 additions & 63 deletions esl_neon.h
Original file line number Original file line Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@
* 1. Data structures for ARM/Intel intrinsics compatibility * 1. Data structures for ARM/Intel intrinsics compatibility
* 2. Function declarations for esl_neon * 2. Function declarations for esl_neon
* 3. Inlined functions: horizontal max, sum * 3. Inlined functions: horizontal max, sum
* 4. Inlined utilities for float vectors (4 floats in esl_neon_128f_t) * 4. Inlined functions: left, right shift
* 4. Inlined utilities for epu8 vectors (16 uchars in esl_neon_128i_t) * 5. Inlined functions: any_gt
* 6. Inlined functions: select
* *
*/ */
#include "esl_config.h" #include "esl_config.h"
Expand Down Expand Up @@ -219,67 +220,9 @@ esl_neon_hsum_float(esl_neon_128f_t a, float *ret_sum)
} }


/***************************************************************** /*****************************************************************
* 3. Inline utilities for ps vectors (4 floats in esl_neon_128f_t) * 4. Inlined functions: left, right shifts
*****************************************************************/ *****************************************************************/


/* Function: esl_neon_select_float()
* Synopsis: NEON equivalent of <vec_sel()>
*
* Purpose: Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
* is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
*
* Useful for avoiding conditional branches. For example,
* to implement \ccode{if (a > 0) a += a;}:
*
* \begin{cchunk}
* mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
* twoa = _mm_add_ps(a, a);
* a = esl_sse_select_ps(a, twoa, mask);
* \end{cchunk}
*
*/
static inline esl_neon_128f_t
esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
{
esl_neon_128i_t aview, bview, maskview, masknot;
esl_neon_128f_t ret;

maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
bview.s64x2 = vreinterpretq_s64_f32(b.f32x4);
aview.s64x2 = vreinterpretq_s64_f32(a.f32x4);
bview.s64x2 = vandq_s64(bview.s64x2, maskview.s64x2);
masknot.s32x4 = vmvnq_s32(maskview.s32x4);
aview.s64x2 = vandq_s64(masknot.s64x2, aview.s64x2);
ret.f32x4 = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));
return ret;
}


/* Function: esl_neon_any_gt_float()
* Synopsis: Returns TRUE if any a[z] > b[z]
*
* Purpose: Returns TRUE if any a[z] > b[z] in two
* <ps> vectors of floats.
*
* Note: Ported from esl_sse.c::esl_sse_any_gt_float().
*/
static inline int
esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
{
esl_neon_128i_t mask;
int l0, l1;
int maskbits;

mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
l0 = vgetq_lane_u64(mask.u64x2, 0);
l1 = vgetq_lane_u64(mask.u64x2, 1);
maskbits = l0 | l1;
return maskbits != 0;
}






/* Function: esl_neon_rightshift_float() /* Function: esl_neon_rightshift_float()
* Synopsis: Shift vector elements to the right. * Synopsis: Shift vector elements to the right.
Expand Down Expand Up @@ -320,8 +263,8 @@ esl_neon_leftshift_float(esl_neon_128f_t a, esl_neon_128f_t b)




/***************************************************************** /*****************************************************************
* 4. Inlined utilities for epu8 vectors (16 uchars in __m128i) * 5. Inlined functions: any_gt
*****************************************************************/ *****************************************************************/


/* Function: esl_neon_any_gt_s16() /* Function: esl_neon_any_gt_s16()
* Synopsis: Returns TRUE if any a[z] > b[z]. * Synopsis: Returns TRUE if any a[z] > b[z].
Expand All @@ -343,6 +286,65 @@ esl_neon_any_gt_s16(esl_neon_128i_t a, esl_neon_128i_t b)
return maskbits != 0; return maskbits != 0;
} }


/* Function: esl_neon_any_gt_float()
* Synopsis: Returns TRUE if any a[z] > b[z]
*
* Purpose: Returns TRUE if any a[z] > b[z] in two
* <ps> vectors of floats.
*
* Note: Ported from esl_sse.c::esl_sse_any_gt_float().
*/
static inline int
esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
{
esl_neon_128i_t mask;
int l0, l1;
int maskbits;

mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
l0 = vgetq_lane_u64(mask.u64x2, 0);
l1 = vgetq_lane_u64(mask.u64x2, 1);
maskbits = l0 | l1;
return maskbits != 0;
}



/*****************************************************************
* 6. Inlined functions: select
*****************************************************************/

/* Function: esl_neon_select_float()
* Synopsis: NEON equivalent of <vec_sel()>
*
* Purpose: Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
* is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
*
* Useful for avoiding conditional branches. For example,
* to implement \ccode{if (a > 0) a += a;}:
*
* \begin{cchunk}
* mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
* twoa = _mm_add_ps(a, a);
* a = esl_sse_select_ps(a, twoa, mask);
* \end{cchunk}
*
*/
static inline esl_neon_128f_t
esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
{
esl_neon_128i_t aview, bview, maskview, masknot;
esl_neon_128f_t ret;

maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
bview.s64x2 = vreinterpretq_s64_f32(b.f32x4);
aview.s64x2 = vreinterpretq_s64_f32(a.f32x4);
bview.s64x2 = vandq_s64(bview.s64x2, maskview.s64x2);
masknot.s32x4 = vmvnq_s32(maskview.s32x4);
aview.s64x2 = vandq_s64(masknot.s64x2, aview.s64x2);
ret.f32x4 = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));
return ret;
}






Expand Down

0 comments on commit 6619945

Please sign in to comment.