Renamed left,right shift functions in esl_avx for consistency.

EddyRivasLab · Jun 4, 2017 · 6619945 · 6619945
1 parent 7352bd6
commit 6619945
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 91 deletions.
diff --git a/esl_avx.h b/esl_avx.h
@@ -113,38 +113,49 @@ esl_avx_hsum_ps(__m256 a, float *ret_sum)
  * 3. Inlined functions: left and right shift 
  ******************************************************************/
 
-/* naming conventions:  The left/right in the names of these functions refers to the direction of the SSE shift instruction
+/* Function:  esl_avx_rightshift_int8()
-that they emulate, because that's what the first filters to be ported to AVX used.  The esl_sse_(left/right)shift functions 
+ * Synopsis:  Shift int8 vector elements to the right.
-are vector-logical, meaning that, on x86, they mirror the function of the shift instruction with the opposite name.  For
+ * Incept:    SRE, Sun Jun  4 17:12:07 2017
-self-consistency, I'm sticking with names that match the direction of the instruction, even though this means that the SSE
+ * See:       esl_sse.h::esl_sse_rightshift_int8()
-and AVX filters call different functions.
+ */
-*/
+static inline __m256i 
-
+esl_avx_rightshift_int8(__m256i v, __m256i neginfmask)
-// shifts vector left by one byte
-static inline __m256i esl_avx_leftshift_one(__m256i vector)
 {
-   register __m256i temp_mask_AVX = _mm256_permute2x128_si256(vector, vector, _MM_SHUFFLE(0,0,3,0) );
+  return _mm256_or_si256(_mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0,0,3,0)), 15), neginfmask);
-   return(_mm256_alignr_epi8(vector, temp_mask_AVX,15));
 }
 
-// shifts vector left by two bytes
+/* Function:  esl_avx_rightshift_int16()
-static inline __m256i esl_avx_leftshift_two(__m256i vector)
+ * Synopsis:  Shift int16 vector elements to the right.
+ * Incept:    SRE, Sun Jun  4 17:13:58 2017
+ * See:       esl_sse.h::esl_sse_rightshift_int16()
+ */
+static inline __m256i 
+esl_avx_rightshift_int16(__m256i v, __m256i neginfmask)
 {
-   register __m256i temp_mask_AVX = _mm256_permute2x128_si256(vector, vector, _MM_SHUFFLE(0,0,3,0) );
+  return _mm256_or_si256(_mm256_alignr_epi8(v, _mm256_permute2x128_si256(v, v, _MM_SHUFFLE(0,0,3,0)), 14), neginfmask);
-   return(_mm256_alignr_epi8(vector, temp_mask_AVX,14));
 }
-// shifts vector left by four bytes (one float)
+
-static inline __m256 esl_avx_leftshift_ps(__m256 vector)
+/* Function:  esl_avx_rightshiftz_float()
+ * Synopsis:  Shift float vector elements to the right, shifting zero on.
+ * Incept:    SRE, Sun Jun  4 17:16:42 2017
+ * See:       esl_sse.h::esl_sse_rightshiftz_float()
+ */
+static inline __m256 
+esl_avx_rightshiftz_float(__m256 v)
 {
-   register __m256i temp_mask_AVX = _mm256_permute2x128_si256((__m256i) vector, (__m256i) vector, _MM_SHUFFLE(0,0,3,0) );
+  return ((__m256) _mm256_alignr_epi8((__m256i) v, _mm256_permute2x128_si256((__m256i) v, (__m256i) v, _MM_SHUFFLE(0,0,3,0) ), 12));
-   return((__m256) _mm256_alignr_epi8((__m256i) vector, temp_mask_AVX,12));
 }
 
-// shifts vector right by four bytes (one float)
+/* Function:  esl_avx_leftshiftz_float()
-static inline __m256 esl_avx_rightshift_ps(__m256 vector)
+ * Synopsis:  Shift float vector elements to the left, shifting zero on.
+ * Incept:    SRE, Sun Jun  4 17:27:52 2017
+ * See:       esl_sse.h::esl_sse_leftshiftz_float()
+ */
+static inline __m256 
+esl_avx_leftshiftz_float(__m256 v)
 {
-   register __m256i temp1 = _mm256_permute2x128_si256((__m256i) vector, (__m256i) vector, 0x81);  //result has vector[255:128] in low 128 bits, 0 in high 128
+  //permute result has vector[255:128] in low 128 bits, 0 in high 128
-   return((__m256) _mm256_alignr_epi8(temp1, (__m256i) vector,4));
+  return ((__m256) _mm256_alignr_epi8(_mm256_permute2x128_si256((__m256i) v, (__m256i) v, 0x81), v, 4));  
 }
 
 
@@ -161,7 +172,5 @@ esl_avx_any_gt_epi16(__m256i a, __m256i b)
   return (_mm256_movemask_epi8(_mm256_cmpgt_epi16(a,b)) != 0); 
 }
 
-
-
 #endif /*eslAVX_INCLUDED*/
 #endif // eslENABLE_AVX
diff --git a/esl_avx512.h b/esl_avx512.h
@@ -6,7 +6,7 @@
  * Contents:
  *    1. Function declarations for esl_avx512.c
  *    2. Inlined functions: horizontal max, sum
- *    3. Inlined functions: left and right shifts
+ *    3. Inlined functions: left, right shift
  */
 #ifndef eslAVX512_INCLUDED
 #define eslAVX512_INCLUDED
@@ -25,8 +25,6 @@
 extern void esl_avx512_dump_512i_hex8(__m512i v);
 
 
-
-
 /*****************************************************************
  * 2. Inlined functions: horizontal max, sum
  *****************************************************************/
@@ -115,6 +113,9 @@ esl_avx512_hsum_ps(__m512 a, float *ret_sum)
 
 // shifts vector left by one byte.  Uses a similar technique to the AVX macro, but is complicated by the 
 // lack of permute2x128 instruction in AVX-512
+
+
+
 static inline __m512i 
 esl_avx512_leftshift_one(__m512i vector)
 {

diff --git a/esl_neon.h b/esl_neon.h
@@ -14,8 +14,9 @@
  *    1. Data structures for ARM/Intel intrinsics compatibility
  *    2. Function declarations for esl_neon
  *    3. Inlined functions: horizontal max, sum
- *    4. Inlined utilities for float vectors (4 floats in esl_neon_128f_t)
+ *    4. Inlined functions: left, right shift
- *    4. Inlined utilities for epu8 vectors (16 uchars in esl_neon_128i_t)
+ *    5. Inlined functions: any_gt
+ *    6. Inlined functions: select
  *  
  */
 #include "esl_config.h"
@@ -219,67 +220,9 @@ esl_neon_hsum_float(esl_neon_128f_t a, float *ret_sum)
 }
 
 /*****************************************************************
- * 3. Inline utilities for ps vectors (4 floats in esl_neon_128f_t)
+ * 4. Inlined functions: left, right shifts
  *****************************************************************/
 
-/* Function:  esl_neon_select_float()
- * Synopsis:  NEON equivalent of <vec_sel()>
- *
- * Purpose:   Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
- *            is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
- *            
- *            Useful for avoiding conditional branches. For example,
- *            to implement \ccode{if (a > 0) a += a;}:
- *            
- *            \begin{cchunk}
- *              mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
- *              twoa = _mm_add_ps(a, a);
- *              a    = esl_sse_select_ps(a, twoa, mask);
- *            \end{cchunk}
- *
- */
-static inline esl_neon_128f_t
-esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
-{
-  esl_neon_128i_t aview, bview, maskview, masknot;
-  esl_neon_128f_t ret;
-
-  maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
-  bview.s64x2    = vreinterpretq_s64_f32(b.f32x4);
-  aview.s64x2    = vreinterpretq_s64_f32(a.f32x4);
-  bview.s64x2    = vandq_s64(bview.s64x2, maskview.s64x2);
-  masknot.s32x4  = vmvnq_s32(maskview.s32x4);
-  aview.s64x2    = vandq_s64(masknot.s64x2, aview.s64x2);
-  ret.f32x4      = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));  
-  return ret; 
-}
-
-
-/* Function:  esl_neon_any_gt_float()
- * Synopsis:  Returns TRUE if any a[z] > b[z]
- *
- * Purpose:   Returns TRUE if any a[z] > b[z] in two
- *            <ps> vectors of floats.
- *
- * Note:      Ported from esl_sse.c::esl_sse_any_gt_float().
- */
-static inline int 
-esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
-{
-  esl_neon_128i_t mask;
-  int             l0, l1;
-  int             maskbits;
-
-  mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
-  l0         = vgetq_lane_u64(mask.u64x2, 0);
-  l1         = vgetq_lane_u64(mask.u64x2, 1);
-  maskbits   = l0 | l1;
-  return maskbits != 0;
-}
-
-
-
-
 
 /* Function:  esl_neon_rightshift_float()
  * Synopsis:  Shift vector elements to the right.
@@ -320,8 +263,8 @@ esl_neon_leftshift_float(esl_neon_128f_t a, esl_neon_128f_t b)
 
 
 /*****************************************************************
- * 4. Inlined utilities for epu8 vectors (16 uchars in __m128i)
+ * 5. Inlined functions: any_gt
- *****************************************************************/ 
+ *****************************************************************/
 
 /* Function:  esl_neon_any_gt_s16()
  * Synopsis:  Returns TRUE if any a[z] > b[z].
@@ -343,6 +286,65 @@ esl_neon_any_gt_s16(esl_neon_128i_t a, esl_neon_128i_t b)
   return maskbits != 0;
 }
 
+/* Function:  esl_neon_any_gt_float()
+ * Synopsis:  Returns TRUE if any a[z] > b[z]
+ *
+ * Purpose:   Returns TRUE if any a[z] > b[z] in two
+ *            <ps> vectors of floats.
+ *
+ * Note:      Ported from esl_sse.c::esl_sse_any_gt_float().
+ */
+static inline int 
+esl_neon_any_gt_float(esl_neon_128f_t a, esl_neon_128f_t b)
+{
+  esl_neon_128i_t mask;
+  int             l0, l1;
+  int             maskbits;
+
+  mask.u32x4 = vcgtq_f32(a.f32x4,b.f32x4);
+  l0         = vgetq_lane_u64(mask.u64x2, 0);
+  l1         = vgetq_lane_u64(mask.u64x2, 1);
+  maskbits   = l0 | l1;
+  return maskbits != 0;
+}
+
+
+
+/*****************************************************************
+ * 6. Inlined functions: select
+ *****************************************************************/ 
+
+/* Function:  esl_neon_select_float()
+ * Synopsis:  NEON equivalent of <vec_sel()>
+ *
+ * Purpose:   Vector select. Returns a vector <r[z] = a[z]> where <mask[z]>
+ *            is all 0's; <r[z] = b[z]> where <mask[z]> is all 1's.
+ *            
+ *            Useful for avoiding conditional branches. For example,
+ *            to implement \ccode{if (a > 0) a += a;}:
+ *            
+ *            \begin{cchunk}
+ *              mask = _mm_cmpgt_ps(a, _mm_setzero_ps());
+ *              twoa = _mm_add_ps(a, a);
+ *              a    = esl_sse_select_ps(a, twoa, mask);
+ *            \end{cchunk}
+ *
+ */
+static inline esl_neon_128f_t
+esl_neon_select_float(esl_neon_128f_t a, esl_neon_128f_t b, esl_neon_128f_t mask)
+{
+  esl_neon_128i_t aview, bview, maskview, masknot;
+  esl_neon_128f_t ret;
+
+  maskview.s64x2 = vreinterpretq_s64_f32(mask.f32x4);
+  bview.s64x2    = vreinterpretq_s64_f32(b.f32x4);
+  aview.s64x2    = vreinterpretq_s64_f32(a.f32x4);
+  bview.s64x2    = vandq_s64(bview.s64x2, maskview.s64x2);
+  masknot.s32x4  = vmvnq_s32(maskview.s32x4);
+  aview.s64x2    = vandq_s64(masknot.s64x2, aview.s64x2);
+  ret.f32x4      = vreinterpretq_f32_s64(vorrq_s64(aview.s64x2,bview.s64x2));  
+  return ret; 
+}