Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
165 lines (140 sloc) 6.7 KB
/* Vectorized routines for x86 AVX-512 instructions.
*
* This header file, unusually, provides many complete function
* implementations so they can be inlined by the compiler.
*
* Contents:
* 1. Function declarations for esl_avx512.c
* 2. Inlined functions: horizontal max, sum
* 3. Inlined functions: left, right shift
*/
#ifndef eslAVX512_INCLUDED
#define eslAVX512_INCLUDED
#include "esl_config.h"
#ifdef eslENABLE_AVX512
#include "easel.h"
#include <stdio.h>
#include <x86intrin.h>
/*****************************************************************
* 1. Function declarations for esl_avx512.c
*****************************************************************/
extern void esl_avx512_dump_512i_hex8(__m512i v);
/*****************************************************************
* 2. Inlined functions: horizontal max, sum
*****************************************************************/
/* Function: esl_avx512_hmax_epu8()
* Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector.
*/
static inline uint8_t
esl_avx512_hmax_epu8(__m512i a)
{
// Use AVX instructions for this because AVX-512 can't extract 8-bit quantities
// Intel has stated that there will be no performance penalty for switching between AVX-512 and AVX
__m256i b = _mm256_max_epu8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
b = _mm256_max_epu8(b, _mm256_permute2x128_si256(b, b, 0x01));
b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0x4e));
b = _mm256_max_epu8(b, _mm256_shuffle_epi32 (b, 0xb1));
b = _mm256_max_epu8(b, _mm256_shufflelo_epi16 (b, 0xb1));
b = _mm256_max_epu8(b, _mm256_srli_si256 (b, 1));
return _mm256_extract_epi8(b, 0); // epi8 is fine here. gets cast properly to uint8_t on return.
}
/* Function: esl_avx512_hmax_epu8()
* Synopsis: Return max of 64 unsigned uint8_t elements in epu8 vector.
* Incept: SRE, Thu May 25 13:20:45 2017 [Old 97's, Oppenheimer]
*/
static inline int8_t
esl_avx512_hmax_epi8(__m512i a)
{
__m256i b = _mm256_max_epi8(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
b = _mm256_max_epi8(b, _mm256_permute2x128_si256(b, b, 0x01));
b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0x4e));
b = _mm256_max_epi8(b, _mm256_shuffle_epi32 (b, 0xb1));
b = _mm256_max_epi8(b, _mm256_shufflelo_epi16 (b, 0xb1));
b = _mm256_max_epi8(b, _mm256_srli_si256 (b, 1));
return _mm256_extract_epi8(b, 0);
}
/* Function: esl_avx512_hmax_epi16()
* Synopsis: Return max of 32 signed int8_t elements in epi16 vector.
*/
static inline int16_t
esl_avx512_hmax_epi16(__m512i a)
{
__m256i b = _mm256_max_epi16(_mm512_extracti32x8_epi32(a, 0), _mm512_extracti32x8_epi32(a, 1));
b = _mm256_max_epi16(b, _mm256_permute2x128_si256(b, b, 0x01));
b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0x4e));
b = _mm256_max_epi16(b, _mm256_shuffle_epi32 (b, 0xb1));
b = _mm256_max_epi16(b, _mm256_shufflelo_epi16 (b, 0xb1));
return _mm256_extract_epi16(b, 0);
}
/* Function: esl_avx512_hsum_ps()
* Synopsis: sums the floating-point values in an __m512 vector
* returns the result in ret_sum
* Purpose: To compute the sum of the 32-bit float elements of a 512-bit vector
*/
static inline void
esl_avx512_hsum_ps(__m512 a, float *ret_sum)
{
__m512 temp1_AVX_512 = _mm512_shuffle_f32x4(a, a, 0x4e); //swap high and low halves of a
__m512 temp2_AVX_512 = _mm512_add_ps(a, temp1_AVX_512); // sum corresponding floats in the high, low halves
temp1_AVX_512 = _mm512_shuffle_f32x4(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low quarters of each half of temp2
temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low quarters
temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0x4e); //swap high and low eigths of each quarter of a
temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // sum corresponding floats in the high, low eighths
temp1_AVX_512 = _mm512_shuffle_ps(temp2_AVX_512, temp2_AVX_512, 0xb1); //swap high and low sixteenths of each eighth
temp2_AVX_512 = _mm512_add_ps(temp2_AVX_512, temp1_AVX_512); // each element of temp2_AVX_512 now contains the sum of all the floats in a
__m256 temp3_AVX = _mm512_extractf32x8_ps(temp2_AVX_512, 0); //Grab the low half of temp2_AVX_512
// because AVX-512 doesn't provide an operation to extract one float from a 512-bit vector
// printf("output sum vector is: ");
// print_512(temp2_AVX_512);
int *retint_ptr = (int *) ret_sum; // This is a horrible hack because there isn't an intrinsic to extract a float from
// an __m256. Do this to avoid casting an int back to a float and screwing it up
*retint_ptr = _mm256_extract_epi32((__m256i) temp3_AVX, 0);
}
/*****************************************************************
* 3. Inlined functions: left and right shifts
*****************************************************************/
/* Function: esl_avx512_rightshift_int8()
* Synopsis: Shift int8 vector elements to the right, shifting -inf on.
* Incept: SRE, Sun Jun 4 17:43:14 2017
* See: esl_sse.h::esl_sse_rightshift_int8()
*/
static inline __m512i
esl_avx512_rightshift_int8(__m512i v, __m512i neginfmask)
{
// Similar to AVX logic, but complicated by lack of permute2x128 instruction.
v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 15);
return _mm512_or_si512(v, neginfmask);
}
/* Function: esl_avx512_rightshift_int16()
* Synopsis: Shift int16 vector elements to the right, shifting -inf on.
* Incept: SRE, Sun Jun 4 17:49:02 2017
* See: esl_sse.h::esl_sse_rightshift_int16()
*/
static inline __m512i
esl_avx512_rightshift_int16(__m512i v, __m512i neginfmask)
{
v = _mm512_alignr_epi8(v, _mm512_maskz_shuffle_i32x4(0xfff0, v, v, 0x90), 14);
return _mm512_or_si512(v, neginfmask);
}
/* Function: esl_avx512_rightshiftz_float()
* Synopsis: Shift float vector elements to the right, shifting zero on.
* Incept: SRE, Sun Jun 4 17:59:54 2017
* See: esl_sse.h::esl_sse_rightshiftz_float()
*/
static inline __m512
esl_avx512_rightshiftz_float(__m512 v)
{
return ((__m512) _mm512_alignr_epi8((__m512i) v, _mm512_maskz_shuffle_i32x4(0xfff0, (__m512i) v, (__m512i) v, 0x90), 12));
}
/* Function: esl_avx512_leftshiftz_float()
* Synopsis: Shift float vector elements to the left, shifting zero on.
* Incept: SRE, Sun Jun 4 18:04:34 2017
* See: esl_sse.h::esl_sse_leftshiftz_float()
*/
static inline __m512
esl_avx512_leftshiftz_float(__m512 v)
{
return ((__m512) _mm512_alignr_epi8( _mm512_maskz_shuffle_i32x4(0x0fff, (__m512i) v, (__m512i) v, 0x39), (__m512i) v, 4));
}
#endif //eslAVX512_INCLUDED
#endif //eslENABLE_AVX512
You can’t perform that action at this time.