Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
691 lines (583 sloc) 26.3 KB
/* Vectorized routines for Intel/AMD processors, using Streaming SIMD Extensions (SSE).
*
* Contents:
* 1. SIMD logf(), expf()
* 2. Utilities for ps vectors (4 floats in a __m128)
* 3. Benchmark
* 4. Unit tests
* 5. Test driver
* 6. Example
*
*****************************************************************
*
* This code is conditionally compiled, only when <eslENABLE_SSE> or
* <eslENABLE_SSE4> was set in <esl_config.h> by the configure script,
* and that will only happen on x86 platforms. When neither
* <eslENABLE_SSE> nor <eslENABLE_SSE4> are set, we include some dummy
* code to silence compiler and ranlib warnings about empty
* translation units and no symbols, and dummy drivers that do nothing
* but declare success.
*
*****************************************************************
* Credits:
*
* The logf() and expf() routines are derivatives of routines by
* Julien Pommier [http://gruntthepeon.free.fr/ssemath/]. Those
* routines were in turn based on serial implementations in the Cephes
* math library by Stephen Moshier [Moshier89;
* http://www.moshier.net/#Cephes]. Thanks and credit to both Moshier
* and Pommier for their clear code. Additional copyright and license
* information is appended at the end of the file.
*/
#include "esl_config.h"
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4)
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <x86intrin.h>
#include "easel.h"
#include "esl_sse.h"
/*****************************************************************
* 1. SSE SIMD logf(), expf()
*****************************************************************/
/* Function: esl_sse_logf()
* Synopsis: <r[z] = log x[z]>
* Incept: SRE, Fri Dec 14 11:32:54 2007 [Janelia]
*
* Purpose: Given a vector <x> containing four floats, returns a
* vector <r> in which each element <r[z] = logf(x[z])>.
*
* Valid in the domain $x_z > 0$ for normalized IEEE754
* $x_z$.
*
* For <x> $< 0$, including -0, returns <NaN>. For <x> $==
* 0$ or subnormal <x>, returns <-inf>. For <x = inf>,
* returns <inf>. For <x = NaN>, returns <NaN>. For
* subnormal <x>, returns <-inf>.
*
* Xref: J2/71.
*
* Note: Derived from an SSE1 implementation by Julian
* Pommier. Converted to SSE2 and added handling
* of IEEE754 specials.
*/
__m128
esl_sse_logf(__m128 x)
{
static float cephes_p[9] = { 7.0376836292E-2f, -1.1514610310E-1f, 1.1676998740E-1f,
-1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f,
2.0000714765E-1f, -2.4999993993E-1f, 3.3333331174E-1f };
__m128 onev = _mm_set1_ps(1.0f); /* all elem = 1.0 */
__m128 v0p5 = _mm_set1_ps(0.5f); /* all elem = 0.5 */
__m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */
__m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */
__m128i ei;
__m128 e;
__m128 invalid_mask, zero_mask, inf_mask; /* masks used to handle special IEEE754 inputs */
__m128 mask;
__m128 origx;
__m128 tmp;
__m128 y;
__m128 z;
/* first, split x apart: x = frexpf(x, &e); */
ei = _mm_srli_epi32( _mm_castps_si128(x), 23); /* shift right 23: IEEE754 floats: ei = biased exponents */
invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg)); /* mask any elem that's negative; these become NaN */
zero_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128())); /* mask any elem zero or subnormal; these become -inf */
inf_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp)); /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN */
origx = x; /* store original x, used for log(inf) = inf, log(NaN) = NaN */
x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand */
x = _mm_or_ps (x, v0p5); /* sets hidden bit b[0] */
ei = _mm_sub_epi32(ei, _mm_set1_epi32(126)); /* -127 (ei now signed base-2 exponent); then +1 */
e = _mm_cvtepi32_ps(ei);
/* now, calculate the log */
mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches. */
tmp = _mm_and_ps(x, mask); /* tmp contains x values < 0.707, else 0 */
x = _mm_sub_ps(x, onev);
e = _mm_sub_ps(e, _mm_and_ps(onev, mask));
x = _mm_add_ps(x, tmp);
z = _mm_mul_ps(x,x);
y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8])); y = _mm_mul_ps(y, x);
y = _mm_mul_ps(y, z);
tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
y = _mm_add_ps(y, tmp);
tmp = _mm_mul_ps(z, v0p5);
y = _mm_sub_ps(y, tmp);
tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
/* IEEE754 cleanup: */
x = esl_sse_select_ps(x, origx, inf_mask); /* log(inf)=inf; log(NaN) = NaN */
x = _mm_or_ps(x, invalid_mask); /* log(x<0, including -0,-inf) = NaN */
x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal = -inf */
return x;
}
/* Function: esl_sse_expf()
* Synopsis: <r[z] = exp x[z]>
* Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia]
*
* Purpose: Given a vector <x> containing four floats, returns a
* vector <r> in which each element <r[z] = expf(x[z])>.
*
* Valid for all IEEE754 floats $x_z$.
*
* Xref: J2/71
* J10/62: bugfix, minlogf/maxlogf range was too wide;
* (k+127) must be >=0 and <=255, so (k+127)<<23
* is a valid IEEE754 float, without touching
* the sign bit. Pommier had this right in the
* first place, and I didn't understand.
*
* Note: Derived from an SSE1 implementation by Julian
* Pommier. Converted to SSE2.
*
* Note on maxlogf/minlogf, which are close to but not
* exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
* k+127 is 0..255, a valid IEEE754 8-bit exponent
* (0..255), so the bit pattern (k+127)<<23 is IEEE754
* single-precision for 2^k. If k=-127, we get IEEE754 0.
* If k=128, we get IEEE754 +inf. If k<-127, k+127 is
* negative and we get screwed up. If k>128, k+127
* overflows the 8-bit exponent and sets the sign bit. So
* for x' (base 2) < -127.5 we must definitely return e^x ~
* 0; for x' < 126.5 we're going to calculate 0 anyway
* (because k=floor(-126.5-epsilon+0.5) = -127). So any
* minlogf between -126.5 log2 ... -127.5 log2 will suffice
* as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
* That's 87.68312 .. 88.3762655. I think Pommier's
* thinking is, you don't want to get to close to the
* edges, lest fp roundoff error screw you (he may have
* consider 1 ulp carefully, I can't tell), but otherwise
* you may as well put your bounds close to the outer edge;
* so
* maxlogf = 127.5 log(2) - epsilon
* minlogf = -127.5 log(2) + epsilon
* for an epsilon that happen to be ~ 3e-6.
*/
__m128
esl_sse_expf(__m128 x)
{
static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f,
4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f };
static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f };
static float maxlogf = 88.3762626647949f; /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
static float minlogf = -88.3762626647949f; /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
__m128i k;
__m128 mask, tmp, fx, z, y, minmask, maxmask;
/* handle out-of-range and special conditions */
maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));
/* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R));
fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));
/* floorf() with SSE: */
k = _mm_cvttps_epi32(fx); /* cast to int with truncation */
tmp = _mm_cvtepi32_ps(k); /* cast back to float */
mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */
mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */
fx = _mm_sub_ps(tmp, mask); /* then subtract one. */
k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */
/* polynomial approx for e^f for f in range [-0.5, 0.5] */
tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
x = _mm_sub_ps(x, tmp);
x = _mm_sub_ps(x, z);
z = _mm_mul_ps(x, x);
y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(1.0f));
/* build 2^k by hand, by creating a IEEE754 float */
k = _mm_add_epi32(k, _mm_set1_epi32(127));
k = _mm_slli_epi32(k, 23);
fx = _mm_castsi128_ps(k);
/* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */
y = _mm_mul_ps(y, fx);
/* special/range cleanup */
y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */
y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */
return y;
}
/*****************************************************************
* 2. Utilities for ps vectors (4 floats in a __m128)
*****************************************************************/
void
esl_sse_dump_ps(FILE *fp, __m128 v)
{
float *p = (float *)&v;
fprintf(fp, "[%13.8g, %13.8g, %13.8g, %13.8g]", p[0], p[1], p[2], p[3]);
}
/*****************************************************************
* 3. Benchmark
*****************************************************************/
#ifdef eslSSE_BENCHMARK
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_stopwatch.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "benchmark driver for sse module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_STOPWATCH *w = esl_stopwatch_Create();
int N = esl_opt_GetInteger(go, "-N");
float origx = 2.0;
float x = origx;
__m128 xv = _mm_set1_ps(x);
int i;
/* First, serial time. */
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) { x = logf(x); x = expf(x); }
esl_stopwatch_Stop(w);
esl_stopwatch_Display(stdout, w, "# serial CPU time: ");
/* Vector time */
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) { xv = esl_sse_logf(xv); xv = esl_sse_expf(xv); }
esl_stopwatch_Stop(w);
esl_stopwatch_Display(stdout, w, "# vector CPU time: ");
/* If you don't do something with x and xv, the compiler may optimize them away */
printf("%g => many scalar logf,expf cycles => %g\n", origx, x);
printf("%g => many vector logf,expf cycles => ", origx);
esl_sse_dump_ps(stdout, xv); printf("\n");
esl_stopwatch_Destroy(w);
esl_getopts_Destroy(go);
return 0;
}
#endif /*eslSSE_BENCHMARK*/
/*****************************************************************
* 4. Unit tests
*****************************************************************/
#ifdef eslSSE_TESTDRIVE
#include "esl_getopts.h"
#include "esl_random.h"
/* utest_logf(): Test range/domain of logf */
static void
utest_logf(ESL_GETOPTS *go)
{
__m128 x; /* test input */
union { __m128 v; float x[4]; } r; /* test output */
/* Test IEEE754 specials:
* log(-inf) = NaN log(x<0) = NaN log(-0) = NaN
* log(0) = -inf log(inf) = inf log(NaN) = NaN
*/
x = _mm_set_ps(0.0, -0.0, -1.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */
r.v = esl_sse_logf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("logf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (! isnan(r.x[0])) esl_fatal("logf(-inf) should be NaN");
if (! isnan(r.x[1])) esl_fatal("logf(-1) should be NaN");
if (! isnan(r.x[2])) esl_fatal("logf(-0) should be NaN");
if (! (r.x[3] < 0 && isinf(r.x[3]))) esl_fatal("logf(0) should be -inf");
x = _mm_set_ps(FLT_MAX, FLT_MIN, eslNaN, eslINFINITY);
r.v = esl_sse_logf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("logf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (! isinf(r.x[0])) esl_fatal("logf(inf) should be inf");
if (! isnan(r.x[1])) esl_fatal("logf(NaN) should be NaN");
}
/* utest_expf(): Test range/domain of expf */
static void
utest_expf(ESL_GETOPTS *go)
{
__m128 x; /* test input */
union { __m128 v; float x[4]; } r; /* test output */
/* exp(-inf) = 0 exp(-0) = 1 exp(0) = 1 exp(inf) = inf exp(NaN) = NaN */
x = _mm_set_ps(eslINFINITY, 0.0, -0.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */
r.v = esl_sse_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (r.x[0] != 0.0f) esl_fatal("expf(-inf) should be 0");
if (r.x[1] != 1.0f) esl_fatal("expf(-0) should be 1");
if (r.x[2] != 1.0f) esl_fatal("expf(0) should be 1");
if (! isinf(r.x[3])) esl_fatal("expf(inf) should be inf");
/* exp(NaN) = NaN exp(large) = inf exp(-large) = 0 exp(1) = exp(1) */
x = _mm_set_ps(1.0f, -666.0f, 666.0f, eslNaN); /* set_ps() is in order 3 2 1 0 */
r.v = esl_sse_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (! isnan(r.x[0])) esl_fatal("expf(NaN) should be NaN");
if (! isinf(r.x[1])) esl_fatal("expf(large x) should be inf");
if (r.x[2] != 0.0f) esl_fatal("expf(-large x) should be 0");
/* Make sure we are correct around the problematic ~minlogf boundary:
* (1) e^x for x < -127.5 log2 + epsilon is 0, because that's our minlogf barrier.
* (2) e^x for -127.5 log2 < x < -126.5 log2 is 0 too, but is actually calculated
* (3) e^x for -126.5 log2 < x should be finite (and close to FLT_MIN)
*
* minlogf = -127.5 log(2) + epsilon = -88.3762626647949;
* and -126.5 log(2) = -87.68311834
* so for
* (1): expf(-88.3763) => 0
* (2): expf(-88.3762) => 0
* (3): expf(-87.6832) => 0
* (4): expf(-87.6831) => <FLT_MIN (subnormal) : ~8.31e-39 (may become 0 in flush-to-zero mode for subnormals)
*/
x = _mm_set_ps(-88.3763, -88.3762, -87.6832, -87.6831);
r.v = esl_sse_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if ( r.x[0] >= FLT_MIN) esl_fatal("expf( -126.5 log2 + eps) should be around FLT_MIN");
if ( r.x[1] != 0.0f) esl_fatal("expf( -126.5 log2 - eps) should be 0.0 (by calculation)");
if ( r.x[2] != 0.0f) esl_fatal("expf( -127.5 log2 + eps) should be 0.0 (by calculation)");
if ( r.x[3] != 0.0f) esl_fatal("expf( -127.5 log2 - eps) should be 0.0 (by min bound): %g", r.x[0]);
}
/* utest_odds(): test accuracy of logf, expf on odds ratios,
* our main intended use.
*/
static void
utest_odds(ESL_GETOPTS *go, ESL_RANDOMNESS *r)
{
int N = esl_opt_GetInteger(go, "-N");
int verbose = esl_opt_GetBoolean(go, "-v");
int very_verbose = esl_opt_GetBoolean(go, "--vv");
int i;
float p1, p2, odds;
union { __m128 v; float x[4]; } r1;
union { __m128 v; float x[4]; } r2;
float scalar_r1, scalar_r2;
double err1, maxerr1 = 0.0, avgerr1 = 0.0; /* errors on logf() */
double err2, maxerr2 = 0.0, avgerr2 = 0.0; /* errors on expf() */
for (i = 0; i < N; i++)
{
p1 = esl_rnd_UniformPositive(r);
p2 = esl_rnd_UniformPositive(r);
odds = p1 / p2;
if (odds == 0.0) esl_fatal("whoa, odds ratio can't be 0!\n");
r1.v = esl_sse_logf(_mm_set1_ps(odds)); /* r1.x[z] = log(p1/p2) */
scalar_r1 = log(odds);
err1 = (r1.x[0] == 0. && scalar_r1 == 0.) ? 0.0 : 2 * fabs(r1.x[0] - scalar_r1) / fabs(r1.x[0] + scalar_r1);
if (err1 > maxerr1) maxerr1 = err1;
avgerr1 += err1 / (float) N;
if (isnan(avgerr1)) esl_fatal("whoa, what?\n");
r2.v = esl_sse_expf(r1.v); /* and back to odds */
scalar_r2 = exp(r1.x[0]);
err2 = (r2.x[0] == 0. && scalar_r2 == 0.) ? 0.0 : 2 * fabs(r2.x[0] - scalar_r2) / fabs(r2.x[0] + scalar_r2);
if (err2 > maxerr2) maxerr2 = err2;
avgerr2 += err2 / (float) N;
if (very_verbose)
printf("%13.7g %13.7g %13.7g %13.7g %13.7g %13.7g %13.7g\n", odds, scalar_r1, r1.x[0], scalar_r2, r2.x[0], err1, err2);
}
if (verbose) {
printf("Average [max] logf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr1, maxerr1);
printf("Average [max] expf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr2, maxerr2);
printf("(random seed : %" PRIu32 ")\n", esl_randomness_GetSeed(r));
}
if (avgerr1 > 1e-8) esl_fatal("average error on logf() is intolerable\n");
if (maxerr1 > 1e-6) esl_fatal("maximum error on logf() is intolerable\n");
if (avgerr2 > 1e-8) esl_fatal("average error on expf() is intolerable\n");
if (maxerr2 > 1e-6) esl_fatal("maximum error on expf() is intolerable\n");
}
static void
utest_hmax_epu8(ESL_RANDOMNESS *rng)
{
union { __m128i v; uint8_t x[16]; } u;
uint8_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = 0;
for (z = 0; z < 16; z++)
{
u.x[z] = (uint8_t) (esl_rnd_Roll(rng, 256)); // 0..255
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epu8(u.v);
if (r1 != r2) esl_fatal("hmax_epu8 utest failed");
}
}
static void
utest_hmax_epi8(ESL_RANDOMNESS *rng)
{
#ifdef eslENABLE_SSE4 // no-op if eslENABLE_SSE only
union { __m128i v; int8_t x[16]; } u;
int8_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = -128;
for (z = 0; z < 16; z++)
{
u.x[z] = (int8_t) (esl_rnd_Roll(rng, 256) - 128); // -128..127
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epi8(u.v);
if (r1 != r2) esl_fatal("hmax_epi8 utest failed");
}
#endif // eslENABLE_SSE4
}
static void
utest_hmax_epi16(ESL_RANDOMNESS *rng)
{
union { __m128i v; int16_t x[8]; } u;
int16_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = -32768;
for (z = 0; z < 8; z++)
{
u.x[z] = (int16_t) (esl_rnd_Roll(rng, 65536) - 32768); // -32768..32767
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epi16(u.v);
if (r1 != r2) esl_fatal("hmax_epi16 utest failed: %d != %d", r1, r2);
}
}
#endif /*eslSSE_TESTDRIVE*/
/*****************************************************************
* 5. Test driver
*****************************************************************/
#ifdef eslSSE_TESTDRIVE
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_sse.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of random test points", 0 },
{ "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
{ "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be verbose: show test report", 0 },
{ "--vv", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be very verbose: show individual test samples", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "test driver for sse module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));;
fprintf(stderr, "## %s\n", argv[0]);
fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
utest_logf(go);
utest_expf(go);
utest_odds(go, rng);
utest_hmax_epu8(rng);
utest_hmax_epi8(rng);
utest_hmax_epi16(rng);
fprintf(stderr, "# status = ok\n");
esl_randomness_Destroy(rng);
esl_getopts_Destroy(go);
return 0;
}
#endif /* eslSSE_TESTDRIVE*/
/*****************************************************************
* 6. Example
*****************************************************************/
#ifdef eslSSE_EXAMPLE
/*::cexcerpt::sse_example::begin::*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_sse.h"
int
main(int argc, char **argv)
{
float x; /* scalar input */
__m128 xv; /* input vector */
union { __m128 v; float x[4]; } rv; /* result vector*/
x = 2.0;
xv = _mm_set1_ps(x);
rv.v = esl_sse_logf(xv);
printf("logf(%f) = %f\n", x, rv.x[0]);
rv.v = esl_sse_expf(xv);
printf("expf(%f) = %f\n", x, rv.x[0]);
return 0;
}
/*::cexcerpt::sse_example::end::*/
#endif /*eslSSE_EXAMPLE*/
#else // ! (eslENABLE_SSE || eslENABLE_SSE4)
/* If we don't have SSE compiled in, provide some nothingness to:
* a. prevent Mac OS/X ranlib from bitching about .o file that "has no symbols"
* b. prevent compiler from bitching about "empty compilation unit"
* c. compile blank drivers and automatically pass the automated tests.
*/
void esl_sse_silence_hack(void) { return; }
#if defined eslSSE_TESTDRIVE || eslSSE_EXAMPLE || eslSSE_BENCHMARK
int main(void) { return 0; }
#endif
#endif // (eslENABLE_SSE || eslENABLE_SSE4) or not
/*****************************************************************
* additional copyright and license information for this file
*****************************************************************
* In addition to our own copyrights, esl_sse_logf() and esl_sse_expf() are also:
* Copyright (C) 2007 Julien Pommier
* Copyright (C) 1992 Stephen Moshier
*
* These functions derived from zlib-licensed routines by
* Julien Pommier, http://gruntthepeon.free.fr/ssemath/. The
* zlib license:
*
*-------------------------------------------------------------------------
* Copyright (C) 2007 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*-------------------------------------------------------------------------
*
* In turn, Pommier had derived the logf() and expf() functions from
* serial versions in the Cephes math library. According to its
* readme, Cephes is "copyrighted by the author" and "may be used
* freely but it comes with no support or guarantee." Cephes is
* available in NETLIB [http://www.netlib.org/cephes/]. NETLIB is
* widely considered to be a free scientific code repository, though
* the copyright and license status of many parts, including Cephes,
* is ill-defined. We have attached Moshier's copyright,
* to credit his original contribution. Thanks to both Pommier and
* Moshier for their clear code.
*/
You can’t perform that action at this time.