Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
734 lines (617 sloc) 26.5 KB
/* Vectorized routines for ARM processors, using NEON intrinsics.
*
* Table of contents:
* 1. SIMD logf(), expf()
* 2. Utilities for float vectors (4 floats in an esl_neon_128f_t)
* 3. Benchmark
* 4. Unit tests
* 5. Test driver
* 6. Example
*
*****************************************************************
*
* This code is conditionally compiled, only when <eslENABLE_NEON> was
* set in <esl_config.h> by the configure script, and that will only
* happen on ARM platforms. When <eslENABLE_NEON> is not set, we
* include some dummy code to silence compiler and ranlib warnings
* about empty translation units and no symbols, and dummy drivers
* that do nothing but declare success.
*
*****************************************************************
* Credits:
*
* The logf() and expf() routines are derivatives of routines by
* Julien Pommier [http://gruntthepeon.free.fr/ssemath/]. Those
* routines were in turn based on serial implementations in the Cephes
* math library by Stephen Moshier [Moshier89;
* http://www.moshier.net/#Cephes]. Thanks and credit to both Moshier
* and Pommier for their clear code. Additional copyright and license
* information is appended at the end of the file.
*/
#include "esl_config.h"
#ifdef eslENABLE_NEON
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include "easel.h"
#include "esl_neon.h"
/* Definitions for log/exp */
#define c_inv_mant_mask ~0x7f800000u
#define c_cephes_SQRTHF 0.707106781186547524
#define c_cephes_log_p0 7.0376836292E-2
#define c_cephes_log_p1 -1.1514610310E-1
#define c_cephes_log_p2 1.1676998740E-1
#define c_cephes_log_p3 -1.2420140846E-1
#define c_cephes_log_p4 +1.4249322787E-1
#define c_cephes_log_p5 -1.6668057665E-1
#define c_cephes_log_p6 +2.0000714765E-1
#define c_cephes_log_p7 -2.4999993993E-1
#define c_cephes_log_p8 +3.3333331174E-1
#define c_cephes_log_q1 -2.12194440e-4
#define c_cephes_log_q2 0.693359375
#define c_exp_hi 88.3762626647949f
#define c_exp_lo -88.3762626647949f
#define c_cephes_LOG2EF 1.44269504088896341
#define c_cephes_exp_C1 0.693359375
#define c_cephes_exp_C2 -2.12194440e-4
#define c_cephes_exp_p0 1.9875691500E-4
#define c_cephes_exp_p1 1.3981999507E-3
#define c_cephes_exp_p2 8.3334519073E-3
#define c_cephes_exp_p3 4.1665795894E-2
#define c_cephes_exp_p4 1.6666665459E-1
#define c_cephes_exp_p5 5.0000001201E-1
/*****************************************************************
* 1. NEON SIMD logf(), expf()
*****************************************************************/
/* Function: esl_neon_logf()
* Synopsis: <r[z] = log x[z]>
* Incept: Tyler Camp, summer 2016
*
* Purpose: Given a vector <x> containing four floats, returns a
* vector <r> in which each element <r[z] = logf(x[z])>.
*
* Valid in the domain $x_z > 0$ for normalized IEEE754
* $x_z$.
*
* For <x> $< 0$, including -0, returns <NaN>. For <x> $==
* 0$ or subnormal <x>, returns <-inf>. For <x = inf>,
* returns <inf>. For <x = NaN>, returns <NaN>. For
* subnormal <x>, returns <-inf>.
*
* Note: Derived from an ARM implementation by Julian
* Pommier. Added handling of IEEE754 specials.
*/
esl_neon_128f_t
esl_neon_logf(esl_neon_128f_t x)
{
esl_neon_128f_t one, e, tmp, z, y, inf_vector, neginf_vector;
esl_neon_128i_t nan_mask, emm0, ux, mask;
esl_neon_128i_t poszero_mask, inf_mask; /* Special IEEE754 inputs */
uint32_t negzero = (1 << 31);
esl_neon_128f_t zero_vector;
zero_vector.f32x4 = vdupq_n_f32(0.0f);
inf_vector.f32x4 = vdupq_n_f32(eslINFINITY); /* All floats with exponent bits high */
neginf_vector.f32x4 = vdupq_n_f32(-eslINFINITY); /* -inf */
one.f32x4 = vdupq_n_f32(1);
nan_mask.u32x4 = vcltq_f32(x.f32x4, vdupq_n_f32(0.0)); /* log(-x) = NaN */
nan_mask.u32x4 = vorrq_u32(vceqq_u32(vreinterpretq_u32_f32(x.f32x4),
vdupq_n_u32(negzero)),
nan_mask.u32x4); /* log(-0) = NaN */
/* Mask all other NaNs */
esl_neon_128i_t exp_hi_mask, mantissa_mask, mantissa_vector;
mantissa_vector.u32x4 = vdupq_n_u32(0x007FFFFF);
exp_hi_mask.u32x4 = vandq_u32(vreinterpretq_u32_f32(x.f32x4),
vreinterpretq_u32_f32(inf_vector.f32x4));
exp_hi_mask.u32x4 = vceqq_u32(exp_hi_mask.u32x4, vreinterpretq_u32_f32(inf_vector.f32x4));
mantissa_mask.u32x4 = vandq_u32(vreinterpretq_u32_f32(x.f32x4), mantissa_vector.u32x4);
mantissa_mask.u32x4 = vcgtq_u32(mantissa_mask.u32x4, vreinterpretq_u32_f32(zero_vector.f32x4));
nan_mask.u32x4 = vorrq_u32(vandq_u32(mantissa_mask.u32x4, exp_hi_mask.u32x4), nan_mask.u32x4);
ux.s32x4 = vreinterpretq_s32_f32(x.f32x4);
emm0.u32x4 = vshrq_n_u32(ux.u32x4, 23);
/* Mask 0 elements and infinity elements; log(0) = -inf, log(inf) = inf, log(NaN) = NaN */
poszero_mask.u32x4 = vceqq_f32(x.f32x4, zero_vector.f32x4);
/* (x == +0) : !(x < 0) && (x == 0) */
poszero_mask.u32x4 = vandq_u32(vmvnq_u32(nan_mask.u32x4), poszero_mask.u32x4);
/* +inf */
inf_mask.u32x4 = vceqq_f32(x.f32x4, inf_vector.f32x4);
/* keep only the fractional part */
ux.s32x4 = vandq_s32(ux.s32x4, vdupq_n_s32(c_inv_mant_mask));
ux.s32x4 = vorrq_s32(ux.s32x4, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
x.f32x4 = vreinterpretq_f32_s32(ux.s32x4);
emm0.s32x4 = vsubq_s32(emm0.s32x4, vdupq_n_s32(0x7f));
e.f32x4 = vcvtq_f32_s32(emm0.s32x4);
e.f32x4 = vaddq_f32(e.f32x4, one.f32x4);
/* part2:
if( x < SQRTHF ) {
e -= 1;
x = x + x - 1.0;
} else { x = x - 1.0; }
*/
mask.u32x4 = vcltq_f32(x.f32x4, vdupq_n_f32(c_cephes_SQRTHF));
tmp.f32x4 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x.f32x4), mask.u32x4));
x.f32x4 = vsubq_f32(x.f32x4, one.f32x4);
e.f32x4 = vsubq_f32(e.f32x4, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one.f32x4), mask.u32x4)));
x.f32x4 = vaddq_f32(x.f32x4, tmp.f32x4);
z.f32x4 = vmulq_f32(x.f32x4, x.f32x4);
y.f32x4 = vdupq_n_f32(c_cephes_log_p0);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p1), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p2), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p3), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p4), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p5), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p6), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p7), y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(vdupq_n_f32(c_cephes_log_p8), y.f32x4, x.f32x4);
y.f32x4 = vmulq_f32(y.f32x4, x.f32x4);
y.f32x4 = vmulq_f32(y.f32x4, z.f32x4);
tmp.f32x4 = vmulq_f32(e.f32x4, vdupq_n_f32(c_cephes_log_q1));
y.f32x4 = vaddq_f32(y.f32x4, tmp.f32x4);
tmp.f32x4 = vmulq_f32(z.f32x4, vdupq_n_f32(0.5f));
y.f32x4 = vsubq_f32(y.f32x4, tmp.f32x4);
tmp.f32x4 = vmulq_f32(e.f32x4, vdupq_n_f32(c_cephes_log_q2));
x.f32x4 = vaddq_f32(x.f32x4, y.f32x4);
x.f32x4 = vaddq_f32(x.f32x4, tmp.f32x4);
/* IEEE754 cleanup */
esl_neon_128f_t inf_mask_view, poszero_mask_view;
neginf_vector.f32x4 = vdupq_n_f32(-eslINFINITY); /* -inf */
inf_mask_view.f32x4 = vreinterpretq_f32_s32(inf_mask.s32x4);
poszero_mask_view.f32x4 = vreinterpretq_f32_s32(poszero_mask.s32x4);
x.f32x4 = vreinterpretq_f32_s32(vorrq_s32(vreinterpretq_s32_f32(x.f32x4),
nan_mask.s32x4)); /* log(x<0, including -0, -inf)=NaN */
x = esl_neon_select_float(x, neginf_vector, poszero_mask_view); /* mask +0 */
x = esl_neon_select_float(x, inf_vector, inf_mask_view); /* mask +inf */
return x;
}
/* Function: esl_neon_expf()
* Synopsis: <r[z] = exp x[z]>
* Incept: Tyler Camp, summer 2016
*
* Purpose: Given a vector <x> containing four floats, returns a
* vector <r> in which each element <r[z] = expf(x[z])>.
* Valid for all IEEE754 floats $x_z$.
*
* Note: Derived from a NEON implementation by Julian Pommier.
*/
esl_neon_128f_t
esl_neon_expf(esl_neon_128f_t x)
{
esl_neon_128f_t tmp, fx, one, z;
esl_neon_128i_t mask, maxmask, minmask;
/* handle out of range and special conditions */
maxmask.u32x4 = vcgtq_f32(x.f32x4, vdupq_n_f32(c_exp_hi));
minmask.u32x4 = vcleq_f32(x.f32x4, vdupq_n_f32(c_exp_lo));
one.f32x4 = vdupq_n_f32(1);
x.f32x4 = vminq_f32(x.f32x4, vdupq_n_f32(c_exp_hi));
x.f32x4 = vmaxq_f32(x.f32x4, vdupq_n_f32(c_exp_lo));
/* express exp(x) as exp(g + n*log(2)) */
fx.f32x4 = vmlaq_f32(vdupq_n_f32(0.5f), x.f32x4, vdupq_n_f32(c_cephes_LOG2EF));
/* perform a floorf */
tmp.f32x4 = vcvtq_f32_s32(vcvtq_s32_f32(fx.f32x4));
/* if greater, substract 1 */
mask.u32x4 = vcgtq_f32(tmp.f32x4, fx.f32x4);
mask.u32x4 = vandq_u32(mask.u32x4, vreinterpretq_u32_f32(one.f32x4));
fx.f32x4 = vsubq_f32(tmp.f32x4, vreinterpretq_f32_u32(mask.u32x4));
tmp.f32x4 = vmulq_f32(fx.f32x4, vdupq_n_f32(c_cephes_exp_C1));
z.f32x4 = vmulq_f32(fx.f32x4, vdupq_n_f32(c_cephes_exp_C2));
x.f32x4 = vsubq_f32(x.f32x4, tmp.f32x4);
x.f32x4 = vsubq_f32(x.f32x4, z.f32x4);
static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
esl_neon_128f_t y, c1, c2, c3, c4, c5, pow2n;
y.f32x4 = vld1q_dup_f32(cephes_exp_p+0);
c1.f32x4 = vld1q_dup_f32(cephes_exp_p+1);
c2.f32x4 = vld1q_dup_f32(cephes_exp_p+2);
c3.f32x4 = vld1q_dup_f32(cephes_exp_p+3);
c4.f32x4 = vld1q_dup_f32(cephes_exp_p+4);
c5.f32x4 = vld1q_dup_f32(cephes_exp_p+5);
y.f32x4 = vmulq_f32(y.f32x4, x.f32x4);
z.f32x4 = vmulq_f32(x.f32x4,x.f32x4);
y.f32x4 = vaddq_f32(y.f32x4, c1.f32x4);
y.f32x4 = vmlaq_f32(c2.f32x4, y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(c3.f32x4, y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(c4.f32x4, y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(c5.f32x4, y.f32x4, x.f32x4);
y.f32x4 = vmlaq_f32(x.f32x4, y.f32x4, z.f32x4);
y.f32x4 = vaddq_f32(y.f32x4, one.f32x4);
/* build 2^n */
esl_neon_128i_t mm;
mm.s32x4 = vcvtq_s32_f32(fx.f32x4);
mm.s32x4 = vaddq_s32(mm.s32x4, vdupq_n_s32(0x7f));
mm.s32x4 = vshlq_n_s32(mm.s32x4, 23);
pow2n.f32x4 = vreinterpretq_f32_s32(mm.s32x4);
y.f32x4 = vmulq_f32(y.f32x4, pow2n.f32x4);
/* special/range cleanup */
esl_neon_128f_t maxmask_view, minmask_view, zero_vec, inf_vec;
zero_vec.f32x4 = vdupq_n_f32(0.0f);
inf_vec.f32x4 = vdupq_n_f32(-eslINFINITY);
maxmask_view.f32x4 = vreinterpretq_f32_s32(maxmask.s32x4);
minmask_view.f32x4 = vreinterpretq_f32_s32(minmask.s32x4);
y = esl_neon_select_float(y, inf_vec, maxmask_view); /* exp(x) = inf for x > log(2^128) */
y = esl_neon_select_float(y, zero_vec, minmask_view); /* exp(x) = 0 for x < log(2^-149) */
return y;
}
/*****************************************************************
* 2. Utilities for fq vectors (4 floats in an esl_neon_128f_t)
*****************************************************************/
void
esl_neon_dump_float(FILE *fp, esl_neon_128f_t v)
{
float *p = (float *)&v;
fprintf(fp, "[%13.8g, %13.8g, %13.8g, %13.8g]", p[0], p[1], p[2], p[3]);
}
/*****************************************************************
* 3. Benchmark
*****************************************************************/
#ifdef eslNEON_BENCHMARK
/* gcc -mfpu=neon -O3 -o benchmark-neon -I ~/src/hmmer/easel -L ~/src/hmmer/easel -DeslNEON_BENCHMARK esl_neon.c -leasel -lm
*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_stopwatch.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "benchmark driver for neon module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_STOPWATCH *w = esl_stopwatch_Create();
int N = esl_opt_GetInteger(go, "-N");
float origx = 2.0;
float x = origx;
esl_neon_128f_t xv = vdupq_n_f32(x);
int i;
/* First, serial time. */
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) { x = logf(x); x = expf(x); }
esl_stopwatch_Stop(w);
esl_stopwatch_Display(stdout, w, "# serial CPU time: ");
/* Vector time */
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) { xv = esl_neon_logf(xv); xv = esl_neon_expf(xv); }
esl_stopwatch_Stop(w);
esl_stopwatch_Display(stdout, w, "# vector CPU time: ");
/* If you don't do something with x and xv, the compiler may optimize them away */
printf("%g => many scalar logf,expf cycles => %g\n", origx, x);
printf("%g => many vector logf,expf cycles => ", origx);
esl_neon_dump_float(stdout, xv); printf("\n");
esl_stopwatch_Destroy(w);
esl_getopts_Destroy(go);
return 0;
}
#endif /*eslNEON_BENCHMARK*/
/*****************************************************************
* 4. Unit tests
*****************************************************************/
#ifdef eslNEON_TESTDRIVE
#include "esl_getopts.h"
#include "esl_random.h"
/* utest_logf(): Test range/domain of logf */
static void
utest_logf(ESL_GETOPTS *go)
{
esl_neon_128f_t x; /* test input */
union { esl_neon_128f_t v; float x[4]; } r; /* test output */
/* Test IEEE754 specials:
* log(-inf) = NaN log(x<0) = NaN log(-0) = NaN
* log(0) = -inf log(inf) = inf log(NaN) = NaN
*/
float test1[4] = {-eslINFINITY, -1.0, -0.0, 0.0};
x.f32x4 = vld1q_f32(test1);
r.v = esl_neon_logf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("logf");
esl_neon_dump_float(stdout, x); printf(" ==> ");
esl_neon_dump_float(stdout, r.v); printf("\n");
}
if (! isnan(r.x[0])) esl_fatal("logf(-inf) should be NaN");
if (! isnan(r.x[1])) esl_fatal("logf(-1) should be NaN");
if (! isnan(r.x[2])) esl_fatal("logf(-0) should be NaN");
if (! (r.x[3] < 0 && isinf(r.x[3]))) esl_fatal("logf(0) should be -inf");
float test2[4] = {eslINFINITY, eslNaN, FLT_MIN, FLT_MAX};
x.f32x4 = vld1q_f32(test2);
r.v = esl_neon_logf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("logf");
esl_neon_dump_float(stdout, x); printf(" ==> ");
esl_neon_dump_float(stdout, r.v); printf("\n");
}
if (! isinf(r.x[0])) esl_fatal("logf(inf) should be inf");
if (! isnan(r.x[1])) esl_fatal("logf(NaN) should be NaN");
}
/* utest_expf(): Test range/domain of expf */
static void
utest_expf(ESL_GETOPTS *go)
{
esl_neon_128f_t x; /* test input */
union { esl_neon_128f_t v; float x[4]; } r; /* test output */
/* exp(-inf) = 0 exp(-0) = 1 exp(0) = 1 exp(inf) = inf exp(NaN) = NaN */
float test1[4] = {-eslINFINITY, -0.0, 0.0, eslINFINITY};
x.f32x4 = vld1q_f32(test1);
r.v = esl_neon_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_neon_dump_float(stdout, x); printf(" ==> ");
esl_neon_dump_float(stdout, r.v); printf("\n");
}
if (r.x[0] != 0.0f) esl_fatal("expf(-inf) should be 0");
if (r.x[1] != 1.0f) esl_fatal("expf(-0) should be 1");
if (r.x[2] != 1.0f) esl_fatal("expf(0) should be 1");
if (! isinf(r.x[3])) esl_fatal("expf(inf) should be inf");
/* exp(NaN) = NaN exp(large) = inf exp(-large) = 0 exp(1) = exp(1) */
float test2[4] = {eslNaN, 666.0f, -666.0f, 1.0f};
x.f32x4 = vld1q_f32(test2);
r.v = esl_neon_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_neon_dump_float(stdout, x); printf(" ==> ");
esl_neon_dump_float(stdout, r.v); printf("\n");
}
if (! isnan(r.x[0])) esl_fatal("expf(NaN) should be NaN");
if (! isinf(r.x[1])) esl_fatal("expf(large x) should be inf");
if (r.x[2] != 0.0f) esl_fatal("expf(-large x) should be 0");
/* Make sure we are correct around the problematic ~minlogf boundary:
* (1) e^x for x < -127.5 log2 + epsilon is 0, because that's our minlogf barrier.
* (2) e^x for -127.5 log2 < x < -126.5 log2 is 0 too, but is actually calculated
* (3) e^x for -126.5 log2 < x should be finite (and close to FLT_MIN)
*
* minlogf = -127.5 log(2) + epsilon = -88.3762626647949;
* and -126.5 log(2) = -87.68311834
* so for
* (1): expf(-88.3763) => 0
* (2): expf(-88.3762) => 0
* (3): expf(-87.6832) => 0
* (4): expf(-87.6831) => <FLT_MIN (subnormal) : ~8.31e-39 (may become 0 in flush-to-zero mode for subnormals)
*/
float test3[4] = {-87.6831, -87.6832, -88.3762, -88.3763};
x.f32x4 = vld1q_f32(test3);
r.v = esl_neon_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_neon_dump_float(stdout, x); printf(" ==> ");
esl_neon_dump_float(stdout, r.v); printf("\n");
}
if ( r.x[0] >= FLT_MIN) esl_fatal("expf( -126.5 log2 + eps) should be around FLT_MIN");
if ( r.x[1] != 0.0f) esl_fatal("expf( -126.5 log2 - eps) should be 0.0 (by calculation)");
if ( r.x[2] != 0.0f) esl_fatal("expf( -127.5 log2 + eps) should be 0.0 (by calculation)");
if ( r.x[3] != 0.0f) esl_fatal("expf( -127.5 log2 - eps) should be 0.0 (by min bound): %g", r.x[0]);
}
/* utest_odds(): test accuracy of logf, expf on odds ratios,
* our main intended use.
*/
static void
utest_odds(ESL_GETOPTS *go, ESL_RANDOMNESS *r)
{
int N = esl_opt_GetInteger(go, "-N");
int verbose = esl_opt_GetBoolean(go, "-v");
int very_verbose = esl_opt_GetBoolean(go, "--vv");
int i;
float p1, p2, odds;
union { esl_neon_128f_t v; float x[4]; } r1;
union { esl_neon_128f_t v; float x[4]; } r2;
float scalar_r1, scalar_r2;
double err1, maxerr1 = 0.0, avgerr1 = 0.0; /* errors on logf() */
double err2, maxerr2 = 0.0, avgerr2 = 0.0; /* errors on expf() */
for (i = 0; i < N; i++)
{
p1 = esl_rnd_UniformPositive(r);
p2 = esl_rnd_UniformPositive(r);
odds = p1 / p2;
if (odds == 0.0) esl_fatal("whoa, odds ratio can't be 0!\n");
esl_neon_128f_t tmp;
tmp.f32x4 = vdupq_n_f32(odds);
r1.v = esl_neon_logf(tmp); /* r1.x[z] = log(p1/p2) */
scalar_r1 = log(odds);
err1 = (r1.x[0] == 0. && scalar_r1 == 0.) ? 0.0 : 2 * fabs(r1.x[0] - scalar_r1) / fabs(r1.x[0] + scalar_r1);
if (err1 > maxerr1) maxerr1 = err1;
avgerr1 += err1 / (float) N;
if (isnan(avgerr1)) esl_fatal("whoa, what?\n");
r2.v = esl_neon_expf(r1.v); /* and back to odds */
scalar_r2 = exp(r1.x[0]);
err2 = (r2.x[0] == 0. && scalar_r2 == 0.) ? 0.0 : 2 * fabs(r2.x[0] - scalar_r2) / fabs(r2.x[0] + scalar_r2);
if (err2 > maxerr2) maxerr2 = err2;
avgerr2 += err2 / (float) N;
if (very_verbose)
printf("%13.7g %13.7g %13.7g %13.7g %13.7g %13.7g %13.7g\n", odds, scalar_r1, r1.x[0], scalar_r2, r2.x[0], err1, err2);
}
if (verbose) {
printf("Average [max] logf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr1, maxerr1);
printf("Average [max] expf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr2, maxerr2);
printf("(random seed : %" PRIu32 ")\n", esl_randomness_GetSeed(r));
}
if (avgerr1 > 1e-8) esl_fatal("average error on logf() is intolerable\n");
if (maxerr1 > 1e-6) esl_fatal("maximum error on logf() is intolerable\n");
if (avgerr2 > 1e-8) esl_fatal("average error on expf() is intolerable\n");
if (maxerr2 > 1e-6) esl_fatal("maximum error on expf() is intolerable\n");
}
static void
utest_hmax_u8(ESL_RANDOMNESS *rng)
{
union { esl_neon_128i_t v; uint8_t x[16]; } u;
uint8_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = 0;
for (z = 0; z < 16; z++)
{
u.x[z] = (uint8_t) (esl_rnd_Roll(rng, 256)); // 0..255
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_neon_hmax_u8(u.v);
if (r1 != r2) esl_fatal("hmax_u8 utest failed");
}
}
static void
utest_hmax_s8(ESL_RANDOMNESS *rng)
{
union { esl_neon_128i_t v; int8_t x[16]; } u;
int8_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = -128;
for (z = 0; z < 16; z++)
{
u.x[z] = (int8_t) (esl_rnd_Roll(rng, 256) - 128); // -128..127
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_neon_hmax_s8(u.v);
if (r1 != r2) esl_fatal("hmax_s8 utest failed");
}
}
static void
utest_hmax_s16(ESL_RANDOMNESS *rng)
{
union { esl_neon_128i_t v; int16_t x[8]; } u;
int16_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = -32768;
for (z = 0; z < 8; z++)
{
u.x[z] = (int16_t) (esl_rnd_Roll(rng, 65536) - 32768); // -32768..32767
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_neon_hmax_s16(u.v);
if (r1 != r2) esl_fatal("hmax_s16 utest failed: %d != %d", r1, r2);
}
}
#endif /*eslNEON_TESTDRIVE*/
/*****************************************************************
* 5. Test driver
*****************************************************************/
#ifdef eslNEON_TESTDRIVE
/* gcc -mfpu=neon -g -Wall -o test -I. -L. -DeslNEON_TESTDRIVE esl_neon.c -leasel -lm
*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of random test points", 0 },
{ "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
{ "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be verbose: show test report", 0 },
{ "--vv", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be very verbose: show individual test samples", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "test driver for neon module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));;
fprintf(stderr, "## %s\n", argv[0]);
fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
#ifdef eslHAVE_NEON_AARCH64
fprintf(stderr, "# flavor = aarch64\n");
#else
fprintf(stderr, "# flavor = armv7\n");
#endif
utest_logf(go);
utest_expf(go);
utest_odds(go, rng);
utest_hmax_u8(rng);
utest_hmax_s8(rng);
utest_hmax_s16(rng);
fprintf(stderr, "# status = ok\n");
esl_randomness_Destroy(rng);
esl_getopts_Destroy(go);
return 0;
}
#endif /* eslNEON_TESTDRIVE*/
/*****************************************************************
* 6. Example
*****************************************************************/
#ifdef eslNEON_EXAMPLE
/*::cexcerpt::neon_example::begin::*/
/* gcc -mfpu=neon -g -Wall -o example -I. -L. -DeslNEON_EXAMPLE esl_neon.c -leasel -lm
*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
int
main(int argc, char **argv)
{
float x; /* scalar input */
esl_neon_128f_t xv; /* input vector */
union { esl_neon_128f_t v; float x[4]; } rv; /* result vector*/
x = 2.0;
xv.f32x4 = vdupq_n_f32(x);
rv.v = esl_neon_logf(xv);
printf("logf(%f) = %f\n", x, rv.x[0]);
rv.v = esl_neon_expf(xv);
printf("expf(%f) = %f\n", x, rv.x[0]);
return 0;
}
/*::cexcerpt::neon_example::end::*/
#endif /* eslNEON_EXAMPLE */
#else // ! eslENABLE_NEON
/* If we don't have NEON compiled in, provide some nothingness to:
* a. prevent Mac OS/X ranlib from bitching about .o file that "has no symbols"
* b. prevent compiler from bitching about "empty compilation unit"
* c. compile blank drivers and automatically pass the automated tests.
*/
void esl_neon_silence_hack(void) { return; }
#if defined eslNEON_TESTDRIVE || eslNEON_EXAMPLE || eslNEON_BENCHMARK
int main(void) { return 0; }
#endif
#endif // !eslENABLE_NEON or not
/*****************************************************************
* additional copyright and license information for this file
*****************************************************************
* In addition to our own copyrights, esl_neon_logf() and esl_neon_expf() are also:
* Copyright (C) 2007 Julien Pommier
* Copyright (C) 1992 Stephen Moshier
*
* These functions derived from zlib-licensed routines by
* Julien Pommier, http://gruntthepeon.free.fr/ssemath/. The
* zlib license:
*
*-------------------------------------------------------------------------
* Copyright (C) 2007 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*-------------------------------------------------------------------------
*
* In turn, Pommier had derived the logf() and expf() functions from
* serial versions in the Cephes math library. According to its
* readme, Cephes is "copyrighted by the author" and "may be used
* freely but it comes with no support or guarantee." Cephes is
* available in NETLIB [http://www.netlib.org/cephes/]. NETLIB is
* widely considered to be a free scientific code repository, though
* the copyright and license status of many parts, including Cephes,
* is ill-defined. We have attached Moshier's copyright,
* to credit his original contribution. Thanks to both Pommier and
* Moshier for their clear code.
*/
You can’t perform that action at this time.