/
esl_sse.c
690 lines (583 loc) · 26.3 KB
/
esl_sse.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
/* Vectorized routines for Intel/AMD processors, using Streaming SIMD Extensions (SSE).
*
* Contents:
* 1. SIMD logf(), expf()
* 2. Utilities for ps vectors (4 floats in a __m128)
* 3. Benchmark
* 4. Unit tests
* 5. Test driver
* 6. Example
*
*****************************************************************
*
* This code is conditionally compiled, only when <eslENABLE_SSE> or
* <eslENABLE_SSE4> was set in <esl_config.h> by the configure script,
* and that will only happen on x86 platforms. When neither
* <eslENABLE_SSE> nor <eslENABLE_SSE4> are set, we include some dummy
* code to silence compiler and ranlib warnings about empty
* translation units and no symbols, and dummy drivers that do nothing
* but declare success.
*
*****************************************************************
* Credits:
*
* The logf() and expf() routines are derivatives of routines by
* Julien Pommier [http://gruntthepeon.free.fr/ssemath/]. Those
* routines were in turn based on serial implementations in the Cephes
* math library by Stephen Moshier [Moshier89;
* http://www.moshier.net/#Cephes]. Thanks and credit to both Moshier
* and Pommier for their clear code. Additional copyright and license
* information is appended at the end of the file.
*/
#include "esl_config.h"
#if defined(eslENABLE_SSE) || defined(eslENABLE_SSE4)
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <float.h>
#include <x86intrin.h>
#include "easel.h"
#include "esl_sse.h"
/*****************************************************************
* 1. SSE SIMD logf(), expf()
*****************************************************************/
/* Function: esl_sse_logf()
* Synopsis: <r[z] = log x[z]>
* Incept: SRE, Fri Dec 14 11:32:54 2007 [Janelia]
*
* Purpose: Given a vector <x> containing four floats, returns a
* vector <r> in which each element <r[z] = logf(x[z])>.
*
* Valid in the domain $x_z > 0$ for normalized IEEE754
* $x_z$.
*
* For <x> $< 0$, including -0, returns <NaN>. For <x> $==
* 0$ or subnormal <x>, returns <-inf>. For <x = inf>,
* returns <inf>. For <x = NaN>, returns <NaN>. For
* subnormal <x>, returns <-inf>.
*
* Xref: J2/71.
*
* Note: Derived from an SSE1 implementation by Julian
* Pommier. Converted to SSE2 and added handling
* of IEEE754 specials.
*/
__m128
esl_sse_logf(__m128 x)
{
static float cephes_p[9] = { 7.0376836292E-2f, -1.1514610310E-1f, 1.1676998740E-1f,
-1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f,
2.0000714765E-1f, -2.4999993993E-1f, 3.3333331174E-1f };
__m128 onev = _mm_set1_ps(1.0f); /* all elem = 1.0 */
__m128 v0p5 = _mm_set1_ps(0.5f); /* all elem = 0.5 */
__m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */
__m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */
__m128i ei;
__m128 e;
__m128 invalid_mask, zero_mask, inf_mask; /* masks used to handle special IEEE754 inputs */
__m128 mask;
__m128 origx;
__m128 tmp;
__m128 y;
__m128 z;
/* first, split x apart: x = frexpf(x, &e); */
ei = _mm_srli_epi32( _mm_castps_si128(x), 23); /* shift right 23: IEEE754 floats: ei = biased exponents */
invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg)); /* mask any elem that's negative; these become NaN */
zero_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128())); /* mask any elem zero or subnormal; these become -inf */
inf_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp)); /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN */
origx = x; /* store original x, used for log(inf) = inf, log(NaN) = NaN */
x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand */
x = _mm_or_ps (x, v0p5); /* sets hidden bit b[0] */
ei = _mm_sub_epi32(ei, _mm_set1_epi32(126)); /* -127 (ei now signed base-2 exponent); then +1 */
e = _mm_cvtepi32_ps(ei);
/* now, calculate the log */
mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches. */
tmp = _mm_and_ps(x, mask); /* tmp contains x values < 0.707, else 0 */
x = _mm_sub_ps(x, onev);
e = _mm_sub_ps(e, _mm_and_ps(onev, mask));
x = _mm_add_ps(x, tmp);
z = _mm_mul_ps(x,x);
y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8])); y = _mm_mul_ps(y, x);
y = _mm_mul_ps(y, z);
tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
y = _mm_add_ps(y, tmp);
tmp = _mm_mul_ps(z, v0p5);
y = _mm_sub_ps(y, tmp);
tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
x = _mm_add_ps(x, y);
x = _mm_add_ps(x, tmp);
/* IEEE754 cleanup: */
x = esl_sse_select_ps(x, origx, inf_mask); /* log(inf)=inf; log(NaN) = NaN */
x = _mm_or_ps(x, invalid_mask); /* log(x<0, including -0,-inf) = NaN */
x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal = -inf */
return x;
}
/* Function: esl_sse_expf()
* Synopsis: <r[z] = exp x[z]>
* Incept: SRE, Fri Dec 14 14:46:27 2007 [Janelia]
*
* Purpose: Given a vector <x> containing four floats, returns a
* vector <r> in which each element <r[z] = expf(x[z])>.
*
* Valid for all IEEE754 floats $x_z$.
*
* Xref: J2/71
* J10/62: bugfix, minlogf/maxlogf range was too wide;
* (k+127) must be >=0 and <=255, so (k+127)<<23
* is a valid IEEE754 float, without touching
* the sign bit. Pommier had this right in the
* first place, and I didn't understand.
*
* Note: Derived from an SSE1 implementation by Julian
* Pommier. Converted to SSE2.
*
* Note on maxlogf/minlogf, which are close to but not
* exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
* k+127 is 0..255, a valid IEEE754 8-bit exponent
* (0..255), so the bit pattern (k+127)<<23 is IEEE754
* single-precision for 2^k. If k=-127, we get IEEE754 0.
* If k=128, we get IEEE754 +inf. If k<-127, k+127 is
* negative and we get screwed up. If k>128, k+127
* overflows the 8-bit exponent and sets the sign bit. So
* for x' (base 2) < -127.5 we must definitely return e^x ~
* 0; for x' < 126.5 we're going to calculate 0 anyway
* (because k=floor(-126.5-epsilon+0.5) = -127). So any
* minlogf between -126.5 log2 ... -127.5 log2 will suffice
* as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
* That's 87.68312 .. 88.3762655. I think Pommier's
* thinking is, you don't want to get to close to the
* edges, lest fp roundoff error screw you (he may have
* consider 1 ulp carefully, I can't tell), but otherwise
* you may as well put your bounds close to the outer edge;
* so
* maxlogf = 127.5 log(2) - epsilon
* minlogf = -127.5 log(2) + epsilon
* for an epsilon that happen to be ~ 3e-6.
*/
__m128
esl_sse_expf(__m128 x)
{
static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f,
4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f };
static float cephes_c[2] = { 0.693359375f, -2.12194440e-4f };
static float maxlogf = 88.3762626647949f; /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
static float minlogf = -88.3762626647949f; /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
__m128i k;
__m128 mask, tmp, fx, z, y, minmask, maxmask;
/* handle out-of-range and special conditions */
maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));
/* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
fx = _mm_mul_ps(x, _mm_set1_ps(eslCONST_LOG2R));
fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));
/* floorf() with SSE: */
k = _mm_cvttps_epi32(fx); /* cast to int with truncation */
tmp = _mm_cvtepi32_ps(k); /* cast back to float */
mask = _mm_cmpgt_ps(tmp, fx); /* if it increased (i.e. if it was negative...) */
mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch... */
fx = _mm_sub_ps(tmp, mask); /* then subtract one. */
k = _mm_cvttps_epi32(fx); /* k is now ready for the 2^k part. */
/* polynomial approx for e^f for f in range [-0.5, 0.5] */
tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
z = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
x = _mm_sub_ps(x, tmp);
x = _mm_sub_ps(x, z);
z = _mm_mul_ps(x, x);
y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, z);
y = _mm_add_ps(y, x);
y = _mm_add_ps(y, _mm_set1_ps(1.0f));
/* build 2^k by hand, by creating a IEEE754 float */
k = _mm_add_epi32(k, _mm_set1_epi32(127));
k = _mm_slli_epi32(k, 23);
fx = _mm_castsi128_ps(k);
/* put 2^k e^f together (fx = 2^k, y = e^f) and we're done */
y = _mm_mul_ps(y, fx);
/* special/range cleanup */
y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128) */
y = esl_sse_select_ps(y, _mm_set1_ps(0.0f), minmask); /* exp(x) = 0 for x < log(2^-149) */
return y;
}
/*****************************************************************
* 2. Utilities for ps vectors (4 floats in a __m128)
*****************************************************************/
void
esl_sse_dump_ps(FILE *fp, __m128 v)
{
float *p = (float *)&v;
fprintf(fp, "[%13.8g, %13.8g, %13.8g, %13.8g]", p[0], p[1], p[2], p[3]);
}
/*****************************************************************
* 3. Benchmark
*****************************************************************/
#ifdef eslSSE_BENCHMARK
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_stopwatch.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT,"10000000", NULL, NULL, NULL, NULL, NULL, "number of trials", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "benchmark driver for sse module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_STOPWATCH *w = esl_stopwatch_Create();
int N = esl_opt_GetInteger(go, "-N");
float origx = 2.0;
float x = origx;
__m128 xv = _mm_set1_ps(x);
int i;
/* First, serial time. */
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) { x = logf(x); x = expf(x); }
esl_stopwatch_Stop(w);
esl_stopwatch_Display(stdout, w, "# serial CPU time: ");
/* Vector time */
esl_stopwatch_Start(w);
for (i = 0; i < N; i++) { xv = esl_sse_logf(xv); xv = esl_sse_expf(xv); }
esl_stopwatch_Stop(w);
esl_stopwatch_Display(stdout, w, "# vector CPU time: ");
/* If you don't do something with x and xv, the compiler may optimize them away */
printf("%g => many scalar logf,expf cycles => %g\n", origx, x);
printf("%g => many vector logf,expf cycles => ", origx);
esl_sse_dump_ps(stdout, xv); printf("\n");
esl_stopwatch_Destroy(w);
esl_getopts_Destroy(go);
return 0;
}
#endif /*eslSSE_BENCHMARK*/
/*****************************************************************
* 4. Unit tests
*****************************************************************/
#ifdef eslSSE_TESTDRIVE
#include "esl_getopts.h"
#include "esl_random.h"
/* utest_logf(): Test range/domain of logf */
static void
utest_logf(ESL_GETOPTS *go)
{
__m128 x; /* test input */
union { __m128 v; float x[4]; } r; /* test output */
/* Test IEEE754 specials:
* log(-inf) = NaN log(x<0) = NaN log(-0) = NaN
* log(0) = -inf log(inf) = inf log(NaN) = NaN
*/
x = _mm_set_ps(0.0, -0.0, -1.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */
r.v = esl_sse_logf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("logf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (! isnan(r.x[0])) esl_fatal("logf(-inf) should be NaN");
if (! isnan(r.x[1])) esl_fatal("logf(-1) should be NaN");
if (! isnan(r.x[2])) esl_fatal("logf(-0) should be NaN");
if (! (r.x[3] < 0 && isinf(r.x[3]))) esl_fatal("logf(0) should be -inf");
x = _mm_set_ps(FLT_MAX, FLT_MIN, eslNaN, eslINFINITY);
r.v = esl_sse_logf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("logf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (! isinf(r.x[0])) esl_fatal("logf(inf) should be inf");
if (! isnan(r.x[1])) esl_fatal("logf(NaN) should be NaN");
}
/* utest_expf(): Test range/domain of expf */
static void
utest_expf(ESL_GETOPTS *go)
{
__m128 x; /* test input */
union { __m128 v; float x[4]; } r; /* test output */
/* exp(-inf) = 0 exp(-0) = 1 exp(0) = 1 exp(inf) = inf exp(NaN) = NaN */
x = _mm_set_ps(eslINFINITY, 0.0, -0.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */
r.v = esl_sse_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (r.x[0] != 0.0f) esl_fatal("expf(-inf) should be 0");
if (r.x[1] != 1.0f) esl_fatal("expf(-0) should be 1");
if (r.x[2] != 1.0f) esl_fatal("expf(0) should be 1");
if (! isinf(r.x[3])) esl_fatal("expf(inf) should be inf");
/* exp(NaN) = NaN exp(large) = inf exp(-large) = 0 exp(1) = exp(1) */
x = _mm_set_ps(1.0f, -666.0f, 666.0f, eslNaN); /* set_ps() is in order 3 2 1 0 */
r.v = esl_sse_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if (! isnan(r.x[0])) esl_fatal("expf(NaN) should be NaN");
if (! isinf(r.x[1])) esl_fatal("expf(large x) should be inf");
if (r.x[2] != 0.0f) esl_fatal("expf(-large x) should be 0");
/* Make sure we are correct around the problematic ~minlogf boundary:
* (1) e^x for x < -127.5 log2 + epsilon is 0, because that's our minlogf barrier.
* (2) e^x for -127.5 log2 < x < -126.5 log2 is 0 too, but is actually calculated
* (3) e^x for -126.5 log2 < x should be finite (and close to FLT_MIN)
*
* minlogf = -127.5 log(2) + epsilon = -88.3762626647949;
* and -126.5 log(2) = -87.68311834
* so for
* (1): expf(-88.3763) => 0
* (2): expf(-88.3762) => 0
* (3): expf(-87.6832) => 0
* (4): expf(-87.6831) => <FLT_MIN (subnormal) : ~8.31e-39 (may become 0 in flush-to-zero mode for subnormals)
*/
x = _mm_set_ps(-88.3763, -88.3762, -87.6832, -87.6831);
r.v = esl_sse_expf(x);
if (esl_opt_GetBoolean(go, "-v")) {
printf("expf");
esl_sse_dump_ps(stdout, x); printf(" ==> ");
esl_sse_dump_ps(stdout, r.v); printf("\n");
}
if ( r.x[0] >= FLT_MIN) esl_fatal("expf( -126.5 log2 + eps) should be around FLT_MIN");
if ( r.x[1] != 0.0f) esl_fatal("expf( -126.5 log2 - eps) should be 0.0 (by calculation)");
if ( r.x[2] != 0.0f) esl_fatal("expf( -127.5 log2 + eps) should be 0.0 (by calculation)");
if ( r.x[3] != 0.0f) esl_fatal("expf( -127.5 log2 - eps) should be 0.0 (by min bound): %g", r.x[0]);
}
/* utest_odds(): test accuracy of logf, expf on odds ratios,
* our main intended use.
*/
static void
utest_odds(ESL_GETOPTS *go, ESL_RANDOMNESS *r)
{
int N = esl_opt_GetInteger(go, "-N");
int verbose = esl_opt_GetBoolean(go, "-v");
int very_verbose = esl_opt_GetBoolean(go, "--vv");
int i;
float p1, p2, odds;
union { __m128 v; float x[4]; } r1;
union { __m128 v; float x[4]; } r2;
float scalar_r1, scalar_r2;
double err1, maxerr1 = 0.0, avgerr1 = 0.0; /* errors on logf() */
double err2, maxerr2 = 0.0, avgerr2 = 0.0; /* errors on expf() */
for (i = 0; i < N; i++)
{
p1 = esl_rnd_UniformPositive(r);
p2 = esl_rnd_UniformPositive(r);
odds = p1 / p2;
if (odds == 0.0) esl_fatal("whoa, odds ratio can't be 0!\n");
r1.v = esl_sse_logf(_mm_set1_ps(odds)); /* r1.x[z] = log(p1/p2) */
scalar_r1 = log(odds);
err1 = (r1.x[0] == 0. && scalar_r1 == 0.) ? 0.0 : 2 * fabs(r1.x[0] - scalar_r1) / fabs(r1.x[0] + scalar_r1);
if (err1 > maxerr1) maxerr1 = err1;
avgerr1 += err1 / (float) N;
if (isnan(avgerr1)) esl_fatal("whoa, what?\n");
r2.v = esl_sse_expf(r1.v); /* and back to odds */
scalar_r2 = exp(r1.x[0]);
err2 = (r2.x[0] == 0. && scalar_r2 == 0.) ? 0.0 : 2 * fabs(r2.x[0] - scalar_r2) / fabs(r2.x[0] + scalar_r2);
if (err2 > maxerr2) maxerr2 = err2;
avgerr2 += err2 / (float) N;
if (very_verbose)
printf("%13.7g %13.7g %13.7g %13.7g %13.7g %13.7g %13.7g\n", odds, scalar_r1, r1.x[0], scalar_r2, r2.x[0], err1, err2);
}
if (verbose) {
printf("Average [max] logf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr1, maxerr1);
printf("Average [max] expf() relative error in %d odds trials: %13.8g [%13.8g]\n", N, avgerr2, maxerr2);
printf("(random seed : %" PRIu32 ")\n", esl_randomness_GetSeed(r));
}
if (avgerr1 > 1e-8) esl_fatal("average error on logf() is intolerable\n");
if (maxerr1 > 1e-6) esl_fatal("maximum error on logf() is intolerable\n");
if (avgerr2 > 1e-8) esl_fatal("average error on expf() is intolerable\n");
if (maxerr2 > 1e-6) esl_fatal("maximum error on expf() is intolerable\n");
}
static void
utest_hmax_epu8(ESL_RANDOMNESS *rng)
{
union { __m128i v; uint8_t x[16]; } u;
uint8_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = 0;
for (z = 0; z < 16; z++)
{
u.x[z] = (uint8_t) (esl_rnd_Roll(rng, 256)); // 0..255
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epu8(u.v);
if (r1 != r2) esl_fatal("hmax_epu8 utest failed");
}
}
static void
utest_hmax_epi8(ESL_RANDOMNESS *rng)
{
#ifdef eslENABLE_SSE4 // no-op if eslENABLE_SSE only
union { __m128i v; int8_t x[16]; } u;
int8_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = -128;
for (z = 0; z < 16; z++)
{
u.x[z] = (int8_t) (esl_rnd_Roll(rng, 256) - 128); // -128..127
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epi8(u.v);
if (r1 != r2) esl_fatal("hmax_epi8 utest failed");
}
#endif // eslENABLE_SSE4
}
static void
utest_hmax_epi16(ESL_RANDOMNESS *rng)
{
union { __m128i v; int16_t x[8]; } u;
int16_t r1, r2;
int i,z;
for (i = 0; i < 100; i++)
{
r1 = -32768;
for (z = 0; z < 8; z++)
{
u.x[z] = (int16_t) (esl_rnd_Roll(rng, 65536) - 32768); // -32768..32767
if (u.x[z] > r1) r1 = u.x[z];
}
r2 = esl_sse_hmax_epi16(u.v);
if (r1 != r2) esl_fatal("hmax_epi16 utest failed: %d != %d", r1, r2);
}
}
#endif /*eslSSE_TESTDRIVE*/
/*****************************************************************
* 5. Test driver
*****************************************************************/
#ifdef eslSSE_TESTDRIVE
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_getopts.h"
#include "esl_random.h"
#include "esl_sse.h"
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 },
{ "-N", eslARG_INT, "10000", NULL, NULL, NULL, NULL, NULL, "number of random test points", 0 },
{ "-s", eslARG_INT, "0", NULL, NULL, NULL, NULL, NULL, "set random number seed to <n>", 0 },
{ "-v", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be verbose: show test report", 0 },
{ "--vv", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "be very verbose: show individual test samples", 0 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
static char usage[] = "[-options]";
static char banner[] = "test driver for sse module";
int
main(int argc, char **argv)
{
ESL_GETOPTS *go = esl_getopts_CreateDefaultApp(options, 0, argc, argv, banner, usage);
ESL_RANDOMNESS *rng = esl_randomness_Create(esl_opt_GetInteger(go, "-s"));;
fprintf(stderr, "## %s\n", argv[0]);
fprintf(stderr, "# rng seed = %" PRIu32 "\n", esl_randomness_GetSeed(rng));
utest_logf(go);
utest_expf(go);
utest_odds(go, rng);
utest_hmax_epu8(rng);
utest_hmax_epi8(rng);
utest_hmax_epi16(rng);
fprintf(stderr, "# status = ok\n");
esl_randomness_Destroy(rng);
esl_getopts_Destroy(go);
return 0;
}
#endif /* eslSSE_TESTDRIVE*/
/*****************************************************************
* 6. Example
*****************************************************************/
#ifdef eslSSE_EXAMPLE
/*::cexcerpt::sse_example::begin::*/
#include "esl_config.h"
#include <stdio.h>
#include <math.h>
#include "easel.h"
#include "esl_sse.h"
int
main(int argc, char **argv)
{
float x; /* scalar input */
__m128 xv; /* input vector */
union { __m128 v; float x[4]; } rv; /* result vector*/
x = 2.0;
xv = _mm_set1_ps(x);
rv.v = esl_sse_logf(xv);
printf("logf(%f) = %f\n", x, rv.x[0]);
rv.v = esl_sse_expf(xv);
printf("expf(%f) = %f\n", x, rv.x[0]);
return 0;
}
/*::cexcerpt::sse_example::end::*/
#endif /*eslSSE_EXAMPLE*/
#else // ! (eslENABLE_SSE || eslENABLE_SSE4)
/* If we don't have SSE compiled in, provide some nothingness to:
* a. prevent Mac OS/X ranlib from bitching about .o file that "has no symbols"
* b. prevent compiler from bitching about "empty compilation unit"
* c. compile blank drivers and automatically pass the automated tests.
*/
void esl_sse_silence_hack(void) { return; }
#if defined eslSSE_TESTDRIVE || eslSSE_EXAMPLE || eslSSE_BENCHMARK
int main(void) { return 0; }
#endif
#endif // (eslENABLE_SSE || eslENABLE_SSE4) or not
/*****************************************************************
* additional copyright and license information for this file
*****************************************************************
* In addition to our own copyrights, esl_sse_logf() and esl_sse_expf() are also:
* Copyright (C) 2007 Julien Pommier
* Copyright (C) 1992 Stephen Moshier
*
* These functions derived from zlib-licensed routines by
* Julien Pommier, http://gruntthepeon.free.fr/ssemath/. The
* zlib license:
*
*-------------------------------------------------------------------------
* Copyright (C) 2007 Julien Pommier
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
*-------------------------------------------------------------------------
*
* In turn, Pommier had derived the logf() and expf() functions from
* serial versions in the Cephes math library. According to its
* readme, Cephes is "copyrighted by the author" and "may be used
* freely but it comes with no support or guarantee." Cephes is
* available in NETLIB [http://www.netlib.org/cephes/]. NETLIB is
* widely considered to be a free scientific code repository, though
* the copyright and license status of many parts, including Cephes,
* is ill-defined. We have attached Moshier's copyright,
* to credit his original contribution. Thanks to both Pommier and
* Moshier for their clear code.
*/