-
Notifications
You must be signed in to change notification settings - Fork 10
/
Bench.hs
2037 lines (1787 loc) · 74.8 KB
/
Bench.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{- |
Module: Test.Tasty.Bench
Copyright: (c) 2021 Andrew Lelechenko
Licence: MIT
Featherlight benchmark framework (only one file!) for performance
measurement with API
mimicking [@criterion@](http://hackage.haskell.org/package/criterion)
and [@gauge@](http://hackage.haskell.org/package/gauge).
A prominent feature is built-in comparison against previous runs
and between benchmarks.
=== How lightweight is it?
There is only one source file "Test.Tasty.Bench" and no non-boot
dependencies except [@tasty@](http://hackage.haskell.org/package/tasty). So
if you already depend on @tasty@ for a test suite, there is nothing else
to install.
Compare this to @criterion@ (10+ modules, 50+ dependencies) and @gauge@
(40+ modules, depends on @basement@ and @vector@). A build on a clean machine is up to 16x
faster than @criterion@ and up to 4x faster than @gauge@. A build without dependencies
is up to 6x faster than @criterion@ and up to 8x faster than @gauge@.
@tasty-bench@ is a native Haskell library and works everywhere, where GHC
does. We support a full range of architectures (@i386@, @amd64@, @armhf@,
@arm64@, @ppc64le@, @s390x@) and operating systems (Linux, Windows, macOS,
FreeBSD, OpenBSD, NetBSD), plus any GHC from 7.0 to 9.6.
=== How is it possible?
Our benchmarks are literally regular @tasty@ tests, so we can leverage
all existing machinery for command-line options, resource management,
structuring, listing and filtering benchmarks, running and reporting
results. It also means that @tasty-bench@ can be used in conjunction
with other @tasty@ ingredients.
Unlike @criterion@ and @gauge@ we use a very simple statistical model
described below. This is arguably a questionable choice, but it works
pretty well in practice. A rare developer is sufficiently well-versed in
probability theory to make sense and use of all numbers generated by
@criterion@.
=== How to switch?
<https://cabal.readthedocs.io/en/3.4/cabal-package.html#pkg-field-mixins Cabal mixins>
allow to taste @tasty-bench@ instead of @criterion@ or @gauge@ without
changing a single line of code:
> cabal-version: 2.0
>
> benchmark foo
> ...
> build-depends:
> tasty-bench
> mixins:
> tasty-bench (Test.Tasty.Bench as Criterion, Test.Tasty.Bench as Criterion.Main, Test.Tasty.Bench as Gauge, Test.Tasty.Bench as Gauge.Main)
This works vice versa as well: if you use @tasty-bench@, but at some
point need a more comprehensive statistical analysis, it is easy to
switch temporarily back to @criterion@.
=== How to write a benchmark?
Benchmarks are declared in a separate section of @cabal@ file:
> cabal-version: 2.0
> name: bench-fibo
> version: 0.0
> build-type: Simple
> synopsis: Example of a benchmark
>
> benchmark bench-fibo
> main-is: BenchFibo.hs
> type: exitcode-stdio-1.0
> build-depends: base, tasty-bench
> ghc-options: "-with-rtsopts=-A32m"
> if impl(ghc >= 8.6)
> ghc-options: -fproc-alignment=64
And here is @BenchFibo.hs@:
> import Test.Tasty.Bench
>
> fibo :: Int -> Integer
> fibo n = if n < 2 then toInteger n else fibo (n - 1) + fibo (n - 2)
>
> main :: IO ()
> main = defaultMain
> [ bgroup "fibonacci numbers"
> [ bench "fifth" $ nf fibo 5
> , bench "tenth" $ nf fibo 10
> , bench "twentieth" $ nf fibo 20
> ]
> ]
Since @tasty-bench@ provides an API compatible with @criterion@, one can
refer to
<http://www.serpentine.com/criterion/tutorial.html#how-to-write-a-benchmark-suite its documentation>
for more examples.
=== How to read results?
Running the example above (@cabal@ @bench@ or @stack@ @bench@) results in
the following output:
> All
> fibonacci numbers
> fifth: OK (2.13s)
> 63 ns ± 3.4 ns
> tenth: OK (1.71s)
> 809 ns ± 73 ns
> twentieth: OK (3.39s)
> 104 μs ± 4.9 μs
>
> All 3 tests passed (7.25s)
The output says that, for instance, the first benchmark was repeatedly
executed for 2.13 seconds (wall-clock time), its predicted mean CPU time was
63 nanoseconds and means of individual samples do not often diverge from it
further than ±3.4 nanoseconds (double standard deviation). Take standard
deviation numbers with a grain of salt; there are lies, damned lies, and
statistics.
=== Wall-clock time vs. CPU time
What time are we talking about?
Both @criterion@ and @gauge@ by default report wall-clock time, which is
affected by any other application which runs concurrently.
Ideally benchmarks are executed on a dedicated server without any other load,
but — let's face the truth — most of developers run benchmarks
on a laptop with a hundred other services and a window manager, and
watch videos while waiting for benchmarks to finish. That's the cause
of a notorious "variance introduced by outliers: 88% (severely inflated)" warning.
To alleviate this issue @tasty-bench@ measures CPU time by 'getCPUTime'
instead of wall-clock time by default.
It does not provide a perfect isolation from other processes (e. g.,
if CPU cache is spoiled by others, populating data back from RAM
is your burden), but is a bit more stable.
Caveat: this means that for multithreaded algorithms
@tasty-bench@ reports total elapsed CPU time across all cores, while
@criterion@ and @gauge@ print maximum of core's wall-clock time.
It also means that by default @tasty-bench@ does not measure time spent out of process,
e. g., calls to other executables. To work around this limitation
use @--time-mode@ command-line option or set it locally via 'TimeMode' option.
=== Statistical model
Here is a procedure used by @tasty-bench@ to measure execution time:
1. Set \( n \leftarrow 1 \).
2. Measure execution time \( t_n \) of \( n \) iterations and execution time
\( t_{2n} \) of \( 2n \) iterations.
3. Find \( t \) which minimizes deviation of \( (nt, 2nt) \) from
\( (t_n, t_{2n}) \), namely \( t \leftarrow (t_n + 2t_{2n}) / 5n \).
4. If deviation is small enough (see @--stdev@ below)
or time is running out soon (see @--timeout@ below),
return \( t \) as a mean execution time.
5. Otherwise set \( n \leftarrow 2n \) and jump back to Step 2.
This is roughly similar to the linear regression approach which
@criterion@ takes, but we fit only two last points. This allows us to
simplify away all heavy-weight statistical analysis. More importantly,
earlier measurements, which are presumably shorter and noisier, do not
affect overall result. This is in contrast to @criterion@, which fits
all measurements and is biased to use more data points corresponding to
shorter runs (it employs \( n \leftarrow 1.05n \) progression).
Mean time and its deviation does not say much about the
distribution of individual timings. E. g., imagine a computation which
(according to a coarse system timer) takes either 0 ms or 1 ms with equal
probability. While one would be able to establish that its mean time is 0.5 ms
with a very small deviation, this does not imply that individual measurements
are anywhere near 0.5 ms. Even assuming an infinite precision of a system
timer, the distribution of individual times is not known to be
<https://en.wikipedia.org/wiki/Normal_distribution normal>.
Obligatory disclaimer: statistics is a tricky matter, there is no
one-size-fits-all approach. In the absence of a good theory simplistic
approaches are as (un)sound as obscure ones. Those who seek statistical
soundness should rather collect raw data and process it themselves using
a proper statistical toolbox. Data reported by @tasty-bench@ is only of
indicative and comparative significance.
=== Memory usage
Configuring RTS to collect GC statistics
(e. g., via @cabal@ @bench@ @--benchmark-options@ @\'+RTS@ @-T\'@ or
@stack@ @bench@ @--ba@ @\'+RTS@ @-T\'@) enables @tasty-bench@ to estimate and
report memory usage:
> All
> fibonacci numbers
> fifth: OK (2.13s)
> 63 ns ± 3.4 ns, 223 B allocated, 0 B copied, 2.0 MB peak memory
> tenth: OK (1.71s)
> 809 ns ± 73 ns, 2.3 KB allocated, 0 B copied, 4.0 MB peak memory
> twentieth: OK (3.39s)
> 104 μs ± 4.9 μs, 277 KB allocated, 59 B copied, 5.0 MB peak memory
>
> All 3 tests passed (7.25s)
This data is reported as per 'RTSStats' fields: 'allocated_bytes', 'copied_bytes'
and 'max_mem_in_use_bytes'.
=== Combining tests and benchmarks
When optimizing an existing function, it is important to check that its
observable behavior remains unchanged. One can rebuild both tests and
benchmarks after each change, but it would be more convenient to run
sanity checks within benchmark itself. Since our benchmarks are
compatible with @tasty@ tests, we can easily do so.
Imagine you come up with a faster function @myFibo@ to generate
Fibonacci numbers:
> import Test.Tasty.Bench
> import Test.Tasty.QuickCheck -- from tasty-quickcheck package
>
> fibo :: Int -> Integer
> fibo n = if n < 2 then toInteger n else fibo (n - 1) + fibo (n - 2)
>
> myFibo :: Int -> Integer
> myFibo n = if n < 3 then toInteger n else myFibo (n - 1) + myFibo (n - 2)
>
> main :: IO ()
> main = Test.Tasty.Bench.defaultMain -- not Test.Tasty.defaultMain
> [ bench "fibo 20" $ nf fibo 20
> , bench "myFibo 20" $ nf myFibo 20
> , testProperty "myFibo = fibo" $ \n -> fibo n === myFibo n
> ]
This outputs:
> All
> fibo 20: OK (3.02s)
> 104 μs ± 4.9 μs
> myFibo 20: OK (1.99s)
> 71 μs ± 5.3 μs
> myFibo = fibo: FAIL
> *** Failed! Falsified (after 5 tests and 1 shrink):
> 2
> 1 /= 2
> Use --quickcheck-replay=927711 to reproduce.
>
> 1 out of 3 tests failed (5.03s)
We see that @myFibo@ is indeed significantly faster than @fibo@, but
unfortunately does not do the same thing. One should probably look for
another way to speed up generation of Fibonacci numbers.
=== Troubleshooting
- If benchmarks take too long, set @--timeout@ to limit execution time
of individual benchmarks, and @tasty-bench@ will do its best to fit
into a given time frame. Without @--timeout@ we rerun benchmarks until
achieving a target precision set by @--stdev@, which in a noisy
environment of a modern laptop with GUI may take a lot of time.
While @criterion@ runs each benchmark at least for 5 seconds,
@tasty-bench@ is happy to conclude earlier, if it does not
compromise the quality of results. In our experiments @tasty-bench@
suites tend to finish earlier, even if some individual benchmarks
take longer than with @criterion@.
A common source of noisiness is garbage collection. Setting a larger
allocation area (/nursery/) is often a good idea, either via
@cabal@ @bench@ @--benchmark-options@ @\'+RTS@ @-A32m\'@ or
@stack@ @bench@ @--ba@ @\'+RTS@ @-A32m\'@. Alternatively bake it into @cabal@
file as @ghc-options:@ @\"-with-rtsopts=-A32m\"@.
For GHC ≥ 8.10 consider switching benchmarks to a non-moving garbage collector,
because it decreases GC pauses and corresponding noise: @+RTS@ @--nonmoving-gc@.
- Never compile benchmarks with @-fstatic-argument-transformation@, because it
breaks a trick we use to force GHC into reevaluation of the same function application
over and over again.
- If benchmark results look malformed like below, make sure that you
are invoking 'Test.Tasty.Bench.defaultMain' and not
'Test.Tasty.defaultMain' (the difference is 'consoleBenchReporter'
vs. 'consoleTestReporter'):
> All
> fibo 20: OK (1.46s)
> Response {respEstimate = Estimate {estMean = Measurement {measTime = 87496728, measAllocs = 0, measCopied = 0}, estStdev = 694487}, respIfSlower = FailIfSlower Infinity, respIfFaster = FailIfFaster Infinity}
- If benchmarks fail with an error message
> Unhandled resource. Probably a bug in the runner you're using.
or
> Unexpected state of the resource (NotCreated) in getResource. Report as a tasty bug.
this is likely caused by 'env' or 'envWithCleanup' affecting
benchmarks structure. You can use 'env' to read test data from 'IO',
but not to read benchmark names or affect their hierarchy in other
way. This is a fundamental restriction of @tasty@ to list and filter
benchmarks without launching missiles.
- If benchmarks fail with @Test dependencies form a loop@
or @Test dependencies have cycles@, this is likely
because of 'bcompare', which compares a benchmark with itself.
Locating a benchmark in a global environment may be tricky, please refer to
[@tasty@ documentation](https://github.com/UnkindPartition/tasty#patterns) for details
and consider using 'locateBenchmark'.
=== Isolating interfering benchmarks
One difficulty of benchmarking in Haskell is that it is hard to isolate
benchmarks so that they do not interfere. Changing the order of
benchmarks or skipping some of them has an effect on heap’s layout and
thus affects garbage collection. This issue is well attested in
<https://github.com/haskell/criterion/issues/166 both>
<https://github.com/haskell/criterion/issues/60 criterion> and
<https://github.com/vincenthz/hs-gauge/issues/2 gauge>.
Usually (but not always) skipping some benchmarks speeds up remaining
ones. That’s because once a benchmark allocated heap which for some
reason was not promptly released afterwards (e. g., it forced a
top-level thunk in an underlying library), all further benchmarks are
slowed down by garbage collector processing this additional amount of
live data over and over again.
There are several mitigation strategies. First of all, giving garbage
collector more breathing space by @+RTS@ @-A32m@ (or more) is often good
enough.
Further, avoid using top-level bindings to store large test data. Once
such thunks are forced, they remain allocated forever, which affects
detrimentally subsequent unrelated benchmarks. Treat them as external
data, supplied via 'env': instead of
> largeData :: String
> largeData = replicate 1000000 'a'
>
> main :: IO ()
> main = defaultMain
> [ bench "large" $ nf length largeData, ... ]
use
> import Control.DeepSeq (force)
> import Control.Exception (evaluate)
>
> main :: IO ()
> main = defaultMain
> [ env (evaluate (force (replicate 1000000 'a'))) $ \largeData ->
> bench "large" $ nf length largeData, ... ]
Finally, as an ultimate measure to reduce interference between
benchmarks, one can run each of them in a separate process. We do not
quite recommend this approach, but if you are desperate, here is how:
> cabal run -v0 all:benches -- -l | sed -e 's/[\"]/\\\\\\&/g' | while read -r name; do cabal run -v0 all:benches -- -p '$0 == "'"$name"'"'; done
This assumes that there is a single benchmark suite in the project
and that benchmark names do not contain newlines.
=== Comparison against baseline
One can compare benchmark results against an earlier run in an automatic way.
When using this feature, it's especially important to compile benchmarks with
@ghc-options:@ [@-fproc-alignment@](https://downloads.haskell.org/ghc/latest/docs/users_guide/debugging.html#ghc-flag--fproc-alignment)@=64@, otherwise results could be skewed by
intermittent changes in cache-line alignment.
Firstly, run @tasty-bench@ with
@--csv@ @FILE@ key to dump results to @FILE@ in CSV format
(it could be a good idea to set smaller @--stdev@, if possible):
> Name,Mean (ps),2*Stdev (ps)
> All.fibonacci numbers.fifth,48453,4060
> All.fibonacci numbers.tenth,637152,46744
> All.fibonacci numbers.twentieth,81369531,3342646
Now modify implementation and rerun benchmarks with @--baseline@ @FILE@
key. This produces a report as follows:
> All
> fibonacci numbers
> fifth: OK (0.44s)
> 53 ns ± 2.7 ns, 8% more than baseline
> tenth: OK (0.33s)
> 641 ns ± 59 ns, same as baseline
> twentieth: OK (0.36s)
> 77 μs ± 6.4 μs, 5% less than baseline
>
> All 3 tests passed (1.50s)
You can also fail benchmarks, which deviate too far from baseline, using
@--fail-if-slower@ and @--fail-if-faster@ options. For example, setting
both of them to 6 will fail the first benchmark above (because it is
more than 6% slower), but the last one still succeeds (even while it is
measurably faster than baseline, deviation is less than 6%). Consider
also using @--hide-successes@ to show only problematic benchmarks, or
even [@tasty-rerun@](http://hackage.haskell.org/package/tasty-rerun)
package to focus on rerunning failing items only.
If you wish to compare two CSV reports non-interactively, here is a handy @awk@ incantation:
> awk 'BEGIN{FS=",";OFS=",";print "Name,Old,New,Ratio"}FNR==1{trueNF=NF;next}NF<trueNF{print "Benchmark names should not contain newlines";exit 1}FNR==NR{oldTime=$(NF-trueNF+2);NF-=trueNF-1;a[$0]=oldTime;next}{newTime=$(NF-trueNF+2);NF-=trueNF-1;print $0,a[$0],newTime,newTime/a[$0];gs+=log(newTime/a[$0]);gc++}END{if(gc>0)print "Geometric mean,,",exp(gs/gc)}' old.csv new.csv
Note that columns in CSV report are different from what @criterion@ or @gauge@
would produce. If names do not contain commas, missing columns can be faked this way:
> awk 'BEGIN{FS=",";OFS=",";print "Name,Mean,MeanLB,MeanUB,Stddev,StddevLB,StddevUB"}NR==1{trueNF=NF;next}NF<trueNF{print $0;next}{mean=$(NF-trueNF+2);stddev=$(NF-trueNF+3);NF-=trueNF-1;print $0,mean/1e12,mean/1e12,mean/1e12,stddev/2e12,stddev/2e12,stddev/2e12}'
To fake @gauge@ in @--csvraw@ mode use
> awk 'BEGIN{FS=",";OFS=",";print "name,iters,time,cycles,cpuTime,utime,stime,maxrss,minflt,majflt,nvcsw,nivcsw,allocated,numGcs,bytesCopied,mutatorWallSeconds,mutatorCpuSeconds,gcWallSeconds,gcCpuSeconds"}NR==1{trueNF=NF;next}NF<trueNF{print $0;next}{mean=$(NF-trueNF+2);fourth=$(NF-trueNF+4);fifth=$(NF-trueNF+5);sixth=$(NF-trueNF+6);NF-=trueNF-1;print $0,1,mean/1e12,0,mean/1e12,mean/1e12,0,sixth+0,0,0,0,0,fourth+0,0,fifth+0,0,0,0,0}'
=== Comparison between benchmarks
You can also compare benchmarks to each other without any
external tools, all in the comfort of your terminal.
> import Test.Tasty.Bench
>
> fibo :: Int -> Integer
> fibo n = if n < 2 then toInteger n else fibo (n - 1) + fibo (n - 2)
>
> main :: IO ()
> main = defaultMain
> [ bgroup "fibonacci numbers"
> [ bcompare "tenth" $ bench "fifth" $ nf fibo 5
> , bench "tenth" $ nf fibo 10
> , bcompare "tenth" $ bench "twentieth" $ nf fibo 20
> ]
> ]
This produces a report, comparing mean times of @fifth@ and @twentieth@
to @tenth@:
> All
> fibonacci numbers
> fifth: OK (16.56s)
> 121 ns ± 2.6 ns, 0.08x
> tenth: OK (6.84s)
> 1.6 μs ± 31 ns
> twentieth: OK (6.96s)
> 203 μs ± 4.1 μs, 128.36x
To locate a baseline benchmark in a larger suite use 'locateBenchmark'.
One can leverage comparisons between benchmarks to implement portable performance
tests, expressing properties like "this algorithm must be at least twice faster
than that one" or "this operation should not be more than thrice slower than that".
This can be achieved with 'bcompareWithin', which takes an acceptable interval
of performance as an argument.
=== Plotting results
Users can dump results into CSV with @--csv@ @FILE@ and plot them using
@gnuplot@ or other software. But for convenience there is also a
built-in quick-and-dirty SVG plotting feature, which can be invoked by
passing @--svg@ @FILE@. Here is a sample of its output:
![Plotting](example.svg)
=== Build flags
Build flags are a brittle subject and users do not normally need to touch them.
* If you find yourself in an environment, where @tasty@ is not available and you
have access to boot packages only, you can still use @tasty-bench@! Just copy
@Test\/Tasty\/Bench.hs@ to your project (imagine it like a header-only C library).
It will provide you with functions to build 'Benchmarkable' and run them manually
via 'measureCpuTime'. This mode of operation can be also configured
by disabling Cabal flag @tasty@.
* If results are amiss or oscillate wildly and adjusting @--timeout@ and @--stdev@
does not help, you may be interested to investigate individual timings of
successive runs by enabling Cabal flag @debug@. This will pipe raw data into @stderr@.
=== Command-line options
Use @--help@ to list command-line options.
[@-p@, @--pattern@]:
This is a standard @tasty@ option, which allows filtering benchmarks
by a pattern or @awk@ expression. Please refer
to [@tasty@ documentation](https://github.com/UnkindPartition/tasty#patterns)
for details.
[@-t@, @--timeout@]:
This is a standard @tasty@ option, setting timeout for individual
benchmarks in seconds. Use it when benchmarks tend to take too long:
@tasty-bench@ will make an effort to report results (even if of
subpar quality) before timeout. Setting timeout too tight
(insufficient for at least three iterations) will result in a
benchmark failure. One can adjust it locally for a group
of benchmarks, e. g., 'localOption' ('mkTimeout' 100000000) for 100 seconds.
[@--stdev@]:
Target relative standard deviation of measurements in percents (5%
by default). Large values correspond to fast and loose benchmarks,
and small ones to long and precise.
It can also be adjusted locally for a group of benchmarks,
e. g., 'localOption' ('RelStDev' 0.02).
If benchmarking takes far too long, consider setting @--timeout@,
which will interrupt benchmarks,
potentially before reaching the target deviation.
[@--csv@]:
File to write results in CSV format.
[@--baseline@]:
File to read baseline results in CSV format (as produced by
@--csv@).
[@--fail-if-slower@, @--fail-if-faster@]:
Upper bounds of acceptable slow down \/ speed up in percents. If a
benchmark is unacceptably slower \/ faster than baseline (see
@--baseline@), it will be reported as failed. Can be used in
conjunction with a standard @tasty@ option @--hide-successes@ to
show only problematic benchmarks.
Both options can be adjusted locally for a group of benchmarks,
e. g., 'localOption' ('FailIfSlower' 0.10).
[@--svg@]:
File to plot results in SVG format.
[@--time-mode@]:
Whether to measure CPU time (@cpu@, default) or wall-clock time (@wall@).
[@+RTS@ @-T@]:
Estimate and report memory usage.
=== Custom command-line options
As usual with @tasty@, it is easy to extend benchmarks with custom command-line options.
Here is an example:
> import Data.Proxy
> import Test.Tasty.Bench
> import Test.Tasty.Ingredients.Basic
> import Test.Tasty.Options
> import Test.Tasty.Runners
>
> newtype RandomSeed = RandomSeed Int
>
> instance IsOption RandomSeed where
> defaultValue = RandomSeed 42
> parseValue = fmap RandomSeed . safeRead
> optionName = pure "seed"
> optionHelp = pure "Random seed used in benchmarks"
>
> main :: IO ()
> main = do
> let customOpts = [Option (Proxy :: Proxy RandomSeed)]
> ingredients = includingOptions customOpts : benchIngredients
> opts <- parseOptions ingredients benchmarks
> let RandomSeed seed = lookupOption opts
> defaultMainWithIngredients ingredients benchmarks
>
> benchmarks :: Benchmark
> benchmarks = bgroup "All" []
-}
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE CPP #-}
{-# LANGUAGE DeriveDataTypeable #-}
{-# LANGUAGE DeriveFunctor #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TupleSections #-}
module Test.Tasty.Bench
(
#ifdef MIN_VERSION_tasty
-- * Running 'Benchmark'
defaultMain
, Benchmark
, bench
, bgroup
, bcompare
, bcompareWithin
, env
, envWithCleanup
,
#endif
-- * Creating 'Benchmarkable'
Benchmarkable(..)
, nf
, whnf
, nfIO
, whnfIO
, nfAppIO
, whnfAppIO
, measureCpuTime
#ifdef MIN_VERSION_tasty
-- * Ingredients
, benchIngredients
, consoleBenchReporter
, csvReporter
, svgReporter
, RelStDev(..)
, FailIfSlower(..)
, FailIfFaster(..)
, CsvPath(..)
, BaselinePath(..)
, SvgPath(..)
, TimeMode(..)
-- * Utils
, locateBenchmark
, mapLeafBenchmarks
#else
, Timeout(..)
, RelStDev(..)
#endif
) where
import Prelude hiding (Int, Integer)
import qualified Prelude
import Control.Applicative
import Control.Arrow (first, second)
import Control.DeepSeq (NFData, force)
import Control.Exception (bracket, evaluate)
import Control.Monad (void, unless, guard, (>=>), when)
import Data.Data (Typeable)
import Data.Foldable (foldMap, traverse_)
import Data.Int (Int64)
import Data.IORef
import Data.List (intercalate, stripPrefix, isPrefixOf, genericLength, genericDrop, foldl1')
import Data.Maybe (fromMaybe)
import Data.Monoid (All(..), Any(..))
import Data.Proxy
import Data.Traversable (forM)
import Data.Word (Word64)
import GHC.Conc
#if MIN_VERSION_base(4,5,0)
import GHC.IO.Encoding
#endif
#if MIN_VERSION_base(4,6,0)
import GHC.Stats
#endif
import System.CPUTime
import System.Exit
import System.IO
import System.IO.Unsafe
import System.Mem
import Text.Printf
#ifdef DEBUG
import Debug.Trace
#endif
#ifdef MIN_VERSION_tasty
#if !MIN_VERSION_base(4,8,0)
import Data.Monoid (Monoid(..))
#endif
#if MIN_VERSION_base(4,9,0)
import Data.Semigroup (Semigroup(..))
#endif
#if MIN_VERSION_containers(0,5,0)
import qualified Data.IntMap.Strict as IM
#else
import qualified Data.IntMap as IM
#endif
import Data.IntMap (IntMap)
import Data.Sequence (Seq, (<|))
import qualified Data.Sequence as Seq
import qualified Data.Set as S
import Test.Tasty hiding (defaultMain)
import qualified Test.Tasty
import Test.Tasty.Ingredients
import Test.Tasty.Ingredients.ConsoleReporter
import Test.Tasty.Options
import Test.Tasty.Patterns.Eval (eval, asB, withFields)
import Test.Tasty.Patterns.Types (Expr (And, Field, IntLit, NF, StringLit, Sub))
import qualified Test.Tasty.Patterns.Types as Patterns
import Test.Tasty.Providers
import Test.Tasty.Runners
#endif
#if defined(mingw32_HOST_OS)
import Data.Word (Word32)
#endif
#if MIN_VERSION_ghc_prim(0,3,1)
import GHC.Types (SPEC(..))
#else
import GHC.Exts (SpecConstrAnnotation(..))
data SPEC = SPEC | SPEC2
{-# ANN type SPEC ForceSpecConstr #-}
#endif
#ifndef MIN_VERSION_tasty
data Timeout
= Timeout
Prelude.Integer -- ^ number of microseconds (e. g., 200000)
String -- ^ textual representation (e. g., @"0.2s"@)
| NoTimeout
deriving (Show)
#endif
-- | In addition to @--stdev@ command-line option,
-- one can adjust target relative standard deviation
-- for individual benchmarks and groups of benchmarks
-- using 'adjustOption' and 'localOption'.
--
-- E. g., set target relative standard deviation to 2% as follows:
--
-- > import Test.Tasty (localOption)
-- > localOption (RelStDev 0.02) (bgroup [...])
--
-- If you set 'RelStDev' to infinity,
-- a benchmark will be executed
-- only once and its standard deviation will be recorded as zero.
-- This is rather a blunt approach, but it might be a necessary evil
-- for extremely long benchmarks. If you wish to run all benchmarks
-- only once, use command-line option @--stdev@ @Infinity@.
--
-- @since 0.2
newtype RelStDev = RelStDev Double
deriving (Show, Read, Typeable)
-- | Whether to measure CPU time or wall-clock time.
-- Normally 'CpuTime' is a better option (and default),
-- but consider switching to 'WallTime'
-- to measure multithreaded algorithms or time spent in external processes.
--
-- One can switch the default measurement mode globally
-- using @--time-mode@ command-line option,
-- but it is usually better to adjust the mode locally:
--
-- > import Test.Tasty (localOption)
-- > localOption WallTime (bgroup [...])
--
-- section of your cabal file.
--
-- @since 0.3.2
data TimeMode = CpuTime
-- ^ Measure CPU time.
#ifdef MIN_VERSION_tasty
| WallTime
-- ^ Measure wall-clock time.
#endif
deriving (Typeable)
#ifdef MIN_VERSION_tasty
instance IsOption RelStDev where
defaultValue = RelStDev 0.05
parseValue = fmap RelStDev . parsePositivePercents
optionName = pure "stdev"
optionHelp = pure "Target relative standard deviation of measurements in percents (5 by default). Large values correspond to fast and loose benchmarks, and small ones to long and precise. If it takes far too long, consider setting --timeout, which will interrupt benchmarks, potentially before reaching the target deviation."
-- | In addition to @--fail-if-slower@ command-line option,
-- one can adjust an upper bound of acceptable slow down
-- in comparison to baseline for
-- individual benchmarks and groups of benchmarks
-- using 'adjustOption' and 'localOption'.
--
-- E. g., set upper bound of acceptable slow down to 10% as follows:
--
-- > import Test.Tasty (localOption)
-- > localOption (FailIfSlower 0.10) (bgroup [...])
--
-- @since 0.2
newtype FailIfSlower = FailIfSlower Double
deriving (Show, Read, Typeable)
instance IsOption FailIfSlower where
defaultValue = FailIfSlower (1.0 / 0.0)
parseValue = fmap FailIfSlower . parsePositivePercents
optionName = pure "fail-if-slower"
optionHelp = pure "Upper bound of acceptable slow down in percents. If a benchmark is unacceptably slower than baseline (see --baseline), it will be reported as failed."
-- | In addition to @--fail-if-faster@ command-line option,
-- one can adjust an upper bound of acceptable speed up
-- in comparison to baseline for
-- individual benchmarks and groups of benchmarks
-- using 'adjustOption' and 'localOption'.
--
-- E. g., set upper bound of acceptable speed up to 10% as follows:
--
-- > import Test.Tasty (localOption)
-- > localOption (FailIfFaster 0.10) (bgroup [...])
--
-- @since 0.2
newtype FailIfFaster = FailIfFaster Double
deriving (Show, Read, Typeable)
instance IsOption FailIfFaster where
defaultValue = FailIfFaster (1.0 / 0.0)
parseValue = fmap FailIfFaster . parsePositivePercents
optionName = pure "fail-if-faster"
optionHelp = pure "Upper bound of acceptable speed up in percents. If a benchmark is unacceptably faster than baseline (see --baseline), it will be reported as failed."
parsePositivePercents :: String -> Maybe Double
parsePositivePercents xs = do
x <- safeRead xs
guard (x > 0)
pure (x / 100)
instance IsOption TimeMode where
defaultValue = CpuTime
parseValue v = case v of
"cpu" -> Just CpuTime
"wall" -> Just WallTime
_ -> Nothing
optionName = pure "time-mode"
optionHelp = pure "Whether to measure CPU time (\"cpu\") or wall-clock time (\"wall\")"
#if MIN_VERSION_tasty(1,3,0)
showDefaultValue m = Just $ case m of
CpuTime -> "cpu"
WallTime -> "wall"
#endif
#endif
-- | Something that can be benchmarked, produced by 'nf', 'whnf', 'nfIO', 'whnfIO',
-- 'nfAppIO', 'whnfAppIO' below.
--
-- Drop-in replacement for @Criterion.@'Criterion.Benchmarkable' and
-- @Gauge.@'Gauge.Benchmarkable'.
--
-- @since 0.1
newtype Benchmarkable =
-- | @since 0.3
Benchmarkable
{ unBenchmarkable :: Word64 -> IO () -- ^ Run benchmark given number of times.
} deriving (Typeable)
#ifdef MIN_VERSION_tasty
-- | 'defaultMain' forces 'setLocaleEncoding' to 'utf8', but users might
-- be running benchmarks outside of it (e. g., via 'defaultMainWithIngredients').
supportsUnicode :: Bool
#if MIN_VERSION_base(4,5,0)
supportsUnicode = take 3 (textEncodingName enc) == "UTF"
#if defined(mingw32_HOST_OS)
&& unsafePerformIO getConsoleOutputCP == 65001
#endif
where
enc = unsafePerformIO getLocaleEncoding
#else
supportsUnicode = False
#endif
{-# NOINLINE supportsUnicode #-}
mu :: Char
mu = if supportsUnicode then 'μ' else 'u'
pm :: String
pm = if supportsUnicode then " ± " else " +-"
-- | Show picoseconds, fitting number in 3 characters.
showPicos3 :: Word64 -> String
showPicos3 i
| t < 995 = printf "%3.0f ps" t
| t < 995e1 = printf "%3.1f ns" (t / 1e3)
| t < 995e3 = printf "%3.0f ns" (t / 1e3)
| t < 995e4 = printf "%3.1f %cs" (t / 1e6) mu
| t < 995e6 = printf "%3.0f %cs" (t / 1e6) mu
| t < 995e7 = printf "%3.1f ms" (t / 1e9)
| t < 995e9 = printf "%3.0f ms" (t / 1e9)
| otherwise = printf "%4.2f s" (t / 1e12)
where
t = word64ToDouble i
-- | Show picoseconds, fitting number in 4 characters.
showPicos4 :: Word64 -> String
showPicos4 i
| t < 995 = printf "%3.0f ps" t
| t < 995e1 = printf "%4.2f ns" (t / 1e3)
| t < 995e2 = printf "%4.1f ns" (t / 1e3)
| t < 995e3 = printf "%3.0f ns" (t / 1e3)
| t < 995e4 = printf "%4.2f %cs" (t / 1e6) mu
| t < 995e5 = printf "%4.1f %cs" (t / 1e6) mu
| t < 995e6 = printf "%3.0f %cs" (t / 1e6) mu
| t < 995e7 = printf "%4.2f ms" (t / 1e9)
| t < 995e8 = printf "%4.1f ms" (t / 1e9)
| t < 995e9 = printf "%3.0f ms" (t / 1e9)
| otherwise = printf "%4.3f s" (t / 1e12)
where
t = word64ToDouble i
showBytes :: Word64 -> String
showBytes i
| t < 1000 = printf "%3.0f B " t
| t < 10189 = printf "%3.1f KB" (t / 1024)
| t < 1023488 = printf "%3.0f KB" (t / 1024)
| t < 10433332 = printf "%3.1f MB" (t / 1048576)
| t < 1048051712 = printf "%3.0f MB" (t / 1048576)
| t < 10683731149 = printf "%3.1f GB" (t / 1073741824)
| t < 1073204953088 = printf "%3.0f GB" (t / 1073741824)
| t < 10940140696372 = printf "%3.1f TB" (t / 1099511627776)
| t < 1098961871962112 = printf "%3.0f TB" (t / 1099511627776)
| t < 11202704073084108 = printf "%3.1f PB" (t / 1125899906842624)
| t < 1125336956889202624 = printf "%3.0f PB" (t / 1125899906842624)
| t < 11471568970838126592 = printf "%3.1f EB" (t / 1152921504606846976)
| otherwise = printf "%3.0f EB" (t / 1152921504606846976)
where
t = word64ToDouble i
#endif
data Measurement = Measurement
{ measTime :: !Word64 -- ^ time in picoseconds
, measAllocs :: !Word64 -- ^ allocations in bytes
, measCopied :: !Word64 -- ^ copied bytes
, measMaxMem :: !Word64 -- ^ max memory in use
} deriving (Show, Read)
data Estimate = Estimate
{ estMean :: !Measurement
, estStdev :: !Word64 -- ^ stdev in picoseconds
} deriving (Show, Read)
#ifdef MIN_VERSION_tasty
data WithLoHi a = WithLoHi
!a -- payload
!Double -- lower bound (e. g., 0.9 for -10% speedup)
!Double -- upper bound (e. g., 1.2 for +20% slowdown)
deriving (Show, Read)
prettyEstimate :: Estimate -> String
prettyEstimate (Estimate m stdev) =
showPicos4 (measTime m)
++ (if stdev == 0 then " " else pm ++ showPicos3 (2 * stdev))
prettyEstimateWithGC :: Estimate -> String
prettyEstimateWithGC (Estimate m stdev) =
showPicos4 (measTime m)
++ (if stdev == 0 then ", " else pm ++ showPicos3 (2 * stdev) ++ ", ")
++ showBytes (measAllocs m) ++ " allocated, "
++ showBytes (measCopied m) ++ " copied, "
++ showBytes (measMaxMem m) ++ " peak memory"
csvEstimate :: Estimate -> String
csvEstimate (Estimate m stdev) = show (measTime m) ++ "," ++ show (2 * stdev)
csvEstimateWithGC :: Estimate -> String
csvEstimateWithGC (Estimate m stdev) = show (measTime m) ++ "," ++ show (2 * stdev)
++ "," ++ show (measAllocs m) ++ "," ++ show (measCopied m) ++ "," ++ show (measMaxMem m)
#endif
predict
:: Measurement -- ^ time for one run
-> Measurement -- ^ time for two runs
-> Estimate
predict (Measurement t1 a1 c1 m1) (Measurement t2 a2 c2 m2) = Estimate
{ estMean = Measurement t (fit a1 a2) (fit c1 c2) (max m1 m2)
, estStdev = truncate (sqrt d :: Double)
}
where
fit x1 x2 = x1 `quot` 5 + 2 * (x2 `quot` 5)
t = fit t1 t2
sqr x = x * x
d = sqr (word64ToDouble t1 - word64ToDouble t)
+ sqr (word64ToDouble t2 - 2 * word64ToDouble t)
predictPerturbed :: Measurement -> Measurement -> Estimate
predictPerturbed t1 t2 = Estimate
{ estMean = estMean (predict t1 t2)
, estStdev = max
(estStdev (predict (lo t1) (hi t2)))
(estStdev (predict (hi t1) (lo t2)))
}
where
prec = max (fromInteger cpuTimePrecision) 1000000000 -- 1 ms
hi meas = meas { measTime = measTime meas + prec }
lo meas = meas { measTime = measTime meas - prec }
hasGCStats :: Bool
#if MIN_VERSION_base(4,10,0)
hasGCStats = unsafePerformIO getRTSStatsEnabled
#elif MIN_VERSION_base(4,6,0)
hasGCStats = unsafePerformIO getGCStatsEnabled
#else
hasGCStats = False
#endif
getAllocsAndCopied :: IO (Word64, Word64, Word64)
getAllocsAndCopied = do
if not hasGCStats then pure (0, 0, 0) else
#if MIN_VERSION_base(4,10,0)
(\s -> (allocated_bytes s, copied_bytes s, max_mem_in_use_bytes s)) <$> getRTSStats
#elif MIN_VERSION_base(4,6,0)
(\s -> (int64ToWord64 $ bytesAllocated s, int64ToWord64 $ bytesCopied s, int64ToWord64 $ peakMegabytesAllocated s * 1024 * 1024)) <$> getGCStats
#else
pure (0, 0, 0)
#endif