Implement floating point conversion with ryu (haskell#365)

* Implement floating point conversion with ryu * Use manual strictness - Strict extension is from ghc >= 8.0.1 * Use checked shifts - ghc 9.0.1 doesn't shiftL on negative unsafeShiftR and this is more straightforward regardless * Use builtin float-to-word conversion functions * Use builtin conversion to Bool * Remove dependency on array package * Handle non-exhaustive patterns * Try using prim conversions directly * Revert "Try using prim conversions directly" This reverts commit 10809d3. * Dispatch to slow cast when builtin unavailable - GHC.Float exports them starting from base-4.10.0.0 which starts with with ghc 8.2.1 * Try bumping min version to 8.4.x * Fix log10pow5 approximation and add unit test - expose RealFloat.Internal so that tests can import and verify that the approximations match in the expected ranges * Re-export floatDec and doubleDec to maintain public API * Improve documentation and fixes for initial code review - make double-polymorphic functions singly-polymorphic for clarity - use canonical Control.Arrow.first - name boolean values in special string formatting and add explanation - explain magic numbers in logarithm approximations and reciprocal multiplication - other misc comments * Improve table generation documentation and clean-up - add general overview of floating point conversion algorithm and idea behind ryu algorithm - remove unused Num Word128 instance * Improve documentation of f2s and d2s and cleanup - deduplicate shortest and rounding handling - add some comments and explanations of algorithm steps inline * Use stricter integral types and annotate fromIntegral usages - a closer inspection of `fromIntegral` usages shows that a lot of that conversion scaffolding is unnecessary if types are chose correctly - also fixes a delayed to-signed-int conversion that caused unsigned wraparound on subtraction * Add module descriptions and fix typos * Use internal FloatFormat instead of GHC.Float.FFFormat - avoids dependency especially while we don't actually support the full API of GHC.Float.formatRealFloat * Use monomorphic helpers for remaining integral conversions used by RealFloat * Remove usage of TemplateHaskell in RealFloat * Fix LUT usage on big-endian systems - Do a runtime byteswap. bswap64 is ~1 cycle on most architectures - NB: multiline string literals are parsed differently with language CPP * Add header for endianness detection * Fix big-endian word16 packing in fast digit formatting * Fix big-endian word128 read from raw addr * Clean up unused functions - finv and fnorm kept as comments for reference to table computation * Fix incorrect reciprocal function usage - Doesn't actually affect correctness vs show since round-even is not implemented (acceptBounds is always False) - Adds a couple explorative test cases and a comment anyways * Add more test coverage and fix doc example - Fixed-format fixed-precision tests - Re-exports TableGenerator constants to allow sanity checks for computed bit-range constants * Use quickcheck equality property in tests * Format haddock headers more similarly to existing ones * Use simpler reciprocal math for 32-bit words - Clarity and marginal performance improvement * Use boxed arithmetic in logic flow - more portable wrt internal ghc prim and 32- vs 64-bit - more readable (less syntax cruft in hashes and verbose operation names) - not much of a performance difference measured * Support ghc 9.2 prim word changes - Data.Word wraps fixed-sized word types - array operations now use fixed-sized word types * Fix 32-bit support - Removes most of the raw Word# usage to facilitate support of fixed-size sub-word types and 32-bit systems. Benchmarking shows little difference (<5%) - Implements manual multiplication of 64-bit words for 32-bit systems * Skip conversion to Double before fixed Float formatting - otherwise produces unnecessarily long results since imprecise representations get much more significance with Double precision * Tweak doc wording and add examples - per sjakobi suggestions * Rename FExponent to FScientific - More intuitive name for scientific notation since we're moving away from GHC.Float.FFFormat anyway * Use an opaque FloatFormat type for compatibility - while precision handling is not fully implemented * Rename float fixed-format to standard-format and other naming tweaks - `fixed` was confusing terminology adopted from FFFormat - `FormatFloat'` -> `FormatMode` - some doc tweaks * Encourage inlining by removing partial application * Fix some haddock links and accidental monospacing * Add explanation about difference between implementation and reference paper * Clarify default precision * Point to ryu paper for more details * Fix non-exhaustive warning for ghc 9.2 - add redundant pattern matching. now symmetrical with the e >= 0 case
Bodigrim · Dec 4, 2021 · 5db3a78 · 5db3a78
1 parent fac2b09
commit 5db3a78
Show file tree

Hide file tree

Showing 9 changed files with 2,548 additions and 31 deletions.
diff --git a/Data/ByteString/Builder.hs b/Data/ByteString/Builder.hs
@@ -252,6 +252,7 @@ module Data.ByteString.Builder
     , stringUtf8
 
     , module Data.ByteString.Builder.ASCII
+    , module Data.ByteString.Builder.RealFloat
 
     ) where
 
@@ -261,6 +262,7 @@ import           Data.ByteString.Builder.Internal
 import qualified Data.ByteString.Builder.Prim  as P
 import qualified Data.ByteString.Lazy.Internal as L
 import           Data.ByteString.Builder.ASCII
+import           Data.ByteString.Builder.RealFloat
 
 import           Data.String (IsString(..))
 import           System.IO (Handle, IOMode(..), withBinaryFile)

diff --git a/Data/ByteString/Builder/ASCII.hs b/Data/ByteString/Builder/ASCII.hs
@@ -81,6 +81,7 @@ import           Data.ByteString.Lazy                           as L
 import           Data.ByteString.Builder.Internal (Builder)
 import qualified Data.ByteString.Builder.Prim                   as P
 import qualified Data.ByteString.Builder.Prim.Internal          as P
+import           Data.ByteString.Builder.RealFloat (floatDec, doubleDec)
 
 import           Foreign
 import           Foreign.C.Types
@@ -89,16 +90,6 @@ import           Foreign.C.Types
 -- Decimal Encoding
 ------------------------------------------------------------------------------
 
-
--- | Encode a 'String' using 'P.char7'.
-{-# INLINE string7 #-}
-string7 :: String -> Builder
-string7 = P.primMapListFixed P.char7
-
-------------------------------------------------------------------------------
--- Decimal Encoding
-------------------------------------------------------------------------------
-
 -- Signed integers
 ------------------
 
@@ -163,22 +154,6 @@ wordDec :: Word -> Builder
 wordDec = P.primBounded P.wordDec
 
 
--- Floating point numbers
--------------------------
-
--- TODO: Use Bryan O'Sullivan's double-conversion package to speed it up.
-
--- | /Currently slow./ Decimal encoding of an IEEE 'Float'.
-{-# INLINE floatDec #-}
-floatDec :: Float -> Builder
-floatDec = string7 . show
-
--- | /Currently slow./ Decimal encoding of an IEEE 'Double'.
-{-# INLINE doubleDec #-}
-doubleDec :: Double -> Builder
-doubleDec = string7 . show
-
-
 ------------------------------------------------------------------------------
 -- Hexadecimal Encoding
 ------------------------------------------------------------------------------

diff --git a/Data/ByteString/Builder/RealFloat.hs b/Data/ByteString/Builder/RealFloat.hs
@@ -0,0 +1,272 @@
+-- |
+-- Module      : Data.ByteString.Builder.RealFloat
+-- Copyright   : (c) Lawrence Wu 2021
+-- License     : BSD-style
+-- Maintainer  : lawrencejwu@gmail.com
+--
+-- Floating point formatting for @Bytestring.Builder@
+--
+-- This module primarily exposes `floatDec` and `doubleDec` which do the
+-- equivalent of converting through @'Data.ByteString.Builder.string7' . 'show'@.
+--
+-- It also exposes `formatFloat` and `formatDouble` with a similar API as
+-- `GHC.Float.formatRealFloat`.
+--
+-- NB: The float-to-string conversions exposed by this module match `show`'s
+-- output (specifically with respect to default rounding and length). In
+-- particular, there are boundary cases where the closest and \'shortest\'
+-- string representations are not used.  Mentions of \'shortest\' in the docs
+-- below are with this caveat.
+--
+-- For example, for fidelity, we match `show` on the output below.
+--
+-- >>> show (1.0e23 :: Float)
+-- "1.0e23"
+-- >>> show (1.0e23 :: Double)
+-- "9.999999999999999e22"
+-- >>> floatDec 1.0e23
+-- "1.0e23"
+-- >>> doubleDec 1.0e23
+-- "9.999999999999999e22"
+--
+-- Simplifying, we can build a shorter, lossless representation by just using
+-- @"1.0e23"@ since the floating point values that are 1 ULP away are
+--
+-- >>> showHex (castDoubleToWord64 1.0e23) []
+-- "44b52d02c7e14af6"
+-- >>> castWord64ToDouble 0x44b52d02c7e14af5
+-- 9.999999999999997e22
+-- >>> castWord64ToDouble 0x44b52d02c7e14af6
+-- 9.999999999999999e22
+-- >>> castWord64ToDouble 0x44b52d02c7e14af7
+-- 1.0000000000000001e23
+--
+-- In particular, we could use the exact boundary if it is the shortest
+-- representation and the original floating number is even. To experiment with
+-- the shorter rounding, refer to
+-- `Data.ByteString.Builder.RealFloat.Internal.acceptBounds`. This will give us
+--
+-- >>> floatDec 1.0e23
+-- "1.0e23"
+-- >>> doubleDec 1.0e23
+-- "1.0e23"
+--
+-- For more details, please refer to the
+-- <https://dl.acm.org/doi/10.1145/3192366.3192369 Ryu paper>.
+
+
+module Data.ByteString.Builder.RealFloat
+  ( floatDec
+  , doubleDec
+
+  -- * Custom formatting
+  , formatFloat
+  , formatDouble
+  , FloatFormat
+  , standard
+  , standardDefaultPrecision
+  , scientific
+  , generic
+  ) where
+
+import Data.ByteString.Builder.Internal (Builder)
+import qualified Data.ByteString.Builder.RealFloat.Internal as R
+import qualified Data.ByteString.Builder.RealFloat.F2S as RF
+import qualified Data.ByteString.Builder.RealFloat.D2S as RD
+import qualified Data.ByteString.Builder.Prim as BP
+import GHC.Float (roundTo)
+import GHC.Word (Word64)
+import GHC.Show (intToDigit)
+
+-- | Returns a rendered Float. Matches `show` in displaying in standard or
+-- scientific notation
+--
+-- @
+-- floatDec = 'formatFloat' 'generic'
+-- @
+{-# INLINABLE floatDec #-}
+floatDec :: Float -> Builder
+floatDec = formatFloat generic
+
+-- | Returns a rendered Double. Matches `show` in displaying in standard or
+-- scientific notation
+--
+-- @
+-- doubleDec = 'formatDouble' 'generic'
+-- @
+{-# INLINABLE doubleDec #-}
+doubleDec :: Double -> Builder
+doubleDec = formatDouble generic
+
+-- | Format type for use with `formatFloat` and `formatDouble`.
+data FloatFormat = MkFloatFormat FormatMode (Maybe Int)
+
+-- | Standard notation with `n` decimal places
+standard :: Int -> FloatFormat
+standard n = MkFloatFormat FStandard (Just n)
+
+-- | Standard notation with the \'default precision\' (decimal places matching `show`)
+standardDefaultPrecision :: FloatFormat
+standardDefaultPrecision = MkFloatFormat FStandard Nothing
+
+-- | Scientific notation with \'default precision\' (decimal places matching `show`)
+scientific :: FloatFormat
+scientific = MkFloatFormat FScientific Nothing
+
+-- | Standard or scientific notation depending on the exponent. Matches `show`
+generic :: FloatFormat
+generic = MkFloatFormat FGeneric Nothing
+
+-- | ByteString float-to-string format
+data FormatMode
+  = FScientific     -- ^ scientific notation
+  | FStandard       -- ^ standard notation with `Maybe Int` digits after the decimal
+  | FGeneric        -- ^ dispatches to scientific or standard notation based on the exponent
+  deriving Show
+
+-- TODO: support precision argument for FGeneric and FScientific
+-- | Returns a rendered Float. Returns the \'shortest\' representation in
+-- scientific notation and takes an optional precision argument in standard
+-- notation. Also see `floatDec`.
+--
+-- With standard notation, the precision argument is used to truncate (or
+-- extend with 0s) the \'shortest\' rendered Float. The \'default precision\' does
+-- no such modifications and will return as many decimal places as the
+-- representation demands.
+--
+-- e.g
+--
+-- >>> formatFloat (standard 1) 1.2345e-2
+-- "0.0"
+-- >>> formatFloat (standard 10) 1.2345e-2
+-- "0.0123450000"
+-- >>> formatFloat standardDefaultPrecision 1.2345e-2
+-- "0.01234"
+-- >>> formatFloat scientific 12.345
+-- "1.2345e1"
+-- >>> formatFloat generic 12.345
+-- "12.345"
+{-# INLINABLE formatFloat #-}
+formatFloat :: FloatFormat -> Float -> Builder
+formatFloat (MkFloatFormat fmt prec) = \f ->
+  let (RF.FloatingDecimal m e) = RF.f2Intermediate f
+      e' = R.int32ToInt e + R.decimalLength9 m in
+  case fmt of
+    FGeneric ->
+      case specialStr f of
+        Just b -> b
+        Nothing ->
+          if e' >= 0 && e' <= 7
+             then sign f `mappend` showStandard (R.word32ToWord64 m) e' prec
+             else BP.primBounded (R.toCharsScientific (f < 0) m e) ()
+    FScientific -> RF.f2s f
+    FStandard ->
+      case specialStr f of
+        Just b -> b
+        Nothing -> sign f `mappend` showStandard (R.word32ToWord64 m) e' prec
+
+-- TODO: support precision argument for FGeneric and FScientific
+-- | Returns a rendered Double. Returns the \'shortest\' representation in
+-- scientific notation and takes an optional precision argument in standard
+-- notation. Also see `doubleDec`.
+--
+-- With standard notation, the precision argument is used to truncate (or
+-- extend with 0s) the \'shortest\' rendered Float. The \'default precision\'
+-- does no such modifications and will return as many decimal places as the
+-- representation demands.
+--
+-- e.g
+--
+-- >>> formatDouble (standard 1) 1.2345e-2
+-- "0.0"
+-- >>> formatDouble (standard 10) 1.2345e-2
+-- "0.0123450000"
+-- >>> formatDouble standardDefaultPrecision 1.2345e-2
+-- "0.01234"
+-- >>> formatDouble scientific 12.345
+-- "1.2345e1"
+-- >>> formatDouble generic 12.345
+-- "12.345"
+{-# INLINABLE formatDouble #-}
+formatDouble :: FloatFormat -> Double -> Builder
+formatDouble (MkFloatFormat fmt prec) = \f ->
+  let (RD.FloatingDecimal m e) = RD.d2Intermediate f
+      e' = R.int32ToInt e + R.decimalLength17 m in
+  case fmt of
+    FGeneric ->
+      case specialStr f of
+        Just b -> b
+        Nothing ->
+          if e' >= 0 && e' <= 7
+             then sign f `mappend` showStandard m e' prec
+             else BP.primBounded (R.toCharsScientific (f < 0) m e) ()
+    FScientific -> RD.d2s f
+    FStandard ->
+      case specialStr f of
+        Just b -> b
+        Nothing -> sign f `mappend` showStandard m e' prec
+
+-- | Char7 encode a 'Char'.
+{-# INLINE char7 #-}
+char7 :: Char -> Builder
+char7 = BP.primFixed BP.char7
+
+-- | Char7 encode a 'String'.
+{-# INLINE string7 #-}
+string7 :: String -> Builder
+string7 = BP.primMapListFixed BP.char7
+
+-- | Encodes a `-` if input is negative
+sign :: RealFloat a => a -> Builder
+sign f = if f < 0 then char7 '-' else mempty
+
+-- | Special rendering for Nan, Infinity, and 0. See
+-- RealFloat.Internal.NonNumbersAndZero
+specialStr :: RealFloat a => a -> Maybe Builder
+specialStr f
+  | isNaN f          = Just $ string7 "NaN"
+  | isInfinite f     = Just $ sign f `mappend` string7 "Infinity"
+  | isNegativeZero f = Just $ string7 "-0.0"
+  | f == 0           = Just $ string7 "0.0"
+  | otherwise        = Nothing
+
+-- | Returns a list of decimal digits in a Word64
+digits :: Word64 -> [Int]
+digits w = go [] w
+  where go ds 0 = ds
+        go ds c = let (q, r) = R.dquotRem10 c
+                   in go ((R.word64ToInt r) : ds) q
+
+-- | Show a floating point value in standard notation. Based on GHC.Float.showFloat
+showStandard :: Word64 -> Int -> Maybe Int -> Builder
+showStandard m e prec =
+  case prec of
+    Nothing
+      | e <= 0 -> char7 '0'
+               `mappend` char7 '.'
+               `mappend` string7 (replicate (-e) '0')
+               `mappend` mconcat (digitsToBuilder ds)
+      | otherwise ->
+          let f 0 s     rs = mk0 (reverse s) `mappend` char7 '.' `mappend` mk0 rs
+              f n s     [] = f (n-1) (char7 '0':s) []
+              f n s (r:rs) = f (n-1) (r:s) rs
+           in f e [] (digitsToBuilder ds)
+    Just p
+      | e >= 0 ->
+          let (ei, is') = roundTo 10 (p' + e) ds
+              (ls, rs) = splitAt (e + ei) (digitsToBuilder is')
+           in mk0 ls `mappend` mkDot rs
+      | otherwise ->
+          let (ei, is') = roundTo 10 p' (replicate (-e) 0 ++ ds)
+              -- ds' should always be non-empty but use redundant pattern
+              -- matching to silence warning
+              ds' = if ei > 0 then is' else 0:is'
+              (ls, rs) = splitAt 1 $ digitsToBuilder ds'
+           in mk0 ls `mappend` mkDot rs
+          where p' = max p 0
+  where
+    mk0 ls = case ls of [] -> char7 '0'; _ -> mconcat ls
+    mkDot rs = if null rs then mempty else char7 '.' `mappend` mconcat rs
+    ds = digits m
+    digitsToBuilder = fmap (char7 . intToDigit)
+