From 8afaa68c7b856114c01330f9a66d74c1372fd641 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Thu, 20 Oct 2022 11:38:34 +0100 Subject: [PATCH 01/11] partial work supporting GHC 9.4 --- TODO.md | 1 + flatparse.cabal | 20 +- src/FlatParse/Basic.hs | 1230 +++------------ src/FlatParse/Basic/Chars.hs | 171 +++ src/FlatParse/Basic/Integers.hs | 340 ++++ src/FlatParse/Basic/Internal.hs | 197 +++ src/FlatParse/Basic/Parser.hs | 135 ++ src/FlatParse/Basic/Position.hs | 71 + src/FlatParse/Common/Assorted.hs | 135 ++ src/FlatParse/Common/Numbers.hs | 70 + src/FlatParse/Common/Position.hs | 42 + src/FlatParse/Common/Trie.hs | 85 + src/FlatParse/Examples/BasicLambda/Lexer.hs | 10 +- src/FlatParse/Examples/BasicLambda/Parser.hs | 2 +- src/FlatParse/Internal.hs | 277 ---- src/FlatParse/Internal/UnboxedNumerics.hs | 115 -- src/FlatParse/Stateful.hs | 1453 ------------------ test/Test.hs | 272 ++-- 18 files changed, 1636 insertions(+), 2990 deletions(-) create mode 100644 TODO.md create mode 100644 src/FlatParse/Basic/Chars.hs create mode 100644 src/FlatParse/Basic/Integers.hs create mode 100644 src/FlatParse/Basic/Internal.hs create mode 100644 src/FlatParse/Basic/Parser.hs create mode 100644 src/FlatParse/Basic/Position.hs create mode 100644 src/FlatParse/Common/Assorted.hs create mode 100644 src/FlatParse/Common/Numbers.hs create mode 100644 src/FlatParse/Common/Position.hs create mode 100644 src/FlatParse/Common/Trie.hs delete mode 100644 src/FlatParse/Internal.hs delete mode 100644 src/FlatParse/Internal/UnboxedNumerics.hs delete mode 100644 src/FlatParse/Stateful.hs diff --git a/TODO.md b/TODO.md new file mode 100644 index 0000000..4501b18 --- /dev/null +++ b/TODO.md @@ -0,0 +1 @@ + * add unsafeTakeBs# which doesn't assert >=0. can cause hangs so bad diff --git a/flatparse.cabal b/flatparse.cabal index b7f0032..4ea5335 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -1,6 +1,6 @@ cabal-version: 1.12 --- This file has been generated from package.yaml by hpack version 0.34.4. +-- This file has been generated from package.yaml by hpack version 0.35.0. -- -- see: https://github.com/sol/hpack @@ -44,11 +44,17 @@ flag llvm library exposed-modules: FlatParse.Basic + FlatParse.Basic.Chars + FlatParse.Basic.Integers + FlatParse.Basic.Internal + FlatParse.Basic.Parser + FlatParse.Basic.Position + FlatParse.Common.Assorted + FlatParse.Common.Numbers + FlatParse.Common.Position + FlatParse.Common.Trie FlatParse.Examples.BasicLambda.Lexer FlatParse.Examples.BasicLambda.Parser - FlatParse.Internal - FlatParse.Internal.UnboxedNumerics - FlatParse.Stateful other-modules: Paths_flatparse hs-source-dirs: @@ -71,11 +77,11 @@ library , containers , integer-gmp , template-haskell + default-language: Haskell2010 if flag(dump) ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file if flag(llvm) ghc-options: -fllvm - default-language: Haskell2010 test-suite spec type: exitcode-stdio-1.0 @@ -105,11 +111,11 @@ test-suite spec , flatparse , hspec , quickcheck-instances + default-language: Haskell2010 if flag(dump) ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file if flag(llvm) ghc-options: -fllvm - default-language: Haskell2010 benchmark bench type: exitcode-stdio-1.0 @@ -146,8 +152,8 @@ benchmark bench , megaparsec , parsec , primitive + default-language: Haskell2010 if flag(dump) ghc-options: -ddump-simpl -ddump-stg-final -ddump-cmm -dsuppress-all -dno-suppress-type-signatures -ddump-to-file if flag(llvm) ghc-options: -fllvm - default-language: Haskell2010 diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index cda9867..14858ce 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -24,12 +24,12 @@ module FlatParse.Basic ( -- * Errors and failures , failed - , Base.empty + , Control.Applicative.empty , err , lookahead , fails , try - , optional + , Control.Applicative.optional , optional_ , withOption , cut @@ -37,72 +37,46 @@ module FlatParse.Basic ( -- * Basic lexing and parsing , eof - , takeBs - , takeRestBs + , take + , takeRest , skip - , char - , byte - , bytes - , byteString - , string , switch , switchWithPost , rawSwitchWithPost + + , getCharOf + , getBytesOf + , getByteStringOf + , getStringOf + + , getChar + , getChar_ + , getCharASCII + , getCharASCII_ + , getAsciiDecimalInt + , getAsciiDecimalInteger + , getAsciiHexInt + , getCString + + , Common.isDigit + , Common.isGreekLetter + , Common.isLatinLetter + , satisfy , satisfy_ , satisfyASCII , satisfyASCII_ , fusedSatisfy , fusedSatisfy_ - , anyWord8 - , anyWord8_ - , anyWord16 - , anyWord16_ - , anyWord32 - , anyWord32_ - , anyWord64 - , anyWord64_ - , anyWord - , anyWord_ - , anyInt8 - , anyInt16 - , anyInt32 - , anyInt64 - , anyInt - , anyChar - , anyChar_ - , anyCharASCII - , anyCharASCII_ - , FlatParse.Internal.isDigit - , FlatParse.Internal.isGreekLetter - , FlatParse.Internal.isLatinLetter - , FlatParse.Basic.readInt - , FlatParse.Basic.readIntHex - , FlatParse.Basic.readInteger - , anyCString - - -- ** Explicit-endianness machine integers - , anyWord16le - , anyWord16be - , anyWord32le - , anyWord32be - , anyWord64le - , anyWord64be - , anyInt16le - , anyInt16be - , anyInt32le - , anyInt32be - , anyInt64le - , anyInt64be -- * Combinators , (<|>) , branch , chainl , chainr - , many + , Control.Applicative.many , many_ - , some + , Control.Applicative.some , some_ , notFollowedBy , isolate @@ -130,8 +104,8 @@ module FlatParse.Basic ( -- * Getting the rest of the input as a 'String' , takeLine , traceLine - , takeRest - , traceRest + , takeRestString + , traceRestString -- * `String` conversions , packUTF8 @@ -144,15 +118,6 @@ module FlatParse.Basic ( , takeBs# , atSkip# - -- *** Machine integer continuation parsers - , withAnyWord8# - , withAnyWord16# - , withAnyWord32# - , withAnyWord64# - , withAnyInt8# - , withAnyInt16# - , withAnyInt32# - , withAnyInt64# -- ** Location & address primitives , setBack# @@ -162,27 +127,24 @@ module FlatParse.Basic ( , atAddr# -- ** Unsafe - , anyCStringUnsafe - , scan8# - , scan16# - , scan32# - , scan64# - , scanAny8# - , scanBytes# + , getCStringUnsafe + + , module FlatParse.Basic.Integers ) where -import qualified Control.Applicative as Base +import Prelude hiding ( take, getChar ) + +import Control.Applicative + import Control.Monad import Data.Foldable import Data.List (sortBy) import Data.Map (Map) import Data.Ord (comparing) -import Data.Word import GHC.Exts import GHC.Word -import GHC.Int -import GHC.ForeignPtr +import GHC.ForeignPtr ( ForeignPtr(..) ) import Language.Haskell.TH import System.IO.Unsafe @@ -191,76 +153,17 @@ import qualified Data.ByteString.Unsafe as B import qualified Data.ByteString.Internal as B import qualified Data.Map.Strict as M -import FlatParse.Internal -import FlatParse.Internal.UnboxedNumerics - --------------------------------------------------------------------------------- - --- | Primitive result of a parser. Possible results are given by `OK#`, `Err#` and `Fail#` --- pattern synonyms. -type Res# e a = - (# - (# a, Addr# #) - | (# #) - | (# e #) - #) - --- | Contains return value and a pointer to the rest of the input buffer. -pattern OK# :: a -> Addr# -> Res# e a -pattern OK# a s = (# (# a, s #) | | #) - --- | Constructor for errors which are by default non-recoverable. -pattern Err# :: e -> Res# e a -pattern Err# e = (# | | (# e #) #) - --- | Constructor for recoverable failure. -pattern Fail# :: Res# e a -pattern Fail# = (# | (# #) | #) -{-# complete OK#, Err#, Fail# #-} - --- | @Parser e a@ has an error type @e@ and a return type @a@. -newtype Parser e a = Parser {runParser# :: ForeignPtrContents -> Addr# -> Addr# -> Res# e a} - -instance Functor (Parser e) where - fmap f (Parser g) = Parser \fp eob s -> case g fp eob s of - OK# a s -> let !b = f a in OK# b s - x -> unsafeCoerce# x - {-# inline fmap #-} - - (<$) a' (Parser g) = Parser \fp eob s -> case g fp eob s of - OK# a s -> OK# a' s - x -> unsafeCoerce# x - {-# inline (<$) #-} +import qualified FlatParse.Common.Numbers as Common +import qualified FlatParse.Common.Assorted as Common +import FlatParse.Common.Position +import FlatParse.Common.Trie +import FlatParse.Common.Assorted ( packBytes, splitBytes, strToBytes, packUTF8 ) -instance Applicative (Parser e) where - pure a = Parser \fp eob s -> OK# a s - {-# inline pure #-} - Parser ff <*> Parser fa = Parser \fp eob s -> case ff fp eob s of - OK# f s -> case fa fp eob s of - OK# a s -> let !b = f a in OK# b s - x -> unsafeCoerce# x - x -> unsafeCoerce# x - {-# inline (<*>) #-} - Parser fa <* Parser fb = Parser \fp eob s -> case fa fp eob s of - OK# a s -> case fb fp eob s of - OK# b s -> OK# a s - x -> unsafeCoerce# x - x -> unsafeCoerce# x - {-# inline (<*) #-} - Parser fa *> Parser fb = Parser \fp eob s -> case fa fp eob s of - OK# a s -> fb fp eob s - x -> unsafeCoerce# x - {-# inline (*>) #-} - -instance Monad (Parser e) where - return = pure - {-# inline return #-} - Parser fa >>= f = Parser \fp eob s -> case fa fp eob s of - OK# a s -> runParser# (f a) fp eob s - x -> unsafeCoerce# x - {-# inline (>>=) #-} - (>>) = (*>) - {-# inline (>>) #-} +import FlatParse.Basic.Parser +import FlatParse.Basic.Integers +import FlatParse.Basic.Internal +import FlatParse.Basic.Chars +import FlatParse.Basic.Position -- | Higher-level boxed data type for parsing results. data Result e a = @@ -295,35 +198,15 @@ runParser (Parser f) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = unsafeDupablePerfor pure Fail {-# inlinable runParser #-} --- | Run a parser on a `String` input. Reminder: @OverloadedStrings@ for `B.ByteString` does not --- yield a valid UTF-8 encoding! For non-ASCII `B.ByteString` literal input, use `runParserS` or --- `packUTF8` for testing. -runParserS :: Parser e a -> String -> Result e a -runParserS pa s = runParser pa (packUTF8 s) - -------------------------------------------------------------------------------- --- | The failing parser. By default, parser choice `(<|>)` arbitrarily backtracks --- on parser failure. -failed :: Parser e a -failed = Parser \fp eob s -> Fail# -{-# inline failed #-} - -- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack -- on parser error. Use `try` to convert an error to a recoverable failure. err :: e -> Parser e a err e = Parser \fp eob s -> Err# e {-# inline err #-} --- | Save the parsing state, then run a parser, then restore the state. -lookahead :: Parser e a -> Parser e a -lookahead (Parser f) = Parser \fp eob s -> - case f fp eob s of - OK# a _ -> OK# a s - x -> x -{-# inline lookahead #-} - -- | Convert a parsing failure to a success. fails :: Parser e a -> Parser e () fails (Parser f) = Parser \fp eob s -> @@ -340,19 +223,14 @@ try (Parser f) = Parser \fp eob s -> case f fp eob s of x -> x {-# inline try #-} --- | Convert a parsing failure to a `Maybe`. If possible, use `withOption` instead. -optional :: Parser e a -> Parser e (Maybe a) -optional p = (Just <$> p) <|> pure Nothing -{-# inline optional #-} - -- | Convert a parsing failure to a `()`. optional_ :: Parser e a -> Parser e () optional_ p = (() <$ p) <|> pure () {-# inline optional_ #-} --- | CPS'd version of `optional`. This is usually more efficient, since it gets rid of the --- extra `Maybe` allocation. -withOption :: Parser e a -> (a -> Parser e b) -> Parser e b -> Parser e b +-- | CPS'd version of `optional`. This is usually more efficient, since it gets +-- rid of the extra `Maybe` allocation. +withOption :: Parser e a -> (a -> Parser e r) -> Parser e r -> Parser e r withOption (Parser f) just (Parser nothing) = Parser \fp eob s -> case f fp eob s of OK# a s -> runParser# (just a) fp eob s Fail# -> nothing fp eob s @@ -389,16 +267,16 @@ eof = Parser \fp eob s -> case eqAddr# eob s of -- | Read the given number of bytes as a 'ByteString'. -- -- Throws a runtime error if given a negative integer. -takeBs :: Int -> Parser e B.ByteString -takeBs (I# n#) = takeBs# n# -{-# inline takeBs #-} +take :: Int -> Parser e B.ByteString +take (I# n#) = takeBs# n# +{-# inline take #-} -- | Consume the rest of the input. May return the empty bytestring. -takeRestBs :: Parser e B.ByteString -takeRestBs = Parser \fp eob s -> +takeRest :: Parser e B.ByteString +takeRest = Parser \fp eob s -> let n# = minusAddr# eob s in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) eob -{-# inline takeRestBs #-} +{-# inline takeRest #-} -- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. -- @@ -409,52 +287,13 @@ skip (I# os#) = atSkip# os# (pure ()) -- | Parse a UTF-8 character literal. This is a template function, you can use it as -- @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@. -char :: Char -> Q Exp -char c = string [c] - --- | Read a `Word8`. -byte :: Word8 -> Parser e () -byte w = ensureBytes# 1 >> scan8# w -{-# inline byte #-} - --- | Read a sequence of bytes. This is a template function, you can use it as @$(bytes [3, 4, 5])@, --- for example, and the splice has type @Parser e ()@. For a non-TH variant see 'byteString'. -bytes :: [Word] -> Q Exp -bytes bytes = do - let !len = length bytes - [| ensureBytes# len >> $(scanBytes# bytes) |] - --- | Parse a given `B.ByteString`. If the bytestring is statically known, consider using 'bytes' instead. -byteString :: B.ByteString -> Parser e () -byteString (B.PS (ForeignPtr bs fcontent) _ (I# len)) = - - let go64 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go64 bs bsend s w = - let bs' = plusAddr# bs 8# in - case gtAddr# bs' bsend of - 1# -> go8 bs bsend s w - _ -> case eqWord# (indexWord64OffAddr# bs 0#) (indexWord64OffAddr# s 0#) of - 1# -> go64 bs' bsend (plusAddr# s 8#) w - _ -> (# Fail#, w #) - - go8 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go8 bs bsend s w = case ltAddr# bs bsend of - 1# -> case eqWord8'# (indexWord8OffAddr# bs 0#) (indexWord8OffAddr# s 0#) of - 1# -> go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) w - _ -> (# Fail#, w #) - _ -> (# OK# () s, w #) - - in Parser \fp eob s -> case len <=# minusAddr# eob s of - 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s w of - (# res, w #) -> case touch# fcontent w of - w -> res - _ -> Fail# -{-# inline byteString #-} +getCharOf :: Char -> Q Exp +getCharOf c = getStringOf [c] -- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@, -- for example, and the splice has type @Parser e ()@. -string :: String -> Q Exp -string str = bytes (strToBytes str) +getStringOf :: String -> Q Exp +getStringOf str = getBytesOf (strToBytes str) {-| This is a template function which makes it possible to branch on a collection of string literals in @@ -523,233 +362,32 @@ rawSwitchWithPost postAction cases fallback = do !fallback <- sequence fallback genTrie $! genSwitchTrie' postAction cases fallback --- | Parse a UTF-8 `Char` for which a predicate holds. -satisfy :: (Char -> Bool) -> Parser e Char -satisfy f = Parser \fp eob s -> case runParser# anyChar fp eob s of - OK# c s | f c -> OK# c s - _ -> Fail# -{-# inline satisfy #-} - --- | Skip a UTF-8 `Char` for which a predicate holds. -satisfy_ :: (Char -> Bool) -> Parser e () -satisfy_ f = Parser \fp eob s -> case runParser# anyChar fp eob s of - OK# c s | f c -> OK# () s - _ -> Fail# -{-# inline satisfy_ #-} - --- | Parse an ASCII `Char` for which a predicate holds. Assumption: the predicate must only return --- `True` for ASCII-range characters. Otherwise this function might read a 128-255 range byte, --- thereby breaking UTF-8 decoding. -satisfyASCII :: (Char -> Bool) -> Parser e Char -satisfyASCII f = Parser \fp eob s -> case eqAddr# eob s of - 1# -> Fail# - _ -> case derefChar8# s of - c1 | f (C# c1) -> OK# (C# c1) (plusAddr# s 1#) - | otherwise -> Fail# -{-# inline satisfyASCII #-} - --- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate --- must only return `True` for ASCII-range characters. -satisfyASCII_ :: (Char -> Bool) -> Parser e () -satisfyASCII_ f = Parser \fp eob s -> case eqAddr# eob s of - 1# -> Fail# - _ -> case derefChar8# s of - c1 | f (C# c1) -> OK# () (plusAddr# s 1#) - | otherwise -> Fail# -{-# inline satisfyASCII_ #-} - --- | This is a variant of `satisfy` which allows more optimization. We can pick four testing --- functions for the four cases for the possible number of bytes in the UTF-8 character. So in --- @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with --- @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups. --- --- For example, if we want to accept any letter, the naive solution would be to use --- `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We --- can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the --- `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of --- all cases without accessing the character table. -fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e Char -fusedSatisfy f1 f2 f3 f4 = Parser \fp eob buf -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# | f1 (C# c1) -> OK# (C# c1) (plusAddr# buf 1#) - | otherwise -> Fail# - _ -> case eqAddr# eob (plusAddr# buf 1#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 1# of - c2 -> case c1 `leChar#` '\xDF'# of - 1# -> - let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c2 -# 0x80#))) - in case f2 resc of - True -> OK# resc (plusAddr# buf 2#) - _ -> Fail# - _ -> case eqAddr# eob (plusAddr# buf 2#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 2# of - c3 -> case c1 `leChar#` '\xEF'# of - 1# -> - let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c3 -# 0x80#))) - in case f3 resc of - True -> OK# resc (plusAddr# buf 3#) - _ -> Fail# - _ -> case eqAddr# eob (plusAddr# buf 3#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 3# of - c4 -> - let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c4 -# 0x80#))) - in case f4 resc of - True -> OK# resc (plusAddr# buf 4#) - _ -> Fail# -{-# inline fusedSatisfy #-} - --- | Skipping variant of `fusedSatisfy`. -fusedSatisfy_ :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e () -fusedSatisfy_ f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4 -{-# inline fusedSatisfy_ #-} - --- | Parse any UTF-8-encoded `Char`. -anyChar :: Parser e Char -anyChar = Parser \fp eob buf -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# -> OK# (C# c1) (plusAddr# buf 1#) - _ -> case eqAddr# eob (plusAddr# buf 1#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 1# of - c2 -> case c1 `leChar#` '\xDF'# of - 1# -> - let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c2 -# 0x80#) - in OK# (C# (chr# resc)) (plusAddr# buf 2#) - _ -> case eqAddr# eob (plusAddr# buf 2#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 2# of - c3 -> case c1 `leChar#` '\xEF'# of - 1# -> - let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c3 -# 0x80#) - in OK# (C# (chr# resc)) (plusAddr# buf 3#) - _ -> case eqAddr# eob (plusAddr# buf 3#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 3# of - c4 -> - let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c4 -# 0x80#) - in OK# (C# (chr# resc)) (plusAddr# buf 4#) -{-# inline anyChar #-} - --- | Skip any UTF-8-encoded `Char`. -anyChar_ :: Parser e () -anyChar_ = Parser \fp eob buf -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# -> OK# () (plusAddr# buf 1#) - _ -> - let buf' = - case c1 `leChar#` '\xDF'# of - 1# -> plusAddr# buf 2# - _ -> case c1 `leChar#` '\xEF'# of - 1# -> plusAddr# buf 3# - _ -> plusAddr# buf 4# - in case leAddr# buf' eob of - 1# -> OK# () buf' - _ -> Fail# -{-# inline anyChar_ #-} - - --- | Parse any `Char` in the ASCII range, fail if the next input character is not in the range. --- This is more efficient than `anyChar` if we are only working with ASCII. -anyCharASCII :: Parser e Char -anyCharASCII = Parser \fp eob buf -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# -> OK# (C# c1) (plusAddr# buf 1#) - _ -> Fail# -{-# inline anyCharASCII #-} - --- | Skip any `Char` in the ASCII range. More efficient than `anyChar_` if we're working only with --- ASCII. -anyCharASCII_ :: Parser e () -anyCharASCII_ = () <$ anyCharASCII -{-# inline anyCharASCII_ #-} - -- | Read a non-negative `Int` from the input, as a non-empty digit sequence. -- The `Int` may overflow in the result. -readInt :: Parser e Int -readInt = Parser \fp eob s -> case FlatParse.Internal.readInt eob s of +getAsciiDecimalInt :: Parser e Int +getAsciiDecimalInt = Parser \fp eob s -> case Common.readInt eob s of (# (##) | #) -> Fail# (# | (# n, s' #) #) -> OK# (I# n) s' -{-# inline readInt #-} +{-# inline getAsciiDecimalInt #-} -- | Read an `Int` from the input, as a non-empty case-insensitive ASCII -- hexadecimal digit sequence. The `Int` may overflow in the result. -readIntHex :: Parser e Int -readIntHex = Parser \fp eob s -> case FlatParse.Internal.readIntHex eob s of +getAsciiHexInt :: Parser e Int +getAsciiHexInt = Parser \fp eob s -> case Common.readIntHex eob s of (# (##) | #) -> Fail# (# | (# n, s' #) #) -> OK# (I# n) s' -{-# inline readIntHex #-} +{-# inline getAsciiHexInt #-} -- | Read a non-negative `Integer` from the input, as a non-empty digit -- sequence. -readInteger :: Parser e Integer -readInteger = Parser \fp eob s -> case FlatParse.Internal.readInteger fp eob s of +getAsciiDecimalInteger :: Parser e Integer +getAsciiDecimalInteger = Parser \fp eob s -> case Common.readInteger fp eob s of (# (##) | #) -> Fail# (# | (# i, s' #) #) -> OK# i s' -{-# inline readInteger #-} +{-# inline getAsciiDecimalInteger #-} -------------------------------------------------------------------------------- --- | Choose between two parsers. If the first parser fails, try the second one, but if the first one --- throws an error, propagate the error. -infixr 6 <|> -(<|>) :: Parser e a -> Parser e a -> Parser e a -(<|>) (Parser f) (Parser g) = Parser \fp eob s -> - case f fp eob s of - Fail# -> g fp eob s - x -> x -{-# inline[1] (<|>) #-} - -instance Base.Alternative (Parser e) where - empty = failed - {-# inline empty #-} - (<|>) = (<|>) - {-# inline (Base.<|>) #-} - -instance MonadPlus (Parser e) where - mzero = failed - {-# inline mzero #-} - mplus = (<|>) - {-# inline mplus #-} - -{-# RULES - -"flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r) - -#-} - --- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. --- This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not --- backtrack from the true/false cases. -branch :: Parser e a -> Parser e b -> Parser e b -> Parser e b -branch pa pt pf = Parser \fp eob s -> case runParser# pa fp eob s of - OK# _ s -> runParser# pt fp eob s - Fail# -> runParser# pf fp eob s - Err# e -> Err# e -{-# inline branch #-} - -- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, -- and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not -- the usual `chainl` function from the parsec libraries! @@ -771,19 +409,6 @@ chainr f (Parser elem) (Parser end) = Parser go where Err# e -> Err# e {-# inline chainr #-} --- | Run a parser zero or more times, collect the results in a list. Note: for optimal performance, --- try to avoid this. Often it is possible to get rid of the intermediate list by using a --- combinator or a custom parser. -many :: Parser e a -> Parser e [a] -many (Parser f) = Parser go where - go fp eob s = case f fp eob s of - OK# a s -> case go fp eob s of - OK# as s -> OK# (a:as) s - x -> x - Fail# -> OK# [] s - Err# e -> Err# e -{-# inline many #-} - -- | Skip a parser zero or more times. many_ :: Parser e a -> Parser e () many_ (Parser f) = Parser go where @@ -793,13 +418,6 @@ many_ (Parser f) = Parser go where Err# e -> Err# e {-# inline many_ #-} --- | Run a parser one or more times, collect the results in a list. Note: for optimal performance, --- try to avoid this. Often it is possible to get rid of the intermediate list by using a --- combinator or a custom parser. -some :: Parser e a -> Parser e [a] -some p = (:) <$> p <*> many p -{-# inline some #-} - -- | Skip a parser one or more times. some_ :: Parser e a -> Parser e () some_ pa = pa >> many_ pa @@ -810,260 +428,44 @@ notFollowedBy :: Parser e a -> Parser e b -> Parser e a notFollowedBy p1 p2 = p1 <* fails p2 {-# inline notFollowedBy #-} --- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All --- isolated bytes must be consumed. --- --- Throws a runtime error if given a negative integer. -isolate :: Int -> Parser e a -> Parser e a -isolate (I# n#) p = Parser \fp eob s -> - let s' = plusAddr# s n# - in case n# <=# minusAddr# eob s of - 1# -> case n# >=# 0# of - 1# -> case runParser# p fp s' s of - OK# a s'' -> case eqAddr# s' s'' of - 1# -> OK# a s'' - _ -> Fail# -- isolated segment wasn't fully consumed - Fail# -> Fail# - Err# e -> Err# e - _ -> error "FlatParse.Basic.isolate: negative integer" - _ -> Fail# -- you tried to isolate more than we have left -{-# inline isolate #-} - - --------------------------------------------------------------------------------- - --- | Get the current position in the input. -getPos :: Parser e Pos -getPos = Parser \fp eob s -> OK# (addrToPos# eob s) s -{-# inline getPos #-} - --- | Set the input position. Warning: this can result in crashes if the position points outside the --- current buffer. It is always safe to `setPos` values which came from `getPos` with the current --- input. -setPos :: Pos -> Parser e () -setPos s = Parser \fp eob _ -> OK# () (posToAddr# eob s) -{-# inline setPos #-} - --- | The end of the input. -endPos :: Pos -endPos = Pos 0 -{-# inline endPos #-} - --- | Return the consumed span of a parser. -spanOf :: Parser e a -> Parser e Span -spanOf (Parser f) = Parser \fp eob s -> case f fp eob s of - OK# a s' -> OK# (Span (addrToPos# eob s) (addrToPos# eob s')) s' - x -> unsafeCoerce# x -{-# inline spanOf #-} - --- | Bind the result together with the span of the result. CPS'd version of `spanOf` --- for better unboxing. -withSpan :: Parser e a -> (a -> Span -> Parser e b) -> Parser e b -withSpan (Parser f) g = Parser \fp eob s -> case f fp eob s of - OK# a s' -> runParser# (g a (Span (addrToPos# eob s) (addrToPos# eob s'))) fp eob s' - x -> unsafeCoerce# x -{-# inline withSpan #-} - --- | Return the `B.ByteString` consumed by a parser. Note: it's more efficient to use `spanOf` and --- `withSpan` instead. -byteStringOf :: Parser e a -> Parser e B.ByteString -byteStringOf (Parser f) = Parser \fp eob s -> case f fp eob s of - OK# a s' -> OK# (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) s' - x -> unsafeCoerce# x -{-# inline byteStringOf #-} - --- | CPS'd version of `byteStringOf`. Can be more efficient, because the result is more eagerly unboxed --- by GHC. It's more efficient to use `spanOf` or `withSpan` instead. -withByteString :: Parser e a -> (a -> B.ByteString -> Parser e b) -> Parser e b -withByteString (Parser f) g = Parser \fp eob s -> case f fp eob s of - OK# a s' -> runParser# (g a (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s)))) fp eob s' - x -> unsafeCoerce# x -{-# inline withByteString #-} - --- | Run a parser in a given input span. The input position and the `Int` state is restored after --- the parser is finished, so `inSpan` does not consume input and has no side effect. Warning: --- this operation may crash if the given span points outside the current parsing buffer. It's --- always safe to use `inSpan` if the span comes from a previous `withSpan` or `spanOf` call on --- the current input. -inSpan :: Span -> Parser e a -> Parser e a -inSpan (Span s eob) (Parser f) = Parser \fp eob' s' -> - case f fp (posToAddr# eob' eob) (posToAddr# eob' s) of - OK# a _ -> OK# a s' - x -> unsafeCoerce# x -{-# inline inSpan #-} - --------------------------------------------------------------------------------- - --- | Check whether a `Pos` points into a `B.ByteString`. -validPos :: B.ByteString -> Pos -> Bool -validPos str pos = - let go = do - start <- getPos - pure (start <= pos && pos <= endPos) - in case runParser go str of - OK b _ -> b - _ -> error "impossible" -{-# inline validPos #-} - --- | Compute corresponding line and column numbers for each `Pos` in a list. Throw an error --- on invalid positions. Note: computing lines and columns may traverse the `B.ByteString`, --- but it traverses it only once regardless of the length of the position list. -posLineCols :: B.ByteString -> [Pos] -> [(Int, Int)] -posLineCols str poss = - let go !line !col [] = pure [] - go line col ((i, pos):poss) = do - p <- getPos - if pos == p then - ((i, (line, col)):) <$> go line col poss - else do - c <- anyChar - if '\n' == c then - go (line + 1) 0 ((i, pos):poss) - else - go line (col + 1) ((i, pos):poss) - - sorted :: [(Int, Pos)] - sorted = sortBy (comparing snd) (zip [0..] poss) - - in case runParser (go 0 0 sorted) str of - OK res _ -> snd <$> sortBy (comparing fst) res - _ -> error "invalid position" - --- | Create a `B.ByteString` from a `Span`. The result is invalid if the `Span` points --- outside the current buffer, or if the `Span` start is greater than the end position. -unsafeSpanToByteString :: Span -> Parser e B.ByteString -unsafeSpanToByteString (Span l r) = - lookahead (setPos l >> byteStringOf (setPos r)) -{-# inline unsafeSpanToByteString #-} - --- | Create a `Pos` from a line and column number. Throws an error on out-of-bounds --- line and column numbers. -mkPos :: B.ByteString -> (Int, Int) -> Pos -mkPos str (line', col') = - let go line col | line == line' && col == col' = getPos - go line col = (do - c <- anyChar - if c == '\n' then go (line + 1) 0 - else go line (col + 1)) <|> error "mkPos: invalid position" - in case runParser (go 0 0) str of - OK res _ -> res - _ -> error "impossible" - --- | Break an UTF-8-coded `B.ByteString` to lines. Throws an error on invalid input. --- This is mostly useful for grabbing specific source lines for displaying error --- messages. -lines :: B.ByteString -> [String] -lines str = - let go = ([] <$ eof) <|> ((:) <$> takeLine <*> go) - in case runParser go str of - OK ls _ -> ls - _ -> error "linesUTF8: invalid input" - -------------------------------------------------------------------------------- --- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, --- throws an error if the encoding is invalid. -takeLine :: Parser e String -takeLine = branch eof (pure "") do - c <- anyChar - case c of - '\n' -> pure "" - _ -> (c:) <$> takeLine - --- | Parse the rest of the current line as a `String`, but restore the parsing state. --- Assumes UTF-8 encoding. This can be used for debugging. -traceLine :: Parser e String -traceLine = lookahead takeLine - --- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. -takeRest :: Parser e String -takeRest = branch eof (pure "") do - c <- anyChar - cs <- takeRest - pure (c:cs) +-- | Parse a given `B.ByteString`. If the bytestring is statically known, +-- consider using 'bytes' instead. +getByteStringOf :: B.ByteString -> Parser e () +getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = --- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. --- This can be used for debugging. -traceRest :: Parser e String -traceRest = lookahead takeRest + let go64 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) + go64 bs bsend s w = + let bs' = plusAddr# bs 8# in + case gtAddr# bs' bsend of + 1# -> go8 bs bsend s w + _ -> if W64# (indexWord64OffAddr# bs 0#) == W64# (indexWord64OffAddr# s 0#) + then go64 bs' bsend (plusAddr# s 8#) w + else (# Fail#, w #) --------------------------------------------------------------------------------- + go8 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) + go8 bs bsend s w = + case ltAddr# bs bsend of + 1# -> if W8# (indexWord8OffAddr# bs 0#) == W8# (indexWord8OffAddr# s 0#) + then go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) w + else (# Fail#, w #) + _ -> (# OK# () s, w #) --- | Convert an UTF-8-coded `B.ByteString` to a `String`. -unpackUTF8 :: B.ByteString -> String -unpackUTF8 str = case runParser takeRest str of - OK a _ -> a - _ -> error "unpackUTF8: invalid encoding" + in Parser \fp eob s -> case len <=# minusAddr# eob s of + 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s w of + (# res, w #) -> case touch# fcontent w of + w -> res + _ -> Fail# +{-# inline getByteStringOf #-} --- | Check that the input has at least the given number of bytes. -ensureBytes# :: Int -> Parser e () -ensureBytes# (I# len) = Parser \fp eob s -> - case len <=# minusAddr# eob s of - 1# -> OK# () s - _ -> Fail# -{-# inline ensureBytes# #-} - --- | Unsafely read a concrete byte from the input. It's not checked that the input has --- enough bytes. -scan8# :: Word8 -> Parser e () -scan8# (W8# c) = Parser \fp eob s -> - case indexWord8OffAddr# s 0# of - c' -> case eqWord8'# c c' of - 1# -> OK# () (plusAddr# s 1#) - _ -> Fail# -{-# inline scan8# #-} - --- | Unsafely read two concrete bytes from the input. It's not checked that the input has --- enough bytes. -scan16# :: Word16 -> Parser e () -scan16# (W16# c) = Parser \fp eob s -> - case indexWord16OffAddr# s 0# of - c' -> case eqWord16'# c c' of - 1# -> OK# () (plusAddr# s 2#) - _ -> Fail# -{-# inline scan16# #-} - --- | Unsafely read four concrete bytes from the input. It's not checked that the input has --- enough bytes. -scan32# :: Word32 -> Parser e () -scan32# (W32# c) = Parser \fp eob s -> - case indexWord32OffAddr# s 0# of - c' -> case eqWord32'# c c' of - 1# -> OK# () (plusAddr# s 4#) - _ -> Fail# -{-# inline scan32# #-} - --- | Unsafely read eight concrete bytes from the input. It's not checked that the input has --- enough bytes. -scan64# :: Word -> Parser e () -scan64# (W# c) = Parser \fp eob s -> - case indexWord64OffAddr# s 0# of - c' -> case eqWord# c c' of - 1# -> OK# () (plusAddr# s 8#) - _ -> Fail# -{-# inline scan64# #-} - --- | Unsafely read and return a byte from the input. It's not checked that the input is non-empty. -scanAny8# :: Parser e Word8 -scanAny8# = Parser \fp eob s -> OK# (W8# (indexWord8OffAddr# s 0#)) (plusAddr# s 1#) -{-# inline scanAny8# #-} - -scanPartial64# :: Int -> Word -> Parser e () -scanPartial64# (I# len) (W# w) = Parser \fp eob s -> - case indexWordOffAddr# s 0# of - w' -> case uncheckedIShiftL# (8# -# len) 3# of - sh -> case uncheckedShiftL# w' sh of - w' -> case uncheckedShiftRL# w' sh of - w' -> case eqWord# w w' of - 1# -> OK# () (plusAddr# s len) - _ -> Fail# -{-# inline scanPartial64# #-} - --- | Decrease the current input position by the given number of bytes. -setBack# :: Int -> Parser e () -setBack# (I# i) = Parser \fp eob s -> - OK# () (plusAddr# s (negateInt# i)) -{-# inline setBack# #-} +-- | Read a sequence of bytes. This is a template function, you can use it as +-- @$(getBytesOf [3, 4, 5])@, for example, and the splice has type @Parser e +-- ()@. +getBytesOf :: [Word] -> Q Exp +getBytesOf bytes = do + let !len = length bytes + [| ensureBytes# len >> $(scanBytes# bytes) |] -- | Template function, creates a @Parser e ()@ which unsafely scans a given -- sequence of bytes. @@ -1071,24 +473,24 @@ scanBytes# :: [Word] -> Q Exp scanBytes# bytes = do let !(leading, w8s) = splitBytes bytes !scanw8s = go w8s where - go (w8:[] ) = [| scan64# w8 |] - go (w8:w8s) = [| scan64# w8 >> $(go w8s) |] + go (w8:[] ) = [| getWord64OfUnsafe w8 |] + go (w8:w8s) = [| getWord64OfUnsafe w8 >> $(go w8s) |] go [] = [| pure () |] case w8s of [] -> go leading where - go (a:b:c:d:[]) = let !w = packBytes [a, b, c, d] in [| scan32# w |] - go (a:b:c:d:ws) = let !w = packBytes [a, b, c, d] in [| scan32# w >> $(go ws) |] - go (a:b:[]) = let !w = packBytes [a, b] in [| scan16# w |] - go (a:b:ws) = let !w = packBytes [a, b] in [| scan16# w >> $(go ws) |] - go (a:[]) = [| scan8# a |] + go (a:b:c:d:[]) = let !w = packBytes [a, b, c, d] in [| getWord32OfUnsafe w |] + go (a:b:c:d:ws) = let !w = packBytes [a, b, c, d] in [| getWord32OfUnsafe w >> $(go ws) |] + go (a:b:[]) = let !w = packBytes [a, b] in [| getWord16OfUnsafe w |] + go (a:b:ws) = let !w = packBytes [a, b] in [| getWord16OfUnsafe w >> $(go ws) |] + go (a:[]) = [| getWord8OfUnsafe a |] go [] = [| pure () |] _ -> case leading of [] -> scanw8s - [a] -> [| scan8# a >> $scanw8s |] - ws@[a, b] -> let !w = packBytes ws in [| scan16# w >> $scanw8s |] - ws@[a, b, c, d] -> let !w = packBytes ws in [| scan32# w >> $scanw8s |] + [a] -> [| getWord8OfUnsafe a >> $scanw8s |] + ws@[a, b] -> let !w = packBytes ws in [| getWord16OfUnsafe w >> $scanw8s |] + ws@[a, b, c, d] -> let !w = packBytes ws in [| getWord32OfUnsafe w >> $scanw8s |] ws -> let !w = packBytes ws !l = length ws in [| scanPartial64# l w >> $scanw8s |] @@ -1129,7 +531,7 @@ genTrie (rules, t) = do !defaultCase <- fallback r (n + 1) let cases = mkDoE $ - [BindS (VarP (mkName "c")) (VarE 'scanAny8#), + [BindS (VarP (mkName "c")) (VarE 'getWord8Unsafe), NoBindS (CaseE (VarE (mkName "c")) (map (\(w, t) -> Match (LitP (IntegerL (fromIntegral w))) @@ -1182,313 +584,145 @@ genSwitchTrie' postAction cases fallback = -------------------------------------------------------------------------------- -withAnyWord8# :: (Word8'# -> Parser e a) -> Parser e a -withAnyWord8# p = Parser \fp eob buf -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case indexWord8OffAddr# buf 0# of - w# -> runParser# (p w#) fp eob (plusAddr# buf 1#) -{-# inline withAnyWord8# #-} - -withAnyWord16# :: (Word16'# -> Parser e a) -> Parser e a -withAnyWord16# p = Parser \fp eob buf -> case 2# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexWord16OffAddr# buf 0# of - w# -> runParser# (p w#) fp eob (plusAddr# buf 2#) -{-# inline withAnyWord16# #-} - -withAnyWord32# :: (Word32'# -> Parser e a) -> Parser e a -withAnyWord32# p = Parser \fp eob buf -> case 4# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexWord32OffAddr# buf 0# of - w# -> runParser# (p w#) fp eob (plusAddr# buf 4#) -{-# inline withAnyWord32# #-} - -withAnyWord64# :: (Word# -> Parser e a) -> Parser e a -withAnyWord64# p = Parser \fp eob buf -> case 8# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexWordOffAddr# buf 0# of - w# -> runParser# (p w#) fp eob (plusAddr# buf 8#) -{-# inline withAnyWord64# #-} - -withAnyInt8# :: (Int8'# -> Parser e a) -> Parser e a -withAnyInt8# p = Parser \fp eob buf -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case indexInt8OffAddr# buf 0# of - i# -> runParser# (p i#) fp eob (plusAddr# buf 1#) -{-# inline withAnyInt8# #-} - -withAnyInt16# :: (Int16'# -> Parser e a) -> Parser e a -withAnyInt16# p = Parser \fp eob buf -> case 2# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexInt16OffAddr# buf 0# of - i# -> runParser# (p i#) fp eob (plusAddr# buf 2#) -{-# inline withAnyInt16# #-} - -withAnyInt32# :: (Int32'# -> Parser e a) -> Parser e a -withAnyInt32# p = Parser \fp eob buf -> case 4# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexInt32OffAddr# buf 0# of - i# -> runParser# (p i#) fp eob (plusAddr# buf 4#) -{-# inline withAnyInt32# #-} - -withAnyInt64# :: (Int# -> Parser e a) -> Parser e a -withAnyInt64# p = Parser \fp eob buf -> case 8# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexInt64OffAddr# buf 0# of - i# -> runParser# (p i#) fp eob (plusAddr# buf 8#) -{-# inline withAnyInt64# #-} - --------------------------------------------------------------------------------- - --- | Parse any 'Word8' (byte). -anyWord8 :: Parser e Word8 -anyWord8 = withAnyWord8# (\w# -> pure (W8# w#)) -{-# inline anyWord8 #-} - --- | Skip any 'Word8' (byte). -anyWord8_ :: Parser e () -anyWord8_ = () <$ anyWord8 -{-# inline anyWord8_ #-} - --- | Parse any 'Word16'. -anyWord16 :: Parser e Word16 -anyWord16 = withAnyWord16# (\w# -> pure (W16# w#)) -{-# inline anyWord16 #-} - --- | Skip any 'Word16'. -anyWord16_ :: Parser e () -anyWord16_ = () <$ anyWord16 -{-# inline anyWord16_ #-} - --- | Parse any 'Word32'. -anyWord32 :: Parser e Word32 -anyWord32 = withAnyWord32# (\w# -> pure (W32# w#)) -{-# inline anyWord32 #-} - --- | Skip any 'Word32'. -anyWord32_ :: Parser e () -anyWord32_ = () <$ anyWord32 -{-# inline anyWord32_ #-} - --- | Parse any 'Word64'. -anyWord64 :: Parser e Word64 -anyWord64 = withAnyWord64# (\w# -> pure (W64# w#)) -{-# inline anyWord64 #-} - --- | Skip any 'Word64'. -anyWord64_ :: Parser e () -anyWord64_ = () <$ anyWord64 -{-# inline anyWord64_ #-} - --- | Parse any 'Word'. -anyWord :: Parser e Word -anyWord = withAnyWord64# (\w# -> pure (W# w#)) -{-# inline anyWord #-} - --- | Skip any 'Word'. -anyWord_ :: Parser e () -anyWord_ = () <$ anyWord -{-# inline anyWord_ #-} - --------------------------------------------------------------------------------- - --- | Parse any 'Int8'. -anyInt8 :: Parser e Int8 -anyInt8 = withAnyInt8# (\i# -> pure (I8# i#)) -{-# inline anyInt8 #-} - --- | Parse any 'Int16'. -anyInt16 :: Parser e Int16 -anyInt16 = withAnyInt16# (\i# -> pure (I16# i#)) -{-# inline anyInt16 #-} - --- | Parse any 'Int32'. -anyInt32 :: Parser e Int32 -anyInt32 = withAnyInt32# (\i# -> pure (I32# i#)) -{-# inline anyInt32 #-} +-- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. +-- This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not +-- backtrack from the true/false cases. +branch :: Parser e a -> Parser e b -> Parser e b -> Parser e b +branch pa pt pf = Parser \fp eob s -> case runParser# pa fp eob s of + OK# _ s -> runParser# pt fp eob s + Fail# -> runParser# pf fp eob s + Err# e -> Err# e +{-# inline branch #-} --- | Parse any 'Int64'. -anyInt64 :: Parser e Int64 -anyInt64 = withAnyInt64# (\i# -> pure (I64# i#)) -{-# inline anyInt64 #-} +-- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All +-- isolated bytes must be consumed. +-- +-- Throws a runtime error if given a negative integer. +isolate :: Int -> Parser e a -> Parser e a +isolate (I# n#) p = withPosInt# n# (\n'# -> isolateUnsafe# n'# p) +{-# inline isolate #-} --- | Parse any 'Int'. -anyInt :: Parser e Int -anyInt = withAnyInt64# (\i# -> pure (I# i#)) -{-# inline anyInt #-} +-- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All +-- isolated bytes must be consumed. +-- +-- Undefined behaviour if given a negative integer. +isolateUnsafe# :: Int# -> Parser e a -> Parser e a +isolateUnsafe# n# p = Parser \fp eob s -> + let s' = plusAddr# s n# + in case n# <=# minusAddr# eob s of + 1# -> case runParser# p fp s' s of + OK# a s'' -> case eqAddr# s' s'' of + 1# -> OK# a s'' + _ -> Fail# -- isolated segment wasn't fully consumed + Fail# -> Fail# + Err# e -> Err# e + _ -> Fail# -- you tried to isolate more than we have left +{-# inline isolateUnsafe# #-} -------------------------------------------------------------------------------- --- | Parse any 'Word16' (little-endian). -anyWord16le :: Parser e Word16 -anyWord16le = anyWord16 -{-# inline anyWord16le #-} - --- | Parse any 'Word16' (big-endian). -anyWord16be :: Parser e Word16 -anyWord16be = withAnyWord16# (\w# -> pure (W16# (byteSwap16'# w#))) -{-# inline anyWord16be #-} - --- | Parse any 'Word32' (little-endian). -anyWord32le :: Parser e Word32 -anyWord32le = anyWord32 -{-# inline anyWord32le #-} - --- | Parse any 'Word32' (big-endian). -anyWord32be :: Parser e Word32 -anyWord32be = withAnyWord32# (\w# -> pure (W32# (byteSwap32'# w#))) -{-# inline anyWord32be #-} +-- | Convert an UTF-8-coded `B.ByteString` to a `String`. +unpackUTF8 :: B.ByteString -> String +unpackUTF8 str = case runParser takeRestString str of + OK a _ -> a + _ -> error "unpackUTF8: invalid encoding" --- | Parse any 'Word64' (little-endian). -anyWord64le :: Parser e Word64 -anyWord64le = anyWord64 -{-# inline anyWord64le #-} +-- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. +takeRestString :: Parser e String +takeRestString = branch eof (pure "") do + c <- getChar + cs <- takeRestString + pure (c:cs) --- | Parse any 'Word64' (big-endian). -anyWord64be :: Parser e Word64 -anyWord64be = withAnyWord64# (\w# -> pure (W64# (byteSwap# w#))) -{-# inline anyWord64be #-} +-- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. +-- This can be used for debugging. +traceRestString :: Parser e String +traceRestString = lookahead takeRestString -------------------------------------------------------------------------------- --- | Parse any 'Int16' (little-endian). -anyInt16le :: Parser e Int16 -anyInt16le = anyInt16 -{-# inline anyInt16le #-} - --- | Parse any 'Int16' (big-endian). -anyInt16be :: Parser e Int16 -anyInt16be = withAnyWord16# (\w# -> pure (I16# (word16ToInt16# (byteSwap16'# w#)))) -{-# inline anyInt16be #-} - --- | Parse any 'Int32' (little-endian). -anyInt32le :: Parser e Int32 -anyInt32le = anyInt32 -{-# inline anyInt32le #-} - --- | Parse any 'Int32' (big-endian). -anyInt32be :: Parser e Int32 -anyInt32be = withAnyWord32# (\w# -> pure (I32# (word32ToInt32# (byteSwap32'# w#)))) -{-# inline anyInt32be #-} +-- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, +-- throws an error if the encoding is invalid. +takeLine :: Parser e String +takeLine = branch eof (pure "") do + c <- getChar + case c of + '\n' -> pure "" + _ -> (c:) <$> takeLine --- | Parse any 'Int64' (little-endian). -anyInt64le :: Parser e Int64 -anyInt64le = anyInt64 -{-# inline anyInt64le #-} +-- | Parse the rest of the current line as a `String`, but restore the parsing state. +-- Assumes UTF-8 encoding. This can be used for debugging. +traceLine :: Parser e String +traceLine = lookahead takeLine --- | Parse any 'Int64' (big-endian). -anyInt64be :: Parser e Int64 -anyInt64be = withAnyWord64# (\w# -> pure (I64# (word2Int# (byteSwap# w#)))) -{-# inline anyInt64be #-} +-- | Run a parser on a `String` input. Reminder: @OverloadedStrings@ for `B.ByteString` does not +-- yield a valid UTF-8 encoding! For non-ASCII `B.ByteString` literal input, use `runParserS` or +-- `packUTF8` for testing. +runParserS :: Parser e a -> String -> Result e a +runParserS pa s = runParser pa (packUTF8 s) --------------------------------------------------------------------------------- +-- | Create a `Pos` from a line and column number. Throws an error on out-of-bounds +-- line and column numbers. +mkPos :: B.ByteString -> (Int, Int) -> Pos +mkPos str (line', col') = + let go line col | line == line' && col == col' = getPos + go line col = (do + c <- getChar + if c == '\n' then go (line + 1) 0 + else go line (col + 1)) <|> error "mkPos: invalid position" + in case runParser (go 0 0) str of + OK res _ -> res + _ -> error "impossible" --- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ --- bytes are available. --- --- Throws a runtime error if given a negative integer. -atSkip# :: Int# -> Parser e a -> Parser e a -atSkip# os# (Parser p) = Parser \fp eob s -> case os# <=# minusAddr# eob s of - 1# -> case os# >=# 0# of - 1# -> p fp eob (plusAddr# s os#) - _ -> error "FlatParse.Basic.atSkip#: negative integer" - _ -> Fail# -{-# inline atSkip# #-} +-- | Break an UTF-8-coded `B.ByteString` to lines. Throws an error on invalid +-- input. This is mostly useful for grabbing specific source lines for +-- displaying error messages. +lines :: B.ByteString -> [String] +lines str = + let go = ([] <$ eof) <|> ((:) <$> takeLine <*> go) + in case runParser go str of + OK ls _ -> ls + _ -> error "linesUTF8: invalid input" --- | Read the given number of bytes as a 'ByteString'. --- --- Throws a runtime error if given a negative integer. -takeBs# :: Int# -> Parser e B.ByteString -takeBs# n# = Parser \fp eob s -> case n# <=# minusAddr# eob s of - 1# -> -- have to runtime check for negative values, because they cause a hang - case n# >=# 0# of - 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) - _ -> error "FlatParse.Basic.takeBs: negative integer" - _ -> Fail# -{-# inline takeBs# #-} --------------------------------------------------------------------------------- +-- | Check whether a `Pos` points into a `B.ByteString`. +validPos :: B.ByteString -> Pos -> Bool +validPos str pos = + let go = do + start <- getPos + pure (start <= pos && pos <= endPos) + in case runParser go str of + OK b _ -> b + _ -> error "impossible" +{-# inline validPos #-} --- | Run a parser, passing it the current address the parser is at. --- --- Useful for parsing offset-based data tables. For example, you may use this to --- save the base address to use together with various 0-indexed offsets. -withAddr# :: (Addr# -> Parser e a) -> Parser e a -withAddr# p = Parser \fp eob s -> runParser# (p s) fp eob s -{-# inline withAddr# #-} - --- | @takeBsOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ --- bytes, reads @len#@ bytes into a 'ByteString', and restores the original --- address. --- --- The 'Addr#' should be from 'withAddr#'. --- --- Useful for parsing offset-based data tables. For example, you may use this --- together with 'withAddr#' to jump to an offset in your input and read some --- data. -takeBsOffAddr# :: Addr# -> Int# -> Int# -> Parser e B.ByteString -takeBsOffAddr# addr# offset# len# = - lookaheadFromAddr# addr# $ atSkip# offset# $ takeBs# len# -{-# inline takeBsOffAddr# #-} - --- | 'lookahead', but specify the address to lookahead from. --- --- The 'Addr#' should be from 'withAddr#'. -lookaheadFromAddr# :: Addr# -> Parser e a -> Parser e a -lookaheadFromAddr# s = lookahead . atAddr# s -{-# inline lookaheadFromAddr# #-} +-- | Compute corresponding line and column numbers for each `Pos` in a list. Throw an error +-- on invalid positions. Note: computing lines and columns may traverse the `B.ByteString`, +-- but it traverses it only once regardless of the length of the position list. +posLineCols :: B.ByteString -> [Pos] -> [(Int, Int)] +posLineCols str poss = + let go !line !col [] = pure [] + go line col ((i, pos):poss) = do + p <- getPos + if pos == p then + ((i, (line, col)):) <$> go line col poss + else do + c <- getChar + if '\n' == c then + go (line + 1) 0 ((i, pos):poss) + else + go line (col + 1) ((i, pos):poss) --- | Run a parser at the given address. --- --- The 'Addr#' should be from 'withAddr#'. --- --- This is a highly internal function -- you likely want 'lookaheadFromAddr#', --- which will reset the address after running the parser. -atAddr# :: Addr# -> Parser e a -> Parser e a -atAddr# s (Parser p) = Parser \fp eob _ -> p fp eob s -{-# inline atAddr# #-} + sorted :: [(Int, Pos)] + sorted = sortBy (comparing snd) (zip [0..] poss) --------------------------------------------------------------------------------- + in case runParser (go 0 0 sorted) str of + OK res _ -> snd <$> sortBy (comparing fst) res + _ -> error "invalid position" --- | Read a null-terminated bytestring (a C-style string). --- --- Consumes the null terminator. -anyCString :: Parser e B.ByteString -anyCString = Parser \fp eob s -> go' fp eob s - where - go' fp eob s0 = go 0# s0 - where - go n# s = case eqAddr# eob s of - 1# -> Fail# - _ -> - let s' = plusAddr# s 1# - -- TODO below is a candidate for improving with ExtendedLiterals! - in case eqWord8# (indexWord8OffAddr''# s 0#) (wordToWord8''# 0##) of - 1# -> OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' - _ -> go (n# +# 1#) s' -{-# inline anyCString #-} - --- | Read a null-terminated bytestring (a C-style string), where the bytestring --- is known to be null-terminated somewhere in the input. --- --- Highly unsafe. Unless you have a guarantee that the string will be null --- terminated before the input ends, use 'anyCString' instead. Honestly, I'm not --- sure if this is a good function to define. But here it is. --- --- Fails on GHC versions older than 9.0, since we make use of the --- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful --- without it. --- --- Consumes the null terminator. -anyCStringUnsafe :: Parser e B.ByteString -{-# inline anyCStringUnsafe #-} -#if MIN_VERSION_base(4,15,0) -anyCStringUnsafe = Parser \fp eob s -> - case eqAddr# eob s of - 1# -> Fail# - _ -> let n# = cstringLength# s - s'# = plusAddr# s (n# +# 1#) - in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# -#else -anyCStringUnsafe = error "Flatparse.Basic.anyCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" -#endif +-- | Create a `B.ByteString` from a `Span`. The result is invalid if the `Span` points +-- outside the current buffer, or if the `Span` start is greater than the end position. +unsafeSpanToByteString :: Span -> Parser e B.ByteString +unsafeSpanToByteString (Span l r) = + lookahead (setPos l >> byteStringOf (setPos r)) +{-# inline unsafeSpanToByteString #-} diff --git a/src/FlatParse/Basic/Chars.hs b/src/FlatParse/Basic/Chars.hs new file mode 100644 index 0000000..de44048 --- /dev/null +++ b/src/FlatParse/Basic/Chars.hs @@ -0,0 +1,171 @@ +module FlatParse.Basic.Chars where + +import Prelude hiding ( getChar ) + +import FlatParse.Basic.Parser + +import FlatParse.Common.Assorted ( derefChar8# ) + +import GHC.Exts + +-- | Parse a UTF-8 `Char` for which a predicate holds. +satisfy :: (Char -> Bool) -> Parser e Char +satisfy f = Parser \fp eob s -> case runParser# getChar fp eob s of + OK# c s | f c -> OK# c s + _ -> Fail# +{-# inline satisfy #-} + +-- | Skip a UTF-8 `Char` for which a predicate holds. +satisfy_ :: (Char -> Bool) -> Parser e () +satisfy_ f = Parser \fp eob s -> case runParser# getChar fp eob s of + OK# c s | f c -> OK# () s + _ -> Fail# +{-# inline satisfy_ #-} + +-- | Parse an ASCII `Char` for which a predicate holds. Assumption: the predicate must only return +-- `True` for ASCII-range characters. Otherwise this function might read a 128-255 range byte, +-- thereby breaking UTF-8 decoding. +satisfyASCII :: (Char -> Bool) -> Parser e Char +satisfyASCII f = Parser \fp eob s -> case eqAddr# eob s of + 1# -> Fail# + _ -> case derefChar8# s of + c1 | f (C# c1) -> OK# (C# c1) (plusAddr# s 1#) + | otherwise -> Fail# +{-# inline satisfyASCII #-} + +-- | Skip an ASCII `Char` for which a predicate holds. Assumption: the predicate +-- must only return `True` for ASCII-range characters. +satisfyASCII_ :: (Char -> Bool) -> Parser e () +satisfyASCII_ f = Parser \fp eob s -> case eqAddr# eob s of + 1# -> Fail# + _ -> case derefChar8# s of + c1 | f (C# c1) -> OK# () (plusAddr# s 1#) + | otherwise -> Fail# +{-# inline satisfyASCII_ #-} + +-- | This is a variant of `satisfy` which allows more optimization. We can pick four testing +-- functions for the four cases for the possible number of bytes in the UTF-8 character. So in +-- @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with +-- @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups. +-- +-- For example, if we want to accept any letter, the naive solution would be to use +-- `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We +-- can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the +-- `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of +-- all cases without accessing the character table. +fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e Char +fusedSatisfy f1 f2 f3 f4 = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> case derefChar8# buf of + c1 -> case c1 `leChar#` '\x7F'# of + 1# | f1 (C# c1) -> OK# (C# c1) (plusAddr# buf 1#) + | otherwise -> Fail# + _ -> case eqAddr# eob (plusAddr# buf 1#) of + 1# -> Fail# + _ -> case indexCharOffAddr# buf 1# of + c2 -> case c1 `leChar#` '\xDF'# of + 1# -> + let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` + (ord# c2 -# 0x80#))) + in case f2 resc of + True -> OK# resc (plusAddr# buf 2#) + _ -> Fail# + _ -> case eqAddr# eob (plusAddr# buf 2#) of + 1# -> Fail# + _ -> case indexCharOffAddr# buf 2# of + c3 -> case c1 `leChar#` '\xEF'# of + 1# -> + let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` + ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` + (ord# c3 -# 0x80#))) + in case f3 resc of + True -> OK# resc (plusAddr# buf 3#) + _ -> Fail# + _ -> case eqAddr# eob (plusAddr# buf 3#) of + 1# -> Fail# + _ -> case indexCharOffAddr# buf 3# of + c4 -> + let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` + ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` + ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` + (ord# c4 -# 0x80#))) + in case f4 resc of + True -> OK# resc (plusAddr# buf 4#) + _ -> Fail# +{-# inline fusedSatisfy #-} + +-- | Skipping variant of `fusedSatisfy`. +fusedSatisfy_ :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser e () +fusedSatisfy_ f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4 +{-# inline fusedSatisfy_ #-} + +-- | Parse any UTF-8-encoded `Char`. +getChar :: Parser e Char +getChar = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> case derefChar8# buf of + c1 -> case c1 `leChar#` '\x7F'# of + 1# -> OK# (C# c1) (plusAddr# buf 1#) + _ -> case eqAddr# eob (plusAddr# buf 1#) of + 1# -> Fail# + _ -> case indexCharOffAddr# buf 1# of + c2 -> case c1 `leChar#` '\xDF'# of + 1# -> + let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` + (ord# c2 -# 0x80#) + in OK# (C# (chr# resc)) (plusAddr# buf 2#) + _ -> case eqAddr# eob (plusAddr# buf 2#) of + 1# -> Fail# + _ -> case indexCharOffAddr# buf 2# of + c3 -> case c1 `leChar#` '\xEF'# of + 1# -> + let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` + ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` + (ord# c3 -# 0x80#) + in OK# (C# (chr# resc)) (plusAddr# buf 3#) + _ -> case eqAddr# eob (plusAddr# buf 3#) of + 1# -> Fail# + _ -> case indexCharOffAddr# buf 3# of + c4 -> + let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` + ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` + ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` + (ord# c4 -# 0x80#) + in OK# (C# (chr# resc)) (plusAddr# buf 4#) +{-# inline getChar #-} + +-- | Skip any UTF-8-encoded `Char`. +getChar_ :: Parser e () +getChar_ = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> case derefChar8# buf of + c1 -> case c1 `leChar#` '\x7F'# of + 1# -> OK# () (plusAddr# buf 1#) + _ -> + let buf' = + case c1 `leChar#` '\xDF'# of + 1# -> plusAddr# buf 2# + _ -> case c1 `leChar#` '\xEF'# of + 1# -> plusAddr# buf 3# + _ -> plusAddr# buf 4# + in case leAddr# buf' eob of + 1# -> OK# () buf' + _ -> Fail# +{-# inline getChar_ #-} + +-- | Parse any `Char` in the ASCII range, fail if the next input character is not in the range. +-- This is more efficient than `getChar` if we are only working with ASCII. +getCharASCII :: Parser e Char +getCharASCII = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> case derefChar8# buf of + c1 -> case c1 `leChar#` '\x7F'# of + 1# -> OK# (C# c1) (plusAddr# buf 1#) + _ -> Fail# +{-# inline getCharASCII #-} + +-- | Skip any `Char` in the ASCII range. More efficient than `getChar_` if we're working only with +-- ASCII. +getCharASCII_ :: Parser e () +getCharASCII_ = () <$ getCharASCII +{-# inline getCharASCII_ #-} diff --git a/src/FlatParse/Basic/Integers.hs b/src/FlatParse/Basic/Integers.hs new file mode 100644 index 0000000..f5d7516 --- /dev/null +++ b/src/FlatParse/Basic/Integers.hs @@ -0,0 +1,340 @@ +{- | Machine integer parsers. + +The following naming conventions are followed: + + * @withX@ are continuation passing style (CPS) parsers. + * These are sprinkled throughout to enable better reasoning about + performance. + * @getX@ are regular monadic parsers. + * @getXOf@ parse and check equality with a provided value. + * Definitions ending with @#@ are called with unboxed values. + * Definitions ending with @Unsafe@ are unsafe. Read their documentation before + using. + +Other points: + + * TODO: The endianness code is currently lying. We blindly assume that our + host system is little-endian, and parse in big-endian by parsing normally + then "reversing" the resulting integer. + * Unless otherwise mentioned, native byte order is utilized. +-} + +module FlatParse.Basic.Integers + ( + -- * Machine integer parsers (native byte order) + getWord8, getWord16, getWord32, getWord64 + , getInt8, getInt16, getInt32, getInt64 + , getWord, getInt + + -- * Machine integer parsers (explicit endianness) + , getWord16le, getWord16be + , getWord32le, getWord32be + , getWord64le, getWord64be + , getInt16le, getInt16be + , getInt32le, getInt32be + , getInt64le, getInt64be + + -- * Machine integer parsers which assert value + , getWord8Of + + -- * Machine integer CPS parsers + , withWord8, withWord16, withWord32, withWord64 + , withInt8, withInt16, withInt32, withInt64 + , withWord, withInt + + -- * Unsafe machine integer parsers + , getWord8Unsafe + + -- * Unsafe machine integer parsers which assert value + , getWord8OfUnsafe, getWord16OfUnsafe, getWord32OfUnsafe, getWord64OfUnsafe + + -- * Helper definitions + , withSized#, withSizedUnsafe# + , getSizedOfUnsafe# + ) where + +import GHC.Exts +import GHC.Word +import GHC.Int + +import FlatParse.Basic.Parser +import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 ) + +-------------------------------------------------------------------------------- + +-- | Helper for defining CPS parsers for types of a constant byte size (i.e. +-- machine integers). +-- +-- Call this with an @indexXYZOffAddr@ primop (e.g. +-- 'GHC.Exts.indexWord8OffAddr') and the size in bytes of the type you're +-- parsing. +withSized# + :: Int# -> (Addr# -> Int# -> a) -> (a -> Parser e r) -> Parser e r +withSized# size# indexOffAddr p = Parser \fp eob buf -> + case size# <=# minusAddr# eob buf of + 0# -> Fail# + _ -> runParser# (withSizedUnsafe# size# indexOffAddr p) fp eob buf +{-# inline withSized# #-} + +-- | Unsafe helper for defining CPS parsers for types of a constant byte size +-- (i.e. machine integers). +-- +-- The caller must guarantee that the input has enough bytes. +withSizedUnsafe# + :: Int# -> (Addr# -> Int# -> a) -> (a -> Parser e r) -> Parser e r +withSizedUnsafe# size# indexOffAddr p = Parser \fp eob buf -> + let a = indexOffAddr buf 0# + buf' = plusAddr# buf size# + in runParser# (p a) fp eob buf' +{-# inline withSizedUnsafe# #-} + +-- | Parse any 'Word8' (CPS). +withWord8 :: (Word8 -> Parser e r) -> Parser e r +withWord8 p = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> let w# = indexWord8OffAddr# buf 0# + in runParser# (p (W8# w#)) fp eob (plusAddr# buf 1#) +{-# inline withWord8 #-} + +-- | Parse any 'Word16' (native byte order) (CPS). +withWord16 :: (Word16 -> Parser e r) -> Parser e r +withWord16 = withSized# 2# (\a i -> W16# (indexWord16OffAddr# a i)) +{-# inline withWord16 #-} + +-- | Parse any 'Word32' (native byte order) (CPS). +withWord32 :: (Word32 -> Parser e r) -> Parser e r +withWord32 = withSized# 4# (\a i -> W32# (indexWord32OffAddr# a i)) +{-# inline withWord32 #-} + +-- | Parse any 'Word64' (native byte order) (CPS). +withWord64 :: (Word64 -> Parser e r) -> Parser e r +withWord64 = withSized# 8# (\a i -> W64# (indexWord64OffAddr# a i)) +{-# inline withWord64 #-} + +-- | Parse any 'Int8' (CPS). +withInt8 :: (Int8 -> Parser e r) -> Parser e r +withInt8 p = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> let i# = indexInt8OffAddr# buf 0# + in runParser# (p (I8# i#)) fp eob (plusAddr# buf 1#) +{-# inline withInt8 #-} + +-- | Parse any 'Int16' (native byte order) (CPS). +withInt16 :: (Int16 -> Parser e r) -> Parser e r +withInt16 = withSized# 2# (\a i -> I16# (indexInt16OffAddr# a i)) +{-# inline withInt16 #-} + +-- | Parse any 'Int32' (native byte order) (CPS). +withInt32 :: (Int32 -> Parser e r) -> Parser e r +withInt32 = withSized# 4# (\a i -> I32# (indexInt32OffAddr# a i)) +{-# inline withInt32 #-} + +-- | Parse any 'Int64' (native byte order) (CPS). +withInt64 :: (Int64 -> Parser e r) -> Parser e r +withInt64 = withSized# 8# (\a i -> I64# (indexInt64OffAddr# a i)) +{-# inline withInt64 #-} + +-- | Parse any 'Word' (CPS). +-- +-- TODO assumes 64-bit platform +withWord :: (Word -> Parser e r) -> Parser e r +withWord p = Parser \fp eob buf -> case 8# <=# minusAddr# eob buf of + 0# -> Fail# + _ -> let w# = indexWordOffAddr# buf 0# + in runParser# (p (W# w#)) fp eob (plusAddr# buf 8#) +{-# inline withWord #-} + +-- | Parse any 'Int' (CPS). +-- +-- TODO assumes 64-bit platform +withInt :: (Int -> Parser e r) -> Parser e r +withInt p = Parser \fp eob buf -> case 8# <=# minusAddr# eob buf of + 0# -> Fail# + _ -> let i# = indexIntOffAddr# buf 0# + in runParser# (p (I# i#)) fp eob (plusAddr# buf 8#) +{-# inline withInt #-} + +-------------------------------------------------------------------------------- + +-- | Parse any 'Word8'. +getWord8 :: Parser e Word8 +getWord8 = withWord8 pure +{-# inline getWord8 #-} + +-- | Parse any 'Word16' (native byte order). +getWord16 :: Parser e Word16 +getWord16 = withWord16 pure +{-# inline getWord16 #-} + +-- | Parse any 'Word32' (native byte order). +getWord32 :: Parser e Word32 +getWord32 = withWord32 pure +{-# inline getWord32 #-} + +-- | Parse any 'Word64' (native byte order). +getWord64 :: Parser e Word64 +getWord64 = withWord64 pure +{-# inline getWord64 #-} + +-- | Parse any 'Int8'. +getInt8 :: Parser e Int8 +getInt8 = withInt8 pure +{-# inline getInt8 #-} + +-- | Parse any 'Int16' (native byte order). +getInt16 :: Parser e Int16 +getInt16 = withInt16 pure +{-# inline getInt16 #-} + +-- | Parse any 'Int32' (native byte order). +getInt32 :: Parser e Int32 +getInt32 = withInt32 pure +{-# inline getInt32 #-} + +-- | Parse any 'Int64' (native byte order). +getInt64 :: Parser e Int64 +getInt64 = withInt64 pure +{-# inline getInt64 #-} + +-- | Parse any 'Word'. +-- +-- TODO 'withWord' assumes 64-bit platform +getWord :: Parser e Word +getWord = withWord pure +{-# inline getWord #-} + +-- | Parse any 'Int'. +-- +-- TODO 'withInt' assumes 64-bit platform +getInt :: Parser e Int +getInt = withInt pure +{-# inline getInt #-} + +-------------------------------------------------------------------------------- + +-- | Parse any 'Word16' (little-endian). +getWord16le :: Parser e Word16 +getWord16le = getWord16 +{-# inline getWord16le #-} + +-- | Parse any 'Word16' (big-endian). +getWord16be :: Parser e Word16 +getWord16be = withWord16 (pure . byteSwap16) +{-# inline getWord16be #-} + +-- | Parse any 'Word32' (little-endian). +getWord32le :: Parser e Word32 +getWord32le = getWord32 +{-# inline getWord32le #-} + +-- | Parse any 'Word32' (big-endian). +getWord32be :: Parser e Word32 +getWord32be = withWord32 (pure . byteSwap32) +{-# inline getWord32be #-} + +-- | Parse any 'Word64' (little-endian). +getWord64le :: Parser e Word64 +getWord64le = getWord64 +{-# inline getWord64le #-} + +-- | Parse any 'Word64' (big-endian). +getWord64be :: Parser e Word64 +getWord64be = withWord64 (pure . byteSwap64) +{-# inline getWord64be #-} + +-- | Parse any 'Int16' (little-endian). +getInt16le :: Parser e Int16 +getInt16le = getInt16 +{-# inline getInt16le #-} + +-- | Parse any 'Int16' (big-endian). +getInt16be :: Parser e Int16 +getInt16be = withWord16 (pure . word16ToInt16 . byteSwap16) +{-# inline getInt16be #-} + +-- | Parse any 'Int32' (little-endian). +getInt32le :: Parser e Int32 +getInt32le = getInt32 +{-# inline getInt32le #-} + +-- | Parse any 'Int32' (big-endian). +getInt32be :: Parser e Int32 +getInt32be = withWord32 (pure . word32ToInt32 . byteSwap32) +{-# inline getInt32be #-} + +-- | Parse any 'Int64' (little-endian). +getInt64le :: Parser e Int64 +getInt64le = getInt64 +{-# inline getInt64le #-} + +-- | Parse any 'Int64' (big-endian). +getInt64be :: Parser e Int64 +getInt64be = withWord64 (pure . word64ToInt64 . byteSwap64) +{-# inline getInt64be #-} + +-------------------------------------------------------------------------------- + +-- | Unsafe helper for defining parsers for types of a constant byte size (i.e. +-- machine integers) which assert the parsed value's... value. +-- +-- Call this with an @indexXYZOffAddr@ primop (e.g. +-- 'GHC.Exts.indexWord8OffAddr'), the size in bytes of the type you're parsing, +-- and the expected value to test the parsed value against. +-- +-- The caller must guarantee that the input has enough bytes. +getSizedOfUnsafe# :: Eq a => Int# -> (Addr# -> Int# -> a) -> a -> Parser e () +getSizedOfUnsafe# size# indexOffAddr aExpected = + withSizedUnsafe# size# indexOffAddr go + where + go aParsed = + if aParsed == aExpected + then pure () + else failed +{-# inline getSizedOfUnsafe# #-} + +-- | Read the next 1 byte and assert its value as a 'Word8'. +getWord8Of :: Word8 -> Parser e () +getWord8Of wExpected = Parser \fp eob buf -> case eqAddr# eob buf of + 1# -> Fail# + _ -> let w# = indexWord8OffAddr# buf 0# + in if W8# w# == wExpected + then OK# () (plusAddr# buf 1#) + else Fail# +{-# inline getWord8Of #-} + +-- | Unsafely read the next 1 byte and assert its value as a 'Word8'. +-- +-- The caller must guarantee that the input has enough bytes. +getWord8OfUnsafe :: Word8 -> Parser e () +getWord8OfUnsafe = getSizedOfUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) +{-# inline getWord8OfUnsafe #-} + +-- | Unsafely read the next 2 bytes and assert their value as a 'Word16'. +-- +-- The caller must guarantee that the input has enough bytes. +getWord16OfUnsafe :: Word16 -> Parser e () +getWord16OfUnsafe = getSizedOfUnsafe# 2# (\a i -> W16# (indexWord16OffAddr# a i)) +{-# inline getWord16OfUnsafe #-} + +-- | Unsafely read the next 4 bytes and assert their value as a 'Word32'. +-- +-- The caller must guarantee that the input has enough bytes. +getWord32OfUnsafe :: Word32 -> Parser e () +getWord32OfUnsafe = getSizedOfUnsafe# 4# (\a i -> W32# (indexWord32OffAddr# a i)) +{-# inline getWord32OfUnsafe #-} + +-- | Unsafely read the next 8 bytes and assert their value as a 'Word64'. +-- +-- The caller must guarantee that the input has enough bytes. +getWord64OfUnsafe :: Word64 -> Parser e () +getWord64OfUnsafe = getSizedOfUnsafe# 8# (\a i -> W64# (indexWord64OffAddr# a i)) +{-# inline getWord64OfUnsafe #-} + +-------------------------------------------------------------------------------- + +-- | Unsafely parse any 'Word8', without asserting the input is non-empty. +-- +-- The caller must guarantee that the input has enough bytes. +getWord8Unsafe :: Parser e Word8 +getWord8Unsafe = withSizedUnsafe# 1# (\a i -> W8# (indexWord8OffAddr# a i)) pure +{-# inline getWord8Unsafe #-} diff --git a/src/FlatParse/Basic/Internal.hs b/src/FlatParse/Basic/Internal.hs new file mode 100644 index 0000000..c8fee49 --- /dev/null +++ b/src/FlatParse/Basic/Internal.hs @@ -0,0 +1,197 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Basic.Internal where + +import FlatParse.Basic.Parser + +import GHC.Exts +import GHC.Word +import GHC.ForeignPtr + +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B + +-------------------------------------------------------------------------------- + +-- | Save the parsing state, then run a parser, then restore the state. +lookahead :: Parser e a -> Parser e a +lookahead (Parser f) = Parser \fp eob s -> + case f fp eob s of + OK# a _ -> OK# a s + x -> x +{-# inline lookahead #-} + +-------------------------------------------------------------------------------- + +-- | Check that the input has at least the given number of bytes. +ensureBytes# :: Int -> Parser e () +ensureBytes# (I# len) = Parser \fp eob s -> + case len <=# minusAddr# eob s of + 1# -> OK# () s + _ -> Fail# +{-# inline ensureBytes# #-} + +scanPartial64# :: Int -> Word -> Parser e () +scanPartial64# (I# len) (W# w) = Parser \fp eob s -> + case indexWordOffAddr# s 0# of + w' -> case uncheckedIShiftL# (8# -# len) 3# of + sh -> case uncheckedShiftL# w' sh of + w' -> case uncheckedShiftRL# w' sh of + w' -> case eqWord# w w' of + 1# -> OK# () (plusAddr# s len) + _ -> Fail# +{-# inline scanPartial64# #-} + +-- | Decrease the current input position by the given number of bytes. +setBack# :: Int -> Parser e () +setBack# (I# i) = Parser \fp eob s -> + OK# () (plusAddr# s (negateInt# i)) +{-# inline setBack# #-} + +-------------------------------------------------------------------------------- +-- Helpers for common internal operations + +-- | Assert for the given 'Int#' that @n >= 0@, and pass it on to the given +-- function. +-- +-- Throws a runtime error if given a negative integer. +withPosInt# :: Int# -> (Int# -> a) -> a +withPosInt# n# f = case n# >=# 0# of + 1# -> f n# + _ -> error "FlatParse.Basic.Internal.withPosInt#: negative integer" +{-# inline withPosInt# #-} + +-- | Run the given parser only if we have not yet reached the end of the buffer. +withNotEob :: Parser e a -> Parser e a +withNotEob (Parser p) = Parser \fp eob s -> case eqAddr# eob s of + 1# -> Fail# + _ -> p fp eob s +{-# inline withNotEob #-} + +-------------------------------------------------------------------------------- +-- Low level unboxed combinators + +-- | Read the given number of bytes as a 'ByteString'. +-- +-- Throws a runtime error if given a negative integer. +takeBs# :: Int# -> Parser e B.ByteString +takeBs# n# = withPosInt# n# takeBsUnsafe# +{-# inline takeBs# #-} + +-- | Read the given number of bytes as a 'ByteString'. +-- +-- Undefined behaviour if given a negative integer. +takeBsUnsafe# :: Int# -> Parser e B.ByteString +takeBsUnsafe# n# = Parser \fp eob s -> case n# <=# minusAddr# eob s of + 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) + _ -> Fail# +{-# inline takeBsUnsafe# #-} + +-- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ +-- bytes are available. +-- +-- Throws a runtime error if given a negative integer. +atSkip# :: Int# -> Parser e a -> Parser e a +atSkip# os# p = withPosInt# os# (\n# -> atSkipUnsafe# n# p) +{-# inline atSkip# #-} + +-- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ +-- bytes are available. +-- +-- Undefined behaviour if given a negative integer. +atSkipUnsafe# :: Int# -> Parser e a -> Parser e a +atSkipUnsafe# os# (Parser p) = Parser \fp eob s -> case os# <=# minusAddr# eob s of + 1# -> p fp eob (plusAddr# s os#) + _ -> Fail# +{-# inline atSkipUnsafe# #-} + +-- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. +-- +-- Throws a runtime error if given a negative integer. +skip# :: Int# -> Parser e () +skip# os# = atSkip# os# (pure ()) +{-# inline skip# #-} + +-- | Run a parser, passing it the current address the parser is at. +-- +-- Useful for parsing offset-based data tables. For example, you may use this to +-- save the base address to use together with various 0-indexed offsets. +withAddr# :: (Addr# -> Parser e a) -> Parser e a +withAddr# p = Parser \fp eob s -> runParser# (p s) fp eob s +{-# inline withAddr# #-} + +-- | @takeBsOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ +-- bytes, reads @len#@ bytes into a 'ByteString', and restores the original +-- address. +-- +-- The 'Addr#' should be from 'withAddr#'. +-- +-- Useful for parsing offset-based data tables. For example, you may use this +-- together with 'withAddr#' to jump to an offset in your input and read some +-- data. +takeBsOffAddr# :: Addr# -> Int# -> Int# -> Parser e B.ByteString +takeBsOffAddr# addr# offset# len# = + lookaheadFromAddr# addr# $ atSkip# offset# $ takeBs# len# +{-# inline takeBsOffAddr# #-} + +-- | 'lookahead', but specify the address to lookahead from. +-- +-- The 'Addr#' should be from 'withAddr#'. +lookaheadFromAddr# :: Addr# -> Parser e a -> Parser e a +lookaheadFromAddr# s = lookahead . atAddr# s +{-# inline lookaheadFromAddr# #-} + +-- | Run a parser at the given address. +-- +-- The 'Addr#' should be from 'withAddr#'. +-- +-- This is a highly internal function -- you likely want 'lookaheadFromAddr#', +-- which will reset the address after running the parser. +atAddr# :: Addr# -> Parser e a -> Parser e a +atAddr# s (Parser p) = Parser \fp eob _ -> p fp eob s +{-# inline atAddr# #-} + +-------------------------------------------------------------------------------- +-- Low-level boxed combinators + +-- | Read a null-terminated bytestring (a C-style string). +-- +-- Consumes the null terminator. +getCString :: Parser e B.ByteString +getCString = Parser \fp eob s -> go' fp eob s + where + go' fp eob s0 = go 0# s0 + where + go n# s = case eqAddr# eob s of + 1# -> Fail# + _ -> + let s' = plusAddr# s 1# + w# = indexWord8OffAddr# s 0# + in if W8# w# == 0x00 + then OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' + else go (n# +# 1#) s' +{-# inline getCString #-} + +-- | Read a null-terminated bytestring (a C-style string), where the bytestring +-- is known to be null-terminated somewhere in the input. +-- +-- Undefined behaviour if your bytestring isn't null-terminated somewhere. +-- You almost certainly want 'getCString' instead. +-- +-- Fails on GHC versions older than 9.0, since we make use of the +-- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful +-- without it. +-- +-- Consumes the null terminator. +getCStringUnsafe :: Parser e B.ByteString +{-# inline getCStringUnsafe #-} +#if MIN_VERSION_base(4,15,0) +getCStringUnsafe = Parser \fp eob s -> + case eqAddr# eob s of + 1# -> Fail# + _ -> let n# = cstringLength# s + s'# = plusAddr# s (n# +# 1#) + in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# +#else +getCStringUnsafe = error "Flatparse.Basic.getCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" +#endif diff --git a/src/FlatParse/Basic/Parser.hs b/src/FlatParse/Basic/Parser.hs new file mode 100644 index 0000000..a3117cd --- /dev/null +++ b/src/FlatParse/Basic/Parser.hs @@ -0,0 +1,135 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Basic.Parser + ( + -- * Parser & result types + Parser(..) + , Res# + , pattern OK#, pattern Err#, pattern Fail# + + -- * Primitive combinators + , failed + ) where + +import GHC.Exts ( Addr#, unsafeCoerce# ) +import GHC.ForeignPtr ( ForeignPtrContents ) + +import qualified Control.Applicative as Base +import Control.Monad ( MonadPlus(..) ) + +-- | @Parser e a@ has an error type @e@ and a return type @a@. +newtype Parser e a = Parser {runParser# :: ForeignPtrContents -> Addr# -> Addr# -> Res# e a} + +instance Functor (Parser e) where + fmap f (Parser g) = Parser \fp eob s -> case g fp eob s of + OK# a s -> let !b = f a in OK# b s + x -> unsafeCoerce# x + {-# inline fmap #-} + + (<$) a' (Parser g) = Parser \fp eob s -> case g fp eob s of + OK# a s -> OK# a' s + x -> unsafeCoerce# x + {-# inline (<$) #-} + +instance Applicative (Parser e) where + pure a = Parser \fp eob s -> OK# a s + {-# inline pure #-} + Parser ff <*> Parser fa = Parser \fp eob s -> case ff fp eob s of + OK# f s -> case fa fp eob s of + OK# a s -> let !b = f a in OK# b s + x -> unsafeCoerce# x + x -> unsafeCoerce# x + {-# inline (<*>) #-} + Parser fa <* Parser fb = Parser \fp eob s -> case fa fp eob s of + OK# a s -> case fb fp eob s of + OK# b s -> OK# a s + x -> unsafeCoerce# x + x -> unsafeCoerce# x + {-# inline (<*) #-} + Parser fa *> Parser fb = Parser \fp eob s -> case fa fp eob s of + OK# a s -> fb fp eob s + x -> unsafeCoerce# x + {-# inline (*>) #-} + +instance Monad (Parser e) where + return = pure + {-# inline return #-} + Parser fa >>= f = Parser \fp eob s -> case fa fp eob s of + OK# a s -> runParser# (f a) fp eob s + x -> unsafeCoerce# x + {-# inline (>>=) #-} + (>>) = (*>) + {-# inline (>>) #-} + +instance Base.Alternative (Parser e) where + empty = failed + {-# inline empty #-} + + (<|>) = (<|>) + {-# inline (Base.<|>) #-} + + -- TODO more efficient than default? (we do want to inline, so) + many (Parser f) = Parser go where + go fp eob s = case f fp eob s of + OK# a s -> case go fp eob s of + OK# as s -> OK# (a:as) s + x -> x + Fail# -> OK# [] s + Err# e -> Err# e + {-# inline many #-} + + some p = (:) <$> p <*> Base.many p + {-# inline some #-} + +-- | The failing parser. By default, parser choice `(<|>)` arbitrarily backtracks +-- on parser failure. +failed :: Parser e a +failed = Parser \fp eob s -> Fail# +{-# inline failed #-} + +infixr 6 <|> +(<|>) :: Parser e a -> Parser e a -> Parser e a +(<|>) (Parser f) (Parser g) = Parser \fp eob s -> + case f fp eob s of + Fail# -> g fp eob s + x -> x +{-# inline[1] (<|>) #-} + +{-# RULES + +"flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r) + +#-} + +instance MonadPlus (Parser e) where + mzero = failed + {-# inline mzero #-} + mplus = (<|>) + {-# inline mplus #-} + +-------------------------------------------------------------------------------- + +-- | Primitive result of a parser, stored using an unboxed sum. +-- +-- Possible results are given by the pattern synonyms 'OK#', 'Fail#' and +-- 'Error#' (in that order). +type Res# e a = + (# + (# a, Addr# #) + | (# #) + | (# e #) + #) + +-- | 'Res#' constructor for a successful parse. Contains return value and a +-- pointer to the rest of the input buffer. +pattern OK# :: a -> Addr# -> Res# e a +pattern OK# a s = (# (# a, s #) | | #) + +-- | 'Res#' constructor for recoverable failure. +pattern Fail# :: Res# e a +pattern Fail# = (# | (# #) | #) + +-- | 'Res#' constructor for errors which are by default non-recoverable. +pattern Err# :: e -> Res# e a +pattern Err# e = (# | | (# e #) #) +{-# complete OK#, Fail#, Err# #-} diff --git a/src/FlatParse/Basic/Position.hs b/src/FlatParse/Basic/Position.hs new file mode 100644 index 0000000..f566610 --- /dev/null +++ b/src/FlatParse/Basic/Position.hs @@ -0,0 +1,71 @@ +module FlatParse.Basic.Position where + +import FlatParse.Basic.Parser +import FlatParse.Common.Position + +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B + +import GHC.Exts +import GHC.ForeignPtr + +-- | Get the current position in the input. +getPos :: Parser e Pos +getPos = Parser \fp eob s -> OK# (addrToPos# eob s) s +{-# inline getPos #-} + +-- | Set the input position. Warning: this can result in crashes if the position points outside the +-- current buffer. It is always safe to `setPos` values which came from `getPos` with the current +-- input. +setPos :: Pos -> Parser e () +setPos s = Parser \fp eob _ -> OK# () (posToAddr# eob s) +{-# inline setPos #-} + +-- | The end of the input. +endPos :: Pos +endPos = Pos 0 +{-# inline endPos #-} + +-- | Return the consumed span of a parser. +spanOf :: Parser e a -> Parser e Span +spanOf (Parser f) = Parser \fp eob s -> case f fp eob s of + OK# a s' -> OK# (Span (addrToPos# eob s) (addrToPos# eob s')) s' + x -> unsafeCoerce# x +{-# inline spanOf #-} + +-- | Bind the result together with the span of the result. CPS'd version of +-- `spanOf` for better unboxing. +withSpan :: Parser e a -> (a -> Span -> Parser e r) -> Parser e r +withSpan (Parser f) g = Parser \fp eob s -> case f fp eob s of + OK# a s' -> runParser# (g a (Span (addrToPos# eob s) (addrToPos# eob s'))) fp eob s' + x -> unsafeCoerce# x +{-# inline withSpan #-} + +-- | Return the `B.ByteString` consumed by a parser. Note: it's more efficient +-- to use `spanOf` and `withSpan` instead. +byteStringOf :: Parser e a -> Parser e B.ByteString +byteStringOf (Parser f) = Parser \fp eob s -> case f fp eob s of + OK# a s' -> OK# (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) s' + x -> unsafeCoerce# x +{-# inline byteStringOf #-} + +-- | CPS'd version of `byteStringOf`. Can be more efficient, because the result +-- is more eagerly unboxed by GHC. It's more efficient to use `spanOf` or +-- `withSpan` instead. +withByteString :: Parser e a -> (a -> B.ByteString -> Parser e r) -> Parser e r +withByteString (Parser f) g = Parser \fp eob s -> case f fp eob s of + OK# a s' -> runParser# (g a (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s)))) fp eob s' + x -> unsafeCoerce# x +{-# inline withByteString #-} + +-- | Run a parser in a given input span. The input position and the `Int` state is restored after +-- the parser is finished, so `inSpan` does not consume input and has no side effect. Warning: +-- this operation may crash if the given span points outside the current parsing buffer. It's +-- always safe to use `inSpan` if the span comes from a previous `withSpan` or `spanOf` call on +-- the current input. +inSpan :: Span -> Parser e a -> Parser e a +inSpan (Span s eob) (Parser f) = Parser \fp eob' s' -> + case f fp (posToAddr# eob' eob) (posToAddr# eob' s) of + OK# a _ -> OK# a s' + x -> unsafeCoerce# x +{-# inline inSpan #-} diff --git a/src/FlatParse/Common/Assorted.hs b/src/FlatParse/Common/Assorted.hs new file mode 100644 index 0000000..140351b --- /dev/null +++ b/src/FlatParse/Common/Assorted.hs @@ -0,0 +1,135 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Common.Assorted + ( + -- * Compatibility + shortInteger + + -- * 'Char' predicates + , isDigit, isLatinLetter, isGreekLetter + + -- * UTF conversions + , packUTF8, charToBytes, strToBytes, packBytes, splitBytes, derefChar8# + + -- * Boxed integer coercions + -- $boxed-integer-coercion + , word16ToInt16 + , word32ToInt32 + , word64ToInt64 + ) where + +import Data.Bits +import Data.Char ( ord ) +import Data.Foldable (foldl') +import GHC.Exts + +import qualified Data.ByteString as B + +import Data.Word +import Data.Int + +#if MIN_VERSION_base(4,15,0) +import GHC.Num.Integer (Integer(..)) +#else +import GHC.Integer.GMP.Internals (Integer(..)) +#endif + +-- Compatibility +-------------------------------------------------------------------------------- + +shortInteger :: Int# -> Integer +#if MIN_VERSION_base(4,15,0) +shortInteger = IS +#else +shortInteger = S# +#endif +{-# inline shortInteger #-} + + +-- Char predicates +-------------------------------------------------------------------------------- + +-- | @isDigit c = \'0\' <= c && c <= \'9\'@ +isDigit :: Char -> Bool +isDigit c = '0' <= c && c <= '9' +{-# inline isDigit #-} + +-- | @isLatinLetter c = (\'A\' <= c && c <= \'Z\') || (\'a\' <= c && c <= \'z\')@ +isLatinLetter :: Char -> Bool +isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') +{-# inline isLatinLetter #-} + +-- | @isGreekLetter c = (\'Α\' <= c && c <= \'Ω\') || (\'α\' <= c && c <= \'ω\')@ +isGreekLetter :: Char -> Bool +isGreekLetter c = ('Α' <= c && c <= 'Ω') || ('α' <= c && c <= 'ω') +{-# inline isGreekLetter #-} + +-- UTF conversions +-------------------------------------------------------------------------------- + +-- | Convert a `String` to an UTF-8-coded `B.ByteString`. +packUTF8 :: String -> B.ByteString +packUTF8 str = B.pack $ do + c <- str + w <- charToBytes c + pure (fromIntegral w) + +charToBytes :: Char -> [Word] +charToBytes c' + | c <= 0x7f = [fromIntegral c] + | c <= 0x7ff = [0xc0 .|. y, 0x80 .|. z] + | c <= 0xffff = [0xe0 .|. x, 0x80 .|. y, 0x80 .|. z] + | c <= 0x10ffff = [0xf0 .|. w, 0x80 .|. x, 0x80 .|. y, 0x80 .|. z] + | otherwise = error "Not a valid Unicode code point" + where + c = ord c' + z = fromIntegral (c .&. 0x3f) + y = fromIntegral (unsafeShiftR c 6 .&. 0x3f) + x = fromIntegral (unsafeShiftR c 12 .&. 0x3f) + w = fromIntegral (unsafeShiftR c 18 .&. 0x7) + +strToBytes :: String -> [Word] +strToBytes = concatMap charToBytes +{-# inline strToBytes #-} + +packBytes :: [Word] -> Word +packBytes = fst . foldl' go (0, 0) where + go (acc, shift) w | shift == 64 = error "packWords: too many bytes" + go (acc, shift) w = (unsafeShiftL (fromIntegral w) shift .|. acc, shift+8) + +-- TODO chunks into 8-bytes for 64-bit performance +splitBytes :: [Word] -> ([Word], [Word]) +splitBytes ws = case quotRem (length ws) 8 of + (0, _) -> (ws, []) + (_, r) -> (as, chunk8s bs) where + (as, bs) = splitAt r ws + chunk8s [] = [] + chunk8s ws = let (as, bs) = splitAt 8 ws in + packBytes as : chunk8s bs + +derefChar8# :: Addr# -> Char# +derefChar8# addr = indexCharOffAddr# addr 0# +{-# inline derefChar8# #-} + +-------------------------------------------------------------------------------- + +{- $boxed-integer-coercion + +These functions should be no-ops. They correspond to the similarly-named GHC 9.4 +primops which work on unboxed integers. +-} + +-- | Coerce a 'Word16' to 'Int16'. +word16ToInt16 :: Word16 -> Int16 +word16ToInt16 = fromIntegral +{-# inline word16ToInt16 #-} + +-- | Coerce a 'Word32' to 'Int32'. +word32ToInt32 :: Word32 -> Int32 +word32ToInt32 = fromIntegral +{-# inline word32ToInt32 #-} + +-- | Coerce a 'Word64' to 'Int64'. +word64ToInt64 :: Word64 -> Int64 +word64ToInt64 = fromIntegral +{-# inline word64ToInt64 #-} diff --git a/src/FlatParse/Common/Numbers.hs b/src/FlatParse/Common/Numbers.hs new file mode 100644 index 0000000..fbf6826 --- /dev/null +++ b/src/FlatParse/Common/Numbers.hs @@ -0,0 +1,70 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Common.Numbers where + +import FlatParse.Common.Assorted ( shortInteger ) + +import GHC.Exts +import GHC.Word +import GHC.ForeignPtr + +import qualified Data.ByteString.Char8 as BC8 +import qualified Data.ByteString.Internal as B + +mul10 :: Int# -> Int# +mul10 n = uncheckedIShiftL# n 3# +# uncheckedIShiftL# n 1# +{-# inline mul10 #-} + +readInt' :: Int# -> Addr# -> Addr# -> (# Int#, Addr# #) +readInt' acc s end = case eqAddr# s end of + 1# -> (# acc, s #) + _ -> + let w# = indexWord8OffAddr# s 0# + in if 0x30 <= W8# w# && W8# w# <= 0x39 + then + let !(I# w'#) = fromIntegral (W8# w#) + in readInt' (mul10 acc +# (w'# -# 0x30#)) (plusAddr# s 1#) end + else (# acc, s #) + +-- | Read an `Int` from the input, as a non-empty digit sequence. The `Int` may +-- overflow in the result. +readInt :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) +readInt eob s = case readInt' 0# s eob of + (# n, s' #) | 1# <- eqAddr# s s' -> (# (##) | #) + | otherwise -> (# | (# n, s' #) #) +{-# inline readInt #-} + +-- | Read an `Integer` from the input, as a non-empty digit sequence. +readInteger :: ForeignPtrContents -> Addr# -> Addr# -> (# (##) | (# Integer, Addr# #) #) +readInteger fp eob s = case readInt' 0# s eob of + (# n, s' #) + | 1# <- eqAddr# s s' -> (# (##) | #) + | 1# <- minusAddr# s' s <=# 18# -> (# | (# shortInteger n, s' #) #) + | otherwise -> case BC8.readInteger (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) of + Nothing -> (# (##) | #) + Just (i, _) -> (# | (# i, s' #) #) +{-# inline readInteger #-} + +readIntHex' :: Int# -> Addr# -> Addr# -> (# Int#, Addr# #) +readIntHex' acc s end = case eqAddr# s end of + 1# -> (# acc, s #) + _ -> + let w# = indexWord8OffAddr# s 0# + in if 0x30 <= W8# w# && W8# w# <= 0x39 + then let !(I# w'#) = fromIntegral (W8# w#) + in readIntHex' (uncheckedIShiftL# acc 4# +# (w'# -# 0x30#)) (plusAddr# s 1#) end + else if 0x41 <= W8# w# && W8# w# <= 0x46 + then let !(I# w'#) = fromIntegral (W8# w#) + in readIntHex' (uncheckedIShiftL# acc 4# +# (w'# -# 0x37#)) (plusAddr# s 1#) end + else if 0x61 <= W8# w# && W8# w# <= 0x66 + then let !(I# w'#) = fromIntegral (W8# w#) + in readIntHex' (uncheckedIShiftL# acc 4# +# (w'# -# 0x57#)) (plusAddr# s 1#) end + else (# acc, s #) + +-- | Read an `Int` from the input, as a non-empty case-insensitive ASCII +-- hexadecimal digit sequence. The `Int` may overflow in the result. +{-# INLINE readIntHex #-} +readIntHex :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) +readIntHex eob s = case readIntHex' 0# s eob of + (# n, s' #) | 1# <- eqAddr# s s' -> (# (##) | #) + | otherwise -> (# | (# n, s' #) #) diff --git a/src/FlatParse/Common/Position.hs b/src/FlatParse/Common/Position.hs new file mode 100644 index 0000000..14f87a8 --- /dev/null +++ b/src/FlatParse/Common/Position.hs @@ -0,0 +1,42 @@ +module FlatParse.Common.Position where + +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B +import GHC.Int +import GHC.ForeignPtr ( ForeignPtr(..) ) +import GHC.Exts + +-- | Byte offset counted backwards from the end of the buffer. +-- +-- TODO can't use unlifted 'Int#' because of type classes. bugger! some day +newtype Pos = Pos Int deriving (Eq, Show) + +-- | A pair of positions. +data Span = Span !Pos !Pos deriving (Eq, Show) + +instance Ord Pos where + Pos p <= Pos p' = p' <= p + Pos p < Pos p' = p' < p + Pos p > Pos p' = p' > p + Pos p >= Pos p' = p' >= p + {-# inline (<=) #-} + {-# inline (<) #-} + {-# inline (>) #-} + {-# inline (>=) #-} + +addrToPos# :: Addr# -> Addr# -> Pos +addrToPos# eob s = Pos (I# (minusAddr# eob s)) +{-# inline addrToPos# #-} + +posToAddr# :: Addr# -> Pos -> Addr# +posToAddr# eob (Pos (I# n)) = unsafeCoerce# (minusAddr# eob (unsafeCoerce# n)) +{-# inline posToAddr# #-} + +-- | Slice into a `B.ByteString` using a `Span`. The result is invalid if the `Span` +-- is not a valid slice of the first argument. +unsafeSlice :: B.ByteString -> Span -> B.ByteString +unsafeSlice (B.PS (ForeignPtr addr fp) (I# start) (I# len)) + (Span (Pos (I# o1)) (Pos (I# o2))) = + let end = addr `plusAddr#` start `plusAddr#` len + in B.PS (ForeignPtr (plusAddr# end (negateInt# o1)) fp) (I# 0#) (I# (o1 -# o2)) +{-# inline unsafeSlice #-} diff --git a/src/FlatParse/Common/Trie.hs b/src/FlatParse/Common/Trie.hs new file mode 100644 index 0000000..49bdbed --- /dev/null +++ b/src/FlatParse/Common/Trie.hs @@ -0,0 +1,85 @@ +module FlatParse.Common.Trie where + +import FlatParse.Common.Assorted ( charToBytes ) + +import qualified Data.Map.Strict as M +import Data.Map.Strict ( Map ) +import Data.Foldable ( foldl' ) + +data Trie a = Branch !a !(Map Word (Trie a)) + deriving Show + +type Rule = Maybe Int + +nilTrie :: Trie Rule +nilTrie = Branch Nothing mempty + +updRule :: Int -> Maybe Int -> Maybe Int +updRule rule = Just . maybe rule (min rule) + +insert :: Int -> [Word] -> Trie Rule -> Trie Rule +insert rule = go where + go [] (Branch rule' ts) = + Branch (updRule rule rule') ts + go (c:cs) (Branch rule' ts) = + Branch rule' (M.alter (Just . maybe (go cs nilTrie) (go cs)) c ts) + +listToTrie :: [(Int, String)] -> Trie Rule +listToTrie = foldl' (\t (!r, !s) -> insert r (charToBytes =<< s) t) nilTrie + +-- | Decorate a trie with the minimum lengths of non-empty paths. This +-- is used later to place `ensureBytes#`. +mindepths :: Trie Rule -> Trie (Rule, Int) +mindepths (Branch rule ts) = + if M.null ts then + Branch (rule, 0) mempty + else + let !ts' = M.map mindepths ts in + Branch ( + rule, + minimum (M.map (\(Branch (rule,d) _) -> maybe (d + 1) (\_ -> 1) rule) ts')) + ts' + +data Trie' a + = Branch' !a !(Map Word (Trie' a)) + | Path !a ![Word] !(Trie' a) + deriving Show + +-- | Compress linear paths. +pathify :: Trie (Rule, Int) -> Trie' (Rule, Int) +pathify (Branch a ts) = case M.toList ts of + [] -> Branch' a mempty + [(w, t)] -> case pathify t of + Path (Nothing, _) ws t -> Path a (w:ws) t + t -> Path a [w] t + _ -> Branch' a (M.map pathify ts) + +-- | Compute where to fall back after we exhausted a branch. If the branch is +-- empty, that means we've succeded at reading and we jump to the rhs rule. +fallbacks :: Trie' (Rule, Int) -> Trie' (Rule, Int, Int) +fallbacks = go Nothing 0 where + go :: Rule -> Int -> Trie' (Rule, Int) -> Trie' (Rule, Int, Int) + go !rule !n (Branch' (rule', d) ts) + | M.null ts = Branch' (rule', 0, d) mempty + | Nothing <- rule' = Branch' (rule, n, d) (go rule (n + 1) <$> ts) + | otherwise = Branch' (rule', 0, d) (go rule' 1 <$> ts) + go rule n (Path (rule', d) ws t) + | Nothing <- rule' = Path (rule, n, d) ws (go rule (n + length ws) t) + | otherwise = Path (rule', 0, d) ws (go rule' (length ws) t) + +-- | Decorate with `ensureBytes#` invocations, represented as +-- `Maybe Int`. +ensureBytes :: Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int) +ensureBytes = go 0 where + go :: Int -> Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int) + go !res = \case + Branch' (r, n, d) ts + | M.null ts -> Branch' (r, n, Nothing) mempty + | res < 1 -> Branch' (r, n, Just d ) (go (d - 1) <$> ts) + | otherwise -> Branch' (r, n, Nothing) (go (res - 1) <$> ts) + Path (r, n, d) ws t -> case length ws of + l | res < l -> Path (r, n, Just $! d - res) ws (go (d - l) t) + | otherwise -> Path (r, n, Nothing ) ws (go (res - l) t) + +compileTrie :: [(Int, String)] -> Trie' (Rule, Int, Maybe Int) +compileTrie = ensureBytes . fallbacks . pathify . mindepths . listToTrie diff --git a/src/FlatParse/Examples/BasicLambda/Lexer.hs b/src/FlatParse/Examples/BasicLambda/Lexer.hs index a80355a..7a3935a 100644 --- a/src/FlatParse/Examples/BasicLambda/Lexer.hs +++ b/src/FlatParse/Examples/BasicLambda/Lexer.hs @@ -8,7 +8,7 @@ demonstrates a simple but decently informative implementation of error message p module FlatParse.Examples.BasicLambda.Lexer where -import FlatParse.Basic hiding (Parser, runParser, string, char, cut) +import FlatParse.Basic hiding ( Parser, runParser ) import qualified FlatParse.Basic as FP import qualified Data.ByteString as B @@ -117,7 +117,7 @@ testParser p str = case packUTF8 str of -- | Parse a line comment. lineComment :: Parser () lineComment = - withOption anyWord8 + withOption getWord8 (\case 10 -> ws _ -> lineComment) (pure ()) @@ -129,7 +129,7 @@ multilineComment = go (1 :: Int) where go n = $(switch [| case _ of "-}" -> go (n - 1) "{-" -> go (n + 1) - _ -> branch anyWord8 (go n) (pure ()) |]) + _ -> branch getWord8 (go n) (pure ()) |]) -- | Consume whitespace. ws :: Parser () @@ -173,7 +173,7 @@ isKeyword span = inSpan span do -- | Parse a non-keyword string. symbol :: String -> Q Exp -symbol str = [| token $(FP.string str) |] +symbol str = [| token $(FP.getStringOf str) |] -- | Parser a non-keyword string, throw precise error on failure. symbol' :: String -> Q Exp @@ -181,7 +181,7 @@ symbol' str = [| $(symbol str) `cut'` Lit str |] -- | Parse a keyword string. keyword :: String -> Q Exp -keyword str = [| token ($(FP.string str) `notFollowedBy` identChar) |] +keyword str = [| token ($(FP.getStringOf str) `notFollowedBy` identChar) |] -- | Parse a keyword string, throw precise error on failure. keyword' :: String -> Q Exp diff --git a/src/FlatParse/Examples/BasicLambda/Parser.hs b/src/FlatParse/Examples/BasicLambda/Parser.hs index 12d84bd..f1db3af 100644 --- a/src/FlatParse/Examples/BasicLambda/Parser.hs +++ b/src/FlatParse/Examples/BasicLambda/Parser.hs @@ -11,7 +11,7 @@ module FlatParse.Examples.BasicLambda.Parser where import Data.Char (ord) import qualified Data.ByteString as B -import FlatParse.Basic hiding (Parser, runParser, string, char, cut) +import FlatParse.Basic hiding ( Parser, cut ) import FlatParse.Examples.BasicLambda.Lexer -------------------------------------------------------------------------------- diff --git a/src/FlatParse/Internal.hs b/src/FlatParse/Internal.hs deleted file mode 100644 index b2a8117..0000000 --- a/src/FlatParse/Internal.hs +++ /dev/null @@ -1,277 +0,0 @@ -{-# language UnboxedTuples #-} - -module FlatParse.Internal where - -import FlatParse.Internal.UnboxedNumerics - -import Data.Bits -import Data.Char -import Data.Foldable (foldl') -import Data.Map (Map) -import GHC.Exts -import GHC.ForeignPtr - -import qualified Data.ByteString as B -import qualified Data.ByteString.Char8 as BC8 -import qualified Data.ByteString.Internal as B -import qualified Data.Map.Strict as M - -#if MIN_VERSION_base(4,15,0) -import GHC.Num.Integer (Integer(..)) -#else -import GHC.Integer.GMP.Internals (Integer(..)) -#endif - --- Compatibility --------------------------------------------------------------------------------- - -shortInteger :: Int# -> Integer -#if MIN_VERSION_base(4,15,0) -shortInteger = IS -#else -shortInteger = S# -#endif -{-# inline shortInteger #-} - - --- Char predicates --------------------------------------------------------------------------------- - --- | @isDigit c = \'0\' <= c && c <= \'9\'@ -isDigit :: Char -> Bool -isDigit c = '0' <= c && c <= '9' -{-# inline isDigit #-} - --- | @isLatinLetter c = (\'A\' <= c && c <= \'Z\') || (\'a\' <= c && c <= \'z\')@ -isLatinLetter :: Char -> Bool -isLatinLetter c = ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') -{-# inline isLatinLetter #-} - --- | @isGreekLetter c = (\'Α\' <= c && c <= \'Ω\') || (\'α\' <= c && c <= \'ω\')@ -isGreekLetter :: Char -> Bool -isGreekLetter c = ('Α' <= c && c <= 'Ω') || ('α' <= c && c <= 'ω') -{-# inline isGreekLetter #-} - --- Int(eger) reading --------------------------------------------------------------------------------- - -mul10 :: Int# -> Int# -mul10 n = uncheckedIShiftL# n 3# +# uncheckedIShiftL# n 1# -{-# inline mul10 #-} - -readInt' :: Int# -> Addr# -> Addr# -> (# Int#, Addr# #) -readInt' acc s end = case eqAddr# s end of - 1# -> (# acc, s #) - _ -> case indexWord8OffAddr''# s 0# of - w | 1# <- leWord8# (wordToWord8''# 0x30##) w, 1# <- leWord8# w (wordToWord8''# 0x39##) -> - readInt' (mul10 acc +# (word2Int# (word8ToWord''# w) -# 0x30#)) (plusAddr# s 1#) end - _ -> (# acc, s #) - - --- | Read an `Int` from the input, as a non-empty digit sequence. The `Int` may --- overflow in the result. -readInt :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) -readInt eob s = case readInt' 0# s eob of - (# n, s' #) | 1# <- eqAddr# s s' -> (# (##) | #) - | otherwise -> (# | (# n, s' #) #) -{-# inline readInt #-} - -readIntHex' :: Int# -> Addr# -> Addr# -> (# Int#, Addr# #) -readIntHex' acc s end = case eqAddr# s end of - 1# -> (# acc, s #) - _ -> case indexWord8OffAddr''# s 0# of - w | 1# <- leWord8# (wordToWord8''# 0x30##) w - , 1# <- leWord8# w (wordToWord8''# 0x39##) - -> readIntHex' (uncheckedIShiftL# acc 4# +# (word2Int# (word8ToWord''# w) -# 0x30#)) (plusAddr# s 1#) end - - | 1# <- leWord8# (wordToWord8''# 0x41##) w - , 1# <- leWord8# w (wordToWord8''# 0x46##) - -> readIntHex' (uncheckedIShiftL# acc 4# +# (word2Int# (word8ToWord''# w) -# 0x37#)) (plusAddr# s 1#) end - - | 1# <- leWord8# (wordToWord8''# 0x61##) w - , 1# <- leWord8# w (wordToWord8''# 0x66##) - -> readIntHex' (uncheckedIShiftL# acc 4# +# (word2Int# (word8ToWord''# w) -# 0x57#)) (plusAddr# s 1#) end - _ -> (# acc, s #) - --- | Read an `Int` from the input, as a non-empty case-insensitive ASCII --- hexadecimal digit sequence. The `Int` may overflow in the result. -{-# INLINE readIntHex #-} -readIntHex :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) -readIntHex eob s = case readIntHex' 0# s eob of - (# n, s' #) | 1# <- eqAddr# s s' -> (# (##) | #) - | otherwise -> (# | (# n, s' #) #) - --- | Read an `Integer` from the input, as a non-empty digit sequence. -readInteger :: ForeignPtrContents -> Addr# -> Addr# -> (# (##) | (# Integer, Addr# #) #) -readInteger fp eob s = case readInt' 0# s eob of - (# n, s' #) - | 1# <- eqAddr# s s' -> (# (##) | #) - | 1# <- minusAddr# s' s <=# 18# -> (# | (# shortInteger n, s' #) #) - | otherwise -> case BC8.readInteger (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) of - Nothing -> (# (##) | #) - Just (i, _) -> (# | (# i, s' #) #) -{-# inline readInteger #-} - - --- Positions and spans --------------------------------------------------------------------------------- - --- | Byte offset counted backwards from the end of the buffer. -newtype Pos = Pos Int deriving (Eq, Show) - --- | A pair of positions. -data Span = Span !Pos !Pos deriving (Eq, Show) - -instance Ord Pos where - Pos p <= Pos p' = p' <= p - Pos p < Pos p' = p' < p - Pos p > Pos p' = p' > p - Pos p >= Pos p' = p' >= p - {-# inline (<=) #-} - {-# inline (<) #-} - {-# inline (>) #-} - {-# inline (>=) #-} - -addrToPos# :: Addr# -> Addr# -> Pos -addrToPos# eob s = Pos (I# (minusAddr# eob s)) -{-# inline addrToPos# #-} - -posToAddr# :: Addr# -> Pos -> Addr# -posToAddr# eob (Pos (I# n)) = unsafeCoerce# (minusAddr# eob (unsafeCoerce# n)) -{-# inline posToAddr# #-} - --- | Slice into a `B.ByteString` using a `Span`. The result is invalid if the `Span` --- is not a valid slice of the first argument. -unsafeSlice :: B.ByteString -> Span -> B.ByteString -unsafeSlice (B.PS (ForeignPtr addr fp) (I# start) (I# len)) - (Span (Pos (I# o1)) (Pos (I# o2))) = - let end = addr `plusAddr#` start `plusAddr#` len - in B.PS (ForeignPtr (plusAddr# end (negateInt# o1)) fp) (I# 0#) (I# (o1 -# o2)) -{-# inline unsafeSlice #-} - --- UTF conversions --------------------------------------------------------------------------------- - --- | Convert a `String` to an UTF-8-coded `B.ByteString`. -packUTF8 :: String -> B.ByteString -packUTF8 str = B.pack $ do - c <- str - w <- charToBytes c - pure (fromIntegral w) - -charToBytes :: Char -> [Word] -charToBytes c' - | c <= 0x7f = [fromIntegral c] - | c <= 0x7ff = [0xc0 .|. y, 0x80 .|. z] - | c <= 0xffff = [0xe0 .|. x, 0x80 .|. y, 0x80 .|. z] - | c <= 0x10ffff = [0xf0 .|. w, 0x80 .|. x, 0x80 .|. y, 0x80 .|. z] - | otherwise = error "Not a valid Unicode code point" - where - c = ord c' - z = fromIntegral (c .&. 0x3f) - y = fromIntegral (unsafeShiftR c 6 .&. 0x3f) - x = fromIntegral (unsafeShiftR c 12 .&. 0x3f) - w = fromIntegral (unsafeShiftR c 18 .&. 0x7) - -strToBytes :: String -> [Word] -strToBytes = concatMap charToBytes -{-# inline strToBytes #-} - -packBytes :: [Word] -> Word -packBytes = fst . foldl' go (0, 0) where - go (acc, shift) w | shift == 64 = error "packWords: too many bytes" - go (acc, shift) w = (unsafeShiftL (fromIntegral w) shift .|. acc, shift+8) - -splitBytes :: [Word] -> ([Word], [Word]) -splitBytes ws = case quotRem (length ws) 8 of - (0, _) -> (ws, []) - (_, r) -> (as, chunk8s bs) where - (as, bs) = splitAt r ws - chunk8s [] = [] - chunk8s ws = let (as, bs) = splitAt 8 ws in - packBytes as : chunk8s bs - -derefChar8# :: Addr# -> Char# -derefChar8# addr = indexCharOffAddr# addr 0# -{-# inline derefChar8# #-} - --- Switch trie compilation --------------------------------------------------------------------------------- - -data Trie a = Branch !a !(Map Word (Trie a)) - deriving Show - -type Rule = Maybe Int - -nilTrie :: Trie Rule -nilTrie = Branch Nothing mempty - -updRule :: Int -> Maybe Int -> Maybe Int -updRule rule = Just . maybe rule (min rule) - -insert :: Int -> [Word] -> Trie Rule -> Trie Rule -insert rule = go where - go [] (Branch rule' ts) = - Branch (updRule rule rule') ts - go (c:cs) (Branch rule' ts) = - Branch rule' (M.alter (Just . maybe (go cs nilTrie) (go cs)) c ts) - -listToTrie :: [(Int, String)] -> Trie Rule -listToTrie = foldl' (\t (!r, !s) -> insert r (charToBytes =<< s) t) nilTrie - --- | Decorate a trie with the minimum lengths of non-empty paths. This --- is used later to place `ensureBytes#`. -mindepths :: Trie Rule -> Trie (Rule, Int) -mindepths (Branch rule ts) = - if M.null ts then - Branch (rule, 0) mempty - else - let !ts' = M.map mindepths ts in - Branch ( - rule, - minimum (M.map (\(Branch (rule,d) _) -> maybe (d + 1) (\_ -> 1) rule) ts')) - ts' - -data Trie' a - = Branch' !a !(Map Word (Trie' a)) - | Path !a ![Word] !(Trie' a) - deriving Show - --- | Compress linear paths. -pathify :: Trie (Rule, Int) -> Trie' (Rule, Int) -pathify (Branch a ts) = case M.toList ts of - [] -> Branch' a mempty - [(w, t)] -> case pathify t of - Path (Nothing, _) ws t -> Path a (w:ws) t - t -> Path a [w] t - _ -> Branch' a (M.map pathify ts) - --- | Compute where to fall back after we exhausted a branch. If the branch is --- empty, that means we've succeded at reading and we jump to the rhs rule. -fallbacks :: Trie' (Rule, Int) -> Trie' (Rule, Int, Int) -fallbacks = go Nothing 0 where - go :: Rule -> Int -> Trie' (Rule, Int) -> Trie' (Rule, Int, Int) - go !rule !n (Branch' (rule', d) ts) - | M.null ts = Branch' (rule', 0, d) mempty - | Nothing <- rule' = Branch' (rule, n, d) (go rule (n + 1) <$> ts) - | otherwise = Branch' (rule', 0, d) (go rule' 1 <$> ts) - go rule n (Path (rule', d) ws t) - | Nothing <- rule' = Path (rule, n, d) ws (go rule (n + length ws) t) - | otherwise = Path (rule', 0, d) ws (go rule' (length ws) t) - --- | Decorate with `ensureBytes#` invocations, represented as --- `Maybe Int`. -ensureBytes :: Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int) -ensureBytes = go 0 where - go :: Int -> Trie' (Rule, Int, Int) -> Trie' (Rule, Int, Maybe Int) - go !res = \case - Branch' (r, n, d) ts - | M.null ts -> Branch' (r, n, Nothing) mempty - | res < 1 -> Branch' (r, n, Just d ) (go (d - 1) <$> ts) - | otherwise -> Branch' (r, n, Nothing) (go (res - 1) <$> ts) - Path (r, n, d) ws t -> case length ws of - l | res < l -> Path (r, n, Just $! d - res) ws (go (d - l) t) - | otherwise -> Path (r, n, Nothing ) ws (go (res - l) t) - -compileTrie :: [(Int, String)] -> Trie' (Rule, Int, Maybe Int) -compileTrie = ensureBytes . fallbacks . pathify . mindepths . listToTrie diff --git a/src/FlatParse/Internal/UnboxedNumerics.hs b/src/FlatParse/Internal/UnboxedNumerics.hs deleted file mode 100644 index a152943..0000000 --- a/src/FlatParse/Internal/UnboxedNumerics.hs +++ /dev/null @@ -1,115 +0,0 @@ -{- | Compatibility layer for numeric primops. - -GHC 9.2 standardized unboxed numeric primops. Prior, it was quite asymmetric. -Many primop functions used the native unboxed numerics 'Word#' and 'Int#' even -if a sized unboxed numeric was in the name, e.g. 'indexWord8OffAddr#' returning -'Word#' pre-9.2. All boxed machine integers only stored 'Word#' internally! - -We target GHC 9.2's better handling. In order to maintain compatibility with -older GHCs, we define missing primops and wrap ones that changed type. Usually, -we can write a few wrappers so that 9.2 uses sized unboxed numerics everywhere, -and pre-9.2 uses native unboxed numerics everywhere. Sometimes we really want to -work with sized unboxed numerics on both, in which case we have to do more -involved primop wrapping. - -The general pattern is as follows: - - * A ticked primop means it's sized on >=9.2, native on <9.2 - * A double-ticked primop means it's sized on all - * An unticked primop should mean the same as a ticked primop (no guarantees) - -Also see: https://gitlab.haskell.org/ghc/ghc/-/wikis/Unboxed-Numerics --} - -module FlatParse.Internal.UnboxedNumerics where - -import GHC.Exts - --- "Switch" wrappers: sized on >=9.2, native on <9.2 -byteSwap16'# :: Word16'# -> Word16'# -byteSwap32'# :: Word32'# -> Word32'# -eqWord8'# :: Word8'# -> Word8'# -> Int# -eqWord16'# :: Word16'# -> Word16'# -> Int# -eqWord32'# :: Word32'# -> Word32'# -> Int# -{-# inline byteSwap16'# #-} -{-# inline byteSwap32'# #-} -{-# inline eqWord8'# #-} -{-# inline eqWord16'# #-} -{-# inline eqWord32'# #-} - --- "Sized" wrappers: sized on all -indexWord8OffAddr''# :: Addr# -> Int# -> Word8# -wordToWord8''# :: Word# -> Word8# -word8ToWord''# :: Word8# -> Word# -{-# inline indexWord8OffAddr''# #-} -{-# inline wordToWord8''# #-} -{-# inline word8ToWord''# #-} - -#if MIN_VERSION_base(4,16,0) --- GHC >=9.2 - -type Word8'# = Word8# -type Word16'# = Word16# -type Word32'# = Word32# -type Int8'# = Int8# -type Int16'# = Int16# -type Int32'# = Int32# - --- "Switch" wrappers: sized on >=9.2, native on <9.2 -byteSwap16'# w# = wordToWord16# (byteSwap16# (word16ToWord# w#)) -byteSwap32'# w# = wordToWord32# (byteSwap32# (word32ToWord# w#)) -eqWord8'# = eqWord8# -eqWord16'# = eqWord16# -eqWord32'# = eqWord32# - --- "Sized" wrappers: sized on all -indexWord8OffAddr''# = indexWord8OffAddr# -wordToWord8''# = wordToWord8# -word8ToWord''# = word8ToWord# - -#else --- GHC <9.2 - -type Word8'# = Word# -type Word16'# = Word# -type Word32'# = Word# -type Int8'# = Int# -type Int16'# = Int# -type Int32'# = Int# - --- "Switch" wrappers: sized on >=9.2, native on <9.2 -byteSwap16'# = byteSwap16# -byteSwap32'# = byteSwap32# -eqWord8'# = eqWord# -eqWord16'# = eqWord# -eqWord32'# = eqWord# - --- No need to tick wrap these, they didn't exist <9.2 -word16ToInt16# :: Word16'# -> Int# -word16ToInt16# w = narrow16Int# (word2Int# w) -word32ToInt32# :: Word32'# -> Int# -word32ToInt32# w = narrow32Int# (word2Int# w) -{-# inline word16ToInt16# #-} -{-# inline word32ToInt32# #-} - --- "Sized" wrappers: sized on all -indexWord8OffAddr''# a# i# = narrowWord8# (indexWord8OffAddr# a# i#) -wordToWord8''# = narrowWord8# -word8ToWord''# = extendWord8# - -#endif - -#if !MIN_VERSION_base(4,13,0) --- GHC <8.8 - -type Word8# = Word# -narrowWord8# :: Word# -> Word8# -narrowWord8# = narrow8Word# -extendWord8# :: Word# -> Word8# -extendWord8# w# = w# -leWord8# :: Word8# -> Word8# -> Int# -leWord8# w1# w2# = leWord# w1# w2# -eqWord8# :: Word8# -> Word8# -> Int# -eqWord8# w1# w2# = eqWord# w1# w2# - -#endif diff --git a/src/FlatParse/Stateful.hs b/src/FlatParse/Stateful.hs deleted file mode 100644 index 82a7f40..0000000 --- a/src/FlatParse/Stateful.hs +++ /dev/null @@ -1,1453 +0,0 @@ -{-# language UnboxedTuples #-} - -{-| -This module implements a `Parser` supporting a custom reader environment, custom -error types and an `Int` state. --} - -module FlatParse.Stateful ( - -- * Parser types and constructors - type Parser(..) - , type Res# - , pattern OK# - , pattern Fail# - , pattern Err# - , Result(..) - - -- * Running parsers - , runParser - , runParserS - - -- * Actions on the state and the environment - , get - , put - , modify - , ask - , local - - -- * Errors and failures - , failed - , Base.empty - , err - , lookahead - , fails - , try - , optional - , optional_ - , withOption - , cut - , cutting - - -- * Basic lexing and parsing - , eof - , takeBs - , takeRestBs - , char - , byte - , bytes - , byteString - , string - , switch - , switchWithPost - , rawSwitchWithPost - , satisfy - , satisfy_ - , satisfyASCII - , satisfyASCII_ - , fusedSatisfy - , fusedSatisfy_ - , anyWord8 - , anyWord8_ - , anyWord16 - , anyWord16_ - , anyWord32 - , anyWord32_ - , anyWord64 - , anyWord64_ - , anyWord - , anyWord_ - , anyInt8 - , anyInt16 - , anyInt32 - , anyInt64 - , anyInt - , anyChar - , anyChar_ - , anyCharASCII - , anyCharASCII_ - , isDigit - , isGreekLetter - , isLatinLetter - , FlatParse.Stateful.readInt - , FlatParse.Stateful.readIntHex - , FlatParse.Stateful.readInteger - , anyCString - - -- ** Explicit-endianness machine integers - , anyWord16le - , anyWord16be - , anyWord32le - , anyWord32be - , anyWord64le - , anyWord64be - , anyInt16le - , anyInt16be - , anyInt32le - , anyInt32be - , anyInt64le - , anyInt64be - - -- * Combinators - , (<|>) - , branch - , chainl - , chainr - , many - , many_ - , some - , some_ - , notFollowedBy - , isolate - - -- * Positions and spans - , Pos(..) - , Span(..) - , getPos - , setPos - , endPos - , spanOf - , withSpan - , byteStringOf - , withByteString - , inSpan - - -- ** Position and span conversions - , Basic.validPos - , Basic.posLineCols - , unsafeSpanToByteString - , Basic.unsafeSlice - , Basic.mkPos - , Basic.lines - - -- * Getting the rest of the input as a 'String' - , takeLine - , traceLine - , takeRest - , traceRest - - -- * `String` conversions - , packUTF8 - , Basic.unpackUTF8 - - -- * Internal functions - , ensureBytes# - - -- ** Unboxed arguments - , takeBs# - , atSkip# - - -- *** Machine integer continuation parsers - , withAnyWord8# - , withAnyWord16# - , withAnyWord32# - , withAnyWord64# - , withAnyInt8# - , withAnyInt16# - , withAnyInt32# - , withAnyInt64# - - -- ** Location & address primitives - , setBack# - , withAddr# - , takeBsOffAddr# - , lookaheadFromAddr# - , atAddr# - - -- ** Unsafe - , anyCStringUnsafe - , scan8# - , scan16# - , scan32# - , scan64# - , scanAny8# - , scanBytes# - - ) where - -import qualified Control.Applicative as Base -import Control.Monad -import Data.Foldable -import Data.Map (Map) -import GHC.Exts -import GHC.Word -import GHC.Int -import Language.Haskell.TH -import System.IO.Unsafe -import GHC.ForeignPtr - -import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B -import qualified Data.ByteString.Unsafe as B -import qualified Data.Map.Strict as M - -import FlatParse.Internal -import FlatParse.Internal.UnboxedNumerics - -import qualified FlatParse.Basic as Basic - --------------------------------------------------------------------------------- - --- | Primitive result of a parser. Possible results are given by `OK#`, `Err#` and `Fail#` --- pattern synonyms. -type Res# e a = - (# - (# a, Addr#, Int# #) - | (# #) - | (# e #) - #) - --- | Contains return value, pointer to the rest of the input buffer and the nex `Int` --- state. -pattern OK# :: a -> Addr# -> Int# -> Res# e a -pattern OK# a s n = (# (# a, s, n #) | | #) - --- | Constructor for errors which are by default non-recoverable. -pattern Err# :: e -> Res# e a -pattern Err# e = (# | | (# e #) #) - --- | Constructor for recoverable failure. -pattern Fail# :: Res# e a -pattern Fail# = (# | (# #) | #) -{-# complete OK#, Err#, Fail# #-} - --- | @Parser r e a@ has a reader environment @r@, error type @e@ and a return type @a@. -newtype Parser r e a = Parser {runParser# :: ForeignPtrContents -> r -> Addr# -> Addr# -> Int# -> Res# e a} - -instance Functor (Parser r e) where - fmap f (Parser g) = Parser \fp !r eob s n -> case g fp r eob s n of - OK# a s n -> let !b = f a in OK# b s n - x -> unsafeCoerce# x - {-# inline fmap #-} - - (<$) a' (Parser g) = Parser \fp !r eob s n -> case g fp r eob s n of - OK# a s n -> OK# a' s n - x -> unsafeCoerce# x - {-# inline (<$) #-} - -instance Applicative (Parser r e) where - pure a = Parser \fp !r eob s n -> OK# a s n - {-# inline pure #-} - Parser ff <*> Parser fa = Parser \fp !r eob s n -> case ff fp r eob s n of - OK# f s n -> case fa fp r eob s n of - OK# a s n -> let !b = f a in OK# b s n - x -> unsafeCoerce# x - x -> unsafeCoerce# x - {-# inline (<*>) #-} - Parser fa <* Parser fb = Parser \fp !r eob s n -> case fa fp r eob s n of - OK# a s n -> case fb fp r eob s n of - OK# b s n -> OK# a s n - x -> unsafeCoerce# x - x -> unsafeCoerce# x - {-# inline (<*) #-} - Parser fa *> Parser fb = Parser \fp !r eob s n -> case fa fp r eob s n of - OK# a s n -> fb fp r eob s n - x -> unsafeCoerce# x - {-# inline (*>) #-} - -instance Monad (Parser r e) where - return = pure - {-# inline return #-} - Parser fa >>= f = Parser \fp !r eob s n -> case fa fp r eob s n of - OK# a s n -> runParser# (f a) fp r eob s n - x -> unsafeCoerce# x - {-# inline (>>=) #-} - (>>) = (*>) - {-# inline (>>) #-} - --- | Higher-level boxed data type for parsing results. -data Result e a = - OK a Int !(B.ByteString) -- ^ Contains return value, last `Int` state, unconsumed input. - | Fail -- ^ Recoverable-by-default failure. - | Err !e -- ^ Unrecoverble-by-default error. - deriving Show - -instance Functor (Result e) where - fmap f (OK a s n) = let !b = f a in OK b s n - fmap f r = unsafeCoerce# r - {-# inline fmap #-} - (<$) a (OK _ s n) = OK a s n - (<$) _ r = unsafeCoerce# r - {-# inline (<$) #-} - --------------------------------------------------------------------------------- - --- | Run a parser. The `Int` argument is the initial state. -runParser :: Parser r e a -> r -> Int -> B.ByteString -> Result e a -runParser (Parser f) !r (I# n) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = unsafeDupablePerformIO do - B.unsafeUseAsCString b \(Ptr buf) -> do - let end = plusAddr# buf len - case f fp r end buf n of - Err# e -> - pure (Err e) - OK# a s n -> do - let offset = minusAddr# s buf - pure (OK a (I# n) (B.drop (I# offset) b)) - Fail# -> - pure Fail -{-# inlinable runParser #-} - --- | Run a parser on a `String` input. Reminder: @OverloadedStrings@ for `B.ByteString` does not --- yield a valid UTF-8 encoding! For non-ASCII `B.ByteString` literal input, use `runParserS` or --- `packUTF8` for testing. -runParserS :: Parser r e a -> r -> Int -> String -> Result e a -runParserS pa r !n s = runParser pa r n (packUTF8 s) - --------------------------------------------------------------------------------- - --- | Query the `Int` state. -get :: Parser r e Int -get = Parser \fp !r eob s n -> OK# (I# n) s n -{-# inline get #-} - --- | Write the `Int` state. -put :: Int -> Parser r e () -put (I# n) = Parser \fp !r eob s _ -> OK# () s n -{-# inline put #-} - --- | Modify the `Int` state. -modify :: (Int -> Int) -> Parser r e () -modify f = Parser \fp !r eob s n -> - case f (I# n) of - I# n -> OK# () s n -{-# inline modify #-} - --- | Query the environment. -ask :: Parser r e r -ask = Parser \fp !r eob s n -> OK# r s n -{-# inline ask #-} - --- | Run a parser in a modified environment. -local :: (r -> r) -> Parser r e a -> Parser r e a -local f (Parser g) = Parser \fp !r eob s n -> let !r' = f r in g fp r' eob s n -{-# inline local #-} - --------------------------------------------------------------------------------- - --- | The failing parser. By default, parser choice `(<|>)` arbitrarily backtracks --- on parser failure. -failed :: Parser r e a -failed = Parser \fp !r eob s n -> Fail# -{-# inline failed #-} - --- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack --- on parser error. Use `try` to convert an error to a recoverable failure. -err :: e -> Parser r e a -err e = Parser \fp !r eob s n -> Err# e -{-# inline err #-} - --- | Save the parsing state, then run a parser, then restore the state. -lookahead :: Parser r e a -> Parser r e a -lookahead (Parser f) = Parser \fp !r eob s n -> - case f fp r eob s n of - OK# a _ _ -> OK# a s n - x -> x -{-# inline lookahead #-} - --- | Convert a parsing failure to a success. -fails :: Parser r e a -> Parser r e () -fails (Parser f) = Parser \fp !r eob s n -> - case f fp r eob s n of - OK# _ _ _ -> Fail# - Fail# -> OK# () s n - Err# e -> Err# e -{-# inline fails #-} - --- | Convert a parsing error into failure. -try :: Parser r e a -> Parser r e a -try (Parser f) = Parser \fp !r eob s n -> case f fp r eob s n of - Err# _ -> Fail# - x -> x -{-# inline try #-} - --- | Convert a parsing failure to a `Maybe`. If possible, use `withOption` instead. -optional :: Parser r e a -> Parser r e (Maybe a) -optional p = (Just <$> p) <|> pure Nothing -{-# inline optional #-} - --- | Convert a parsing failure to a `()`. -optional_ :: Parser r e a -> Parser r e () -optional_ p = (() <$ p) <|> pure () -{-# inline optional_ #-} - --- | CPS'd version of `optional`. This is usually more efficient, since it gets rid of the --- extra `Maybe` allocation. -withOption :: Parser r e a -> (a -> Parser r e b) -> Parser r e b -> Parser r e b -withOption (Parser f) just (Parser nothing) = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s n -> runParser# (just a) fp r eob s n - Fail# -> nothing fp r eob s n - Err# e -> Err# e -{-# inline withOption #-} - --- | Convert a parsing failure to an error. -cut :: Parser r e a -> e -> Parser r e a -cut (Parser f) e = Parser \fp !r eob s n -> case f fp r eob s n of - Fail# -> Err# e - x -> x -{-# inline cut #-} - --- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the --- inner and the newly given errors using the @e -> e -> e@ function. This can be useful for --- implementing parsing errors which may propagate hints or accummulate contextual information. -cutting :: Parser r e a -> e -> (e -> e -> e) -> Parser r e a -cutting (Parser f) e merge = Parser \fp !r eob s n -> case f fp r eob s n of - Fail# -> Err# e - Err# e' -> let !e'' = merge e' e in Err# e'' - x -> x -{-# inline cutting #-} - --------------------------------------------------------------------------------- - - --- | Succeed if the input is empty. -eof :: Parser r e () -eof = Parser \fp !r eob s n -> case eqAddr# eob s of - 1# -> OK# () s n - _ -> Fail# -{-# inline eof #-} - --- | Read the given number of bytes as a 'ByteString'. --- --- Throws a runtime error if given a negative integer. -takeBs :: Int -> Parser r e B.ByteString -takeBs (I# n#) = Parser \fp !r eob s n -> case n# <=# minusAddr# eob s of - 1# -> -- have to runtime check for negative values, because they cause a hang - case n# >=# 0# of - 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) n - _ -> error "FlatParse.Basic.take: negative integer" - _ -> Fail# -{-# inline takeBs #-} - --- | Consume the rest of the input. May return the empty bytestring. -takeRestBs :: Parser r e B.ByteString -takeRestBs = Parser \fp !r eob s n -> - let n# = minusAddr# eob s - in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) eob n -{-# inline takeRestBs #-} - --- | Parse a UTF-8 character literal. This is a template function, you can use it as --- @$(char \'x\')@, for example, and the splice in this case has type @Parser r e ()@. -char :: Char -> Q Exp -char c = string [c] - --- | Read a byte. -byte :: Word8 -> Parser r e () -byte w = ensureBytes# 1 >> scan8# w -{-# inline byte #-} - --- | Read a sequence of bytes. This is a template function, you can use it as @$(bytes [3, 4, 5])@, --- for example, and the splice has type @Parser r e ()@. For a non-TH variant see 'byteString'. -bytes :: [Word] -> Q Exp -bytes bytes = do - let !len = length bytes - [| ensureBytes# len >> $(scanBytes# bytes) |] - --- | Parse a given `B.ByteString`. If the bytestring is statically known, consider using 'bytes' instead. -byteString :: B.ByteString -> Parser r e () -byteString (B.PS (ForeignPtr bs fcontents) _ (I# len)) = - - let go64 :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go64 bs bsend s n w = - let bs' = plusAddr# bs 8# in - case gtAddr# bs' bsend of - 1# -> go8 bs bsend s n w - _ -> case eqWord# (indexWord64OffAddr# bs 0#) (indexWord64OffAddr# s 0#) of - 1# -> go64 bs' bsend (plusAddr# s 8#) n w - _ -> (# Fail#, w #) - - go8 :: Addr# -> Addr# -> Addr# -> Int# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go8 bs bsend s n w = case ltAddr# bs bsend of - 1# -> case eqWord8'# (indexWord8OffAddr# bs 0#) (indexWord8OffAddr# s 0#) of - 1# -> go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) n w - _ -> (# Fail#, w #) - _ -> (# OK# () s n, w #) - - in Parser \fp !r eob s n -> case len <=# minusAddr# eob s of - 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s n w of - (# res, w #) -> case touch# fcontents w of - w -> res - _ -> Fail# -{-# inline byteString #-} - --- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@, --- for example, and the splice has type @Parser r e ()@. -string :: String -> Q Exp -string str = bytes (strToBytes str) - -{-| -This is a template function which makes it possible to branch on a collection of string literals in -an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing -operations, which has optimized control flow, vectorized reads and grouped checking for needed input -bytes. - -The syntax is slightly magical, it overloads the usual @case@ expression. An example: - -@ - $(switch [| case _ of - "foo" -> pure True - "bar" -> pure False |]) -@ - -The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally -we may have a default case, like in - -@ - $(switch [| case _ of - "foo" -> pure 10 - "bar" -> pure 20 - _ -> pure 30 |]) -@ - -All case right hand sides must be parsers with the same type. That type is also the type -of the whole `switch` expression. - -A `switch` has longest match semantics, and the order of cases does not matter, except for -the default case, which may only appear as the last case. - -If a `switch` does not have a default case, and no case matches the input, then it returns with -failure, \without\ having consumed any input. A fallthrough to the default case also does not -consume any input. --} -switch :: Q Exp -> Q Exp -switch = switchWithPost Nothing - -{-| -Switch expression with an optional first argument for performing a post-processing action after -every successful branch matching. For example, if we have @ws :: Parser r e ()@ for a -whitespace parser, we might want to consume whitespace after matching on any of the switch -cases. For that case, we can define a "lexeme" version of `switch` as follows. - -@ - switch' :: Q Exp -> Q Exp - switch' = switchWithPost (Just [| ws |]) -@ - -Note that this @switch'@ function cannot be used in the same module it's defined in, because of the -stage restriction of Template Haskell. --} -switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp -switchWithPost postAction exp = do - !postAction <- sequence postAction - (!cases, !fallback) <- parseSwitch exp - genTrie $! genSwitchTrie' postAction cases fallback - --- | Version of `switchWithPost` without syntactic sugar. The second argument is the --- list of cases, the third is the default case. -rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp -rawSwitchWithPost postAction cases fallback = do - !postAction <- sequence postAction - !cases <- forM cases \(str, rhs) -> (str,) <$> rhs - !fallback <- sequence fallback - genTrie $! genSwitchTrie' postAction cases fallback - --- | Parse a UTF-8 `Char` for which a predicate holds. -satisfy :: (Char -> Bool) -> Parser r e Char -satisfy f = Parser \fp !r eob s n -> case runParser# anyChar fp r eob s n of - OK# c s n | f c -> OK# c s n - _ -> Fail# -{-# inline satisfy #-} - --- | Skip a UTF-8 `Char` for which a predicate holds. -satisfy_ :: (Char -> Bool) -> Parser r e () -satisfy_ f = Parser \fp !r eob s n -> case runParser# anyChar fp r eob s n of - OK# c s n | f c -> OK# () s n - _ -> Fail# -{-# inline satisfy_ #-} - --- | Parse an ASCII `Char` for which a predicate holds. Assumption: the predicate must only return --- `True` for ASCII-range characters. Otherwise this function might read a 128-255 range byte, --- thereby breaking UTF-8 decoding. -satisfyASCII :: (Char -> Bool) -> Parser r e Char -satisfyASCII f = Parser \fp !r eob s n -> case eqAddr# eob s of - 1# -> Fail# - _ -> case derefChar8# s of - c1 | f (C# c1) -> OK# (C# c1) (plusAddr# s 1#) n - | otherwise -> Fail# -{-# inline satisfyASCII #-} - --- | Skip an ASCII `Char` for which a predicate holds. Assumption: the --- predicate must only return `True` for ASCII-range characters. -satisfyASCII_ :: (Char -> Bool) -> Parser r e () -satisfyASCII_ f = () <$ satisfyASCII f -{-# inline satisfyASCII_ #-} - --- | This is a variant of `satisfy` which allows more optimization. We can pick four testing --- functions for the four cases for the possible number of bytes in the UTF-8 character. So in --- @fusedSatisfy f1 f2 f3 f4@, if we read a one-byte character, the result is scrutinized with --- @f1@, for two-bytes, with @f2@, and so on. This can result in dramatic lexing speedups. --- --- For example, if we want to accept any letter, the naive solution would be to use --- `Data.Char.isLetter`, but this accesses a large lookup table of Unicode character classes. We --- can do better with @fusedSatisfy isLatinLetter isLetter isLetter isLetter@, since here the --- `isLatinLetter` is inlined into the UTF-8 decoding, and it probably handles a great majority of --- all cases without accessing the character table. -fusedSatisfy :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser r e Char -fusedSatisfy f1 f2 f3 f4 = Parser \fp !r eob buf n -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# | f1 (C# c1) -> OK# (C# c1) (plusAddr# buf 1#) n - | otherwise -> Fail# - _ -> case eqAddr# eob (plusAddr# buf 1#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 1# of - c2 -> case c1 `leChar#` '\xDF'# of - 1# -> - let resc = C# (chr# (((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c2 -# 0x80#))) - in case f2 resc of - True -> OK# resc (plusAddr# buf 2#) n - _ -> Fail# - _ -> case eqAddr# eob (plusAddr# buf 2#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 2# of - c3 -> case c1 `leChar#` '\xEF'# of - 1# -> - let resc = C# (chr# (((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c3 -# 0x80#))) - in case f3 resc of - True -> OK# resc (plusAddr# buf 3#) n - _ -> Fail# - _ -> case eqAddr# eob (plusAddr# buf 3#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 3# of - c4 -> - let resc = C# (chr# (((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c4 -# 0x80#))) - in case f4 resc of - True -> OK# resc (plusAddr# buf 4#) n - _ -> Fail# -{-# inline fusedSatisfy #-} - --- | Skipping variant of `fusedSatisfy`. -fusedSatisfy_ :: (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> (Char -> Bool) -> Parser r e () -fusedSatisfy_ f1 f2 f3 f4 = () <$ fusedSatisfy f1 f2 f3 f4 -{-# inline fusedSatisfy_ #-} - --- | Parse any UTF-8-encoded `Char`. -anyChar :: Parser r e Char -anyChar = Parser \fp !r eob buf n -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# -> OK# (C# c1) (plusAddr# buf 1#) n - _ -> case eqAddr# eob (plusAddr# buf 1#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 1# of - c2 -> case c1 `leChar#` '\xDF'# of - 1# -> - let resc = ((ord# c1 -# 0xC0#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c2 -# 0x80#) - in OK# (C# (chr# resc)) (plusAddr# buf 2#) n - _ -> case eqAddr# eob (plusAddr# buf 2#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 2# of - c3 -> case c1 `leChar#` '\xEF'# of - 1# -> - let resc = ((ord# c1 -# 0xE0#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c3 -# 0x80#) - in OK# (C# (chr# resc)) (plusAddr# buf 3#) n - _ -> case eqAddr# eob (plusAddr# buf 3#) of - 1# -> Fail# - _ -> case indexCharOffAddr# buf 3# of - c4 -> - let resc = ((ord# c1 -# 0xF0#) `uncheckedIShiftL#` 18#) `orI#` - ((ord# c2 -# 0x80#) `uncheckedIShiftL#` 12#) `orI#` - ((ord# c3 -# 0x80#) `uncheckedIShiftL#` 6#) `orI#` - (ord# c4 -# 0x80#) - in OK# (C# (chr# resc)) (plusAddr# buf 4#) n -{-# inline anyChar #-} - --- | Skip any UTF-8-encoded `Char`. -anyChar_ :: Parser r e () -anyChar_ = Parser \fp !r eob buf n -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# -> OK# () (plusAddr# buf 1#) n - _ -> - let buf' = - case c1 `leChar#` '\xDF'# of - 1# -> plusAddr# buf 2# - _ -> case c1 `leChar#` '\xEF'# of - 1# -> plusAddr# buf 3# - _ -> plusAddr# buf 4# - in case leAddr# buf' eob of - 1# -> OK# () buf' n - _ -> Fail# -{-# inline anyChar_ #-} - - --- | Parse any `Char` in the ASCII range, fail if the next input character is not in the range. --- This is more efficient than `anyChar` if we are only working with ASCII. -anyCharASCII :: Parser r e Char -anyCharASCII = Parser \fp !r eob buf n -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case derefChar8# buf of - c1 -> case c1 `leChar#` '\x7F'# of - 1# -> OK# (C# c1) (plusAddr# buf 1#) n - _ -> Fail# -{-# inline anyCharASCII #-} - --- | Skip any `Char` in the ASCII range. More efficient than `anyChar_` if we're working only with --- ASCII. -anyCharASCII_ :: Parser r e () -anyCharASCII_ = () <$ anyCharASCII -{-# inline anyCharASCII_ #-} - --- | Read an `Int` from the input, as a non-empty digit sequence. The `Int` may --- overflow in the result. -readInt :: Parser r e Int -readInt = Parser \fp r eob s n -> case FlatParse.Internal.readInt eob s of - (# (##) | #) -> Fail# - (# | (# i, s' #) #) -> OK# (I# i) s' n -{-# inline readInt #-} - --- | Read an `Int` from the input, as a non-empty case-insensitive ASCII --- hexadecimal digit sequence. The `Int` may overflow in the result. -readIntHex :: Parser r e Int -readIntHex = Parser \fp r eob s n -> case FlatParse.Internal.readIntHex eob s of - (# (##) | #) -> Fail# - (# | (# i, s' #) #) -> OK# (I# i) s' n -{-# inline readIntHex #-} - --- | Read an `Integer` from the input, as a non-empty digit sequence. -readInteger :: Parser r e Integer -readInteger = Parser \fp r eob s n -> case FlatParse.Internal.readInteger fp eob s of - (# (##) | #) -> Fail# - (# | (# i, s' #) #) -> OK# i s' n -{-# inline readInteger #-} - - --------------------------------------------------------------------------------- - --- | Choose between two parsers. If the first parser fails, try the second one, but if the first one --- throws an error, propagate the error. -infixr 6 <|> -(<|>) :: Parser r e a -> Parser r e a -> Parser r e a -(<|>) (Parser f) (Parser g) = Parser \fp !r eob s n -> - case f fp r eob s n of - Fail# -> g fp r eob s n - x -> x -{-# inline[1] (<|>) #-} - -instance Base.Alternative (Parser r e) where - empty = failed - {-# inline empty #-} - (<|>) = (<|>) - {-# inline (Base.<|>) #-} - -instance MonadPlus (Parser r e) where - mzero = failed - {-# inline mzero #-} - mplus = (<|>) - {-# inline mplus #-} - -{-# RULES - -"flatparse/reassoc-alt" forall l m r. (l <|> m) <|> r = l <|> (m <|> r) - -#-} - --- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. --- This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not --- backtrack from the true/false cases. -branch :: Parser r e a -> Parser r e b -> Parser r e b -> Parser r e b -branch pa pt pf = Parser \fp !r eob s n -> case runParser# pa fp r eob s n of - OK# _ s n -> runParser# pt fp r eob s n - Fail# -> runParser# pf fp r eob s n - Err# e -> Err# e -{-# inline branch #-} - --- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, --- and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not --- the usual `chainl` function from the parsec libraries! -chainl :: (b -> a -> b) -> Parser r e b -> Parser r e a -> Parser r e b -chainl f start elem = start >>= go where - go b = do {!a <- elem; go $! f b a} <|> pure b -{-# inline chainl #-} - --- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and --- combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not --- the usual `chainr` function from the parsec libraries! -chainr :: (a -> b -> b) -> Parser r e a -> Parser r e b -> Parser r e b -chainr f (Parser elem) (Parser end) = go where - go = Parser \fp !r eob s n -> case elem fp r eob s n of - OK# a s n -> case runParser# go fp r eob s n of - OK# b s n -> let !b' = f a b in OK# b' s n - x -> x - Fail# -> end fp r eob s n - Err# e -> Err# e -{-# inline chainr #-} - --- | Run a parser zero or more times, collect the results in a list. Note: for optimal performance, --- try to avoid this. Often it is possible to get rid of the intermediate list by using a --- combinator or a custom parser. -many :: Parser r e a -> Parser r e [a] -many (Parser f) = go where - go = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s n -> case runParser# go fp r eob s n of - OK# as s n -> OK# (a:as) s n - x -> x - Fail# -> OK# [] s n - Err# e -> Err# e -{-# inline many #-} - --- | Skip a parser zero or more times. -many_ :: Parser r e a -> Parser r e () -many_ (Parser f) = go where - go = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s n -> runParser# go fp r eob s n - Fail# -> OK# () s n - Err# e -> Err# e -{-# inline many_ #-} - --- | Run a parser one or more times, collect the results in a list. Note: for optimal performance, --- try to avoid this. Often it is possible to get rid of the intermediate list by using a --- combinator or a custom parser. -some :: Parser r e a -> Parser r e [a] -some p = (:) <$> p <*> many p -{-# inline some #-} - --- | Skip a parser one or more times. -some_ :: Parser r e a -> Parser r e () -some_ pa = pa >> many_ pa -{-# inline some_ #-} - --- | Succeed if the first parser succeeds and the second one fails. The parsing --- state is restored to the point of the first argument's success. -notFollowedBy :: Parser r e a -> Parser r e b -> Parser r e a -notFollowedBy p1 p2 = p1 <* lookahead (fails p2) -{-# inline notFollowedBy #-} - --- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All --- isolated bytes must be consumed. --- --- Throws a runtime error if given a negative integer. -isolate :: Int -> Parser r e a -> Parser r e a -isolate (I# n#) p = Parser \fp !r eob s n -> - let s' = plusAddr# s n# - in case n# <=# minusAddr# eob s of - 1# -> case n# >=# 0# of - 1# -> case runParser# p fp r s' s n of - OK# a s'' n' -> case eqAddr# s' s'' of - 1# -> OK# a s'' n' - _ -> Fail# -- isolated segment wasn't fully consumed - Fail# -> Fail# - Err# e -> Err# e - _ -> error "FlatParse.Basic.isolate: negative integer" - _ -> Fail# -- you tried to isolate more than we have left -{-# inline isolate #-} - --------------------------------------------------------------------------------- - --- | Get the current position in the input. -getPos :: Parser r e Pos -getPos = Parser \fp !r eob s n -> OK# (addrToPos# eob s) s n -{-# inline getPos #-} - --- | Set the input position. Warning: this can result in crashes if the position points outside the --- current buffer. It is always safe to `setPos` values which came from `getPos` with the current --- input. -setPos :: Pos -> Parser r e () -setPos s = Parser \fp !r eob _ n -> OK# () (posToAddr# eob s) n -{-# inline setPos #-} - --- | The end of the input. -endPos :: Pos -endPos = Pos 0 -{-# inline endPos #-} - - --- | Return the consumed span of a parser. Use `withSpan` if possible for better efficiency. -spanOf :: Parser r e a -> Parser r e Span -spanOf (Parser f) = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s' n -> OK# (Span (addrToPos# eob s) (addrToPos# eob s')) s' n - x -> unsafeCoerce# x -{-# inline spanOf #-} - --- | Bind the result together with the span of the result. CPS'd version of `spanOf` --- for better unboxing. -withSpan :: Parser r e a -> (a -> Span -> Parser r e b) -> Parser r e b -withSpan (Parser f) g = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s' n -> runParser# (g a (Span (addrToPos# eob s) (addrToPos# eob s'))) fp r eob s' n - x -> unsafeCoerce# x -{-# inline withSpan #-} - --- | Return the `B.ByteString` consumed by a parser. Note: it's more efficient to use `spanOf` and --- `withSpan` instead. -byteStringOf :: Parser r e a -> Parser r e B.ByteString -byteStringOf (Parser f) = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s' n -> OK# (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s))) s' n - x -> unsafeCoerce# x -{-# inline byteStringOf #-} - --- | CPS'd version of `byteStringOf`. Can be more efficient, because the result is more eagerly unboxed --- by GHC. It's more efficient to use `spanOf` or `withSpan` instead. -withByteString :: Parser r e a -> (a -> B.ByteString -> Parser r e b) -> Parser r e b -withByteString (Parser f) g = Parser \fp !r eob s n -> case f fp r eob s n of - OK# a s' n -> runParser# (g a (B.PS (ForeignPtr s fp) 0 (I# (minusAddr# s' s)))) fp r eob s' n - x -> unsafeCoerce# x -{-# inline withByteString #-} - --- | Create a `B.ByteString` from a `Span`. The result is invalid is the `Span` points --- outside the current buffer, or if the `Span` start is greater than the end position. -unsafeSpanToByteString :: Span -> Parser r e B.ByteString -unsafeSpanToByteString (Span l r) = - lookahead (setPos l >> byteStringOf (setPos r)) -{-# inline unsafeSpanToByteString #-} - - --- | Run a parser in a given input span. The input position and the `Int` state is restored after --- the parser is finished, so `inSpan` does not consume input and has no side effect. Warning: --- this operation may crash if the given span points outside the current parsing buffer. It's --- always safe to use `inSpan` if the span comes from a previous `withSpan` or `spanOf` call on --- the current input. -inSpan :: Span -> Parser r e a -> Parser r e a -inSpan (Span s eob) (Parser f) = Parser \fp !r eob' s' n' -> - case f fp r (posToAddr# eob' eob) (posToAddr# eob' s) n' of - OK# a _ _ -> OK# a s' n' - x -> unsafeCoerce# x -{-# inline inSpan #-} - - --------------------------------------------------------------------------------- - --- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, --- throws an error if the encoding is invalid. -takeLine :: Parser r e String -takeLine = branch eof (pure "") do - c <- anyChar - case c of - '\n' -> pure "" - _ -> (c:) <$> takeLine - --- | Parse the rest of the current line as a `String`, but restore the parsing state. --- Assumes UTF-8 encoding. This can be used for debugging. -traceLine :: Parser r e String -traceLine = lookahead takeLine - --- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. -takeRest :: Parser r e String -takeRest = branch eof (pure "") do - c <- anyChar - cs <- takeRest - pure (c:cs) - --- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. --- This can be used for debugging. -traceRest :: Parser r e String -traceRest = lookahead takeRest - --------------------------------------------------------------------------------- - --- | Check that the input has at least the given number of bytes. -ensureBytes# :: Int -> Parser r e () -ensureBytes# (I# len) = Parser \fp !r eob s n -> - case len <=# minusAddr# eob s of - 1# -> OK# () s n - _ -> Fail# -{-# inline ensureBytes# #-} - --- | Unsafely read a concrete byte from the input. It's not checked that the input has --- enough bytes. -scan8# :: Word8 -> Parser r e () -scan8# (W8# c) = Parser \fp !r eob s n -> - case indexWord8OffAddr# s 0# of - c' -> case eqWord8'# c c' of - 1# -> OK# () (plusAddr# s 1#) n - _ -> Fail# -{-# inline scan8# #-} - --- | Unsafely read two concrete bytes from the input. It's not checked that the input has --- enough bytes. -scan16# :: Word16 -> Parser r e () -scan16# (W16# c) = Parser \fp !r eob s n -> - case indexWord16OffAddr# s 0# of - c' -> case eqWord16'# c c' of - 1# -> OK# () (plusAddr# s 2#) n - _ -> Fail# -{-# inline scan16# #-} - --- | Unsafely read four concrete bytes from the input. It's not checked that the input has --- enough bytes. -scan32# :: Word32 -> Parser r e () -scan32# (W32# c) = Parser \fp !r eob s n -> - case indexWord32OffAddr# s 0# of - c' -> case eqWord32'# c c' of - 1# -> OK# () (plusAddr# s 4#) n - _ -> Fail# -{-# inline scan32# #-} - --- | Unsafely read eight concrete bytes from the input. It's not checked that the input has --- enough bytes. -scan64# :: Word -> Parser r e () -scan64# (W# c) = Parser \fp !r eob s n -> - case indexWord64OffAddr# s 0# of - c' -> case eqWord# c c' of - 1# -> OK# () (plusAddr# s 8#) n - _ -> Fail# -{-# inline scan64# #-} - --- | Unsafely read and return a byte from the input. It's not checked that the input is non-empty. -scanAny8# :: Parser r e Word8 -scanAny8# = Parser \fp !r eob s n -> OK# (W8# (indexWord8OffAddr# s 0#)) (plusAddr# s 1#) n -{-# inline scanAny8# #-} - -scanPartial64# :: Int -> Word -> Parser r e () -scanPartial64# (I# len) (W# w) = Parser \fp !r eob s n -> - case indexWordOffAddr# s 0# of - w' -> case uncheckedIShiftL# (8# -# len) 3# of - sh -> case uncheckedShiftL# w' sh of - w' -> case uncheckedShiftRL# w' sh of - w' -> case eqWord# w w' of - 1# -> OK# () (plusAddr# s len) n - _ -> Fail# -{-# inline scanPartial64# #-} - --- | Decrease the current input position by the given number of bytes. -setBack# :: Int -> Parser r e () -setBack# (I# i) = Parser \fp !r eob s n -> - OK# () (plusAddr# s (negateInt# i)) n -{-# inline setBack# #-} - --- | Template function, creates a @Parser r e ()@ which unsafely scans a given --- sequence of bytes. -scanBytes# :: [Word] -> Q Exp -scanBytes# bytes = do - let !(leading, w8s) = splitBytes bytes - !scanw8s = go w8s where - go (w8:[] ) = [| scan64# w8 |] - go (w8:w8s) = [| scan64# w8 >> $(go w8s) |] - go [] = [| pure () |] - case w8s of - [] -> go leading - where - go (a:b:c:d:[]) = let !w = packBytes [a, b, c, d] in [| scan32# w |] - go (a:b:c:d:ws) = let !w = packBytes [a, b, c, d] in [| scan32# w >> $(go ws) |] - go (a:b:[]) = let !w = packBytes [a, b] in [| scan16# w |] - go (a:b:ws) = let !w = packBytes [a, b] in [| scan16# w >> $(go ws) |] - go (a:[]) = [| scan8# a |] - go [] = [| pure () |] - _ -> case leading of - - [] -> scanw8s - [a] -> [| scan8# a >> $scanw8s |] - ws@[a, b] -> let !w = packBytes ws in [| scan16# w >> $scanw8s |] - ws@[a, b, c, d] -> let !w = packBytes ws in [| scan32# w >> $scanw8s |] - ws -> let !w = packBytes ws - !l = length ws - in [| scanPartial64# l w >> $scanw8s |] - - --- Switching code generation --------------------------------------------------------------------------------- - -#if MIN_VERSION_base(4,15,0) -mkDoE = DoE Nothing -{-# inline mkDoE #-} -#else -mkDoE = DoE -{-# inline mkDoE #-} -#endif - -genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp -genTrie (rules, t) = do - branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules - - let ix m k = case M.lookup k m of - Nothing -> error ("key not in map: " ++ show k) - Just a -> a - - let ensure :: Maybe Int -> Maybe (Q Exp) - ensure = fmap (\n -> [| ensureBytes# n |]) - - fallback :: Rule -> Int -> Q Exp - fallback rule 0 = pure $ VarE $ fst $ ix branches rule - fallback rule n = [| setBack# n >> $(pure $ VarE $ fst $ ix branches rule) |] - - let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp - go = \case - Branch' (r, n, alloc) ts - | M.null ts -> pure $ VarE $ fst $ branches M.! r - | otherwise -> do - !next <- (traverse . traverse) go (M.toList ts) - !defaultCase <- fallback r (n + 1) - - let cases = mkDoE $ - [BindS (VarP (mkName "c")) (VarE 'scanAny8#), - NoBindS (CaseE (VarE (mkName "c")) - (map (\(w, t) -> - Match (LitP (IntegerL (fromIntegral w))) - (NormalB t) - []) - next - ++ [Match WildP (NormalB defaultCase) []]))] - - case ensure alloc of - Nothing -> pure cases - Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |] - - Path (r, n, alloc) ws t -> - case ensure alloc of - Nothing -> [| branch $(scanBytes# ws) $(go t) $(fallback r n)|] - Just alloc -> [| branch ($alloc >> $(scanBytes# ws)) $(go t) $(fallback r n) |] - - letE - (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches)) - (go t) - -parseSwitch :: Q Exp -> Q ([(String, Exp)], Maybe Exp) -parseSwitch exp = exp >>= \case - CaseE (UnboundVarE _) [] -> error "switch: empty clause list" - CaseE (UnboundVarE _) cases -> do - (!cases, !last) <- pure (init cases, last cases) - !cases <- forM cases \case - Match (LitP (StringL str)) (NormalB rhs) [] -> pure (str, rhs) - _ -> error "switch: expected a match clause on a string literal" - (!cases, !last) <- case last of - Match (LitP (StringL str)) (NormalB rhs) [] -> pure (cases ++ [(str, rhs)], Nothing) - Match WildP (NormalB rhs) [] -> pure (cases, Just rhs) - _ -> error "switch: expected a match clause on a string literal or a wildcard" - pure (cases, last) - _ -> error "switch: expected a \"case _ of\" expression" - -genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp - -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -genSwitchTrie' postAction cases fallback = - - let (!branches, !strings) = unzip do - (!i, (!str, !rhs)) <- zip [0..] cases - case postAction of - Nothing -> pure ((Just i, rhs), (i, str)) - Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str)) - - !m = M.fromList ((Nothing, maybe (VarE 'failed) id fallback) : branches) - !trie = compileTrie strings - in (m , trie) - --------------------------------------------------------------------------------- - -withAnyWord8# :: (Word8'# -> Parser r e a) -> Parser r e a -withAnyWord8# p = Parser \fp !r eob buf n -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case indexWord8OffAddr# buf 0# of - w# -> runParser# (p w#) fp r eob (plusAddr# buf 1#) n -{-# inline withAnyWord8# #-} - -withAnyWord16# :: (Word16'# -> Parser r e a) -> Parser r e a -withAnyWord16# p = Parser \fp !r eob buf n -> case 2# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexWord16OffAddr# buf 0# of - w# -> runParser# (p w#) fp r eob (plusAddr# buf 2#) n -{-# inline withAnyWord16# #-} - -withAnyWord32# :: (Word32'# -> Parser r e a) -> Parser r e a -withAnyWord32# p = Parser \fp !r eob buf n -> case 4# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexWord32OffAddr# buf 0# of - w# -> runParser# (p w#) fp r eob (plusAddr# buf 4#) n -{-# inline withAnyWord32# #-} - -withAnyWord64# :: (Word# -> Parser r e a) -> Parser r e a -withAnyWord64# p = Parser \fp !r eob buf n -> case 8# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexWordOffAddr# buf 0# of - w# -> runParser# (p w#) fp r eob (plusAddr# buf 8#) n -{-# inline withAnyWord64# #-} - -withAnyInt8# :: (Int8'# -> Parser r e a) -> Parser r e a -withAnyInt8# p = Parser \fp !r eob buf n -> case eqAddr# eob buf of - 1# -> Fail# - _ -> case indexInt8OffAddr# buf 0# of - i# -> runParser# (p i#) fp r eob (plusAddr# buf 1#) n -{-# inline withAnyInt8# #-} - -withAnyInt16# :: (Int16'# -> Parser r e a) -> Parser r e a -withAnyInt16# p = Parser \fp !r eob buf n -> case 2# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexInt16OffAddr# buf 0# of - i# -> runParser# (p i#) fp r eob (plusAddr# buf 2#) n -{-# inline withAnyInt16# #-} - -withAnyInt32# :: (Int32'# -> Parser r e a) -> Parser r e a -withAnyInt32# p = Parser \fp !r eob buf n -> case 4# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexInt32OffAddr# buf 0# of - i# -> runParser# (p i#) fp r eob (plusAddr# buf 4#) n -{-# inline withAnyInt32# #-} - -withAnyInt64# :: (Int# -> Parser r e a) -> Parser r e a -withAnyInt64# p = Parser \fp !r eob buf n -> case 8# <=# minusAddr# eob buf of - 0# -> Fail# - _ -> case indexInt64OffAddr# buf 0# of - i# -> runParser# (p i#) fp r eob (plusAddr# buf 8#) n -{-# inline withAnyInt64# #-} - --------------------------------------------------------------------------------- - --- | Parse any 'Word8' (byte). -anyWord8 :: Parser r e Word8 -anyWord8 = withAnyWord8# (\w# -> pure (W8# w#)) -{-# inline anyWord8 #-} - --- | Skip any 'Word8' (byte). -anyWord8_ :: Parser r e () -anyWord8_ = () <$ anyWord8 -{-# inline anyWord8_ #-} - --- | Parse any 'Word16'. -anyWord16 :: Parser r e Word16 -anyWord16 = withAnyWord16# (\w# -> pure (W16# w#)) -{-# inline anyWord16 #-} - --- | Skip any 'Word16'. -anyWord16_ :: Parser r e () -anyWord16_ = () <$ anyWord16 -{-# inline anyWord16_ #-} - --- | Parse any 'Word32'. -anyWord32 :: Parser r e Word32 -anyWord32 = withAnyWord32# (\w# -> pure (W32# w#)) -{-# inline anyWord32 #-} - --- | Skip any 'Word32'. -anyWord32_ :: Parser r e () -anyWord32_ = () <$ anyWord32 -{-# inline anyWord32_ #-} - --- | Parse any 'Word64'. -anyWord64 :: Parser r e Word64 -anyWord64 = withAnyWord64# (\w# -> pure (W64# w#)) -{-# inline anyWord64 #-} - --- | Skip any 'Word64'. -anyWord64_ :: Parser r e () -anyWord64_ = () <$ anyWord64 -{-# inline anyWord64_ #-} - --- | Parse any 'Word'. -anyWord :: Parser r e Word -anyWord = withAnyWord64# (\w# -> pure (W# w#)) -{-# inline anyWord #-} - --- | Skip any 'Word'. -anyWord_ :: Parser r e () -anyWord_ = () <$ anyWord -{-# inline anyWord_ #-} - --------------------------------------------------------------------------------- - --- | Parse any 'Int8'. -anyInt8 :: Parser r e Int8 -anyInt8 = withAnyInt8# (\i# -> pure (I8# i#)) -{-# inline anyInt8 #-} - --- | Parse any 'Int16'. -anyInt16 :: Parser r e Int16 -anyInt16 = withAnyInt16# (\i# -> pure (I16# i#)) -{-# inline anyInt16 #-} - --- | Parse any 'Int32'. -anyInt32 :: Parser r e Int32 -anyInt32 = withAnyInt32# (\i# -> pure (I32# i#)) -{-# inline anyInt32 #-} - --- | Parse any 'Int64'. -anyInt64 :: Parser r e Int64 -anyInt64 = withAnyInt64# (\i# -> pure (I64# i#)) -{-# inline anyInt64 #-} - --- | Parse any 'Int'. -anyInt :: Parser r e Int -anyInt = withAnyInt64# (\i# -> pure (I# i#)) -{-# inline anyInt #-} - --------------------------------------------------------------------------------- - --- | Parse any 'Word16' (little-endian). -anyWord16le :: Parser r e Word16 -anyWord16le = anyWord16 -{-# inline anyWord16le #-} - --- | Parse any 'Word16' (big-endian). -anyWord16be :: Parser r e Word16 -anyWord16be = withAnyWord16# (\w# -> pure (W16# (byteSwap16'# w#))) -{-# inline anyWord16be #-} - --- | Parse any 'Word32' (little-endian). -anyWord32le :: Parser r e Word32 -anyWord32le = anyWord32 -{-# inline anyWord32le #-} - --- | Parse any 'Word32' (big-endian). -anyWord32be :: Parser r e Word32 -anyWord32be = withAnyWord32# (\w# -> pure (W32# (byteSwap32'# w#))) -{-# inline anyWord32be #-} - --- | Parse any 'Word64' (little-endian). -anyWord64le :: Parser r e Word64 -anyWord64le = anyWord64 -{-# inline anyWord64le #-} - --- | Parse any 'Word64' (big-endian). -anyWord64be :: Parser r e Word64 -anyWord64be = withAnyWord64# (\w# -> pure (W64# (byteSwap# w#))) -{-# inline anyWord64be #-} - --------------------------------------------------------------------------------- - --- | Parse any 'Int16' (little-endian). -anyInt16le :: Parser r e Int16 -anyInt16le = anyInt16 -{-# inline anyInt16le #-} - --- | Parse any 'Int16' (big-endian). -anyInt16be :: Parser r e Int16 -anyInt16be = withAnyWord16# (\w# -> pure (I16# (word16ToInt16# (byteSwap16'# w#)))) -{-# inline anyInt16be #-} - --- | Parse any 'Int32' (little-endian). -anyInt32le :: Parser r e Int32 -anyInt32le = anyInt32 -{-# inline anyInt32le #-} - --- | Parse any 'Int32' (big-endian). -anyInt32be :: Parser r e Int32 -anyInt32be = withAnyWord32# (\w# -> pure (I32# (word32ToInt32# (byteSwap32'# w#)))) -{-# inline anyInt32be #-} - --- | Parse any 'Int64' (little-endian). -anyInt64le :: Parser r e Int64 -anyInt64le = anyInt64 -{-# inline anyInt64le #-} - --- | Parse any 'Int64' (big-endian). -anyInt64be :: Parser r e Int64 -anyInt64be = withAnyWord64# (\w# -> pure (I64# (word2Int# (byteSwap# w#)))) -{-# inline anyInt64be #-} - --------------------------------------------------------------------------------- - --- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ --- bytes are available. --- --- Throws a runtime error if given a negative integer. -atSkip# :: Int# -> Parser r e a -> Parser r e a -atSkip# os# (Parser p) = Parser \fp !r eob s n -> case os# <=# minusAddr# eob s of - 1# -> case os# >=# 0# of - 1# -> p fp r eob (plusAddr# s os#) n - _ -> error "FlatParse.Basic.atSkip#: negative integer" - _ -> Fail# -{-# inline atSkip# #-} - --- | Read the given number of bytes as a 'ByteString'. --- --- Throws a runtime error if given a negative integer. -takeBs# :: Int# -> Parser r e B.ByteString -takeBs# n# = Parser \fp !r eob s n -> case n# <=# minusAddr# eob s of - 1# -> -- have to runtime check for negative values, because they cause a hang - case n# >=# 0# of - 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) n - _ -> error "FlatParse.Basic.takeBs: negative integer" - _ -> Fail# -{-# inline takeBs# #-} - --------------------------------------------------------------------------------- - --- | Run a parser, passing it the current address the parser is at. --- --- Useful for parsing offset-based data tables. For example, you may use this to --- save the base address to use together with various 0-indexed offsets. -withAddr# :: (Addr# -> Parser r e a) -> Parser r e a -withAddr# p = Parser \fp !r eob s n -> runParser# (p s) fp r eob s n -{-# inline withAddr# #-} - --- | @takeBsOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ --- bytes, reads @len#@ bytes into a 'ByteString', and restores the original --- address. --- --- The 'Addr#' should be from 'withAddr#'. --- --- Useful for parsing offset-based data tables. For example, you may use this --- together with 'withAddr#' to jump to an offset in your input and read some --- data. -takeBsOffAddr# :: Addr# -> Int# -> Int# -> Parser r e B.ByteString -takeBsOffAddr# addr# offset# len# = - lookaheadFromAddr# addr# $ atSkip# offset# $ takeBs# len# -{-# inline takeBsOffAddr# #-} - --- | 'lookahead', but specify the address to lookahead from. --- --- The 'Addr#' should be from 'withAddr#'. -lookaheadFromAddr# :: Addr# -> Parser r e a -> Parser r e a -lookaheadFromAddr# s = lookahead . atAddr# s -{-# inline lookaheadFromAddr# #-} - --- | Run a parser at the given address. --- --- The 'Addr#' should be from 'withAddr#'. --- --- This is a highly internal function -- you likely want 'lookaheadFromAddr#', --- which will reset the address after running the parser. -atAddr# :: Addr# -> Parser r e a -> Parser r e a -atAddr# s (Parser p) = Parser \fp !r eob _ n -> p fp r eob s n -{-# inline atAddr# #-} - --------------------------------------------------------------------------------- - --- | Read a null-terminated bytestring (a C-style string). --- --- Consumes the null terminator. -anyCString :: Parser r e B.ByteString -anyCString = Parser \fp !r eob s n -> go' fp eob s n - where - go' fp eob s0 n = go 0# s0 n - where - go n# s n = case eqAddr# eob s of - 1# -> Fail# - _ -> - let s' = plusAddr# s 1# - -- TODO below is a candidate for improving with ExtendedLiterals! - in case eqWord8# (indexWord8OffAddr''# s 0#) (wordToWord8''# 0##) of - 1# -> OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' n - _ -> go (n# +# 1#) s' n -{-# inline anyCString #-} - --- | Read a null-terminated bytestring (a C-style string), where the bytestring --- is known to be null-terminated somewhere in the input. --- --- Highly unsafe. Unless you have a guarantee that the string will be null --- terminated before the input ends, use 'anyCString' instead. Honestly, I'm not --- sure if this is a good function to define. But here it is. --- --- Fails on GHC versions older than 9.0, since we make use of the --- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful --- without it. --- --- Consumes the null terminator. -anyCStringUnsafe :: Parser r e B.ByteString -{-# inline anyCStringUnsafe #-} -#if MIN_VERSION_base(4,15,0) -anyCStringUnsafe = Parser \fp !r eob s n -> - case eqAddr# eob s of - 1# -> Fail# - _ -> let n# = cstringLength# s - s'# = plusAddr# s (n# +# 1#) - in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# n -#else -anyCStringUnsafe = error "Flatparse.Basic.anyCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" -#endif diff --git a/test/Test.hs b/test/Test.hs index 9db3ba8..da4641d 100644 --- a/test/Test.hs +++ b/test/Test.hs @@ -3,6 +3,7 @@ module Main where import Numeric (showHex) + import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.Char @@ -14,9 +15,12 @@ import Test.Hspec.QuickCheck import Test.QuickCheck hiding ( (.&.) ) import Data.Word import Data.Int +import GHC.Int import Data.Bits import Test.QuickCheck.Instances.ByteString() +import Control.Applicative + main :: IO () main = hspec $ do basicSpec @@ -33,7 +37,7 @@ basicSpec = describe "FlatParse.Basic" $ do describe "lookahead" $ it "restores state" $ do - let p = FB.lookahead $(FB.string "fun") *> $(FB.string "function") + let p = FB.lookahead $(FB.getStringOf "fun") *> $(FB.getStringOf "function") p `shouldParse` "function" describe "fails" $ do @@ -85,45 +89,44 @@ basicSpec = describe "FlatParse.Basic" $ do \(bs :: ByteString) -> (FB.skip (B.length bs) >> FB.eof) `shouldParse` bs - describe "char" $ do - it "succeeds on that char" $ $(FB.char 'a') `shouldParse` "a" - it "succeeds on multibyte char" $ $(FB.char 'ȩ') `shouldParse` FB.packUTF8 "ȩ" - it "fails on the wrong char" $ $(FB.char 'a') `shouldParseFail` "b" - it "fails at end of file" $ $(FB.char 'a') `shouldParseFail` "" + describe "getCharOf" $ do + it "succeeds on that char" $ $(FB.getCharOf 'a') `shouldParse` "a" + it "succeeds on multibyte char" $ $(FB.getCharOf 'ȩ') `shouldParse` FB.packUTF8 "ȩ" + it "fails on the wrong char" $ $(FB.getCharOf 'a') `shouldParseFail` "b" + it "fails at end of file" $ $(FB.getCharOf 'a') `shouldParseFail` "" - describe "byte" $ do - it "succeeds on that byte" $ FB.byte 0x61 `shouldParse` "\x61" - it "succeeds on high bytes" $ FB.byte 0xfe `shouldParse` "\xfe" - it "fails on the wrong byte" $ FB.byte 0x61 `shouldParseFail` "\x62" - it "fails on end of file" $ FB.byte 0x61 `shouldParseFail` "" + describe "getWord8Of" $ do + it "succeeds on that byte" $ FB.getWord8Of 0x61 `shouldParse` "\x61" + it "succeeds on high bytes" $ FB.getWord8Of 0xfe `shouldParse` "\xfe" + it "fails on the wrong byte" $ FB.getWord8Of 0x61 `shouldParseFail` "\x62" + it "fails on end of file" $ FB.getWord8Of 0x61 `shouldParseFail` "" - describe "bytes" $ do + describe "getBytesOf" $ do it "succeeds on those bytes" $ - $(FB.bytes [1, 2, 3, 4]) `shouldParse` "\x01\x02\x03\x04" + $(FB.getBytesOf [1, 2, 3, 4]) `shouldParse` "\x01\x02\x03\x04" it "succeeds on high bytes" $ - $(FB.bytes [0xf1, 0xf2, 0xf3, 0xf4]) `shouldParse` "\xf1\xf2\xf3\xf4" + $(FB.getBytesOf [0xf1, 0xf2, 0xf3, 0xf4]) `shouldParse` "\xf1\xf2\xf3\xf4" it "fails on wrong bytes" $ - $(FB.bytes [1, 2, 5, 4]) `shouldParseFail` "\x01\x02\x03\x04" + $(FB.getBytesOf [1, 2, 5, 4]) `shouldParseFail` "\x01\x02\x03\x04" it "fails when out of space" $ - $(FB.bytes [1, 2, 3, 4]) `shouldParseFail` "\x01\x02\x03" + $(FB.getBytesOf [1, 2, 3, 4]) `shouldParseFail` "\x01\x02\x03" describe "byteString" $ do it "succeeds on those bytes" $ - FB.byteString (B.pack [1, 2, 3, 4]) `shouldParse` "\x01\x02\x03\x04" + FB.getByteStringOf (B.pack [1, 2, 3, 4]) `shouldParse` "\x01\x02\x03\x04" it "succeeds on high bytestring" $ - FB.byteString (B.pack [0xf1, 0xf2, 0xf3, 0xf4]) `shouldParse` "\xf1\xf2\xf3\xf4" + FB.getByteStringOf (B.pack [0xf1, 0xf2, 0xf3, 0xf4]) `shouldParse` "\xf1\xf2\xf3\xf4" it "fails on wrong bytestring" $ - FB.byteString (B.pack [1, 2, 5, 4]) `shouldParseFail` "\x01\x02\x03\x04" + FB.getByteStringOf (B.pack [1, 2, 5, 4]) `shouldParseFail` "\x01\x02\x03\x04" it "fails when out of space" $ - FB.byteString (B.pack [1, 2, 3, 4]) `shouldParseFail` "\x01\x02\x03" - + FB.getByteStringOf (B.pack [1, 2, 3, 4]) `shouldParseFail` "\x01\x02\x03" - describe "string" $ do - it "succeeds on the right string" $ $(FB.string "foo") `shouldParse` "foo" + describe "getStringOf" $ do + it "succeeds on the right string" $ $(FB.getStringOf "foo") `shouldParse` "foo" it "succeeds with multibyte chars" $ - $(FB.string "foȩ") `shouldParse` FB.packUTF8 "foȩ" - it "fails on the wrong string" $ $(FB.string "foo") `shouldParseFail` "bar" - it "fails when out of space" $ $(FB.string "foo") `shouldParseFail` "fo" + $(FB.getStringOf "foȩ") `shouldParse` FB.packUTF8 "foȩ" + it "fails on the wrong string" $ $(FB.getStringOf "foo") `shouldParseFail` "bar" + it "fails when out of space" $ $(FB.getStringOf "foo") `shouldParseFail` "fo" describe "switch" $ do it "parses simple words" $ @@ -181,7 +184,7 @@ basicSpec = describe "FlatParse.Basic" $ do describe "switchWithPost" $ do it "applies post after match" $ $( FB.switchWithPost - (Just [|$(FB.string "bar")|]) + (Just [|$(FB.getStringOf "bar")|]) [| case _ of "foo" -> pure () @@ -191,7 +194,7 @@ basicSpec = describe "FlatParse.Basic" $ do it "doesn't apply post after default" $ $( FB.switchWithPost - (Just [|$(FB.string "bar")|]) + (Just [|$(FB.getStringOf "bar")|]) [| case _ of "foo" -> pure () @@ -202,7 +205,7 @@ basicSpec = describe "FlatParse.Basic" $ do it "requires the post must match" $ $( FB.switchWithPost - (Just [|$(FB.string "bar")|]) + (Just [|$(FB.getStringOf "bar")|]) [| case _ of "foo" -> pure () @@ -253,7 +256,7 @@ basicSpec = describe "FlatParse.Basic" $ do it "applies post after match" $ $( FB.rawSwitchWithPost - (Just [|$(FB.string "bar")|]) + (Just [|$(FB.getStringOf "bar")|]) [("foo", [|pure ()|])] Nothing ) @@ -261,7 +264,7 @@ basicSpec = describe "FlatParse.Basic" $ do it "doesn't apply post after default" $ $( FB.rawSwitchWithPost - (Just [|$(FB.string "bar")|]) + (Just [|$(FB.getStringOf "bar")|]) [("foo", [|pure ()|])] (Just [|pure ()|]) ) @@ -269,7 +272,7 @@ basicSpec = describe "FlatParse.Basic" $ do it "requires the post must match" $ $( FB.rawSwitchWithPost - (Just [|$(FB.string "bar")|]) + (Just [|$(FB.getStringOf "bar")|]) [("foo", [|pure ()|])] Nothing ) @@ -334,54 +337,54 @@ basicSpec = describe "FlatParse.Basic" $ do FB.fusedSatisfy (const True) (const True) (const True) (const True) `shouldParseFail` "" - describe "anyWord8" $ do - it "reads a byte" $ FB.anyWord8 `shouldParseWith` ("\xef", 0xef) - it "fails on FB.empty input" $ FB.anyWord8 `shouldParseFail` "" + describe "getWord8" $ do + it "reads a byte" $ FB.getWord8 `shouldParseWith` ("\xef", 0xef) + it "fails on empty input" $ FB.getWord8 `shouldParseFail` "" - describe "anyWord16" $ do + describe "getWord16" $ do -- Byte order is unspecified, so just assert that it succeeds. - it "succeeds" $ FB.anyWord16 `shouldParse` "\xef\xbe" + it "succeeds" $ FB.getWord16 `shouldParse` "\xef\xbe" - it "fails on FB.empty input" $ FB.anyWord16 `shouldParseFail` "" - it "fails on insufficient input" $ FB.anyWord16 `shouldParseFail` "\xff" + it "fails on empty input" $ FB.getWord16 `shouldParseFail` "" + it "fails on insufficient input" $ FB.getWord16 `shouldParseFail` "\xff" - describe "anyWord32" $ do + describe "getWord32" $ do -- Byte order is unspecified, so just assert that it succeeds. - it "succeeds" $ FB.anyWord32 `shouldParse` "\xef\xbe\xae\x7e" + it "succeeds" $ FB.getWord32 `shouldParse` "\xef\xbe\xae\x7e" - it "fails on empty input" $ FB.anyWord32 `shouldParseFail` "" + it "fails on empty input" $ FB.getWord32 `shouldParseFail` "" it "fails on insufficient input" $ - FB.anyWord32 `shouldParseFail` "\xff\xff\xff" + FB.getWord32 `shouldParseFail` "\xff\xff\xff" - describe "anyWord" $ do + describe "getWord" $ do -- This combinator is inherently non-portable, but we know a Word is at -- least some bytes. - it "fails on FB.empty input" $ FB.anyWord `shouldParseFail` "" - - describe "anyChar" $ do - it "reads 1-byte char" $ FB.anyChar `shouldParseWith` (FB.packUTF8 "$", '$') - it "reads 2-byte char" $ FB.anyChar `shouldParseWith` (FB.packUTF8 "¢", '¢') - it "reads 3-byte char" $ FB.anyChar `shouldParseWith` (FB.packUTF8 "€", '€') - it "reads 4-byte char" $ FB.anyChar `shouldParseWith` (FB.packUTF8 "𐍈", '𐍈') - it "fails on FB.empty input" $ FB.anyChar `shouldParseFail` "" - - describe "anyChar_" $ do - it "reads 1-byte char" $ FB.anyChar_ `shouldParseWith` (FB.packUTF8 "$", ()) - it "reads 2-byte char" $ FB.anyChar_ `shouldParseWith` (FB.packUTF8 "¢", ()) - it "reads 3-byte char" $ FB.anyChar_ `shouldParseWith` (FB.packUTF8 "€", ()) - it "reads 4-byte char" $ FB.anyChar_ `shouldParseWith` (FB.packUTF8 "𐍈", ()) - it "fails on empty input" $ FB.anyChar_ `shouldParseFail` "" - - describe "anyCharASCII" $ do - it "reads ASCII char" $ FB.anyCharASCII `shouldParseWith` (FB.packUTF8 "$", '$') - it "fails on non-ASCII char" $ FB.anyCharASCII `shouldParseFail` FB.packUTF8 "¢" - it "fails on empty input" $ FB.anyCharASCII `shouldParseFail` "" - - describe "anyCharASCII_" $ do - it "reads ASCII char" $ FB.anyCharASCII_ `shouldParseWith` (FB.packUTF8 "$", ()) + it "fails on empty input" $ FB.getWord `shouldParseFail` "" + + describe "getChar" $ do + it "reads 1-byte char" $ FB.getChar `shouldParseWith` (FB.packUTF8 "$", '$') + it "reads 2-byte char" $ FB.getChar `shouldParseWith` (FB.packUTF8 "¢", '¢') + it "reads 3-byte char" $ FB.getChar `shouldParseWith` (FB.packUTF8 "€", '€') + it "reads 4-byte char" $ FB.getChar `shouldParseWith` (FB.packUTF8 "𐍈", '𐍈') + it "fails on empty input" $ FB.getChar `shouldParseFail` "" + + describe "getChar_" $ do + it "reads 1-byte char" $ FB.getChar_ `shouldParseWith` (FB.packUTF8 "$", ()) + it "reads 2-byte char" $ FB.getChar_ `shouldParseWith` (FB.packUTF8 "¢", ()) + it "reads 3-byte char" $ FB.getChar_ `shouldParseWith` (FB.packUTF8 "€", ()) + it "reads 4-byte char" $ FB.getChar_ `shouldParseWith` (FB.packUTF8 "𐍈", ()) + it "fails on empty input" $ FB.getChar_ `shouldParseFail` "" + + describe "getCharASCII" $ do + it "reads ASCII char" $ FB.getCharASCII `shouldParseWith` (FB.packUTF8 "$", '$') + it "fails on non-ASCII char" $ FB.getCharASCII `shouldParseFail` FB.packUTF8 "¢" + it "fails on empty input" $ FB.getCharASCII `shouldParseFail` "" + + describe "getCharASCII_" $ do + it "reads ASCII char" $ FB.getCharASCII_ `shouldParseWith` (FB.packUTF8 "$", ()) it "fails on non-ASCII char" $ - FB.anyCharASCII_ `shouldParseFail` FB.packUTF8 "¢" - it "fails on empty input" $ FB.anyCharASCII_ `shouldParseFail` "" + FB.getCharASCII_ `shouldParseFail` FB.packUTF8 "¢" + it "fails on empty input" $ FB.getCharASCII_ `shouldParseFail` "" describe "isDigit" $ do it "agrees with Data.Char" $ @@ -395,104 +398,104 @@ basicSpec = describe "FlatParse.Basic" $ do FB.isLatinLetter c === (Data.Char.isAsciiUpper c || Data.Char.isAsciiLower c) - describe "readInt" $ do + describe "getAsciiDecimalInt" $ do it "round-trips on non-negative Ints" $ property $ - \(NonNegative i) -> FB.readInt `shouldParseWith` (FB.packUTF8 (show i), i) + \(NonNegative i) -> FB.getAsciiDecimalInt `shouldParseWith` (FB.packUTF8 (show i), i) - it "fails on non-integers" $ FB.readInt `shouldParseFail` "foo" - it "fails on negative integers" $ FB.readInt `shouldParseFail` "-5" - it "fails on FB.empty input" $ FB.readInt `shouldParseFail` "" - - describe "readIntHex" $ do - it "round-trips on non-negative Ints, lowercase" $ - property $ - \(NonNegative i) -> FB.readIntHex `shouldParseWith` (FB.packUTF8 (showHex i ""), i) - - it "round-trips on non-negative Ints, uppercase" $ - property $ - \(NonNegative i) -> FB.readIntHex `shouldParseWith` (FB.packUTF8 (Data.Char.toUpper <$> showHex i ""), i) + it "fails on non-integers" $ FB.getAsciiDecimalInt `shouldParseFail` "foo" + it "fails on negative integers" $ FB.getAsciiDecimalInt `shouldParseFail` "-5" + it "fails on empty input" $ FB.getAsciiDecimalInt `shouldParseFail` "" - it "fails on non-integers" $ FB.readIntHex `shouldParseFail` "quux" - it "fails on negative integers" $ FB.readIntHex `shouldParseFail` "-5" - it "fails on FB.empty input" $ FB.readIntHex `shouldParseFail` "" - - describe "readInteger" $ do + describe "getAsciiDecimalInteger" $ do it "round-trips on non-negative Integers" $ property $ \(NonNegative i) -> - FB.readInteger `shouldParseWith` (FB.packUTF8 (show i), i) + FB.getAsciiDecimalInteger `shouldParseWith` (FB.packUTF8 (show i), i) - it "fails on non-integers" $ FB.readInteger `shouldParseFail` "foo" - it "fails on negative integers" $ FB.readInteger `shouldParseFail` "-5" - it "fails on FB.empty input" $ FB.readInteger `shouldParseFail` "" + it "fails on non-integers" $ FB.getAsciiDecimalInteger `shouldParseFail` "foo" + it "fails on negative integers" $ FB.getAsciiDecimalInteger `shouldParseFail` "-5" + it "fails on empty input" $ FB.getAsciiDecimalInteger `shouldParseFail` "" - describe "anyCString" $ do + describe "getCString" $ do prop "parses arbitrary null-terminated bytestrings" $ \(bs :: ByteString) -> let bs' = B.snoc bs 0x00 expected = B.takeWhile (/= 0x00) bs' - in FB.anyCString `shouldParsePartialWith` (bs', expected) + in FB.getCString `shouldParsePartialWith` (bs', expected) + + describe "getAsciiHexInt" $ do + it "round-trips on non-negative Ints, lowercase" $ + property $ + \(NonNegative i) -> FB.getAsciiHexInt `shouldParseWith` (FB.packUTF8 (showHex i ""), i) + + it "round-trips on non-negative Ints, uppercase" $ + property $ + \(NonNegative i) -> FB.getAsciiHexInt `shouldParseWith` (FB.packUTF8 (Data.Char.toUpper <$> showHex i ""), i) + + it "fails on non-integers" $ FB.getAsciiHexInt `shouldParseFail` "quux" + it "fails on negative integers" $ FB.getAsciiHexInt `shouldParseFail` "-5" + it "fails on FB.empty input" $ FB.getAsciiHexInt `shouldParseFail` "" describe "Explicit-endianness machine integers" $ do describe "Unsigned" $ do prop "parses Word8s" $ do - \(w :: Word8) -> FB.anyWord8 `shouldParseWith` (w8AsByteString w, w) + \(w :: Word8) -> FB.getWord8 `shouldParseWith` (w8AsByteString w, w) prop "parses Word16s (LE)" $ do - \(w :: Word16) -> FB.anyWord16le `shouldParseWith` (w16leAsByteString w, w) + \(w :: Word16) -> FB.getWord16le `shouldParseWith` (w16leAsByteString w, w) prop "parses Word16s (BE)" $ do - \(w :: Word16) -> FB.anyWord16be `shouldParseWith` (B.reverse (w16leAsByteString w), w) + \(w :: Word16) -> FB.getWord16be `shouldParseWith` (B.reverse (w16leAsByteString w), w) prop "parses Word32s (LE)" $ do - \(w :: Word32) -> FB.anyWord32le `shouldParseWith` (w32leAsByteString w, w) + \(w :: Word32) -> FB.getWord32le `shouldParseWith` (w32leAsByteString w, w) prop "parses Word32s (BE)" $ do - \(w :: Word32) -> FB.anyWord32be `shouldParseWith` (B.reverse (w32leAsByteString w), w) + \(w :: Word32) -> FB.getWord32be `shouldParseWith` (B.reverse (w32leAsByteString w), w) prop "parses Word64s (LE)" $ do - \(w :: Word64) -> FB.anyWord64le `shouldParseWith` (w64leAsByteString w, w) + \(w :: Word64) -> FB.getWord64le `shouldParseWith` (w64leAsByteString w, w) prop "parses Word64s (BE)" $ do - \(w :: Word64) -> FB.anyWord64be `shouldParseWith` (B.reverse (w64leAsByteString w), w) + \(w :: Word64) -> FB.getWord64be `shouldParseWith` (B.reverse (w64leAsByteString w), w) describe "Signed" $ do prop "parses Int8s" $ do - \(i :: Int8) -> FB.anyInt8 `shouldParseWith` (w8AsByteString i, i) + \(i :: Int8) -> FB.getInt8 `shouldParseWith` (w8AsByteString i, i) prop "parses Int16s (LE)" $ do - \(i :: Int16) -> FB.anyInt16le `shouldParseWith` (w16leAsByteString i, i) + \(i :: Int16) -> FB.getInt16le `shouldParseWith` (w16leAsByteString i, i) prop "parses Int16s (BE)" $ do - \(i :: Int16) -> FB.anyInt16be `shouldParseWith` (B.reverse (w16leAsByteString i), i) + \(i :: Int16) -> FB.getInt16be `shouldParseWith` (B.reverse (w16leAsByteString i), i) prop "parses Int32s (LE)" $ do - \(i :: Int32) -> FB.anyInt32le `shouldParseWith` (w32leAsByteString i, i) + \(i :: Int32) -> FB.getInt32le `shouldParseWith` (w32leAsByteString i, i) prop "parses Int32s (BE)" $ do - \(i :: Int32) -> FB.anyInt32be `shouldParseWith` (B.reverse (w32leAsByteString i), i) + \(i :: Int32) -> FB.getInt32be `shouldParseWith` (B.reverse (w32leAsByteString i), i) prop "parses Int64s (LE)" $ do - \(i :: Int64) -> FB.anyInt64le `shouldParseWith` (w64leAsByteString i, i) + \(i :: Int64) -> FB.getInt64le `shouldParseWith` (w64leAsByteString i, i) prop "parses Int64s (BE)" $ do - \(i :: Int64) -> FB.anyInt64be `shouldParseWith` (B.reverse (w64leAsByteString i), i) + \(i :: Int64) -> FB.getInt64be `shouldParseWith` (B.reverse (w64leAsByteString i), i) describe "Combinators" $ do describe "Functor instance" $ do it "fmaps over the result" $ - ((+ 2) <$> FB.readInt) `shouldParseWith` ("2", 4) + ((+ 2) <$> FB.getAsciiDecimalInt) `shouldParseWith` ("2", 4) describe "Applicative instance" $ do it "combines using <*>" $ - ((+) <$> FB.readInt <* $(FB.string "+") <*> FB.readInt) + ((+) <$> FB.getAsciiDecimalInt <* $(FB.getStringOf "+") <*> FB.getAsciiDecimalInt) `shouldParseWith` ("2+3", 5) describe "Monad instance" $ do it "combines with a do block" $ do let parser = do - i <- FB.readInt - $(FB.string "+") - j <- FB.readInt + i <- FB.getAsciiDecimalInt + $(FB.getStringOf "+") + j <- FB.getAsciiDecimalInt pure (i + j) parser `shouldParseWith` ("2+3", 5) describe "(<|>)" $ do it "chooses first option on success" $ - (("A" <$ $(FB.string "foo")) FB.<|> ("B" <$ $(FB.string "foo"))) + (("A" <$ $(FB.getStringOf "foo")) <|> ("B" <$ $(FB.getStringOf "foo"))) `shouldParseWith` ("foo", "A") it "chooses second option when first fails" $ - (("A" <$ $(FB.string "bar")) FB.<|> ("B" <$ $(FB.string "foo"))) + (("A" <$ $(FB.getStringOf "bar")) <|> ("B" <$ $(FB.getStringOf "foo"))) `shouldParseWith` ("foo", "B") describe "branch" $ do @@ -505,28 +508,28 @@ basicSpec = describe "FlatParse.Basic" $ do describe "chainl" $ do it "parses a chain of numbers" $ - FB.chainl (+) FB.readInt ($(FB.char '+') *> FB.readInt) + FB.chainl (+) FB.getAsciiDecimalInt ($(FB.getCharOf '+') *> FB.getAsciiDecimalInt) `shouldParseWith` ("1+2+3", 6) - it "allows the right chain to be FB.empty" $ - FB.chainl (+) FB.readInt ($(FB.char '+') *> FB.readInt) + it "allows the right chain to be empty" $ + FB.chainl (+) FB.getAsciiDecimalInt ($(FB.getCharOf '+') *> FB.getAsciiDecimalInt) `shouldParseWith` ("1", 1) it "requires at least the leftmost parser to match" $ - FB.chainl (+) FB.readInt ($(FB.char '+') *> FB.readInt) + FB.chainl (+) FB.getAsciiDecimalInt ($(FB.getCharOf '+') *> FB.getAsciiDecimalInt) `shouldParseFail` "" describe "chainr" $ do it "parses a chain of numbers" $ - FB.chainr (+) (FB.readInt <* $(FB.char '+')) FB.readInt + FB.chainr (+) (FB.getAsciiDecimalInt <* $(FB.getCharOf '+')) FB.getAsciiDecimalInt `shouldParseWith` ("1+2+3", 6) - it "allows the left chain to be FB.empty" $ - FB.chainr (+) (FB.readInt <* $(FB.char '+')) FB.readInt + it "allows the left chain to be empty" $ + FB.chainr (+) (FB.getAsciiDecimalInt <* $(FB.getCharOf '+')) FB.getAsciiDecimalInt `shouldParseWith` ("1", 1) it "requires at least the rightmost parser to match" $ - FB.chainr (+) (FB.readInt <* $(FB.char '+')) FB.readInt + FB.chainr (+) (FB.getAsciiDecimalInt <* $(FB.getCharOf '+')) FB.getAsciiDecimalInt `shouldParseFail` "" describe "many" $ do @@ -563,19 +566,19 @@ basicSpec = describe "FlatParse.Basic" $ do describe "notFollowedBy" $ do it "succeeds when it should" $ - FB.readInt `FB.notFollowedBy` $(FB.char '.') `shouldParsePartial` "123+5" + FB.getAsciiDecimalInt `FB.notFollowedBy` $(FB.getCharOf '.') `shouldParsePartial` "123+5" it "fails when first parser doesn't match" $ - FB.readInt `FB.notFollowedBy` $(FB.char '.') `shouldParseFail` "a" + FB.getAsciiDecimalInt `FB.notFollowedBy` $(FB.getCharOf '.') `shouldParseFail` "a" it "fails when followed by the wrong thing" $ - FB.readInt `FB.notFollowedBy` $(FB.char '.') `shouldParseFail` "123.0" + FB.getAsciiDecimalInt `FB.notFollowedBy` $(FB.getCharOf '.') `shouldParseFail` "123.0" describe "isolate" $ do - prop "isolate takeRestBs is identity" $ do + prop "isolate takeRest is identity" $ do \(bs :: ByteString) -> - FB.isolate (B.length bs) FB.takeRestBs `shouldParseWith` (bs, bs) - prop "isolate takeBs length is identity" $ do + FB.isolate (B.length bs) FB.takeRest `shouldParseWith` (bs, bs) + prop "isolate take length is identity" $ do \(bs :: ByteString) -> - FB.isolate (B.length bs) (FB.takeBs (B.length bs)) `shouldParseWith` (bs, bs) + FB.isolate (B.length bs) (FB.take (B.length bs)) `shouldParseWith` (bs, bs) describe "Positions and spans" $ do describe "Pos Ord instance" $ do @@ -631,10 +634,10 @@ basicSpec = describe "FlatParse.Basic" $ do describe "traceLine" $ do pure () - describe "takeRest" $ do + describe "takeRestString" $ do pure () - describe "traceRest" $ do + describe "traceRestString" $ do pure () describe "String conversions" $ do @@ -649,8 +652,9 @@ basicSpec = describe "FlatParse.Basic" $ do -- use Int#/Int64# directly because Word8# -> Int# is annoying on old GHCs let bs = B.pack [ 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 , 0xFF, 0x31, 0x32, 0x33, 0x00, 0xFF] - p = FB.withAddr# $ \addr# -> FB.withAnyInt64# $ \os# -> - FB.lookaheadFromAddr# addr# $ FB.atSkip# os# $ FB.anyCString + p = FB.withAddr# $ \addr# -> FB.withInt64 $ \os -> + let !(I# os#) = fromIntegral os + in FB.lookaheadFromAddr# addr# $ FB.atSkip# os# $ FB.getCString p `shouldParsePartialWith` (bs, "123") -------------------------------------------------------------------------------- From a634c0d36e89b18001473f3ba4d4b9d99ae7d48f Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Wed, 26 Oct 2022 13:28:32 +0100 Subject: [PATCH 02/11] fix benchmarks --- bench/Bench.hs | 8 ++++---- bench/FPBasic.hs | 8 ++++---- bench/FPStateful.hs | 5 ++++- bench/ReadInteger.hs | 4 ++-- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/bench/Bench.hs b/bench/Bench.hs index 7ec34ea..bf38a1f 100644 --- a/bench/Bench.hs +++ b/bench/Bench.hs @@ -10,7 +10,7 @@ import qualified Data.ByteString.Char8 as B import qualified Attoparsec import qualified Megaparsec import qualified Parsec -import qualified FPStateful +--import qualified FPStateful import qualified FPBasic import qualified ReadInteger @@ -31,7 +31,7 @@ main :: IO () main = defaultMain [ bgroup "sexp" [ bench "fpbasic" $ whnf FPBasic.runSexp sexpInp, - bench "fpstateful" $ whnf FPStateful.runSexp sexpInp, + --bench "fpstateful" $ whnf FPStateful.runSexp sexpInp, bench "attoparsec" $ whnf Attoparsec.runSexp sexpInp, bench "megaparsec" $ whnf Megaparsec.runSexp sexpInp, bench "parsec" $ whnf Parsec.runSexp sexpInp @@ -39,7 +39,7 @@ main = defaultMain [ bgroup "long keyword" [ bench "fpbasic" $ whnf FPBasic.runLongws longwsInp, - bench "fpstateful" $ whnf FPStateful.runLongws longwsInp, + --bench "fpstateful" $ whnf FPStateful.runLongws longwsInp, bench "attoparsec" $ whnf Attoparsec.runLongws longwsInp, bench "megaparsec" $ whnf Megaparsec.runLongws longwsInp, bench "parsec" $ whnf Parsec.runLongws longwsInp @@ -47,7 +47,7 @@ main = defaultMain [ bgroup "numeral csv" [ bench "fpbasic" $ whnf FPBasic.runNumcsv numcsvInp, - bench "fpstateful" $ whnf FPStateful.runNumcsv numcsvInp, + --bench "fpstateful" $ whnf FPStateful.runNumcsv numcsvInp, bench "attoparsec" $ whnf Attoparsec.runNumcsv numcsvInp, bench "megaparsec" $ whnf Megaparsec.runNumcsv numcsvInp, bench "parsec" $ whnf Parsec.runNumcsv numcsvInp diff --git a/bench/FPBasic.hs b/bench/FPBasic.hs index d555b44..b837822 100644 --- a/bench/FPBasic.hs +++ b/bench/FPBasic.hs @@ -7,18 +7,18 @@ module FPBasic ( import FlatParse.Basic ws = many_ $(switch [| case _ of " " -> pure (); "\n" -> pure () |]) -open = $(char '(') >> ws -close = $(char ')') >> ws +open = $(getCharOf '(') >> ws +close = $(getCharOf ')') >> ws ident = some_ (satisfyASCII_ isLatinLetter) >> ws sexp = branch open (some_ sexp >> close) ident src = sexp >> eof runSexp = runParser src -longw = $(string "thisisalongkeyword") +longw = $(getStringOf "thisisalongkeyword") longws = some_ (longw >> ws) >> eof runLongws = runParser longws numeral = some_ (satisfyASCII_ isDigit) >> ws -comma = $(char ',') >> ws +comma = $(getCharOf ',') >> ws numcsv = numeral >> many_ (comma >> numeral) >> eof runNumcsv = runParser numcsv diff --git a/bench/FPStateful.hs b/bench/FPStateful.hs index 79e80b3..2632946 100644 --- a/bench/FPStateful.hs +++ b/bench/FPStateful.hs @@ -1,4 +1,4 @@ - +{- module FPStateful ( runSexp , runLongws @@ -25,3 +25,6 @@ numeral = some_ (satisfyASCII_ isDigit) >> ws comma = $(char ',') >> ws numcsv = numeral >> many_ (comma >> numeral) >> eof runNumcsv = runParser numcsv () 0 +-} + +module FPStateful where diff --git a/bench/ReadInteger.hs b/bench/ReadInteger.hs index 7109ff8..5f0988c 100644 --- a/bench/ReadInteger.hs +++ b/bench/ReadInteger.hs @@ -3,5 +3,5 @@ module ReadInteger where import FlatParse.Basic as FPBasic -readInt = runParser FPBasic.readInt -readInteger = runParser FPBasic.readInteger +readInt = runParser FPBasic.getAsciiDecimalInt +readInteger = runParser FPBasic.getAsciiDecimalInteger From 3a00c4f31288e2ce3491895496f03f060db0dbd6 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Wed, 26 Oct 2022 13:53:16 +0100 Subject: [PATCH 03/11] CI: +GHC 9.4 --- .github/workflows/haskell.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/haskell.yml b/.github/workflows/haskell.yml index 77dfefd..df8aa0e 100644 --- a/.github/workflows/haskell.yml +++ b/.github/workflows/haskell.yml @@ -13,7 +13,7 @@ jobs: strategy: matrix: cabal: ["3.6"] - ghc: ["8.6.5", "8.8.4", "8.10.7", "9.0.2", "9.2.3"] + ghc: ["8.6.5", "8.8.4", "8.10.7", "9.0.2", "9.2.3", "9.4.2"] env: CONFIG: "--enable-tests --enable-benchmarks" steps: From 5aef3551cc303db2383f76616ad7e6f2e2a7ce21 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Wed, 26 Oct 2022 13:53:29 +0100 Subject: [PATCH 04/11] CI: build with LLVM --- .github/workflows/haskell.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/haskell.yml b/.github/workflows/haskell.yml index df8aa0e..3cb84ad 100644 --- a/.github/workflows/haskell.yml +++ b/.github/workflows/haskell.yml @@ -15,7 +15,7 @@ jobs: cabal: ["3.6"] ghc: ["8.6.5", "8.8.4", "8.10.7", "9.0.2", "9.2.3", "9.4.2"] env: - CONFIG: "--enable-tests --enable-benchmarks" + CONFIG: "--enable-tests --enable-benchmarks --flags llvm" steps: - uses: actions/checkout@v2.3.4 - uses: actions/cache@v2 From 64e1b0427ea1610a17cebca79c30a9724c4d9eb6 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Thu, 27 Oct 2022 18:16:42 +0100 Subject: [PATCH 05/11] split Addr# primitives into module --- flatparse.cabal | 1 + src/FlatParse/Basic.hs | 8 ++-- src/FlatParse/Basic/Addr.hs | 83 +++++++++++++++++++++++++++++++++ src/FlatParse/Basic/Internal.hs | 57 ++++------------------ 4 files changed, 99 insertions(+), 50 deletions(-) create mode 100644 src/FlatParse/Basic/Addr.hs diff --git a/flatparse.cabal b/flatparse.cabal index 4ea5335..84d133f 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -44,6 +44,7 @@ flag llvm library exposed-modules: FlatParse.Basic + FlatParse.Basic.Addr FlatParse.Basic.Chars FlatParse.Basic.Integers FlatParse.Basic.Internal diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index 14858ce..08df697 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -115,14 +115,15 @@ module FlatParse.Basic ( , ensureBytes# -- ** Unboxed arguments - , takeBs# + , take# , atSkip# -- ** Location & address primitives , setBack# , withAddr# - , takeBsOffAddr# + , takeOffAddr# + , withOffAddr# , lookaheadFromAddr# , atAddr# @@ -164,6 +165,7 @@ import FlatParse.Basic.Integers import FlatParse.Basic.Internal import FlatParse.Basic.Chars import FlatParse.Basic.Position +import FlatParse.Basic.Addr -- | Higher-level boxed data type for parsing results. data Result e a = @@ -268,7 +270,7 @@ eof = Parser \fp eob s -> case eqAddr# eob s of -- -- Throws a runtime error if given a negative integer. take :: Int -> Parser e B.ByteString -take (I# n#) = takeBs# n# +take (I# n#) = take# n# {-# inline take #-} -- | Consume the rest of the input. May return the empty bytestring. diff --git a/src/FlatParse/Basic/Addr.hs b/src/FlatParse/Basic/Addr.hs new file mode 100644 index 0000000..a8e0138 --- /dev/null +++ b/src/FlatParse/Basic/Addr.hs @@ -0,0 +1,83 @@ +{- | Highly dangerous parsing primitives using 'Addr#'. + +Ensure to read the documentation before using any definitions from this module. + +This module exports primitives useful for efficiently parsing binary files that +store data using an internal index. + +Often, such indices describes records using a starting offset and a length. +Offsets are often relative to the file start, or some dynamic address in the +file. This way, individual records can be read out efficiently (much faster than +opening lots of small files!). + +We may parse these in-place extremely efficiently by adding record offsets to a +base memory address somewhere in the parsing data. This is also extremely +unsafe, and easy to get catastrophically wrong. Thus, we provide as much utility +as reasonable to enable performing such parsing safely. (That means CPS +functions.) +-} + +module FlatParse.Basic.Addr where + +import FlatParse.Basic.Parser +import FlatParse.Basic.Internal + +import GHC.Exts + +import qualified Data.ByteString as B + +-- | Run a parser, passing it the current address the parser is at. +-- +-- Useful for parsing offset-based data tables. For example, you may use this to +-- save the base address to use together with various relative offsets. +withAddr# :: (Addr# -> Parser e a) -> Parser e a +withAddr# p = Parser \fp eob s -> runParser# (p s) fp eob s +{-# inline withAddr# #-} + +-- | @takeOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ +-- bytes, reads @len#@ bytes into a 'ByteString', and restores the original +-- address. +-- +-- The 'Addr#' should be from 'withAddr#'. +-- +-- Useful for parsing offset-based data tables. Ex: Your file contains an index +-- storing @(OFFSET, LENGTH)@ entries where the offset is the byte position in +-- the file. Begin with @'withAddr#' $ \tableBase# -> ...@, then read each entry +-- like @'takeOffAddr#' tableBase# OFFSET LENGTH@. +-- +-- Fails if you attempt to read outside the parsing data. +-- +-- Name adopted from the similar-ish @indexXOffAddr#@ primops. +takeOffAddr# :: Addr# -> Int# -> Int# -> Parser e B.ByteString +takeOffAddr# addr# offset# len# = withOffAddr# addr# offset# (take# len#) +{-# inline takeOffAddr# #-} + +-- | @withOffAddr# addr# offset# p@ moves to @addr#@, skips @offset#@ +-- bytes, then runs the given parser @p@. +-- +-- The 'Addr#' should be from 'withAddr#'. +-- +-- Fails if you attempt to read outside the parsing data. +-- +-- Name adopted from the similar-ish @indexXOffAddr#@ primops. +withOffAddr# :: Addr# -> Int# -> Parser e a -> Parser e a +withOffAddr# addr# offset# = + lookaheadFromAddr# addr# . atSkip# offset# +{-# inline withOffAddr# #-} + +-- | 'lookahead', but specify the address to lookahead from. +-- +-- The 'Addr#' should be from 'withAddr#'. +lookaheadFromAddr# :: Addr# -> Parser e a -> Parser e a +lookaheadFromAddr# s = lookahead . atAddr# s +{-# inline lookaheadFromAddr# #-} + +-- | Run a parser at the given address. +-- +-- The 'Addr#' should be from 'withAddr#'. +-- +-- This is a highly internal function -- you likely want 'lookaheadFromAddr#', +-- which will reset the address after running the parser. +atAddr# :: Addr# -> Parser e a -> Parser e a +atAddr# s (Parser p) = Parser \fp eob _ -> p fp eob s +{-# inline atAddr# #-} diff --git a/src/FlatParse/Basic/Internal.hs b/src/FlatParse/Basic/Internal.hs index c8fee49..02992b4 100644 --- a/src/FlatParse/Basic/Internal.hs +++ b/src/FlatParse/Basic/Internal.hs @@ -71,21 +71,23 @@ withNotEob (Parser p) = Parser \fp eob s -> case eqAddr# eob s of -------------------------------------------------------------------------------- -- Low level unboxed combinators --- | Read the given number of bytes as a 'ByteString'. +-- | Read @n@ bytes as a 'ByteString'. Fails if newer than @n@ bytes are +-- available. -- -- Throws a runtime error if given a negative integer. -takeBs# :: Int# -> Parser e B.ByteString -takeBs# n# = withPosInt# n# takeBsUnsafe# -{-# inline takeBs# #-} +take# :: Int# -> Parser e B.ByteString +take# n# = withPosInt# n# takeUnsafe# +{-# inline take# #-} --- | Read the given number of bytes as a 'ByteString'. +-- | Read @n@ bytes as a 'ByteString'. Fails if newer than @n@ bytes are +-- available. -- -- Undefined behaviour if given a negative integer. -takeBsUnsafe# :: Int# -> Parser e B.ByteString -takeBsUnsafe# n# = Parser \fp eob s -> case n# <=# minusAddr# eob s of +takeUnsafe# :: Int# -> Parser e B.ByteString +takeUnsafe# n# = Parser \fp eob s -> case n# <=# minusAddr# eob s of 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) _ -> Fail# -{-# inline takeBsUnsafe# #-} +{-# inline takeUnsafe# #-} -- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ -- bytes are available. @@ -112,45 +114,6 @@ skip# :: Int# -> Parser e () skip# os# = atSkip# os# (pure ()) {-# inline skip# #-} --- | Run a parser, passing it the current address the parser is at. --- --- Useful for parsing offset-based data tables. For example, you may use this to --- save the base address to use together with various 0-indexed offsets. -withAddr# :: (Addr# -> Parser e a) -> Parser e a -withAddr# p = Parser \fp eob s -> runParser# (p s) fp eob s -{-# inline withAddr# #-} - --- | @takeBsOffAddr# addr# offset# len#@ moves to @addr#@, skips @offset#@ --- bytes, reads @len#@ bytes into a 'ByteString', and restores the original --- address. --- --- The 'Addr#' should be from 'withAddr#'. --- --- Useful for parsing offset-based data tables. For example, you may use this --- together with 'withAddr#' to jump to an offset in your input and read some --- data. -takeBsOffAddr# :: Addr# -> Int# -> Int# -> Parser e B.ByteString -takeBsOffAddr# addr# offset# len# = - lookaheadFromAddr# addr# $ atSkip# offset# $ takeBs# len# -{-# inline takeBsOffAddr# #-} - --- | 'lookahead', but specify the address to lookahead from. --- --- The 'Addr#' should be from 'withAddr#'. -lookaheadFromAddr# :: Addr# -> Parser e a -> Parser e a -lookaheadFromAddr# s = lookahead . atAddr# s -{-# inline lookaheadFromAddr# #-} - --- | Run a parser at the given address. --- --- The 'Addr#' should be from 'withAddr#'. --- --- This is a highly internal function -- you likely want 'lookaheadFromAddr#', --- which will reset the address after running the parser. -atAddr# :: Addr# -> Parser e a -> Parser e a -atAddr# s (Parser p) = Parser \fp eob _ -> p fp eob s -{-# inline atAddr# #-} - -------------------------------------------------------------------------------- -- Low-level boxed combinators From 82e58011739e93664539c21650ee6619a3a41df1 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Thu, 27 Oct 2022 19:13:24 +0100 Subject: [PATCH 06/11] cleanup --- src/FlatParse/Basic.hs | 130 +++++++++++++++++++------------- src/FlatParse/Basic/Integers.hs | 4 +- src/FlatParse/Basic/Internal.hs | 46 ----------- src/FlatParse/Basic/Parser.hs | 15 +--- src/FlatParse/Basic/Position.hs | 5 +- 5 files changed, 89 insertions(+), 111 deletions(-) diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index 08df697..d9bef66 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -10,45 +10,57 @@ denoted by a @#@ hash suffix. module FlatParse.Basic ( - -- * Parser types and constructors - type Parser(..) - , type Res# - , pattern OK# - , pattern Fail# - , pattern Err# - , Result(..) + -- * Parser monad + type Parser - -- * Running parsers + -- ** Executing parsers + , Result(..) , runParser , runParserS -- * Errors and failures - , failed - , Control.Applicative.empty , err , lookahead , fails , try - , Control.Applicative.optional , optional_ , withOption , cut , cutting - -- * Basic lexing and parsing + -- * Combinators + , (Control.Applicative.<|>) + , Control.Applicative.empty + , branch + , chainl + , chainr + , Control.Applicative.many + , many_ + , Control.Applicative.some + , some_ + , notFollowedBy + , isolate + + -- * Primitive parsers , eof - , take - , takeRest - , skip , switch , switchWithPost , rawSwitchWithPost - , getCharOf + -- ** Byte-wise + , take + , takeRest + , skip , getBytesOf , getByteStringOf - , getStringOf + , getCString + -- ** Machine integers + , module FlatParse.Basic.Integers + + -- ** 'Char', 'String' + , getCharOf + , getStringOf , getChar , getChar_ , getCharASCII @@ -56,12 +68,9 @@ module FlatParse.Basic ( , getAsciiDecimalInt , getAsciiDecimalInteger , getAsciiHexInt - , getCString - , Common.isDigit , Common.isGreekLetter , Common.isLatinLetter - , satisfy , satisfy_ , satisfyASCII @@ -69,35 +78,13 @@ module FlatParse.Basic ( , fusedSatisfy , fusedSatisfy_ - -- * Combinators - , (<|>) - , branch - , chainl - , chainr - , Control.Applicative.many - , many_ - , Control.Applicative.some - , some_ - , notFollowedBy - , isolate - - -- * Positions and spans - , Pos(..) - , Span(..) - , getPos - , setPos - , endPos - , spanOf - , withSpan - , byteStringOf - , withByteString - , inSpan + -- ** Positions and spans + , module FlatParse.Basic.Position -- ** Position and span conversions , validPos , posLineCols , unsafeSpanToByteString - , unsafeSlice , mkPos , FlatParse.Basic.lines @@ -121,17 +108,11 @@ module FlatParse.Basic ( -- ** Location & address primitives , setBack# - , withAddr# - , takeOffAddr# - , withOffAddr# - , lookaheadFromAddr# - , atAddr# + , module FlatParse.Basic.Addr -- ** Unsafe , getCStringUnsafe - , module FlatParse.Basic.Integers - ) where import Prelude hiding ( take, getChar ) @@ -580,7 +561,7 @@ genSwitchTrie' postAction cases fallback = Nothing -> pure ((Just i, rhs), (i, str)) Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str)) - !m = M.fromList ((Nothing, maybe (VarE 'failed) id fallback) : branches) + !m = M.fromList ((Nothing, maybe (VarE 'empty) id fallback) : branches) !trie = compileTrie strings in (m , trie) @@ -728,3 +709,48 @@ unsafeSpanToByteString :: Span -> Parser e B.ByteString unsafeSpanToByteString (Span l r) = lookahead (setPos l >> byteStringOf (setPos r)) {-# inline unsafeSpanToByteString #-} + +-------------------------------------------------------------------------------- +-- Low-level boxed combinators + +-- | Read a null-terminated bytestring (a C-style string). +-- +-- Consumes the null terminator. +getCString :: Parser e B.ByteString +getCString = Parser \fp eob s -> go' fp eob s + where + go' fp eob s0 = go 0# s0 + where + go n# s = case eqAddr# eob s of + 1# -> Fail# + _ -> + let s' = plusAddr# s 1# + w# = indexWord8OffAddr# s 0# + in if W8# w# == 0x00 + then OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' + else go (n# +# 1#) s' +{-# inline getCString #-} + +-- | Read a null-terminated bytestring (a C-style string), where the bytestring +-- is known to be null-terminated somewhere in the input. +-- +-- Undefined behaviour if your bytestring isn't null-terminated somewhere. +-- You almost certainly want 'getCString' instead. +-- +-- Fails on GHC versions older than 9.0, since we make use of the +-- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful +-- without it. +-- +-- Consumes the null terminator. +getCStringUnsafe :: Parser e B.ByteString +{-# inline getCStringUnsafe #-} +#if MIN_VERSION_base(4,15,0) +getCStringUnsafe = Parser \fp eob s -> + case eqAddr# eob s of + 1# -> Fail# + _ -> let n# = cstringLength# s + s'# = plusAddr# s (n# +# 1#) + in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# +#else +getCStringUnsafe = error "Flatparse.Basic.getCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" +#endif diff --git a/src/FlatParse/Basic/Integers.hs b/src/FlatParse/Basic/Integers.hs index f5d7516..6e52487 100644 --- a/src/FlatParse/Basic/Integers.hs +++ b/src/FlatParse/Basic/Integers.hs @@ -60,6 +60,8 @@ import GHC.Int import FlatParse.Basic.Parser import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 ) +import Control.Applicative ( Alternative(empty) ) + -------------------------------------------------------------------------------- -- | Helper for defining CPS parsers for types of a constant byte size (i.e. @@ -289,7 +291,7 @@ getSizedOfUnsafe# size# indexOffAddr aExpected = go aParsed = if aParsed == aExpected then pure () - else failed + else empty {-# inline getSizedOfUnsafe# #-} -- | Read the next 1 byte and assert its value as a 'Word8'. diff --git a/src/FlatParse/Basic/Internal.hs b/src/FlatParse/Basic/Internal.hs index 02992b4..ec8691f 100644 --- a/src/FlatParse/Basic/Internal.hs +++ b/src/FlatParse/Basic/Internal.hs @@ -5,7 +5,6 @@ module FlatParse.Basic.Internal where import FlatParse.Basic.Parser import GHC.Exts -import GHC.Word import GHC.ForeignPtr import qualified Data.ByteString as B @@ -113,48 +112,3 @@ atSkipUnsafe# os# (Parser p) = Parser \fp eob s -> case os# <=# minusAddr# eob s skip# :: Int# -> Parser e () skip# os# = atSkip# os# (pure ()) {-# inline skip# #-} - --------------------------------------------------------------------------------- --- Low-level boxed combinators - --- | Read a null-terminated bytestring (a C-style string). --- --- Consumes the null terminator. -getCString :: Parser e B.ByteString -getCString = Parser \fp eob s -> go' fp eob s - where - go' fp eob s0 = go 0# s0 - where - go n# s = case eqAddr# eob s of - 1# -> Fail# - _ -> - let s' = plusAddr# s 1# - w# = indexWord8OffAddr# s 0# - in if W8# w# == 0x00 - then OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' - else go (n# +# 1#) s' -{-# inline getCString #-} - --- | Read a null-terminated bytestring (a C-style string), where the bytestring --- is known to be null-terminated somewhere in the input. --- --- Undefined behaviour if your bytestring isn't null-terminated somewhere. --- You almost certainly want 'getCString' instead. --- --- Fails on GHC versions older than 9.0, since we make use of the --- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful --- without it. --- --- Consumes the null terminator. -getCStringUnsafe :: Parser e B.ByteString -{-# inline getCStringUnsafe #-} -#if MIN_VERSION_base(4,15,0) -getCStringUnsafe = Parser \fp eob s -> - case eqAddr# eob s of - 1# -> Fail# - _ -> let n# = cstringLength# s - s'# = plusAddr# s (n# +# 1#) - in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# -#else -getCStringUnsafe = error "Flatparse.Basic.getCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" -#endif diff --git a/src/FlatParse/Basic/Parser.hs b/src/FlatParse/Basic/Parser.hs index a3117cd..4c08aef 100644 --- a/src/FlatParse/Basic/Parser.hs +++ b/src/FlatParse/Basic/Parser.hs @@ -6,9 +6,6 @@ module FlatParse.Basic.Parser Parser(..) , Res# , pattern OK#, pattern Err#, pattern Fail# - - -- * Primitive combinators - , failed ) where import GHC.Exts ( Addr#, unsafeCoerce# ) @@ -61,8 +58,9 @@ instance Monad (Parser e) where (>>) = (*>) {-# inline (>>) #-} +-- | By default, parser choice `(<|>)` arbitrarily backtracks on parser failure. instance Base.Alternative (Parser e) where - empty = failed + empty = Parser \fp eob s -> Fail# {-# inline empty #-} (<|>) = (<|>) @@ -81,12 +79,7 @@ instance Base.Alternative (Parser e) where some p = (:) <$> p <*> Base.many p {-# inline some #-} --- | The failing parser. By default, parser choice `(<|>)` arbitrarily backtracks --- on parser failure. -failed :: Parser e a -failed = Parser \fp eob s -> Fail# -{-# inline failed #-} - +-- TODO don't get how to handle this, whether I can inline it safely infixr 6 <|> (<|>) :: Parser e a -> Parser e a -> Parser e a (<|>) (Parser f) (Parser g) = Parser \fp eob s -> @@ -102,7 +95,7 @@ infixr 6 <|> #-} instance MonadPlus (Parser e) where - mzero = failed + mzero = Base.empty {-# inline mzero #-} mplus = (<|>) {-# inline mplus #-} diff --git a/src/FlatParse/Basic/Position.hs b/src/FlatParse/Basic/Position.hs index f566610..3b1c473 100644 --- a/src/FlatParse/Basic/Position.hs +++ b/src/FlatParse/Basic/Position.hs @@ -1,4 +1,7 @@ -module FlatParse.Basic.Position where +module FlatParse.Basic.Position + ( module FlatParse.Common.Position + , module FlatParse.Basic.Position + ) where import FlatParse.Basic.Parser import FlatParse.Common.Position From 9b9bc66053b896c4d40bdc7f635afd2307f8f6df Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Thu, 27 Oct 2022 20:18:58 +0100 Subject: [PATCH 07/11] more cleanup --- flatparse.cabal | 4 +- src/FlatParse/Basic.hs | 243 ++----------------- src/FlatParse/Basic/Bytes.hs | 54 +++++ src/FlatParse/Basic/Internal.hs | 21 +- src/FlatParse/Basic/{Chars.hs => Strings.hs} | 43 +++- src/FlatParse/BasicString.hs | 137 +++++++++++ src/FlatParse/Examples/BasicLambda/Lexer.hs | 5 +- test/Test.hs | 13 +- 8 files changed, 270 insertions(+), 250 deletions(-) create mode 100644 src/FlatParse/Basic/Bytes.hs rename src/FlatParse/Basic/{Chars.hs => Strings.hs} (81%) create mode 100644 src/FlatParse/BasicString.hs diff --git a/flatparse.cabal b/flatparse.cabal index 84d133f..11fd3ae 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -45,11 +45,13 @@ library exposed-modules: FlatParse.Basic FlatParse.Basic.Addr - FlatParse.Basic.Chars + FlatParse.Basic.Bytes FlatParse.Basic.Integers FlatParse.Basic.Internal FlatParse.Basic.Parser FlatParse.Basic.Position + FlatParse.Basic.Strings + FlatParse.BasicString FlatParse.Common.Assorted FlatParse.Common.Numbers FlatParse.Common.Position diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index d9bef66..1e1ff4d 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -16,13 +16,13 @@ module FlatParse.Basic ( -- ** Executing parsers , Result(..) , runParser - , runParserS -- * Errors and failures , err , lookahead , fails , try + , Control.Applicative.optional , optional_ , withOption , cut @@ -40,6 +40,7 @@ module FlatParse.Basic ( , some_ , notFollowedBy , isolate + , isolateUnsafe# -- * Primitive parsers , eof @@ -49,11 +50,14 @@ module FlatParse.Basic ( -- ** Byte-wise , take + , take# , takeRest , skip + , atSkip# , getBytesOf , getByteStringOf , getCString + , getCStringUnsafe -- ** Machine integers , module FlatParse.Basic.Integers @@ -81,49 +85,30 @@ module FlatParse.Basic ( -- ** Positions and spans , module FlatParse.Basic.Position - -- ** Position and span conversions - , validPos - , posLineCols - , unsafeSpanToByteString - , mkPos - , FlatParse.Basic.lines - - -- * Getting the rest of the input as a 'String' - , takeLine - , traceLine - , takeRestString - , traceRestString - - -- * `String` conversions - , packUTF8 - , unpackUTF8 - - -- * Internal functions - , ensureBytes# - - -- ** Unboxed arguments - , take# - , atSkip# - - -- ** Location & address primitives - , setBack# , module FlatParse.Basic.Addr - -- ** Unsafe - , getCStringUnsafe - ) where import Prelude hiding ( take, getChar ) +import qualified FlatParse.Common.Assorted as Common +import FlatParse.Common.Position +import FlatParse.Common.Trie + +import FlatParse.Basic.Parser +import FlatParse.Basic.Integers +import FlatParse.Basic.Internal +import FlatParse.Basic.Bytes +import FlatParse.Basic.Strings +import FlatParse.Basic.Position +import FlatParse.Basic.Addr + import Control.Applicative import Control.Monad import Data.Foldable -import Data.List (sortBy) import Data.Map (Map) -import Data.Ord (comparing) import GHC.Exts import GHC.Word import GHC.ForeignPtr ( ForeignPtr(..) ) @@ -135,19 +120,6 @@ import qualified Data.ByteString.Unsafe as B import qualified Data.ByteString.Internal as B import qualified Data.Map.Strict as M -import qualified FlatParse.Common.Numbers as Common -import qualified FlatParse.Common.Assorted as Common -import FlatParse.Common.Position -import FlatParse.Common.Trie -import FlatParse.Common.Assorted ( packBytes, splitBytes, strToBytes, packUTF8 ) - -import FlatParse.Basic.Parser -import FlatParse.Basic.Integers -import FlatParse.Basic.Internal -import FlatParse.Basic.Chars -import FlatParse.Basic.Position -import FlatParse.Basic.Addr - -- | Higher-level boxed data type for parsing results. data Result e a = OK a !(B.ByteString) -- ^ Contains return value and unconsumed input. @@ -268,16 +240,6 @@ skip :: Int -> Parser e () skip (I# os#) = atSkip# os# (pure ()) {-# inline skip #-} --- | Parse a UTF-8 character literal. This is a template function, you can use it as --- @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@. -getCharOf :: Char -> Q Exp -getCharOf c = getStringOf [c] - --- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@, --- for example, and the splice has type @Parser e ()@. -getStringOf :: String -> Q Exp -getStringOf str = getBytesOf (strToBytes str) - {-| This is a template function which makes it possible to branch on a collection of string literals in an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing @@ -345,30 +307,6 @@ rawSwitchWithPost postAction cases fallback = do !fallback <- sequence fallback genTrie $! genSwitchTrie' postAction cases fallback --- | Read a non-negative `Int` from the input, as a non-empty digit sequence. --- The `Int` may overflow in the result. -getAsciiDecimalInt :: Parser e Int -getAsciiDecimalInt = Parser \fp eob s -> case Common.readInt eob s of - (# (##) | #) -> Fail# - (# | (# n, s' #) #) -> OK# (I# n) s' -{-# inline getAsciiDecimalInt #-} - --- | Read an `Int` from the input, as a non-empty case-insensitive ASCII --- hexadecimal digit sequence. The `Int` may overflow in the result. -getAsciiHexInt :: Parser e Int -getAsciiHexInt = Parser \fp eob s -> case Common.readIntHex eob s of - (# (##) | #) -> Fail# - (# | (# n, s' #) #) -> OK# (I# n) s' -{-# inline getAsciiHexInt #-} - --- | Read a non-negative `Integer` from the input, as a non-empty digit --- sequence. -getAsciiDecimalInteger :: Parser e Integer -getAsciiDecimalInteger = Parser \fp eob s -> case Common.readInteger fp eob s of - (# (##) | #) -> Fail# - (# | (# i, s' #) #) -> OK# i s' -{-# inline getAsciiDecimalInteger #-} - -------------------------------------------------------------------------------- -- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, @@ -442,43 +380,6 @@ getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = _ -> Fail# {-# inline getByteStringOf #-} --- | Read a sequence of bytes. This is a template function, you can use it as --- @$(getBytesOf [3, 4, 5])@, for example, and the splice has type @Parser e --- ()@. -getBytesOf :: [Word] -> Q Exp -getBytesOf bytes = do - let !len = length bytes - [| ensureBytes# len >> $(scanBytes# bytes) |] - --- | Template function, creates a @Parser e ()@ which unsafely scans a given --- sequence of bytes. -scanBytes# :: [Word] -> Q Exp -scanBytes# bytes = do - let !(leading, w8s) = splitBytes bytes - !scanw8s = go w8s where - go (w8:[] ) = [| getWord64OfUnsafe w8 |] - go (w8:w8s) = [| getWord64OfUnsafe w8 >> $(go w8s) |] - go [] = [| pure () |] - case w8s of - [] -> go leading - where - go (a:b:c:d:[]) = let !w = packBytes [a, b, c, d] in [| getWord32OfUnsafe w |] - go (a:b:c:d:ws) = let !w = packBytes [a, b, c, d] in [| getWord32OfUnsafe w >> $(go ws) |] - go (a:b:[]) = let !w = packBytes [a, b] in [| getWord16OfUnsafe w |] - go (a:b:ws) = let !w = packBytes [a, b] in [| getWord16OfUnsafe w >> $(go ws) |] - go (a:[]) = [| getWord8OfUnsafe a |] - go [] = [| pure () |] - _ -> case leading of - - [] -> scanw8s - [a] -> [| getWord8OfUnsafe a >> $scanw8s |] - ws@[a, b] -> let !w = packBytes ws in [| getWord16OfUnsafe w >> $scanw8s |] - ws@[a, b, c, d] -> let !w = packBytes ws in [| getWord32OfUnsafe w >> $scanw8s |] - ws -> let !w = packBytes ws - !l = length ws - in [| scanPartial64# l w >> $scanw8s |] - - -- Switching code generation -------------------------------------------------------------------------------- @@ -503,7 +404,7 @@ genTrie (rules, t) = do fallback :: Rule -> Int -> Q Exp fallback rule 0 = pure $ VarE $ fst $ ix branches rule - fallback rule n = [| setBack# n >> $(pure $ VarE $ fst $ ix branches rule) |] + fallback rule n = [| skipBack# n >> $(pure $ VarE $ fst $ ix branches rule) |] let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp go = \case @@ -602,114 +503,6 @@ isolateUnsafe# n# p = Parser \fp eob s -> _ -> Fail# -- you tried to isolate more than we have left {-# inline isolateUnsafe# #-} --------------------------------------------------------------------------------- - --- | Convert an UTF-8-coded `B.ByteString` to a `String`. -unpackUTF8 :: B.ByteString -> String -unpackUTF8 str = case runParser takeRestString str of - OK a _ -> a - _ -> error "unpackUTF8: invalid encoding" - --- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. -takeRestString :: Parser e String -takeRestString = branch eof (pure "") do - c <- getChar - cs <- takeRestString - pure (c:cs) - --- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. --- This can be used for debugging. -traceRestString :: Parser e String -traceRestString = lookahead takeRestString - --------------------------------------------------------------------------------- - --- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, --- throws an error if the encoding is invalid. -takeLine :: Parser e String -takeLine = branch eof (pure "") do - c <- getChar - case c of - '\n' -> pure "" - _ -> (c:) <$> takeLine - --- | Parse the rest of the current line as a `String`, but restore the parsing state. --- Assumes UTF-8 encoding. This can be used for debugging. -traceLine :: Parser e String -traceLine = lookahead takeLine - --- | Run a parser on a `String` input. Reminder: @OverloadedStrings@ for `B.ByteString` does not --- yield a valid UTF-8 encoding! For non-ASCII `B.ByteString` literal input, use `runParserS` or --- `packUTF8` for testing. -runParserS :: Parser e a -> String -> Result e a -runParserS pa s = runParser pa (packUTF8 s) - --- | Create a `Pos` from a line and column number. Throws an error on out-of-bounds --- line and column numbers. -mkPos :: B.ByteString -> (Int, Int) -> Pos -mkPos str (line', col') = - let go line col | line == line' && col == col' = getPos - go line col = (do - c <- getChar - if c == '\n' then go (line + 1) 0 - else go line (col + 1)) <|> error "mkPos: invalid position" - in case runParser (go 0 0) str of - OK res _ -> res - _ -> error "impossible" - --- | Break an UTF-8-coded `B.ByteString` to lines. Throws an error on invalid --- input. This is mostly useful for grabbing specific source lines for --- displaying error messages. -lines :: B.ByteString -> [String] -lines str = - let go = ([] <$ eof) <|> ((:) <$> takeLine <*> go) - in case runParser go str of - OK ls _ -> ls - _ -> error "linesUTF8: invalid input" - - --- | Check whether a `Pos` points into a `B.ByteString`. -validPos :: B.ByteString -> Pos -> Bool -validPos str pos = - let go = do - start <- getPos - pure (start <= pos && pos <= endPos) - in case runParser go str of - OK b _ -> b - _ -> error "impossible" -{-# inline validPos #-} - --- | Compute corresponding line and column numbers for each `Pos` in a list. Throw an error --- on invalid positions. Note: computing lines and columns may traverse the `B.ByteString`, --- but it traverses it only once regardless of the length of the position list. -posLineCols :: B.ByteString -> [Pos] -> [(Int, Int)] -posLineCols str poss = - let go !line !col [] = pure [] - go line col ((i, pos):poss) = do - p <- getPos - if pos == p then - ((i, (line, col)):) <$> go line col poss - else do - c <- getChar - if '\n' == c then - go (line + 1) 0 ((i, pos):poss) - else - go line (col + 1) ((i, pos):poss) - - sorted :: [(Int, Pos)] - sorted = sortBy (comparing snd) (zip [0..] poss) - - in case runParser (go 0 0 sorted) str of - OK res _ -> snd <$> sortBy (comparing fst) res - _ -> error "invalid position" - --- | Create a `B.ByteString` from a `Span`. The result is invalid if the `Span` points --- outside the current buffer, or if the `Span` start is greater than the end position. -unsafeSpanToByteString :: Span -> Parser e B.ByteString -unsafeSpanToByteString (Span l r) = - lookahead (setPos l >> byteStringOf (setPos r)) -{-# inline unsafeSpanToByteString #-} - -------------------------------------------------------------------------------- -- Low-level boxed combinators diff --git a/src/FlatParse/Basic/Bytes.hs b/src/FlatParse/Basic/Bytes.hs new file mode 100644 index 0000000..91eafc4 --- /dev/null +++ b/src/FlatParse/Basic/Bytes.hs @@ -0,0 +1,54 @@ +module FlatParse.Basic.Bytes where + +import Language.Haskell.TH +import FlatParse.Basic.Parser +import qualified FlatParse.Common.Assorted as Common +import FlatParse.Basic.Integers ( getWord64OfUnsafe + , getWord32OfUnsafe + , getWord16OfUnsafe + , getWord8OfUnsafe ) +import GHC.Exts + +-- | Check that the input has at least the given number of bytes. +ensureBytes# :: Int -> Parser e () +ensureBytes# (I# len) = Parser \fp eob s -> + case len <=# minusAddr# eob s of + 1# -> OK# () s + _ -> Fail# +{-# inline ensureBytes# #-} + +-- | Read a sequence of bytes. This is a template function, you can use it as +-- @$(getBytesOf [3, 4, 5])@, for example, and the splice has type @Parser e +-- ()@. +getBytesOf :: [Word] -> Q Exp +getBytesOf bytes = do + let !len = length bytes + [| ensureBytes# len >> $(scanBytes# bytes) |] + +-- | Template function, creates a @Parser e ()@ which unsafely scans a given +-- sequence of bytes. +scanBytes# :: [Word] -> Q Exp +scanBytes# bytes = do + let !(leading, w8s) = Common.splitBytes bytes + !scanw8s = go w8s where + go (w8:[] ) = [| getWord64OfUnsafe w8 |] + go (w8:w8s) = [| getWord64OfUnsafe w8 >> $(go w8s) |] + go [] = [| pure () |] + case w8s of + [] -> go leading + where + go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| getWord32OfUnsafe w |] + go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| getWord32OfUnsafe w >> $(go ws) |] + go (a:b:[]) = let !w = Common.packBytes [a, b] in [| getWord16OfUnsafe w |] + go (a:b:ws) = let !w = Common.packBytes [a, b] in [| getWord16OfUnsafe w >> $(go ws) |] + go (a:[]) = [| getWord8OfUnsafe a |] + go [] = [| pure () |] + _ -> case leading of + + [] -> scanw8s + [a] -> [| getWord8OfUnsafe a >> $scanw8s |] + ws@[a, b] -> let !w = Common.packBytes ws in [| getWord16OfUnsafe w >> $scanw8s |] + ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| getWord32OfUnsafe w >> $scanw8s |] + ws -> let !w = Common.packBytes ws + !l = length ws + in [| scanPartial64# l w >> $scanw8s |] diff --git a/src/FlatParse/Basic/Internal.hs b/src/FlatParse/Basic/Internal.hs index ec8691f..7f3c4f5 100644 --- a/src/FlatParse/Basic/Internal.hs +++ b/src/FlatParse/Basic/Internal.hs @@ -22,14 +22,6 @@ lookahead (Parser f) = Parser \fp eob s -> -------------------------------------------------------------------------------- --- | Check that the input has at least the given number of bytes. -ensureBytes# :: Int -> Parser e () -ensureBytes# (I# len) = Parser \fp eob s -> - case len <=# minusAddr# eob s of - 1# -> OK# () s - _ -> Fail# -{-# inline ensureBytes# #-} - scanPartial64# :: Int -> Word -> Parser e () scanPartial64# (I# len) (W# w) = Parser \fp eob s -> case indexWordOffAddr# s 0# of @@ -41,12 +33,6 @@ scanPartial64# (I# len) (W# w) = Parser \fp eob s -> _ -> Fail# {-# inline scanPartial64# #-} --- | Decrease the current input position by the given number of bytes. -setBack# :: Int -> Parser e () -setBack# (I# i) = Parser \fp eob s -> - OK# () (plusAddr# s (negateInt# i)) -{-# inline setBack# #-} - -------------------------------------------------------------------------------- -- Helpers for common internal operations @@ -112,3 +98,10 @@ atSkipUnsafe# os# (Parser p) = Parser \fp eob s -> case os# <=# minusAddr# eob s skip# :: Int# -> Parser e () skip# os# = atSkip# os# (pure ()) {-# inline skip# #-} + +-- | Go back @n@ bytes. +-- +-- Highly unsafe. Makes no checks. +skipBack# :: Int -> Parser e () +skipBack# (I# i) = Parser \fp eob s -> OK# () (plusAddr# s (negateInt# i)) +{-# inline skipBack# #-} diff --git a/src/FlatParse/Basic/Chars.hs b/src/FlatParse/Basic/Strings.hs similarity index 81% rename from src/FlatParse/Basic/Chars.hs rename to src/FlatParse/Basic/Strings.hs index de44048..2a3bdf0 100644 --- a/src/FlatParse/Basic/Chars.hs +++ b/src/FlatParse/Basic/Strings.hs @@ -1,13 +1,20 @@ -module FlatParse.Basic.Chars where +{-# language UnboxedTuples #-} + +module FlatParse.Basic.Strings where import Prelude hiding ( getChar ) import FlatParse.Basic.Parser +import FlatParse.Basic.Bytes ( getBytesOf ) import FlatParse.Common.Assorted ( derefChar8# ) import GHC.Exts +import Language.Haskell.TH +import qualified FlatParse.Common.Numbers as Common +import qualified FlatParse.Common.Assorted as Common + -- | Parse a UTF-8 `Char` for which a predicate holds. satisfy :: (Char -> Bool) -> Parser e Char satisfy f = Parser \fp eob s -> case runParser# getChar fp eob s of @@ -169,3 +176,37 @@ getCharASCII = Parser \fp eob buf -> case eqAddr# eob buf of getCharASCII_ :: Parser e () getCharASCII_ = () <$ getCharASCII {-# inline getCharASCII_ #-} + +-- | Parse a UTF-8 character literal. This is a template function, you can use it as +-- @$(char \'x\')@, for example, and the splice in this case has type @Parser e ()@. +getCharOf :: Char -> Q Exp +getCharOf c = getStringOf [c] + +-- | Parse a UTF-8 string literal. This is a template function, you can use it as @$(string "foo")@, +-- for example, and the splice has type @Parser e ()@. +getStringOf :: String -> Q Exp +getStringOf str = getBytesOf (Common.strToBytes str) + +-- | Read a non-negative `Int` from the input, as a non-empty digit sequence. +-- The `Int` may overflow in the result. +getAsciiDecimalInt :: Parser e Int +getAsciiDecimalInt = Parser \fp eob s -> case Common.readInt eob s of + (# (##) | #) -> Fail# + (# | (# n, s' #) #) -> OK# (I# n) s' +{-# inline getAsciiDecimalInt #-} + +-- | Read an `Int` from the input, as a non-empty case-insensitive ASCII +-- hexadecimal digit sequence. The `Int` may overflow in the result. +getAsciiHexInt :: Parser e Int +getAsciiHexInt = Parser \fp eob s -> case Common.readIntHex eob s of + (# (##) | #) -> Fail# + (# | (# n, s' #) #) -> OK# (I# n) s' +{-# inline getAsciiHexInt #-} + +-- | Read a non-negative `Integer` from the input, as a non-empty digit +-- sequence. +getAsciiDecimalInteger :: Parser e Integer +getAsciiDecimalInteger = Parser \fp eob s -> case Common.readInteger fp eob s of + (# (##) | #) -> Fail# + (# | (# i, s' #) #) -> OK# i s' +{-# inline getAsciiDecimalInteger #-} diff --git a/src/FlatParse/BasicString.hs b/src/FlatParse/BasicString.hs new file mode 100644 index 0000000..5595c50 --- /dev/null +++ b/src/FlatParse/BasicString.hs @@ -0,0 +1,137 @@ +module FlatParse.BasicString + ( + runParserS + + , takeLine + , traceLine + , takeRestString + , traceRestString + + , packUTF8 + , unpackUTF8 + + , validPos + , posLineCols + , unsafeSpanToByteString + , mkPos + , lines + + ) where + +import Prelude hiding ( getChar, lines ) + +import FlatParse.Basic + +import Data.Ord ( comparing ) +import Data.List ( sortBy ) +import FlatParse.Common.Assorted ( packUTF8 ) + +import qualified Data.ByteString as B + +-- | Run a parser on a `String` input. Reminder: @OverloadedStrings@ for `B.ByteString` does not +-- yield a valid UTF-8 encoding! For non-ASCII `B.ByteString` literal input, use `runParserS` or +-- `packUTF8` for testing. +runParserS :: Parser e a -> String -> Result e a +runParserS pa s = runParser pa (packUTF8 s) + +-- | Create a `Pos` from a line and column number. Throws an error on out-of-bounds +-- line and column numbers. +mkPos :: B.ByteString -> (Int, Int) -> Pos +mkPos str (line', col') = + let go line col | line == line' && col == col' = getPos + go line col = (do + c <- getChar + if c == '\n' then go (line + 1) 0 + else go line (col + 1)) <|> error "mkPos: invalid position" + in case runParser (go 0 0) str of + OK res _ -> res + _ -> error "impossible" + +-- | Break an UTF-8-coded `B.ByteString` to lines. Throws an error on invalid +-- input. This is mostly useful for grabbing specific source lines for +-- displaying error messages. +lines :: B.ByteString -> [String] +lines str = + let go = ([] <$ eof) <|> ((:) <$> takeLine <*> go) + in case runParser go str of + OK ls _ -> ls + _ -> error "linesUTF8: invalid input" + + +-- | Check whether a `Pos` points into a `B.ByteString`. +validPos :: B.ByteString -> Pos -> Bool +validPos str pos = + let go = do + start <- getPos + pure (start <= pos && pos <= endPos) + in case runParser go str of + OK b _ -> b + _ -> error "impossible" +{-# inline validPos #-} + +-- | Compute corresponding line and column numbers for each `Pos` in a list. Throw an error +-- on invalid positions. Note: computing lines and columns may traverse the `B.ByteString`, +-- but it traverses it only once regardless of the length of the position list. +posLineCols :: B.ByteString -> [Pos] -> [(Int, Int)] +posLineCols str poss = + let go !line !col [] = pure [] + go line col ((i, pos):poss) = do + p <- getPos + if pos == p then + ((i, (line, col)):) <$> go line col poss + else do + c <- getChar + if '\n' == c then + go (line + 1) 0 ((i, pos):poss) + else + go line (col + 1) ((i, pos):poss) + + sorted :: [(Int, Pos)] + sorted = sortBy (comparing snd) (zip [0..] poss) + + in case runParser (go 0 0 sorted) str of + OK res _ -> snd <$> sortBy (comparing fst) res + _ -> error "invalid position" + +-- | Create a `B.ByteString` from a `Span`. The result is invalid if the `Span` points +-- outside the current buffer, or if the `Span` start is greater than the end position. +unsafeSpanToByteString :: Span -> Parser e B.ByteString +unsafeSpanToByteString (Span l r) = + lookahead (setPos l >> byteStringOf (setPos r)) +{-# inline unsafeSpanToByteString #-} + +-------------------------------------------------------------------------------- + +-- | Parse the rest of the current line as a `String`. Assumes UTF-8 encoding, +-- throws an error if the encoding is invalid. +takeLine :: Parser e String +takeLine = branch eof (pure "") do + c <- getChar + case c of + '\n' -> pure "" + _ -> (c:) <$> takeLine + +-- | Parse the rest of the current line as a `String`, but restore the parsing state. +-- Assumes UTF-8 encoding. This can be used for debugging. +traceLine :: Parser e String +traceLine = lookahead takeLine + +-------------------------------------------------------------------------------- + +-- | Convert an UTF-8-coded `B.ByteString` to a `String`. +unpackUTF8 :: B.ByteString -> String +unpackUTF8 str = case runParser takeRestString str of + OK a _ -> a + _ -> error "unpackUTF8: invalid encoding" + +-- | Take the rest of the input as a `String`. Assumes UTF-8 encoding. +takeRestString :: Parser e String +takeRestString = branch eof (pure "") do + c <- getChar + cs <- takeRestString + pure (c:cs) + +-- | Get the rest of the input as a `String`, but restore the parsing state. Assumes UTF-8 encoding. +-- This can be used for debugging. +traceRestString :: Parser e String +traceRestString = lookahead takeRestString diff --git a/src/FlatParse/Examples/BasicLambda/Lexer.hs b/src/FlatParse/Examples/BasicLambda/Lexer.hs index 7a3935a..75d58cf 100644 --- a/src/FlatParse/Examples/BasicLambda/Lexer.hs +++ b/src/FlatParse/Examples/BasicLambda/Lexer.hs @@ -11,6 +11,7 @@ module FlatParse.Examples.BasicLambda.Lexer where import FlatParse.Basic hiding ( Parser, runParser ) import qualified FlatParse.Basic as FP +import qualified FlatParse.BasicString as FP import qualified Data.ByteString as B import Language.Haskell.TH @@ -65,7 +66,7 @@ prettyError b e = pos = case e of Imprecise pos e -> pos Precise pos e -> pos ls = FP.lines b - (l, c) = head $ posLineCols b [pos] + (l, c) = head $ FP.posLineCols b [pos] line = if l < length ls then ls !! l else "" linum = show l lpad = map (const ' ') linum @@ -108,7 +109,7 @@ runParser = FP.runParser -- | Run parser, print pretty error on failure. testParser :: Show a => Parser a -> String -> IO () -testParser p str = case packUTF8 str of +testParser p str = case FP.packUTF8 str of b -> case runParser p b of Err e -> putStrLn $ prettyError b e OK a _ -> print a diff --git a/test/Test.hs b/test/Test.hs index da4641d..a88bf33 100644 --- a/test/Test.hs +++ b/test/Test.hs @@ -8,6 +8,7 @@ import Data.ByteString (ByteString) import qualified Data.ByteString as B import qualified Data.Char import qualified FlatParse.Basic as FB +import qualified FlatParse.BasicString as FB -- import qualified FlatParse.Stateful as FS import Test.HUnit import Test.Hspec @@ -19,8 +20,6 @@ import GHC.Int import Data.Bits import Test.QuickCheck.Instances.ByteString() -import Control.Applicative - main :: IO () main = hspec $ do basicSpec @@ -55,9 +54,9 @@ basicSpec = describe "FlatParse.Basic" $ do it "propagates errors" $ FB.optional (FB.err "nope") `shouldParseErr` "" describe "optional_" $ do - it "can succeed" $ FB.optional (pure ()) `shouldParse` "" - it "can succeed when argument missing" $ FB.optional FB.empty `shouldParse` "" - it "propagates errors" $ FB.optional (FB.err "nope") `shouldParseErr` "" + it "can succeed" $ FB.optional_ (pure ()) `shouldParse` "" + it "can succeed when argument missing" $ FB.optional_ FB.empty `shouldParse` "" + it "propagates errors" $ FB.optional_ (FB.err "nope") `shouldParseErr` "" describe "withOption" $ do let opt p = FB.withOption p (pure . reverse) (pure "bar") @@ -491,11 +490,11 @@ basicSpec = describe "FlatParse.Basic" $ do describe "(<|>)" $ do it "chooses first option on success" $ - (("A" <$ $(FB.getStringOf "foo")) <|> ("B" <$ $(FB.getStringOf "foo"))) + (("A" <$ $(FB.getStringOf "foo")) FB.<|> ("B" <$ $(FB.getStringOf "foo"))) `shouldParseWith` ("foo", "A") it "chooses second option when first fails" $ - (("A" <$ $(FB.getStringOf "bar")) <|> ("B" <$ $(FB.getStringOf "foo"))) + (("A" <$ $(FB.getStringOf "bar")) FB.<|> ("B" <$ $(FB.getStringOf "foo"))) `shouldParseWith` ("foo", "B") describe "branch" $ do From f6e8091a2e7a7183d17bcca15a434926640bf7a0 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Thu, 27 Oct 2022 21:03:47 +0100 Subject: [PATCH 08/11] more fiddling --- TODO.md | 1 - flatparse.cabal | 3 +- src/FlatParse/Basic.hs | 416 +---------------------------- src/FlatParse/Basic/Addr.hs | 2 +- src/FlatParse/Basic/Bytes.hs | 11 + src/FlatParse/Basic/Combinators.hs | 255 ++++++++++++++++++ src/FlatParse/Basic/Internal.hs | 107 -------- src/FlatParse/Basic/Other.hs | 251 +++++++++++++++++ 8 files changed, 526 insertions(+), 520 deletions(-) delete mode 100644 TODO.md create mode 100644 src/FlatParse/Basic/Combinators.hs delete mode 100644 src/FlatParse/Basic/Internal.hs create mode 100644 src/FlatParse/Basic/Other.hs diff --git a/TODO.md b/TODO.md deleted file mode 100644 index 4501b18..0000000 --- a/TODO.md +++ /dev/null @@ -1 +0,0 @@ - * add unsafeTakeBs# which doesn't assert >=0. can cause hangs so bad diff --git a/flatparse.cabal b/flatparse.cabal index 11fd3ae..06147b5 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -46,8 +46,9 @@ library FlatParse.Basic FlatParse.Basic.Addr FlatParse.Basic.Bytes + FlatParse.Basic.Combinators FlatParse.Basic.Integers - FlatParse.Basic.Internal + FlatParse.Basic.Other FlatParse.Basic.Parser FlatParse.Basic.Position FlatParse.Basic.Strings diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index 1e1ff4d..82b316f 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -1,11 +1,8 @@ {-# language UnboxedTuples #-} {-| -This module implements a `Parser` supporting custom error types. If you need efficient indentation -parsing, use "FlatParse.Stateful" instead. - -Many internals are exposed for hacking on and extending. These are generally -denoted by a @#@ hash suffix. +This module implements a `Parser` supporting custom error types. If you need +efficient indentation parsing, use "FlatParse.Stateful" instead. -} module FlatParse.Basic ( @@ -54,6 +51,7 @@ module FlatParse.Basic ( , takeRest , skip , atSkip# + , skipBack# , getBytesOf , getByteStringOf , getCString @@ -94,11 +92,11 @@ import Prelude hiding ( take, getChar ) import qualified FlatParse.Common.Assorted as Common import FlatParse.Common.Position -import FlatParse.Common.Trie import FlatParse.Basic.Parser import FlatParse.Basic.Integers -import FlatParse.Basic.Internal +import FlatParse.Basic.Combinators +import FlatParse.Basic.Other import FlatParse.Basic.Bytes import FlatParse.Basic.Strings import FlatParse.Basic.Position @@ -106,19 +104,13 @@ import FlatParse.Basic.Addr import Control.Applicative -import Control.Monad -import Data.Foldable -import Data.Map (Map) import GHC.Exts -import GHC.Word -import GHC.ForeignPtr ( ForeignPtr(..) ) -import Language.Haskell.TH import System.IO.Unsafe import qualified Data.ByteString as B import qualified Data.ByteString.Unsafe as B import qualified Data.ByteString.Internal as B -import qualified Data.Map.Strict as M +import GHC.ForeignPtr ( ForeignPtr(..) ) -- | Higher-level boxed data type for parsing results. data Result e a = @@ -135,7 +127,6 @@ instance Functor (Result e) where (<$) _ r = unsafeCoerce# r {-# inline (<$) #-} - -------------------------------------------------------------------------------- -- | Run a parser. @@ -152,398 +143,3 @@ runParser (Parser f) b@(B.PS (ForeignPtr _ fp) _ (I# len)) = unsafeDupablePerfor Fail# -> pure Fail {-# inlinable runParser #-} - - --------------------------------------------------------------------------------- - --- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack --- on parser error. Use `try` to convert an error to a recoverable failure. -err :: e -> Parser e a -err e = Parser \fp eob s -> Err# e -{-# inline err #-} - --- | Convert a parsing failure to a success. -fails :: Parser e a -> Parser e () -fails (Parser f) = Parser \fp eob s -> - case f fp eob s of - OK# _ _ -> Fail# - Fail# -> OK# () s - Err# e -> Err# e -{-# inline fails #-} - --- | Convert a parsing error into failure. -try :: Parser e a -> Parser e a -try (Parser f) = Parser \fp eob s -> case f fp eob s of - Err# _ -> Fail# - x -> x -{-# inline try #-} - --- | Convert a parsing failure to a `()`. -optional_ :: Parser e a -> Parser e () -optional_ p = (() <$ p) <|> pure () -{-# inline optional_ #-} - --- | CPS'd version of `optional`. This is usually more efficient, since it gets --- rid of the extra `Maybe` allocation. -withOption :: Parser e a -> (a -> Parser e r) -> Parser e r -> Parser e r -withOption (Parser f) just (Parser nothing) = Parser \fp eob s -> case f fp eob s of - OK# a s -> runParser# (just a) fp eob s - Fail# -> nothing fp eob s - Err# e -> Err# e -{-# inline withOption #-} - --- | Convert a parsing failure to an error. -cut :: Parser e a -> e -> Parser e a -cut (Parser f) e = Parser \fp eob s -> case f fp eob s of - Fail# -> Err# e - x -> x -{-# inline cut #-} - --- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the --- inner and the newly given errors using the @e -> e -> e@ function. This can be useful for --- implementing parsing errors which may propagate hints or accummulate contextual information. -cutting :: Parser e a -> e -> (e -> e -> e) -> Parser e a -cutting (Parser f) e merge = Parser \fp eob s -> case f fp eob s of - Fail# -> Err# e - Err# e' -> let !e'' = merge e' e in Err# e'' - x -> x -{-# inline cutting #-} - --------------------------------------------------------------------------------- - - --- | Succeed if the input is empty. -eof :: Parser e () -eof = Parser \fp eob s -> case eqAddr# eob s of - 1# -> OK# () s - _ -> Fail# -{-# inline eof #-} - --- | Read the given number of bytes as a 'ByteString'. --- --- Throws a runtime error if given a negative integer. -take :: Int -> Parser e B.ByteString -take (I# n#) = take# n# -{-# inline take #-} - --- | Consume the rest of the input. May return the empty bytestring. -takeRest :: Parser e B.ByteString -takeRest = Parser \fp eob s -> - let n# = minusAddr# eob s - in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) eob -{-# inline takeRest #-} - --- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. --- --- Throws a runtime error if given a negative integer. -skip :: Int -> Parser e () -skip (I# os#) = atSkip# os# (pure ()) -{-# inline skip #-} - -{-| -This is a template function which makes it possible to branch on a collection of string literals in -an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing -operations, which has optimized control flow, vectorized reads and grouped checking for needed input -bytes. - -The syntax is slightly magical, it overloads the usual @case@ expression. An example: - -@ - $(switch [| case _ of - "foo" -> pure True - "bar" -> pure False |]) -@ - -The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally -we may have a default case, like in - -@ - $(switch [| case _ of - "foo" -> pure 10 - "bar" -> pure 20 - _ -> pure 30 |]) -@ - -All case right hand sides must be parsers with the same type. That type is also the type -of the whole `switch` expression. - -A `switch` has longest match semantics, and the order of cases does not matter, except for -the default case, which may only appear as the last case. - -If a `switch` does not have a default case, and no case matches the input, then it returns with -failure, \without\ having consumed any input. A fallthrough to the default case also does not -consume any input. --} -switch :: Q Exp -> Q Exp -switch = switchWithPost Nothing - -{-| -Switch expression with an optional first argument for performing a post-processing action after -every successful branch matching, not including the default branch. For example, if we have -@ws :: Parser e ()@ for a whitespace parser, we might want to consume whitespace after matching -on any of the switch cases. For that case, we can define a "lexeme" version of `switch` as -follows. - -@ - switch' :: Q Exp -> Q Exp - switch' = switchWithPost (Just [| ws |]) -@ - -Note that this @switch'@ function cannot be used in the same module it's defined in, because of the -stage restriction of Template Haskell. --} -switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp -switchWithPost postAction exp = do - !postAction <- sequence postAction - (!cases, !fallback) <- parseSwitch exp - genTrie $! genSwitchTrie' postAction cases fallback - --- | Version of `switchWithPost` without syntactic sugar. The second argument is the --- list of cases, the third is the default case. -rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp -rawSwitchWithPost postAction cases fallback = do - !postAction <- sequence postAction - !cases <- forM cases \(str, rhs) -> (str,) <$> rhs - !fallback <- sequence fallback - genTrie $! genSwitchTrie' postAction cases fallback - --------------------------------------------------------------------------------- - --- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, --- and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not --- the usual `chainl` function from the parsec libraries! -chainl :: (b -> a -> b) -> Parser e b -> Parser e a -> Parser e b -chainl f start elem = start >>= go where - go b = do {!a <- elem; go $! f b a} <|> pure b -{-# inline chainl #-} - --- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and --- combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not --- the usual `chainr` function from the parsec libraries! -chainr :: (a -> b -> b) -> Parser e a -> Parser e b -> Parser e b -chainr f (Parser elem) (Parser end) = Parser go where - go fp eob s = case elem fp eob s of - OK# a s -> case go fp eob s of - OK# b s -> let !b' = f a b in OK# b' s - x -> x - Fail# -> end fp eob s - Err# e -> Err# e -{-# inline chainr #-} - --- | Skip a parser zero or more times. -many_ :: Parser e a -> Parser e () -many_ (Parser f) = Parser go where - go fp eob s = case f fp eob s of - OK# a s -> go fp eob s - Fail# -> OK# () s - Err# e -> Err# e -{-# inline many_ #-} - --- | Skip a parser one or more times. -some_ :: Parser e a -> Parser e () -some_ pa = pa >> many_ pa -{-# inline some_ #-} - --- | Succeed if the first parser succeeds and the second one fails. -notFollowedBy :: Parser e a -> Parser e b -> Parser e a -notFollowedBy p1 p2 = p1 <* fails p2 -{-# inline notFollowedBy #-} - --------------------------------------------------------------------------------- - --- | Parse a given `B.ByteString`. If the bytestring is statically known, --- consider using 'bytes' instead. -getByteStringOf :: B.ByteString -> Parser e () -getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = - - let go64 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go64 bs bsend s w = - let bs' = plusAddr# bs 8# in - case gtAddr# bs' bsend of - 1# -> go8 bs bsend s w - _ -> if W64# (indexWord64OffAddr# bs 0#) == W64# (indexWord64OffAddr# s 0#) - then go64 bs' bsend (plusAddr# s 8#) w - else (# Fail#, w #) - - go8 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go8 bs bsend s w = - case ltAddr# bs bsend of - 1# -> if W8# (indexWord8OffAddr# bs 0#) == W8# (indexWord8OffAddr# s 0#) - then go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) w - else (# Fail#, w #) - _ -> (# OK# () s, w #) - - in Parser \fp eob s -> case len <=# minusAddr# eob s of - 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s w of - (# res, w #) -> case touch# fcontent w of - w -> res - _ -> Fail# -{-# inline getByteStringOf #-} - --- Switching code generation --------------------------------------------------------------------------------- - -#if MIN_VERSION_base(4,15,0) -mkDoE = DoE Nothing -{-# inline mkDoE #-} -#else -mkDoE = DoE -{-# inline mkDoE #-} -#endif - -genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp -genTrie (rules, t) = do - branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules - - let ix m k = case M.lookup k m of - Nothing -> error ("key not in map: " ++ show k) - Just a -> a - - let ensure :: Maybe Int -> Maybe (Q Exp) - ensure = fmap (\n -> [| ensureBytes# n |]) - - fallback :: Rule -> Int -> Q Exp - fallback rule 0 = pure $ VarE $ fst $ ix branches rule - fallback rule n = [| skipBack# n >> $(pure $ VarE $ fst $ ix branches rule) |] - - let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp - go = \case - Branch' (r, n, alloc) ts - | M.null ts -> pure $ VarE $ fst $ branches M.! r - | otherwise -> do - !next <- (traverse . traverse) go (M.toList ts) - !defaultCase <- fallback r (n + 1) - - let cases = mkDoE $ - [BindS (VarP (mkName "c")) (VarE 'getWord8Unsafe), - NoBindS (CaseE (VarE (mkName "c")) - (map (\(w, t) -> - Match (LitP (IntegerL (fromIntegral w))) - (NormalB t) - []) - next - ++ [Match WildP (NormalB defaultCase) []]))] - - case ensure alloc of - Nothing -> pure cases - Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |] - - Path (r, n, alloc) ws t -> - case ensure alloc of - Nothing -> [| branch $(scanBytes# ws) $(go t) $(fallback r n)|] - Just alloc -> [| branch ($alloc >> $(scanBytes# ws)) $(go t) $(fallback r n) |] - - letE - (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches)) - (go t) - -parseSwitch :: Q Exp -> Q ([(String, Exp)], Maybe Exp) -parseSwitch exp = exp >>= \case - CaseE (UnboundVarE _) [] -> error "switch: empty clause list" - CaseE (UnboundVarE _) cases -> do - (!cases, !last) <- pure (init cases, last cases) - !cases <- forM cases \case - Match (LitP (StringL str)) (NormalB rhs) [] -> pure (str, rhs) - _ -> error "switch: expected a match clause on a string literal" - (!cases, !last) <- case last of - Match (LitP (StringL str)) (NormalB rhs) [] -> pure (cases ++ [(str, rhs)], Nothing) - Match WildP (NormalB rhs) [] -> pure (cases, Just rhs) - _ -> error "switch: expected a match clause on a string literal or a wildcard" - pure (cases, last) - _ -> error "switch: expected a \"case _ of\" expression" - -genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp - -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -genSwitchTrie' postAction cases fallback = - - let (!branches, !strings) = unzip do - (!i, (!str, !rhs)) <- zip [0..] cases - case postAction of - Nothing -> pure ((Just i, rhs), (i, str)) - Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str)) - - !m = M.fromList ((Nothing, maybe (VarE 'empty) id fallback) : branches) - !trie = compileTrie strings - in (m , trie) - --------------------------------------------------------------------------------- - --- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. --- This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not --- backtrack from the true/false cases. -branch :: Parser e a -> Parser e b -> Parser e b -> Parser e b -branch pa pt pf = Parser \fp eob s -> case runParser# pa fp eob s of - OK# _ s -> runParser# pt fp eob s - Fail# -> runParser# pf fp eob s - Err# e -> Err# e -{-# inline branch #-} - --- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All --- isolated bytes must be consumed. --- --- Throws a runtime error if given a negative integer. -isolate :: Int -> Parser e a -> Parser e a -isolate (I# n#) p = withPosInt# n# (\n'# -> isolateUnsafe# n'# p) -{-# inline isolate #-} - --- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All --- isolated bytes must be consumed. --- --- Undefined behaviour if given a negative integer. -isolateUnsafe# :: Int# -> Parser e a -> Parser e a -isolateUnsafe# n# p = Parser \fp eob s -> - let s' = plusAddr# s n# - in case n# <=# minusAddr# eob s of - 1# -> case runParser# p fp s' s of - OK# a s'' -> case eqAddr# s' s'' of - 1# -> OK# a s'' - _ -> Fail# -- isolated segment wasn't fully consumed - Fail# -> Fail# - Err# e -> Err# e - _ -> Fail# -- you tried to isolate more than we have left -{-# inline isolateUnsafe# #-} - --------------------------------------------------------------------------------- --- Low-level boxed combinators - --- | Read a null-terminated bytestring (a C-style string). --- --- Consumes the null terminator. -getCString :: Parser e B.ByteString -getCString = Parser \fp eob s -> go' fp eob s - where - go' fp eob s0 = go 0# s0 - where - go n# s = case eqAddr# eob s of - 1# -> Fail# - _ -> - let s' = plusAddr# s 1# - w# = indexWord8OffAddr# s 0# - in if W8# w# == 0x00 - then OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' - else go (n# +# 1#) s' -{-# inline getCString #-} - --- | Read a null-terminated bytestring (a C-style string), where the bytestring --- is known to be null-terminated somewhere in the input. --- --- Undefined behaviour if your bytestring isn't null-terminated somewhere. --- You almost certainly want 'getCString' instead. --- --- Fails on GHC versions older than 9.0, since we make use of the --- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful --- without it. --- --- Consumes the null terminator. -getCStringUnsafe :: Parser e B.ByteString -{-# inline getCStringUnsafe #-} -#if MIN_VERSION_base(4,15,0) -getCStringUnsafe = Parser \fp eob s -> - case eqAddr# eob s of - 1# -> Fail# - _ -> let n# = cstringLength# s - s'# = plusAddr# s (n# +# 1#) - in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# -#else -getCStringUnsafe = error "Flatparse.Basic.getCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" -#endif diff --git a/src/FlatParse/Basic/Addr.hs b/src/FlatParse/Basic/Addr.hs index a8e0138..0d76c55 100644 --- a/src/FlatParse/Basic/Addr.hs +++ b/src/FlatParse/Basic/Addr.hs @@ -20,7 +20,7 @@ functions.) module FlatParse.Basic.Addr where import FlatParse.Basic.Parser -import FlatParse.Basic.Internal +import FlatParse.Basic.Combinators ( take#, atSkip#, lookahead ) import GHC.Exts diff --git a/src/FlatParse/Basic/Bytes.hs b/src/FlatParse/Basic/Bytes.hs index 91eafc4..7c47195 100644 --- a/src/FlatParse/Basic/Bytes.hs +++ b/src/FlatParse/Basic/Bytes.hs @@ -52,3 +52,14 @@ scanBytes# bytes = do ws -> let !w = Common.packBytes ws !l = length ws in [| scanPartial64# l w >> $scanw8s |] + +scanPartial64# :: Int -> Word -> Parser e () +scanPartial64# (I# len) (W# w) = Parser \fp eob s -> + case indexWordOffAddr# s 0# of + w' -> case uncheckedIShiftL# (8# -# len) 3# of + sh -> case uncheckedShiftL# w' sh of + w' -> case uncheckedShiftRL# w' sh of + w' -> case eqWord# w w' of + 1# -> OK# () (plusAddr# s len) + _ -> Fail# +{-# inline scanPartial64# #-} diff --git a/src/FlatParse/Basic/Combinators.hs b/src/FlatParse/Basic/Combinators.hs new file mode 100644 index 0000000..16a4e5d --- /dev/null +++ b/src/FlatParse/Basic/Combinators.hs @@ -0,0 +1,255 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Basic.Combinators where + +import FlatParse.Basic.Parser +import FlatParse.Basic.Integers +import FlatParse.Basic.Bytes + +import FlatParse.Common.Trie + +import GHC.Exts +import GHC.Word +import Control.Applicative ( empty ) +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B +import GHC.ForeignPtr ( ForeignPtr(..) ) +import qualified Data.Foldable +import Control.Monad ( forM ) +import Control.Applicative ( (<|>) ) + +import Language.Haskell.TH +import qualified Data.Map.Strict as M +import Data.Map.Strict ( Map ) + +-- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack +-- on parser error. Use `try` to convert an error to a recoverable failure. +err :: e -> Parser e a +err e = Parser \fp eob s -> Err# e +{-# inline err #-} + +-- | Convert a parsing failure to a success. +fails :: Parser e a -> Parser e () +fails (Parser f) = Parser \fp eob s -> + case f fp eob s of + OK# _ _ -> Fail# + Fail# -> OK# () s + Err# e -> Err# e +{-# inline fails #-} + +-- | Convert a parsing error into failure. +try :: Parser e a -> Parser e a +try (Parser f) = Parser \fp eob s -> case f fp eob s of + Err# _ -> Fail# + x -> x +{-# inline try #-} + +-- | Convert a parsing failure to a `()`. +optional_ :: Parser e a -> Parser e () +optional_ p = (() <$ p) <|> pure () +{-# inline optional_ #-} + +-- | CPS'd version of `optional`. This is usually more efficient, since it gets +-- rid of the extra `Maybe` allocation. +withOption :: Parser e a -> (a -> Parser e r) -> Parser e r -> Parser e r +withOption (Parser f) just (Parser nothing) = Parser \fp eob s -> case f fp eob s of + OK# a s -> runParser# (just a) fp eob s + Fail# -> nothing fp eob s + Err# e -> Err# e +{-# inline withOption #-} + +-- | Convert a parsing failure to an error. +cut :: Parser e a -> e -> Parser e a +cut (Parser f) e = Parser \fp eob s -> case f fp eob s of + Fail# -> Err# e + x -> x +{-# inline cut #-} + +-- | Run the parser, if we get a failure, throw the given error, but if we get an error, merge the +-- inner and the newly given errors using the @e -> e -> e@ function. This can be useful for +-- implementing parsing errors which may propagate hints or accummulate contextual information. +cutting :: Parser e a -> e -> (e -> e -> e) -> Parser e a +cutting (Parser f) e merge = Parser \fp eob s -> case f fp eob s of + Fail# -> Err# e + Err# e' -> let !e'' = merge e' e in Err# e'' + x -> x +{-# inline cutting #-} + +-------------------------------------------------------------------------------- + +-- | Save the parsing state, then run a parser, then restore the state. +lookahead :: Parser e a -> Parser e a +lookahead (Parser f) = Parser \fp eob s -> + case f fp eob s of + OK# a _ -> OK# a s + x -> x +{-# inline lookahead #-} + +-- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All +-- isolated bytes must be consumed. +-- +-- Throws a runtime error if given a negative integer. +isolate :: Int -> Parser e a -> Parser e a +isolate (I# n#) p = withPosInt# n# (\n'# -> isolateUnsafe# n'# p) +{-# inline isolate #-} + +-- | @isolate n p@ runs the parser @p@ isolated to the next @n@ bytes. All +-- isolated bytes must be consumed. +-- +-- Undefined behaviour if given a negative integer. +isolateUnsafe# :: Int# -> Parser e a -> Parser e a +isolateUnsafe# n# p = Parser \fp eob s -> + let s' = plusAddr# s n# + in case n# <=# minusAddr# eob s of + 1# -> case runParser# p fp s' s of + OK# a s'' -> case eqAddr# s' s'' of + 1# -> OK# a s'' + _ -> Fail# -- isolated segment wasn't fully consumed + Fail# -> Fail# + Err# e -> Err# e + _ -> Fail# -- you tried to isolate more than we have left +{-# inline isolateUnsafe# #-} + +-- | An analogue of the list `foldl` function: first parse a @b@, then parse zero or more @a@-s, +-- and combine the results in a left-nested way by the @b -> a -> b@ function. Note: this is not +-- the usual `chainl` function from the parsec libraries! +chainl :: (b -> a -> b) -> Parser e b -> Parser e a -> Parser e b +chainl f start elem = start >>= go where + go b = do {!a <- elem; go $! f b a} <|> pure b +{-# inline chainl #-} + +-- | An analogue of the list `foldr` function: parse zero or more @a@-s, terminated by a @b@, and +-- combine the results in a right-nested way using the @a -> b -> b@ function. Note: this is not +-- the usual `chainr` function from the parsec libraries! +chainr :: (a -> b -> b) -> Parser e a -> Parser e b -> Parser e b +chainr f (Parser elem) (Parser end) = Parser go where + go fp eob s = case elem fp eob s of + OK# a s -> case go fp eob s of + OK# b s -> let !b' = f a b in OK# b' s + x -> x + Fail# -> end fp eob s + Err# e -> Err# e +{-# inline chainr #-} + +-- | Branch on a parser: if the first argument succeeds, continue with the second, else with the third. +-- This can produce slightly more efficient code than `(<|>)`. Moreover, `ḃranch` does not +-- backtrack from the true/false cases. +branch :: Parser e a -> Parser e b -> Parser e b -> Parser e b +branch pa pt pf = Parser \fp eob s -> case runParser# pa fp eob s of + OK# _ s -> runParser# pt fp eob s + Fail# -> runParser# pf fp eob s + Err# e -> Err# e +{-# inline branch #-} + +-- | Skip a parser zero or more times. +many_ :: Parser e a -> Parser e () +many_ (Parser f) = Parser go where + go fp eob s = case f fp eob s of + OK# a s -> go fp eob s + Fail# -> OK# () s + Err# e -> Err# e +{-# inline many_ #-} + +-- | Skip a parser one or more times. +some_ :: Parser e a -> Parser e () +some_ pa = pa >> many_ pa +{-# inline some_ #-} + +-- | Succeed if the first parser succeeds and the second one fails. +notFollowedBy :: Parser e a -> Parser e b -> Parser e a +notFollowedBy p1 p2 = p1 <* fails p2 +{-# inline notFollowedBy #-} + +-------------------------------------------------------------------------------- + +-- | Succeed if the input is empty. +eof :: Parser e () +eof = Parser \fp eob s -> case eqAddr# eob s of + 1# -> OK# () s + _ -> Fail# +{-# inline eof #-} + +-- | Read the given number of bytes as a 'ByteString'. +-- +-- Throws a runtime error if given a negative integer. +take :: Int -> Parser e B.ByteString +take (I# n#) = take# n# +{-# inline take #-} + +-- | Read @n@ bytes as a 'ByteString'. Fails if newer than @n@ bytes are +-- available. +-- +-- Throws a runtime error if given a negative integer. +take# :: Int# -> Parser e B.ByteString +take# n# = withPosInt# n# takeUnsafe# +{-# inline take# #-} + +-- | Read @n@ bytes as a 'ByteString'. Fails if newer than @n@ bytes are +-- available. +-- +-- Undefined behaviour if given a negative integer. +takeUnsafe# :: Int# -> Parser e B.ByteString +takeUnsafe# n# = Parser \fp eob s -> case n# <=# minusAddr# eob s of + 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) + _ -> Fail# +{-# inline takeUnsafe# #-} + +-- | Consume the rest of the input. May return the empty bytestring. +takeRest :: Parser e B.ByteString +takeRest = Parser \fp eob s -> + let n# = minusAddr# eob s + in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) eob +{-# inline takeRest #-} + +-- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. +-- +-- Throws a runtime error if given a negative integer. +skip :: Int -> Parser e () +skip (I# os#) = skip# os# +{-# inline skip #-} + +-- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. +-- +-- Throws a runtime error if given a negative integer. +skip# :: Int# -> Parser e () +skip# os# = atSkip# os# (pure ()) +{-# inline skip# #-} + +-- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ +-- bytes are available. +-- +-- Throws a runtime error if given a negative integer. +atSkip# :: Int# -> Parser e a -> Parser e a +atSkip# os# p = withPosInt# os# (\n# -> atSkipUnsafe# n# p) +{-# inline atSkip# #-} + +-- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ +-- bytes are available. +-- +-- Undefined behaviour if given a negative integer. +atSkipUnsafe# :: Int# -> Parser e a -> Parser e a +atSkipUnsafe# os# (Parser p) = Parser \fp eob s -> + case os# <=# minusAddr# eob s of + 1# -> p fp eob (plusAddr# s os#) + _ -> Fail# +{-# inline atSkipUnsafe# #-} + +-- | Go back @n@ bytes. +-- +-- Highly unsafe. Makes no checks. +skipBack# :: Int -> Parser e () +skipBack# (I# i) = Parser \fp eob s -> OK# () (plusAddr# s (negateInt# i)) +{-# inline skipBack# #-} + +-------------------------------------------------------------------------------- +-- Helpers for common internal operations + +-- | Assert for the given 'Int#' that @n >= 0@, and pass it on to the given +-- function. +-- +-- Throws a runtime error if given a negative integer. +withPosInt# :: Int# -> (Int# -> a) -> a +withPosInt# n# f = case n# >=# 0# of + 1# -> f n# + _ -> error "FlatParse.Basic.Internal.withPosInt#: negative integer" +{-# inline withPosInt# #-} diff --git a/src/FlatParse/Basic/Internal.hs b/src/FlatParse/Basic/Internal.hs deleted file mode 100644 index 7f3c4f5..0000000 --- a/src/FlatParse/Basic/Internal.hs +++ /dev/null @@ -1,107 +0,0 @@ -{-# language UnboxedTuples #-} - -module FlatParse.Basic.Internal where - -import FlatParse.Basic.Parser - -import GHC.Exts -import GHC.ForeignPtr - -import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as B - --------------------------------------------------------------------------------- - --- | Save the parsing state, then run a parser, then restore the state. -lookahead :: Parser e a -> Parser e a -lookahead (Parser f) = Parser \fp eob s -> - case f fp eob s of - OK# a _ -> OK# a s - x -> x -{-# inline lookahead #-} - --------------------------------------------------------------------------------- - -scanPartial64# :: Int -> Word -> Parser e () -scanPartial64# (I# len) (W# w) = Parser \fp eob s -> - case indexWordOffAddr# s 0# of - w' -> case uncheckedIShiftL# (8# -# len) 3# of - sh -> case uncheckedShiftL# w' sh of - w' -> case uncheckedShiftRL# w' sh of - w' -> case eqWord# w w' of - 1# -> OK# () (plusAddr# s len) - _ -> Fail# -{-# inline scanPartial64# #-} - --------------------------------------------------------------------------------- --- Helpers for common internal operations - --- | Assert for the given 'Int#' that @n >= 0@, and pass it on to the given --- function. --- --- Throws a runtime error if given a negative integer. -withPosInt# :: Int# -> (Int# -> a) -> a -withPosInt# n# f = case n# >=# 0# of - 1# -> f n# - _ -> error "FlatParse.Basic.Internal.withPosInt#: negative integer" -{-# inline withPosInt# #-} - --- | Run the given parser only if we have not yet reached the end of the buffer. -withNotEob :: Parser e a -> Parser e a -withNotEob (Parser p) = Parser \fp eob s -> case eqAddr# eob s of - 1# -> Fail# - _ -> p fp eob s -{-# inline withNotEob #-} - --------------------------------------------------------------------------------- --- Low level unboxed combinators - --- | Read @n@ bytes as a 'ByteString'. Fails if newer than @n@ bytes are --- available. --- --- Throws a runtime error if given a negative integer. -take# :: Int# -> Parser e B.ByteString -take# n# = withPosInt# n# takeUnsafe# -{-# inline take# #-} - --- | Read @n@ bytes as a 'ByteString'. Fails if newer than @n@ bytes are --- available. --- --- Undefined behaviour if given a negative integer. -takeUnsafe# :: Int# -> Parser e B.ByteString -takeUnsafe# n# = Parser \fp eob s -> case n# <=# minusAddr# eob s of - 1# -> OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) (plusAddr# s n#) - _ -> Fail# -{-# inline takeUnsafe# #-} - --- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ --- bytes are available. --- --- Throws a runtime error if given a negative integer. -atSkip# :: Int# -> Parser e a -> Parser e a -atSkip# os# p = withPosInt# os# (\n# -> atSkipUnsafe# n# p) -{-# inline atSkip# #-} - --- | Skip forward @n@ bytes and run the given parser. Fails if fewer than @n@ --- bytes are available. --- --- Undefined behaviour if given a negative integer. -atSkipUnsafe# :: Int# -> Parser e a -> Parser e a -atSkipUnsafe# os# (Parser p) = Parser \fp eob s -> case os# <=# minusAddr# eob s of - 1# -> p fp eob (plusAddr# s os#) - _ -> Fail# -{-# inline atSkipUnsafe# #-} - --- | Skip forward @n@ bytes. Fails if fewer than @n@ bytes are available. --- --- Throws a runtime error if given a negative integer. -skip# :: Int# -> Parser e () -skip# os# = atSkip# os# (pure ()) -{-# inline skip# #-} - --- | Go back @n@ bytes. --- --- Highly unsafe. Makes no checks. -skipBack# :: Int -> Parser e () -skipBack# (I# i) = Parser \fp eob s -> OK# () (plusAddr# s (negateInt# i)) -{-# inline skipBack# #-} diff --git a/src/FlatParse/Basic/Other.hs b/src/FlatParse/Basic/Other.hs new file mode 100644 index 0000000..4cb108c --- /dev/null +++ b/src/FlatParse/Basic/Other.hs @@ -0,0 +1,251 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Basic.Other where + +import FlatParse.Basic.Parser + +-- TODO TH!! +import FlatParse.Basic.Combinators ( branch, skipBack# ) +import FlatParse.Basic.Bytes ( scanBytes#, ensureBytes# ) +import FlatParse.Basic.Integers ( getWord8Unsafe ) + +import FlatParse.Common.Trie + +import GHC.Exts +import GHC.Word +import qualified Data.ByteString as B +import qualified Data.ByteString.Internal as B +import GHC.ForeignPtr ( ForeignPtr(..) ) +import Control.Applicative ( empty ) +import qualified Data.Foldable +import Control.Monad ( forM ) + +import Language.Haskell.TH +import qualified Data.Map.Strict as M +import Data.Map.Strict ( Map ) + +-- | Parse a given `B.ByteString`. If the bytestring is statically known, +-- consider using 'bytes' instead. +getByteStringOf :: B.ByteString -> Parser e () +getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = + + let go64 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) + go64 bs bsend s w = + let bs' = plusAddr# bs 8# in + case gtAddr# bs' bsend of + 1# -> go8 bs bsend s w + _ -> if W64# (indexWord64OffAddr# bs 0#) == W64# (indexWord64OffAddr# s 0#) + then go64 bs' bsend (plusAddr# s 8#) w + else (# Fail#, w #) + + go8 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) + go8 bs bsend s w = + case ltAddr# bs bsend of + 1# -> if W8# (indexWord8OffAddr# bs 0#) == W8# (indexWord8OffAddr# s 0#) + then go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) w + else (# Fail#, w #) + _ -> (# OK# () s, w #) + + in Parser \fp eob s -> case len <=# minusAddr# eob s of + 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s w of + (# res, w #) -> case touch# fcontent w of + w -> res + _ -> Fail# +{-# inline getByteStringOf #-} + +-- | Read a null-terminated bytestring (a C-style string). +-- +-- Consumes the null terminator. +getCString :: Parser e B.ByteString +getCString = Parser \fp eob s -> go' fp eob s + where + go' fp eob s0 = go 0# s0 + where + go n# s = case eqAddr# eob s of + 1# -> Fail# + _ -> + let s' = plusAddr# s 1# + w# = indexWord8OffAddr# s 0# + in if W8# w# == 0x00 + then OK# (B.PS (ForeignPtr s0 fp) 0 (I# n#)) s' + else go (n# +# 1#) s' +{-# inline getCString #-} + +-- | Read a null-terminated bytestring (a C-style string), where the bytestring +-- is known to be null-terminated somewhere in the input. +-- +-- Undefined behaviour if your bytestring isn't null-terminated somewhere. +-- You almost certainly want 'getCString' instead. +-- +-- Fails on GHC versions older than 9.0, since we make use of the +-- 'cstringLength#' primop introduced in GHC 9.0, and we aren't very useful +-- without it. +-- +-- Consumes the null terminator. +getCStringUnsafe :: Parser e B.ByteString +{-# inline getCStringUnsafe #-} +#if MIN_VERSION_base(4,15,0) +getCStringUnsafe = Parser \fp eob s -> + case eqAddr# eob s of + 1# -> Fail# + _ -> let n# = cstringLength# s + s'# = plusAddr# s (n# +# 1#) + in OK# (B.PS (ForeignPtr s fp) 0 (I# n#)) s'# +#else +getCStringUnsafe = error "Flatparse.Basic.Combinators.getCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" +#endif + +-- Switching code generation +-------------------------------------------------------------------------------- + +#if MIN_VERSION_base(4,15,0) +mkDoE = DoE Nothing +{-# inline mkDoE #-} +#else +mkDoE = DoE +{-# inline mkDoE #-} +#endif + +genTrie :: (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) -> Q Exp +genTrie (rules, t) = do + branches <- traverse (\e -> (,) <$> (newName "rule") <*> pure e) rules + + let ix m k = case M.lookup k m of + Nothing -> error ("key not in map: " ++ show k) + Just a -> a + + let ensure :: Maybe Int -> Maybe (Q Exp) + ensure = fmap (\n -> [| ensureBytes# n |]) + + fallback :: Rule -> Int -> Q Exp + fallback rule 0 = pure $ VarE $ fst $ ix branches rule + fallback rule n = [| skipBack# n >> $(pure $ VarE $ fst $ ix branches rule) |] + + let go :: Trie' (Rule, Int, Maybe Int) -> Q Exp + go = \case + Branch' (r, n, alloc) ts + | M.null ts -> pure $ VarE $ fst $ branches M.! r + | otherwise -> do + !next <- (traverse . traverse) go (M.toList ts) + !defaultCase <- fallback r (n + 1) + + let cases = mkDoE $ + [BindS (VarP (mkName "c")) (VarE 'getWord8Unsafe), + NoBindS (CaseE (VarE (mkName "c")) + (map (\(w, t) -> + Match (LitP (IntegerL (fromIntegral w))) + (NormalB t) + []) + next + ++ [Match WildP (NormalB defaultCase) []]))] + + case ensure alloc of + Nothing -> pure cases + Just alloc -> [| branch $alloc $(pure cases) $(fallback r n) |] + + Path (r, n, alloc) ws t -> + case ensure alloc of + Nothing -> [| branch $(scanBytes# ws) $(go t) $(fallback r n)|] + Just alloc -> [| branch ($alloc >> $(scanBytes# ws)) $(go t) $(fallback r n) |] + + letE + (map (\(x, rhs) -> valD (varP x) (normalB (pure rhs)) []) (Data.Foldable.toList branches)) + (go t) + +parseSwitch :: Q Exp -> Q ([(String, Exp)], Maybe Exp) +parseSwitch exp = exp >>= \case + CaseE (UnboundVarE _) [] -> error "switch: empty clause list" + CaseE (UnboundVarE _) cases -> do + (!cases, !last) <- pure (init cases, last cases) + !cases <- forM cases \case + Match (LitP (StringL str)) (NormalB rhs) [] -> pure (str, rhs) + _ -> error "switch: expected a match clause on a string literal" + (!cases, !last) <- case last of + Match (LitP (StringL str)) (NormalB rhs) [] -> pure (cases ++ [(str, rhs)], Nothing) + Match WildP (NormalB rhs) [] -> pure (cases, Just rhs) + _ -> error "switch: expected a match clause on a string literal or a wildcard" + pure (cases, last) + _ -> error "switch: expected a \"case _ of\" expression" + +genSwitchTrie' :: Maybe Exp -> [(String, Exp)] -> Maybe Exp + -> (Map (Maybe Int) Exp, Trie' (Rule, Int, Maybe Int)) +genSwitchTrie' postAction cases fallback = + + let (!branches, !strings) = unzip do + (!i, (!str, !rhs)) <- zip [0..] cases + case postAction of + Nothing -> pure ((Just i, rhs), (i, str)) + Just !post -> pure ((Just i, (VarE '(>>)) `AppE` post `AppE` rhs), (i, str)) + + !m = M.fromList ((Nothing, maybe (VarE 'empty) id fallback) : branches) + !trie = compileTrie strings + in (m , trie) + +-------------------------------------------------------------------------------- + +{-| +This is a template function which makes it possible to branch on a collection of string literals in +an efficient way. By using `switch`, such branching is compiled to a trie of primitive parsing +operations, which has optimized control flow, vectorized reads and grouped checking for needed input +bytes. + +The syntax is slightly magical, it overloads the usual @case@ expression. An example: + +@ + $(switch [| case _ of + "foo" -> pure True + "bar" -> pure False |]) +@ + +The underscore is mandatory in @case _ of@. Each branch must be a string literal, but optionally +we may have a default case, like in + +@ + $(switch [| case _ of + "foo" -> pure 10 + "bar" -> pure 20 + _ -> pure 30 |]) +@ + +All case right hand sides must be parsers with the same type. That type is also the type +of the whole `switch` expression. + +A `switch` has longest match semantics, and the order of cases does not matter, except for +the default case, which may only appear as the last case. + +If a `switch` does not have a default case, and no case matches the input, then it returns with +failure, \without\ having consumed any input. A fallthrough to the default case also does not +consume any input. +-} +switch :: Q Exp -> Q Exp +switch = switchWithPost Nothing + +{-| +Switch expression with an optional first argument for performing a post-processing action after +every successful branch matching, not including the default branch. For example, if we have +@ws :: Parser e ()@ for a whitespace parser, we might want to consume whitespace after matching +on any of the switch cases. For that case, we can define a "lexeme" version of `switch` as +follows. + +@ + switch' :: Q Exp -> Q Exp + switch' = switchWithPost (Just [| ws |]) +@ + +Note that this @switch'@ function cannot be used in the same module it's defined in, because of the +stage restriction of Template Haskell. +-} +switchWithPost :: Maybe (Q Exp) -> Q Exp -> Q Exp +switchWithPost postAction exp = do + !postAction <- sequence postAction + (!cases, !fallback) <- parseSwitch exp + genTrie $! genSwitchTrie' postAction cases fallback + +-- | Version of `switchWithPost` without syntactic sugar. The second argument is the +-- list of cases, the third is the default case. +rawSwitchWithPost :: Maybe (Q Exp) -> [(String, Q Exp)] -> Maybe (Q Exp) -> Q Exp +rawSwitchWithPost postAction cases fallback = do + !postAction <- sequence postAction + !cases <- forM cases \(str, rhs) -> (str,) <$> rhs + !fallback <- sequence fallback + genTrie $! genSwitchTrie' postAction cases fallback From 19bd2081dccea5a4e3fba4102a0bb471383f39b6 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Thu, 27 Oct 2022 21:38:22 +0100 Subject: [PATCH 09/11] last tweaking --- flatparse.cabal | 1 - src/FlatParse/Basic.hs | 1 - src/FlatParse/Basic/Bytes.hs | 65 --------------- src/FlatParse/Basic/Combinators.hs | 14 +--- src/FlatParse/Basic/Integers.hs | 7 +- src/FlatParse/Basic/Other.hs | 124 +++++++++++++++++++++-------- src/FlatParse/Basic/Strings.hs | 4 +- 7 files changed, 102 insertions(+), 114 deletions(-) delete mode 100644 src/FlatParse/Basic/Bytes.hs diff --git a/flatparse.cabal b/flatparse.cabal index 06147b5..5f20196 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -45,7 +45,6 @@ library exposed-modules: FlatParse.Basic FlatParse.Basic.Addr - FlatParse.Basic.Bytes FlatParse.Basic.Combinators FlatParse.Basic.Integers FlatParse.Basic.Other diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index 82b316f..954359c 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -97,7 +97,6 @@ import FlatParse.Basic.Parser import FlatParse.Basic.Integers import FlatParse.Basic.Combinators import FlatParse.Basic.Other -import FlatParse.Basic.Bytes import FlatParse.Basic.Strings import FlatParse.Basic.Position import FlatParse.Basic.Addr diff --git a/src/FlatParse/Basic/Bytes.hs b/src/FlatParse/Basic/Bytes.hs deleted file mode 100644 index 7c47195..0000000 --- a/src/FlatParse/Basic/Bytes.hs +++ /dev/null @@ -1,65 +0,0 @@ -module FlatParse.Basic.Bytes where - -import Language.Haskell.TH -import FlatParse.Basic.Parser -import qualified FlatParse.Common.Assorted as Common -import FlatParse.Basic.Integers ( getWord64OfUnsafe - , getWord32OfUnsafe - , getWord16OfUnsafe - , getWord8OfUnsafe ) -import GHC.Exts - --- | Check that the input has at least the given number of bytes. -ensureBytes# :: Int -> Parser e () -ensureBytes# (I# len) = Parser \fp eob s -> - case len <=# minusAddr# eob s of - 1# -> OK# () s - _ -> Fail# -{-# inline ensureBytes# #-} - --- | Read a sequence of bytes. This is a template function, you can use it as --- @$(getBytesOf [3, 4, 5])@, for example, and the splice has type @Parser e --- ()@. -getBytesOf :: [Word] -> Q Exp -getBytesOf bytes = do - let !len = length bytes - [| ensureBytes# len >> $(scanBytes# bytes) |] - --- | Template function, creates a @Parser e ()@ which unsafely scans a given --- sequence of bytes. -scanBytes# :: [Word] -> Q Exp -scanBytes# bytes = do - let !(leading, w8s) = Common.splitBytes bytes - !scanw8s = go w8s where - go (w8:[] ) = [| getWord64OfUnsafe w8 |] - go (w8:w8s) = [| getWord64OfUnsafe w8 >> $(go w8s) |] - go [] = [| pure () |] - case w8s of - [] -> go leading - where - go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| getWord32OfUnsafe w |] - go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| getWord32OfUnsafe w >> $(go ws) |] - go (a:b:[]) = let !w = Common.packBytes [a, b] in [| getWord16OfUnsafe w |] - go (a:b:ws) = let !w = Common.packBytes [a, b] in [| getWord16OfUnsafe w >> $(go ws) |] - go (a:[]) = [| getWord8OfUnsafe a |] - go [] = [| pure () |] - _ -> case leading of - - [] -> scanw8s - [a] -> [| getWord8OfUnsafe a >> $scanw8s |] - ws@[a, b] -> let !w = Common.packBytes ws in [| getWord16OfUnsafe w >> $scanw8s |] - ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| getWord32OfUnsafe w >> $scanw8s |] - ws -> let !w = Common.packBytes ws - !l = length ws - in [| scanPartial64# l w >> $scanw8s |] - -scanPartial64# :: Int -> Word -> Parser e () -scanPartial64# (I# len) (W# w) = Parser \fp eob s -> - case indexWordOffAddr# s 0# of - w' -> case uncheckedIShiftL# (8# -# len) 3# of - sh -> case uncheckedShiftL# w' sh of - w' -> case uncheckedShiftRL# w' sh of - w' -> case eqWord# w w' of - 1# -> OK# () (plusAddr# s len) - _ -> Fail# -{-# inline scanPartial64# #-} diff --git a/src/FlatParse/Basic/Combinators.hs b/src/FlatParse/Basic/Combinators.hs index 16a4e5d..3da3c1b 100644 --- a/src/FlatParse/Basic/Combinators.hs +++ b/src/FlatParse/Basic/Combinators.hs @@ -1,27 +1,17 @@ {-# language UnboxedTuples #-} +-- | Basic parser building blocks. + module FlatParse.Basic.Combinators where import FlatParse.Basic.Parser -import FlatParse.Basic.Integers -import FlatParse.Basic.Bytes - -import FlatParse.Common.Trie import GHC.Exts -import GHC.Word -import Control.Applicative ( empty ) import qualified Data.ByteString as B import qualified Data.ByteString.Internal as B import GHC.ForeignPtr ( ForeignPtr(..) ) -import qualified Data.Foldable -import Control.Monad ( forM ) import Control.Applicative ( (<|>) ) -import Language.Haskell.TH -import qualified Data.Map.Strict as M -import Data.Map.Strict ( Map ) - -- | Throw a parsing error. By default, parser choice `(<|>)` can't backtrack -- on parser error. Use `try` to convert an error to a recoverable failure. err :: e -> Parser e a diff --git a/src/FlatParse/Basic/Integers.hs b/src/FlatParse/Basic/Integers.hs index 6e52487..50240b9 100644 --- a/src/FlatParse/Basic/Integers.hs +++ b/src/FlatParse/Basic/Integers.hs @@ -53,13 +53,14 @@ module FlatParse.Basic.Integers , getSizedOfUnsafe# ) where +import FlatParse.Basic.Parser + +import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 ) + import GHC.Exts import GHC.Word import GHC.Int -import FlatParse.Basic.Parser -import FlatParse.Common.Assorted ( word16ToInt16, word32ToInt32, word64ToInt64 ) - import Control.Applicative ( Alternative(empty) ) -------------------------------------------------------------------------------- diff --git a/src/FlatParse/Basic/Other.hs b/src/FlatParse/Basic/Other.hs index 4cb108c..f6c79a6 100644 --- a/src/FlatParse/Basic/Other.hs +++ b/src/FlatParse/Basic/Other.hs @@ -1,15 +1,21 @@ {-# language UnboxedTuples #-} +-- | Parser building blocks. + module FlatParse.Basic.Other where import FlatParse.Basic.Parser -- TODO TH!! import FlatParse.Basic.Combinators ( branch, skipBack# ) -import FlatParse.Basic.Bytes ( scanBytes#, ensureBytes# ) -import FlatParse.Basic.Integers ( getWord8Unsafe ) +import FlatParse.Basic.Integers ( getWord64OfUnsafe + , getWord32OfUnsafe + , getWord16OfUnsafe + , getWord8OfUnsafe + , getWord8Unsafe ) import FlatParse.Common.Trie +import qualified FlatParse.Common.Assorted as Common import GHC.Exts import GHC.Word @@ -24,35 +30,6 @@ import Language.Haskell.TH import qualified Data.Map.Strict as M import Data.Map.Strict ( Map ) --- | Parse a given `B.ByteString`. If the bytestring is statically known, --- consider using 'bytes' instead. -getByteStringOf :: B.ByteString -> Parser e () -getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = - - let go64 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go64 bs bsend s w = - let bs' = plusAddr# bs 8# in - case gtAddr# bs' bsend of - 1# -> go8 bs bsend s w - _ -> if W64# (indexWord64OffAddr# bs 0#) == W64# (indexWord64OffAddr# s 0#) - then go64 bs' bsend (plusAddr# s 8#) w - else (# Fail#, w #) - - go8 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) - go8 bs bsend s w = - case ltAddr# bs bsend of - 1# -> if W8# (indexWord8OffAddr# bs 0#) == W8# (indexWord8OffAddr# s 0#) - then go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) w - else (# Fail#, w #) - _ -> (# OK# () s, w #) - - in Parser \fp eob s -> case len <=# minusAddr# eob s of - 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s w of - (# res, w #) -> case touch# fcontent w of - w -> res - _ -> Fail# -{-# inline getByteStringOf #-} - -- | Read a null-terminated bytestring (a C-style string). -- -- Consumes the null terminator. @@ -95,6 +72,91 @@ getCStringUnsafe = Parser \fp eob s -> getCStringUnsafe = error "Flatparse.Basic.Combinators.getCStringUnsafe: requires GHC 9.0 / base-4.15, not available on this compiler" #endif +-- | Parse a given `B.ByteString`. If the bytestring is statically known, +-- consider using 'bytes' instead. +getByteStringOf :: B.ByteString -> Parser e () +getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = + + let go64 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) + go64 bs bsend s w = + let bs' = plusAddr# bs 8# in + case gtAddr# bs' bsend of + 1# -> go8 bs bsend s w + _ -> if W64# (indexWord64OffAddr# bs 0#) == W64# (indexWord64OffAddr# s 0#) + then go64 bs' bsend (plusAddr# s 8#) w + else (# Fail#, w #) + + go8 :: Addr# -> Addr# -> Addr# -> State# RealWorld -> (# Res# e (), State# RealWorld #) + go8 bs bsend s w = + case ltAddr# bs bsend of + 1# -> if W8# (indexWord8OffAddr# bs 0#) == W8# (indexWord8OffAddr# s 0#) + then go8 (plusAddr# bs 1#) bsend (plusAddr# s 1#) w + else (# Fail#, w #) + _ -> (# OK# () s, w #) + + in Parser \fp eob s -> case len <=# minusAddr# eob s of + 1# -> runRW# \w -> case go64 bs (plusAddr# bs len) s w of + (# res, w #) -> case touch# fcontent w of + w -> res + _ -> Fail# +{-# inline getByteStringOf #-} + +-- | Check that the input has at least the given number of bytes. +ensureBytes# :: Int -> Parser e () +ensureBytes# (I# len) = Parser \fp eob s -> + case len <=# minusAddr# eob s of + 1# -> OK# () s + _ -> Fail# +{-# inline ensureBytes# #-} + +-- | Read a sequence of bytes. This is a template function, you can use it as +-- @$(getBytesOf [3, 4, 5])@, for example, and the splice has type @Parser e +-- ()@. +getBytesOf :: [Word] -> Q Exp +getBytesOf bytes = do + let !len = length bytes + [| ensureBytes# len >> $(scanBytes# bytes) |] + +-- | Template function, creates a @Parser e ()@ which unsafely scans a given +-- sequence of bytes. +scanBytes# :: [Word] -> Q Exp +scanBytes# bytes = do + let !(leading, w8s) = Common.splitBytes bytes + !scanw8s = go w8s where + go (w8:[] ) = [| getWord64OfUnsafe w8 |] + go (w8:w8s) = [| getWord64OfUnsafe w8 >> $(go w8s) |] + go [] = [| pure () |] + case w8s of + [] -> go leading + where + go (a:b:c:d:[]) = let !w = Common.packBytes [a, b, c, d] in [| getWord32OfUnsafe w |] + go (a:b:c:d:ws) = let !w = Common.packBytes [a, b, c, d] in [| getWord32OfUnsafe w >> $(go ws) |] + go (a:b:[]) = let !w = Common.packBytes [a, b] in [| getWord16OfUnsafe w |] + go (a:b:ws) = let !w = Common.packBytes [a, b] in [| getWord16OfUnsafe w >> $(go ws) |] + go (a:[]) = [| getWord8OfUnsafe a |] + go [] = [| pure () |] + _ -> case leading of + + [] -> scanw8s + [a] -> [| getWord8OfUnsafe a >> $scanw8s |] + ws@[a, b] -> let !w = Common.packBytes ws in [| getWord16OfUnsafe w >> $scanw8s |] + ws@[a, b, c, d] -> let !w = Common.packBytes ws in [| getWord32OfUnsafe w >> $scanw8s |] + ws -> let !w = Common.packBytes ws + !l = length ws + in [| scanPartial64# l w >> $scanw8s |] + +scanPartial64# :: Int -> Word -> Parser e () +scanPartial64# (I# len) (W# w) = Parser \fp eob s -> + case indexWordOffAddr# s 0# of + w' -> case uncheckedIShiftL# (8# -# len) 3# of + sh -> case uncheckedShiftL# w' sh of + w' -> case uncheckedShiftRL# w' sh of + w' -> case eqWord# w w' of + 1# -> OK# () (plusAddr# s len) + _ -> Fail# +{-# inline scanPartial64# #-} + + -- Switching code generation -------------------------------------------------------------------------------- diff --git a/src/FlatParse/Basic/Strings.hs b/src/FlatParse/Basic/Strings.hs index 2a3bdf0..6953a44 100644 --- a/src/FlatParse/Basic/Strings.hs +++ b/src/FlatParse/Basic/Strings.hs @@ -1,11 +1,13 @@ {-# language UnboxedTuples #-} +-- | Parsers and utilities for parsing (UTF-8) 'Char's and strings. + module FlatParse.Basic.Strings where import Prelude hiding ( getChar ) import FlatParse.Basic.Parser -import FlatParse.Basic.Bytes ( getBytesOf ) +import FlatParse.Basic.Other ( getBytesOf ) import FlatParse.Common.Assorted ( derefChar8# ) From dde03d7647f8656ffe9764ac49bc85904e22d12b Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Fri, 28 Oct 2022 13:52:56 +0100 Subject: [PATCH 10/11] more fiddling --- flatparse.cabal | 2 +- src/FlatParse/Basic.hs | 99 ++++++------------- src/FlatParse/Basic/Addr.hs | 2 +- .../Basic/{Combinators.hs => Base.hs} | 93 +++++++++++------ src/FlatParse/Basic/Integers.hs | 22 +---- src/FlatParse/Basic/Other.hs | 36 ++++--- src/FlatParse/Basic/Position.hs | 2 + src/FlatParse/Basic/Strings.hs | 21 +++- 8 files changed, 144 insertions(+), 133 deletions(-) rename src/FlatParse/Basic/{Combinators.hs => Base.hs} (94%) diff --git a/flatparse.cabal b/flatparse.cabal index 5f20196..7eb01c4 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -45,7 +45,7 @@ library exposed-modules: FlatParse.Basic FlatParse.Basic.Addr - FlatParse.Basic.Combinators + FlatParse.Basic.Base FlatParse.Basic.Integers FlatParse.Basic.Other FlatParse.Basic.Parser diff --git a/src/FlatParse/Basic.hs b/src/FlatParse/Basic.hs index 954359c..0a1c026 100644 --- a/src/FlatParse/Basic.hs +++ b/src/FlatParse/Basic.hs @@ -3,10 +3,21 @@ {-| This module implements a `Parser` supporting custom error types. If you need efficient indentation parsing, use "FlatParse.Stateful" instead. --} -module FlatParse.Basic ( +The following naming conventions are followed: + + * @withX@ are continuation passing style (CPS) parsers. + * These are sprinkled throughout to enable better reasoning about + performance. + * @getX@ are regular monadic parsers. + * @getXOf@ parse and check equality with a provided value. + * Definitions ending with @#@ are called with unboxed values. + * Definitions ending with @Unsafe@ are unsafe. Read their documentation before + using. +-} +module FlatParse.Basic + ( -- * Parser monad type Parser @@ -14,88 +25,37 @@ module FlatParse.Basic ( , Result(..) , runParser - -- * Errors and failures - , err - , lookahead - , fails - , try - , Control.Applicative.optional - , optional_ - , withOption - , cut - , cutting - - -- * Combinators - , (Control.Applicative.<|>) - , Control.Applicative.empty - , branch - , chainl - , chainr - , Control.Applicative.many - , many_ - , Control.Applicative.some - , some_ - , notFollowedBy - , isolate - , isolateUnsafe# - - -- * Primitive parsers - , eof - , switch - , switchWithPost - , rawSwitchWithPost - - -- ** Byte-wise - , take - , take# - , takeRest - , skip - , atSkip# - , skipBack# - , getBytesOf - , getByteStringOf - , getCString - , getCStringUnsafe + -- * Parsers + -- ** Base combinators, byte-wise + , module FlatParse.Basic.Base + , module FlatParse.Basic.Other -- ** Machine integers , module FlatParse.Basic.Integers -- ** 'Char', 'String' - , getCharOf - , getStringOf - , getChar - , getChar_ - , getCharASCII - , getCharASCII_ - , getAsciiDecimalInt - , getAsciiDecimalInteger - , getAsciiHexInt - , Common.isDigit - , Common.isGreekLetter - , Common.isLatinLetter - , satisfy - , satisfy_ - , satisfyASCII - , satisfyASCII_ - , fusedSatisfy - , fusedSatisfy_ + , module FlatParse.Basic.Strings -- ** Positions and spans , module FlatParse.Basic.Position - -- ** Location & address primitives + -- ** Address primitives , module FlatParse.Basic.Addr + -- ** Re-exports + , (Control.Applicative.<|>) + , Control.Applicative.empty + , Control.Applicative.many + , Control.Applicative.some + , Control.Applicative.optional + ) where import Prelude hiding ( take, getChar ) -import qualified FlatParse.Common.Assorted as Common -import FlatParse.Common.Position - import FlatParse.Basic.Parser import FlatParse.Basic.Integers -import FlatParse.Basic.Combinators +import FlatParse.Basic.Base import FlatParse.Basic.Other import FlatParse.Basic.Strings import FlatParse.Basic.Position @@ -113,9 +73,10 @@ import GHC.ForeignPtr ( ForeignPtr(..) ) -- | Higher-level boxed data type for parsing results. data Result e a = - OK a !(B.ByteString) -- ^ Contains return value and unconsumed input. + OK a !(B.ByteString) -- ^ Contains return value and unconsumed input + -- (possibly empty, if fully consumed). | Fail -- ^ Recoverable-by-default failure. - | Err !e -- ^ Unrecoverble-by-default error. + | Err !e -- ^ Unrecoverable-by-default error. deriving Show instance Functor (Result e) where diff --git a/src/FlatParse/Basic/Addr.hs b/src/FlatParse/Basic/Addr.hs index 0d76c55..72bba80 100644 --- a/src/FlatParse/Basic/Addr.hs +++ b/src/FlatParse/Basic/Addr.hs @@ -20,7 +20,7 @@ functions.) module FlatParse.Basic.Addr where import FlatParse.Basic.Parser -import FlatParse.Basic.Combinators ( take#, atSkip#, lookahead ) +import FlatParse.Basic.Base ( take#, atSkip#, lookahead ) import GHC.Exts diff --git a/src/FlatParse/Basic/Combinators.hs b/src/FlatParse/Basic/Base.hs similarity index 94% rename from src/FlatParse/Basic/Combinators.hs rename to src/FlatParse/Basic/Base.hs index 3da3c1b..da88aac 100644 --- a/src/FlatParse/Basic/Combinators.hs +++ b/src/FlatParse/Basic/Base.hs @@ -2,7 +2,42 @@ -- | Basic parser building blocks. -module FlatParse.Basic.Combinators where +module FlatParse.Basic.Base + ( + -- * Errors and failures + err + , try + , fails + , cut + , cutting + , optional_ + , withOption + + -- * Primitive combinators + , lookahead + , branch + , chainl + , chainr + , many_ + , some_ + , notFollowedBy + , isolate + , isolateUnsafe# + + -- * Primitive byte-wise parsers + , eof + , take + , take# + , takeUnsafe# + , takeRest + , skip + , atSkip# + , atSkipUnsafe# + , skipBack# + + ) where + +import Prelude hiding ( take ) import FlatParse.Basic.Parser @@ -18,6 +53,13 @@ err :: e -> Parser e a err e = Parser \fp eob s -> Err# e {-# inline err #-} +-- | Convert a parsing error into failure. +try :: Parser e a -> Parser e a +try (Parser f) = Parser \fp eob s -> case f fp eob s of + Err# _ -> Fail# + x -> x +{-# inline try #-} + -- | Convert a parsing failure to a success. fails :: Parser e a -> Parser e () fails (Parser f) = Parser \fp eob s -> @@ -27,27 +69,6 @@ fails (Parser f) = Parser \fp eob s -> Err# e -> Err# e {-# inline fails #-} --- | Convert a parsing error into failure. -try :: Parser e a -> Parser e a -try (Parser f) = Parser \fp eob s -> case f fp eob s of - Err# _ -> Fail# - x -> x -{-# inline try #-} - --- | Convert a parsing failure to a `()`. -optional_ :: Parser e a -> Parser e () -optional_ p = (() <$ p) <|> pure () -{-# inline optional_ #-} - --- | CPS'd version of `optional`. This is usually more efficient, since it gets --- rid of the extra `Maybe` allocation. -withOption :: Parser e a -> (a -> Parser e r) -> Parser e r -> Parser e r -withOption (Parser f) just (Parser nothing) = Parser \fp eob s -> case f fp eob s of - OK# a s -> runParser# (just a) fp eob s - Fail# -> nothing fp eob s - Err# e -> Err# e -{-# inline withOption #-} - -- | Convert a parsing failure to an error. cut :: Parser e a -> e -> Parser e a cut (Parser f) e = Parser \fp eob s -> case f fp eob s of @@ -65,8 +86,29 @@ cutting (Parser f) e merge = Parser \fp eob s -> case f fp eob s of x -> x {-# inline cutting #-} +-- | Convert a parsing failure to a `()`. +optional_ :: Parser e a -> Parser e () +optional_ p = (() <$ p) <|> pure () +{-# inline optional_ #-} + +-- | CPS'd version of `optional`. This is usually more efficient, since it gets +-- rid of the extra `Maybe` allocation. +withOption :: Parser e a -> (a -> Parser e r) -> Parser e r -> Parser e r +withOption (Parser f) just (Parser nothing) = Parser \fp eob s -> case f fp eob s of + OK# a s -> runParser# (just a) fp eob s + Fail# -> nothing fp eob s + Err# e -> Err# e +{-# inline withOption #-} + -------------------------------------------------------------------------------- +-- | Succeed if the input is empty. +eof :: Parser e () +eof = Parser \fp eob s -> case eqAddr# eob s of + 1# -> OK# () s + _ -> Fail# +{-# inline eof #-} + -- | Save the parsing state, then run a parser, then restore the state. lookahead :: Parser e a -> Parser e a lookahead (Parser f) = Parser \fp eob s -> @@ -152,13 +194,6 @@ notFollowedBy p1 p2 = p1 <* fails p2 -------------------------------------------------------------------------------- --- | Succeed if the input is empty. -eof :: Parser e () -eof = Parser \fp eob s -> case eqAddr# eob s of - 1# -> OK# () s - _ -> Fail# -{-# inline eof #-} - -- | Read the given number of bytes as a 'ByteString'. -- -- Throws a runtime error if given a negative integer. diff --git a/src/FlatParse/Basic/Integers.hs b/src/FlatParse/Basic/Integers.hs index 50240b9..eb51ba6 100644 --- a/src/FlatParse/Basic/Integers.hs +++ b/src/FlatParse/Basic/Integers.hs @@ -1,22 +1,10 @@ {- | Machine integer parsers. -The following naming conventions are followed: - - * @withX@ are continuation passing style (CPS) parsers. - * These are sprinkled throughout to enable better reasoning about - performance. - * @getX@ are regular monadic parsers. - * @getXOf@ parse and check equality with a provided value. - * Definitions ending with @#@ are called with unboxed values. - * Definitions ending with @Unsafe@ are unsafe. Read their documentation before - using. - -Other points: - - * TODO: The endianness code is currently lying. We blindly assume that our - host system is little-endian, and parse in big-endian by parsing normally - then "reversing" the resulting integer. - * Unless otherwise mentioned, native byte order is utilized. +Parsers use native byte order, unless stated otherwise. + +TODO: The endianness code is currently lying. We blindly assume that our host +system is little-endian, and parse in big-endian by parsing normally then +"reversing" the resulting integer. -} module FlatParse.Basic.Integers diff --git a/src/FlatParse/Basic/Other.hs b/src/FlatParse/Basic/Other.hs index f6c79a6..98a751d 100644 --- a/src/FlatParse/Basic/Other.hs +++ b/src/FlatParse/Basic/Other.hs @@ -1,20 +1,25 @@ {-# language UnboxedTuples #-} --- | Parser building blocks. +-- | Assorted parser building blocks which may use the more basic combinators, +-- and don't belong elsewhere. -module FlatParse.Basic.Other where +module FlatParse.Basic.Other + ( getCString, getCStringUnsafe + , getByteStringOf, getBytesOf + , switch, switchWithPost, rawSwitchWithPost + ) where import FlatParse.Basic.Parser -- TODO TH!! -import FlatParse.Basic.Combinators ( branch, skipBack# ) +import FlatParse.Basic.Base ( branch, skipBack# ) import FlatParse.Basic.Integers ( getWord64OfUnsafe , getWord32OfUnsafe , getWord16OfUnsafe , getWord8OfUnsafe , getWord8Unsafe ) -import FlatParse.Common.Trie +import FlatParse.Common.Trie hiding ( ensureBytes ) import qualified FlatParse.Common.Assorted as Common import GHC.Exts @@ -101,21 +106,13 @@ getByteStringOf (B.PS (ForeignPtr bs fcontent) _ (I# len)) = _ -> Fail# {-# inline getByteStringOf #-} --- | Check that the input has at least the given number of bytes. -ensureBytes# :: Int -> Parser e () -ensureBytes# (I# len) = Parser \fp eob s -> - case len <=# minusAddr# eob s of - 1# -> OK# () s - _ -> Fail# -{-# inline ensureBytes# #-} - -- | Read a sequence of bytes. This is a template function, you can use it as -- @$(getBytesOf [3, 4, 5])@, for example, and the splice has type @Parser e -- ()@. getBytesOf :: [Word] -> Q Exp getBytesOf bytes = do let !len = length bytes - [| ensureBytes# len >> $(scanBytes# bytes) |] + [| ensureBytes len >> $(scanBytes# bytes) |] -- | Template function, creates a @Parser e ()@ which unsafely scans a given -- sequence of bytes. @@ -156,7 +153,6 @@ scanPartial64# (I# len) (W# w) = Parser \fp eob s -> _ -> Fail# {-# inline scanPartial64# #-} - -- Switching code generation -------------------------------------------------------------------------------- @@ -177,7 +173,7 @@ genTrie (rules, t) = do Just a -> a let ensure :: Maybe Int -> Maybe (Q Exp) - ensure = fmap (\n -> [| ensureBytes# n |]) + ensure = fmap (\n -> [| ensureBytes n |]) fallback :: Rule -> Int -> Q Exp fallback rule 0 = pure $ VarE $ fst $ ix branches rule @@ -311,3 +307,13 @@ rawSwitchWithPost postAction cases fallback = do !cases <- forM cases \(str, rhs) -> (str,) <$> rhs !fallback <- sequence fallback genTrie $! genSwitchTrie' postAction cases fallback + +-------------------------------------------------------------------------------- + +-- | Assert that the input has at least the given number of bytes. +ensureBytes :: Int -> Parser e () +ensureBytes (I# len) = Parser \fp eob s -> + case len <=# minusAddr# eob s of + 1# -> OK# () s + _ -> Fail# +{-# inline ensureBytes #-} diff --git a/src/FlatParse/Basic/Position.hs b/src/FlatParse/Basic/Position.hs index 3b1c473..3527811 100644 --- a/src/FlatParse/Basic/Position.hs +++ b/src/FlatParse/Basic/Position.hs @@ -1,3 +1,5 @@ +-- | Parsers and utilities concerning position in input. + module FlatParse.Basic.Position ( module FlatParse.Common.Position , module FlatParse.Basic.Position diff --git a/src/FlatParse/Basic/Strings.hs b/src/FlatParse/Basic/Strings.hs index 6953a44..c908f82 100644 --- a/src/FlatParse/Basic/Strings.hs +++ b/src/FlatParse/Basic/Strings.hs @@ -2,7 +2,26 @@ -- | Parsers and utilities for parsing (UTF-8) 'Char's and strings. -module FlatParse.Basic.Strings where +module FlatParse.Basic.Strings + ( getCharOf + , getStringOf + , getChar + , getChar_ + , getCharASCII + , getCharASCII_ + , getAsciiDecimalInt + , getAsciiDecimalInteger + , getAsciiHexInt + , Common.isDigit + , Common.isGreekLetter + , Common.isLatinLetter + , satisfy + , satisfy_ + , satisfyASCII + , satisfyASCII_ + , fusedSatisfy + , fusedSatisfy_ + ) where import Prelude hiding ( getChar ) From e29f98b2ea0d0a190a182770e7d8281113d23781 Mon Sep 17 00:00:00 2001 From: Ben Orchard Date: Fri, 28 Oct 2022 18:11:25 +0100 Subject: [PATCH 11/11] add efficient varint (protobuf) parser --- flatparse.cabal | 1 + src/FlatParse/Basic/Integers/Varints.hs | 27 +++++++++++++++++++++++++ src/FlatParse/Common/Numbers.hs | 22 +++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 src/FlatParse/Basic/Integers/Varints.hs diff --git a/flatparse.cabal b/flatparse.cabal index 7eb01c4..79e64d8 100644 --- a/flatparse.cabal +++ b/flatparse.cabal @@ -47,6 +47,7 @@ library FlatParse.Basic.Addr FlatParse.Basic.Base FlatParse.Basic.Integers + FlatParse.Basic.Integers.Varints FlatParse.Basic.Other FlatParse.Basic.Parser FlatParse.Basic.Position diff --git a/src/FlatParse/Basic/Integers/Varints.hs b/src/FlatParse/Basic/Integers/Varints.hs new file mode 100644 index 0000000..495bca0 --- /dev/null +++ b/src/FlatParse/Basic/Integers/Varints.hs @@ -0,0 +1,27 @@ +{-# language UnboxedTuples #-} + +module FlatParse.Basic.Integers.Varints where + +import FlatParse.Basic.Parser + +import FlatParse.Common.Assorted ( shortInteger ) +import FlatParse.Common.Numbers ( getVarintProtobuf# ) + +import GHC.Exts + +getVarintProtobuf :: Parser e Int +getVarintProtobuf = Parser \fp eob s -> + case getVarintProtobuf# eob s of + (# (##) | #) -> Fail# + (# | (# i#, s#, _n# #) #) -> OK# (I# i#) s# +{-# inline getVarintProtobuf #-} + +getVarintProtobufInteger :: Parser e Integer +getVarintProtobufInteger = Parser \fp eob s -> + case getVarintProtobuf# eob s of + (# (##) | #) -> Fail# + (# | (# i#, s#, n# #) #) -> + case n# >=# 62# of -- TODO unsure if 62 or 63 + 1# -> error "TODO overflow" + _ -> OK# (shortInteger i#) s# +{-# inline getVarintProtobufInteger #-} diff --git a/src/FlatParse/Common/Numbers.hs b/src/FlatParse/Common/Numbers.hs index fbf6826..22c61db 100644 --- a/src/FlatParse/Common/Numbers.hs +++ b/src/FlatParse/Common/Numbers.hs @@ -1,4 +1,4 @@ -{-# language UnboxedTuples #-} +{-# language UnboxedTuples, BinaryLiterals #-} module FlatParse.Common.Numbers where @@ -68,3 +68,23 @@ readIntHex :: Addr# -> Addr# -> (# (##) | (# Int#, Addr# #) #) readIntHex eob s = case readIntHex' 0# s eob of (# n, s' #) | 1# <- eqAddr# s s' -> (# (##) | #) | otherwise -> (# | (# n, s' #) #) + +-- | protobuf style (LE, redundant, on continues) +getVarintProtobuf# :: Addr# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #) +-- v TODO not working?? +#if !MIN_VERSION_base(4,17,0) +getVarintProtobuf# = error "TODO not supported on this GHC bud" +#else +getVarintProtobuf# end# = go 0# 0# + where + go :: Int# -> Int# -> Addr# -> (# (##) | (# Int#, Addr#, Int# #) #) + go i# n# s# = case eqAddr# s# end# of + 1# -> (# (##) | #) + _ -> + let w# = indexWord8OffAddr# s# 0# + w'# = word2Int# (word8ToWord# (w# `andWord8#` (wordToWord8# 0b01111111##))) + i'# = i# `orI#` (w'# `uncheckedIShiftL#` n#) + in case w# `geWord8#` wordToWord8# 0b10000000## of + 1# -> go i'# (n# +# 7#) (s# `plusAddr#` 1#) + _ -> (# | (# i'#, s#, n# #) #) +#endif