Skip to content
This repository

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse code

Add a CFG parser.

  • Loading branch information...
commit 3bd8f142e3625ca1d2df8ee2ab6c2ee23225dff4 1 parent 3d8af0c
Mikhail Glushenkov authored
38 src/CFG/Examples.hs
... ... @@ -1,33 +1,23 @@
1 1 module CFG.Examples (balancedParentheses, sillyGrammar, sillyGrammar2)
2 2 where
3 3
4   -import CFG.Helpers.CNF
5   -import CFG.Types
  4 +import CFG.Types
6 5
7   --- | Example grammar: balanced parenthese.
  6 +import qualified CFG.Helpers.CNF as CNF
  7 +import CFG.Read
  8 +
  9 +fromRight :: Either String a -> a
  10 +fromRight = either error id
  11 +
  12 +-- | Example grammar: balanced parentheses.
8 13 balancedParentheses :: CompiledCNFGrammar
9   -balancedParentheses =
10   - -- S -> SS | LH | LR
11   - -- H -> SR
12   - -- L -> '('
13   - -- R -> ')'
14   - listToGrammar
15   - [ ruleNonTerminal "S" [ ("S","S"), ("L","H"), ("L","R")]
16   - , ruleNonTerminal "H" [("S","R")]
17   - , ruleTerminal "L" '('
18   - , ruleTerminal "R" ')'
19   - ]
  14 +balancedParentheses = CNF.compileGrammar . fromRight . readCNFGrammar $
  15 + "S -> SS, S -> LH, S -> LR, H -> SR, L -> (, R -> )"
20 16
21 17 sillyGrammar :: CompiledCNFGrammar
22   -sillyGrammar =
23   - listToGrammar
24   - [ ruleNonTerminal "S" [("S1", "S2")]
25   - , ruleTerminal "S1" 'o'
26   - , ruleTerminal "S2" 'o' ]
  18 +sillyGrammar = CNF.compileGrammar . fromRight . readCNFGrammar $
  19 + "S -> S1S2, S1 -> o, S2 -> o"
27 20
28 21 sillyGrammar2 :: CompiledCNFGrammar
29   -sillyGrammar2 =
30   - listToGrammar
31   - [ ruleNonTerminal "S" [("S", "S"), ("S1", "S1")]
32   - , ruleTerminal "S1" '1'
33   - , ruleTerminal "S1" '0' ]
  22 +sillyGrammar2 = CNF.compileGrammar . fromRight . readCNFGrammar $
  23 + "S -> SS, S -> S1S1, S1 -> 1, S1 -> 0"
67 src/CFG/Helpers/CFG.hs
... ... @@ -1,14 +1,69 @@
1 1 -- | Helpers for working with general context-free grammars.
2 2 module CFG.Helpers.CFG (
3   - -- * Helpers for constructing the grammar.
4   - ruleTerminal, ruleNonTerminal
  3 + -- * Basic helpers.
  4 + ruleName, ruleNumber, isNonTerminalRule, nonTerminalRuleProductions
  5 + -- * Monad for fresh names.
  6 + ,NameMonad, runNameMonad, freshName, rememberName
5 7 )
6 8 where
7 9
8 10 import CFG.Types
9 11
10   -ruleTerminal :: RuleName -> Char -> NamedCFGRule
11   -ruleTerminal name prod = CFGTerminalRule name (charToSymbol prod)
  12 +import Control.Monad.State
  13 +import qualified Data.Set as S
12 14
13   -ruleNonTerminal :: RuleName -> [[RuleName]] -> NamedCFGRule
14   -ruleNonTerminal name prods = CFGNonTerminalRule name prods
  15 +-- Basic helpers
  16 +ruleName :: NamedCFGRule -> RuleName
  17 +ruleName (CFGTerminalRule name _) = name
  18 +ruleName (CFGNonTerminalRule name _) = name
  19 +
  20 +ruleNumber :: NumberedCFGRule -> RuleNumber
  21 +ruleNumber (CFGTerminalRule num _) = num
  22 +ruleNumber (CFGNonTerminalRule num _) = num
  23 +
  24 +isNonTerminalRule :: CFGRule a -> Bool
  25 +isNonTerminalRule (CFGNonTerminalRule _ _) = True
  26 +isNonTerminalRule _ = False
  27 +
  28 +nonTerminalRuleProductions :: CFGRule a -> [[a]]
  29 +nonTerminalRuleProductions (CFGNonTerminalRule _ prods) = prods
  30 +nonTerminalRuleProductions _ = error "Non-terminal rule expected!"
  31 +
  32 +
  33 +-- Monad for generating fresh names.
  34 +data NameState = NameState { nameStateCounter :: !Int
  35 + , nameStateSet :: !(S.Set RuleName) }
  36 + deriving Show
  37 +type NameMonad = State NameState
  38 +
  39 +-- | Initial state, for using in conjunction with 'runState'.
  40 +runNameMonad :: S.Set RuleName -> NameMonad a -> a
  41 +runNameMonad s act = fst $ runState act (NameState 0 s)
  42 +
  43 +-- | Generate a fresh name.
  44 +freshName :: NameMonad RuleName
  45 +freshName = do
  46 + c <- getCounter
  47 + incCounter
  48 + let n = "Z" ++ (show c)
  49 + hasSeen <- nameSetContains n
  50 + if hasSeen
  51 + then freshName
  52 + else do rememberName n
  53 + return n
  54 + where
  55 + getCounter :: NameMonad Int
  56 + getCounter = gets nameStateCounter
  57 +
  58 + incCounter :: NameMonad ()
  59 + incCounter = modify
  60 + (\s -> s { nameStateCounter = (+1) . nameStateCounter $ s })
  61 +
  62 + nameSetContains :: RuleName -> NameMonad Bool
  63 + nameSetContains n = S.member n `fmap` gets nameStateSet
  64 +
  65 +-- | Remember a given name. A remembered name will never be produced by
  66 +-- 'freshName'.
  67 +rememberName :: RuleName -> NameMonad ()
  68 +rememberName n = modify
  69 + (\s -> s { nameStateSet = S.insert n . nameStateSet $ s})
14 src/CFG/Helpers/CNF.hs
... ... @@ -1,8 +1,7 @@
1 1 -- | Helpers for working with context-free grammars in Chomsky normal form.
2 2 module CFG.Helpers.CNF (
3 3 -- * Helpers for constructing the grammar.
4   - ruleTerminal, ruleNonTerminal
5   - ,compileGrammar, listToGrammar
  4 + compileGrammar
6 5
7 6 -- * Misc. helper functions.
8 7 ,ruleName, ruleNumber
@@ -16,13 +15,6 @@ import CFG.Types
16 15 import qualified Data.Map as M
17 16 import Data.Maybe (fromJust)
18 17
19   -
20   -ruleTerminal :: RuleName -> Char -> NamedCNFRule
21   -ruleTerminal name prod = CNFTerminalRule name (charToSymbol prod)
22   -
23   -ruleNonTerminal :: RuleName -> [(RuleName, RuleName)] -> NamedCNFRule
24   -ruleNonTerminal name prods = CNFNonTerminalRule name prods
25   -
26 18 compileGrammar :: NamedCNFGrammar -> CompiledCNFGrammar
27 19 compileGrammar (CNFGrammar rules start e) =
28 20 CNFGrammar (map compileRule rules) (lookupName start) e
@@ -40,10 +32,6 @@ compileGrammar (CNFGrammar rules start e) =
40 32 CNFNonTerminalRule (lookupName name)
41 33 [(lookupName a, lookupName b) | (a,b) <- prods]
42 34
43   --- NB: this assumes that the grammar doesn't generate the empty string.
44   -listToGrammar :: [NamedCNFRule] -> CompiledCNFGrammar
45   -listToGrammar rules = compileGrammar $ CNFGrammar rules "S" False
46   -
47 35 -- Misc. helper functions.
48 36
49 37 ruleName :: NamedCNFRule -> RuleName
157 src/CFG/Read.hs
... ... @@ -1,42 +1,134 @@
1 1 module CFG.Read (readCFGrammar, readCNFGrammar)
2 2 where
3 3
4   -import CFG.Helpers.CNF
5   -import CFG.Types
6   -
7   -import Control.Applicative hiding (many, (<|>))
8   -import Control.Monad (unless)
9   -import Data.Bifunctor (first)
10   -import Data.Char (isPunctuation)
11   -import Data.Either ()
12   -import Data.List (find, groupBy, sortBy)
13   -import Data.Maybe (isJust)
14   -import Data.Ord (comparing)
15   -import Text.Parsec
  4 +import qualified CFG.Helpers.CFG as CFG
  5 +import qualified CFG.Helpers.CNF as CNF
  6 +import CFG.Types
16 7
  8 +import Control.Applicative hiding (many, (<|>))
  9 +import Control.Monad (foldM, unless)
  10 +import Data.Bifunctor (first)
  11 +import Data.Either (partitionEithers)
  12 +import Data.List (find, groupBy, partition, sortBy)
  13 +import qualified Data.Map as M
  14 +import Data.Maybe (fromJust, isJust)
  15 +import Data.Ord (comparing)
  16 +import qualified Data.Set as S
  17 +import Data.Tuple (swap)
  18 +import Text.Parsec
  19 +
  20 +type SymbolMap = M.Map Symbol RuleName
  21 +type SymOrName = Either Symbol RuleName
  22 +
  23 +-- | Parse a comma-delimited string representing a context-free grammar
  24 +-- in. Uppercase letters followed by zero or more digits act as nonterminals and
  25 +-- lowercase letters are terminals. The initial nonterminal is always called
  26 +-- S. Empty productions are allowed.
17 27 readCFGrammar :: String -> Either String NamedCFGrammar
18   -readCFGrammar _input = undefined
  28 +readCFGrammar input = do
  29 + rules <- first show $ parse cfgRulesP "<stdin>" input
  30 + validateCFGrammar rules
  31 +
  32 +-- | A list of CFG rule names separated by commas.
  33 +cfgRulesP :: Parsec String () [(RuleName, [SymOrName])]
  34 +cfgRulesP = sepBy cfgRuleP (char ',' >> spaces)
  35 +
  36 +-- | A single CFG rule. Sum type representation makes it easier to extract
  37 +-- non-terminals (see 'processRules').
  38 +cfgRuleP :: Parsec String () (RuleName, [SymOrName])
  39 +cfgRuleP = do
  40 + name <- ruleNameP
  41 + _ <- spaces >> string "->" >> spaces
  42 + rhs <- many ruleNameOrTerminalP
  43 + return $ (name, rhs)
  44 + where
  45 + ruleNameOrTerminalP :: Parsec String () (SymOrName)
  46 + ruleNameOrTerminalP = (Left <$> terminalP) <|> (Right <$> ruleNameP)
  47 +
  48 +-- | Given a bunch of rules, perform various checks and produce a CFGrammar.
  49 +validateCFGrammar :: [(RuleName, [SymOrName])] ->
  50 + Either String NamedCFGrammar
  51 +validateCFGrammar g = do
  52 + -- Replace all Lefts with Rights in non-terminal productions. We don't want to
  53 + -- mix symbols and rule names in non-terminal rules.
  54 + let (terms, nonterms) = partition isTerm g
  55 + allNames = S.fromList . map fst $ g
  56 + allSyms = map snd namedSyms ++ concatMap extractSyms nonterms
  57 + namedSyms = map (\(nam, [Left sym]) -> (nam, sym)) terms
  58 +
  59 + allSymsMap = CFG.runNameMonad allNames $
  60 + foldM bindSym M.empty allSyms
  61 +
  62 + termRules = map toTermRule namedSyms
  63 + ++ map (toTermRule . swap) (M.toList allSymsMap)
  64 + nontermRules = map (toNonTermRule allSymsMap) nonterms
  65 + allRules = termRules ++ nontermRules
  66 +
  67 + -- Merge all non-terminal productions with the same name. TODO: remove
  68 + -- duplication.
  69 + sorted = sortBy (comparing CFG.ruleName) allRules
  70 + grouped = groupBy
  71 + (\r0 r1 ->
  72 + CFG.isNonTerminalRule r0
  73 + && CFG.isNonTerminalRule r1
  74 + && CFG.ruleName r0 == CFG.ruleName r1) sorted
  75 + merged = foldr mergeProductions [] grouped
  76 +
  77 + -- Check that the grammar contains the start rule.
  78 + unless (S.member "S" allNames) $ (Left "No start rule found!")
  79 +
  80 + return (CFGrammar merged "S")
  81 + where
  82 + isTerm :: (RuleName, [SymOrName]) -> Bool
  83 + isTerm (_,[Left _]) = True
  84 + isTerm _ = False
  85 +
  86 + extractSyms :: (RuleName, [SymOrName]) -> [Symbol]
  87 + extractSyms (_, l) = fst . partitionEithers $ l
  88 +
  89 + bindSym :: SymbolMap -> Symbol -> CFG.NameMonad SymbolMap
  90 + bindSym m sym = do n <- CFG.freshName
  91 + return $ M.insert sym n m
  92 +
  93 + toTermRule :: (RuleName, Symbol) -> NamedCFGRule
  94 + toTermRule (nam, sym) = CFGTerminalRule nam sym
  95 +
  96 + toNonTermRule :: SymbolMap -> (RuleName, [SymOrName]) -> NamedCFGRule
  97 + toNonTermRule symMap (nam, prods) = CFGNonTerminalRule nam
  98 + [(map toName prods)]
  99 + where
  100 + toName :: SymOrName -> RuleName
  101 + toName (Left sym) = fromJust . M.lookup sym $ symMap
  102 + toName (Right n) = n
  103 +
  104 + mergeProductions :: [NamedCFGRule] -> [NamedCFGRule] -> [NamedCFGRule]
  105 + mergeProductions [] rest = rest
  106 + mergeProductions [rule] rest = rule:rest
  107 + mergeProductions rules rest =
  108 + let name = CFG.ruleName . head $ rules
  109 + prods = concatMap CFG.nonTerminalRuleProductions rules
  110 + in (CFGNonTerminalRule name prods) : rest
  111 +
19 112
20 113 -- | Parse a comma-delimited string representing a context-free grammar in
21 114 -- CNF. Uppercase letters followed by zero or more digits act as nonterminals
22 115 -- and lowercase letters are terminals. The initial nonterminal is always called
23 116 -- S.
24 117 readCNFGrammar :: String -> Either String NamedCNFGrammar
25   -readCNFGrammar input = do g <- first show $ parse rulesP "<stdin>" input
26   - validateCNFGrammar g
  118 +readCNFGrammar input = do rules <- first show $ parse cnfRulesP "<stdin>" input
  119 + validateCNFGrammar rules
27 120
28   --- | A list of rule names separated by commas.
29   -rulesP :: Parsec String () [NamedCNFRule]
30   -rulesP = sepBy ruleP (char ',' >> spaces)
  121 +-- | A list of CNF rule names separated by commas.
  122 +cnfRulesP :: Parsec String () [NamedCNFRule]
  123 +cnfRulesP = sepBy cnfRuleP (char ',' >> spaces)
31 124
32 125 -- | A rule name, followed by '<-', followed either by a terminal (lowercase
33 126 -- letter, digit or a punctuation symbol) or by two names of non-terminals.
34   -ruleP :: Parsec String () NamedCNFRule
35   -ruleP = do
  127 +cnfRuleP :: Parsec String () NamedCNFRule
  128 +cnfRuleP = do
36 129 name <- ruleNameP
37 130 _ <- spaces >> string "->" >> spaces
38   - mTerm <- optionMaybe (charToSymbol <$>
39   - (lower <|> satisfy isPunctuation <|> digit))
  131 + mTerm <- optionMaybe terminalP
40 132 case mTerm of
41 133 Just t -> return $ CNFTerminalRule name t
42 134 Nothing -> do
@@ -47,21 +139,28 @@ ruleP = do
47 139 ruleNameP :: Parsec String () String
48 140 ruleNameP = (:) <$> upper <*> (many digit)
49 141
  142 +-- | A terminal: a digit, lowercase letter or a special character.
  143 +terminalP :: Parsec String () Symbol
  144 +terminalP = charToSymbol <$>
  145 + (lower <|> digit <|> satisfy (flip elem "()+-."))
  146 +
  147 +-- | Given a bunch of CNF rules, perform various checks on them.
50 148 validateCNFGrammar :: [NamedCNFRule] -> Either String NamedCNFGrammar
51 149 validateCNFGrammar g = do
52 150 -- Group productions of non-terminals together (not actually necessary).
53   - let sorted = sortBy (comparing ruleName) g
54   - grouped = groupBy (\r0 r1 -> isNonTerminalRule r0
55   - && isNonTerminalRule r1
56   - && ruleName r0 == ruleName r1) sorted
  151 + let sorted = sortBy (comparing CNF.ruleName) g
  152 + grouped = groupBy (\r0 r1 -> CNF.isNonTerminalRule r0
  153 + && CNF.isNonTerminalRule r1
  154 + && CNF.ruleName r0 == CNF.ruleName r1) sorted
57 155 merged = foldr mergeProductions [] grouped
58 156
59 157 -- Check that the start rule exists.
60   - unless (isJust $ find (\r -> ruleName r == "S") merged) $
  158 + unless (isJust $ find (\r -> CNF.ruleName r == "S") merged) $
61 159 (Left "No start rule found!")
62 160
63 161 -- TODO: Add more validity checks. Check whether the grammar produces an empty
64 162 -- string.
  163 + -- Check that terminal rules are unique and don't intersect with nonterminals.
65 164 return (CNFGrammar merged "S" False)
66 165
67 166 where
@@ -69,6 +168,6 @@ validateCNFGrammar g = do
69 168 mergeProductions [] rest = rest
70 169 mergeProductions [rule] rest = rule:rest
71 170 mergeProductions rules rest =
72   - let (CNFNonTerminalRule name _) = head rules
73   - prods = concatMap nonTerminalRuleProductions rules
  171 + let name = CNF.ruleName . head $ rules
  172 + prods = concatMap CNF.nonTerminalRuleProductions rules
74 173 in (CNFNonTerminalRule name prods) : rest
2  src/CFG/Types.hs
@@ -20,7 +20,7 @@ module CFG.Types (
20 20 -- Basic types.
21 21
22 22 newtype Symbol = SymChar Char
23   - deriving (Show, Eq)
  23 + deriving (Show, Eq, Ord)
24 24 type Symbols = [Symbol]
25 25 type RuleName = String
26 26 type RuleNumber = Int
1  src/cfg.cabal
@@ -25,6 +25,7 @@ library
25 25 CFG.Examples
26 26 -- other-modules:
27 27 build-depends: base < 5,
  28 + mtl >= 2 && < 3,
28 29 array < 0.5,
29 30 containers < 0.6,
30 31 parsec >= 3 && < 3.2,

0 comments on commit 3bd8f14

Please sign in to comment.
Something went wrong with that request. Please try again.