jgm / markdown-peg

A Haskell implementation of markdown using a PEG grammar

This URL has Read+Write access

markdown-peg / Markdown.hs
100644 381 lines (333 sloc) 20.873 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
{-# OPTIONS_GHC -fglasgow-exts #-}
 
{- Markdown.hs - Haskell implementation of markdown using PEG grammar.
(c) 2008 John MacFarlane
Released under the GPL
-}
 
import Text.Parsers.Frisby
import Text.Parsers.Frisby.Char
import Text.Pandoc.XML
import Data.Char (toUpper)
import Text.PrettyPrint.HughesPJ hiding (text, char, (<>), empty)
import qualified Text.PrettyPrint.HughesPJ as P (text, char, (<>), empty)
import System.Environment
 
-- Uncomment the following two lines for UTF8 support (requires utf8-string library):
-- import System.IO.UTF8
-- import Prelude hiding (getContents, putStrLn, readFile)
 
main :: IO ()
main = do
  argv <- getArgs
  c <- if null argv
          then getContents
          else mapM readFile argv >>= return . unlines
  let (blocks, remaining) = runPeg doc $ tabFilter tabStop (c ++ "\n")
  if (not . null) remaining
     then error $ "Parse failed at: " ++ take 35 remaining
     else do
       -- extract link references first, then convert to HTML
       let refs = map (\(Reference l (Src (s,t))) -> (l, (s,t))) $ filter isReference blocks
       putStrLn $ render $ vcat $ map (blockToHtml refs) blocks
 
-- | A link target: a URL and title, or a reference, or nothing.
data Target = Src (String, String) -- ^ (URL, title)
            | Ref [Inline] String -- ^ the string contains spaces before the reference: e.g. " " in "[label] [ref]"
            | Null -- ^ this is used for shortcut references: "[label]"
            deriving (Show, Read, Eq)
 
-- | An inline (span-level) element.
data Inline =
    Text String
  | Entity String -- ^ an entity reference without the '&' and ';'
  | Space
  | LineBreak
  | Emph [Inline]
  | Strong [Inline]
  | Code String
  | Link [Inline] Target
  | Image [Inline] Target
  | Html String -- ^ raw HTML
  deriving (Show, Read, Eq)
 
-- | A block element.
data Block =
    Para [Inline]
  | Plain [Inline]
  | Heading Int [Inline]
  | BlockQuote [Block]
  | BulletList [[Block]]
  | OrderedList [[Block]]
  | HtmlBlock String
  | Verbatim String
  | HorizontalRule
  | Reference [Inline] Target -- ^ a link reference: e.g. [label]: /url "title"
  | Markdown String -- ^ unprocessed markdown, to be parsed into blocks
  deriving (Show, Read, Eq)
 
{- Reading the grammar:
 
Standard PEG Frisby
------------ -----------------------
A* many A
A+ many1 A
A? optional A (returns () if no match)
option B A (same thing but returns B if no match)
!A doesNotMatch A
&A peek A
A / B A // B
[abc] oneOf "abc"
[^abc] noneOf "abc"
'a' char 'a'
"abc" text "abc"
A B A <> B -- parses A, then B, and returns a pair (A, B)
A ->> B -- parses A, then B, and returns A
A <<- B -- parses A, then B, and returns B
A <++> B -- parses A, then B, and returns concatenation of A and B
eof eof
A <- B A <- newRule $ B
 
Other Frisby peculiarities:
 
X ## f -- parses X and passes the result through function f
X ##> a -- parses X and returns constant a
 
-}
 
-- | The PEG grammar definition for Markdown.
doc :: forall s . PM s (P s ([Block], String))
doc = mdo
 
  -- characters and tokens
  spaceChar <- newRule $ oneOf " \t"
  newline <- newRule $ text "\n"
  sp <- newRule $ many spaceChar
  spnl <- newRule $ sp <++> option "" (newline <++> sp)
  specialChar <- newRule $ oneOf "*_`&[]<!\\"
  normalChar <- newRule $ escapedChar // (doesNotMatch (specialChar // spaceChar) ->> doesNotMatch newline ->> anyChar)
  escapedChar <- newRule $ char '\\' ->> anyChar
 
  -- strings
  nonindentSpace <- newRule $ option "" (text " " // text " " // text " ")
  indent <- newRule $ text "\t" // text " "
  indentedLine <- newRule $ indent ->> anyline
  optionallyIndentedLine <- newRule $ indent ->> anyline
  anyline <- newRule $ many (doesNotMatch newline ->> doesNotMatch eof ->> anyChar) <++> option "" newline
  blankline <- newRule $ sp <++> newline
  blockquoteLine <- newRule $ nonindentSpace ->> char '>' ->> optional (char ' ') ->> anyline
  quoted <- newRule $ (text "\"" <++> many (noneOf "\"") <++> text "\"") //
                              (text "'" <++> many (noneOf "'") <++> text "'")
  htmlAttribute <- newRule $ many1 alphaNum <++> spnl <++> option "" (text "=" <++> spnl <++>
                                (quoted // many1 (doesNotMatch spaces ->> anyChar))) <++> spnl
  htmlComment <- newRule $ text "<!--" <++> many (doesNotMatch (text "-->") ->> anyChar) <++> text "-->"
  htmlTag <- newRule $ text "<" <++> spnl <++> option "" (text "/") <++> many1 alphaNum <++>
                              spnl <++> (many htmlAttribute ## concat) <++> option "" (text "/") <++> text ">"
 
  -- inlines
  inline <- newRule $ strong // emph // code // endline // spaces // link // image // autolink //
                      rawHtml // str // entity // special
 
  emph <- newRule $ emphStar // emphUl
  oneStar <- newRule $ char '*' <<- doesNotMatch oneStar
  emphStar <- newRule $ oneStar ->> doesNotMatch spaceChar ->> doesNotMatch newline ->>
                           many1 (strong // (doesNotMatch (spnl ->> oneStar) ->> inline)) <<- oneStar ## Emph
  oneUl <- newRule $ char '_' <<- doesNotMatch oneUl
  emphUl <- newRule $ oneUl ->> doesNotMatch spaceChar ->> doesNotMatch newline ->>
                           many1 (strong // (doesNotMatch (spnl ->> oneUl) ->> inline)) <<- oneUl <<-
                           doesNotMatch alphaNum ## Emph
 
  strong <- newRule $ strongStar // strongUl
  twoStar <- newRule $ text "**" <<- doesNotMatch twoStar
  twoUl <- newRule $ text "__" <<- doesNotMatch twoStar
  strongStar <- newRule $ twoStar ->> doesNotMatch spaceChar ->> doesNotMatch newline ->>
                           many1 (doesNotMatch (spnl ->> twoStar) ->> inline) <<- twoStar ## Strong
  strongUl <- newRule $ twoUl ->> doesNotMatch spaceChar ->> doesNotMatch newline ->>
                           many1 (doesNotMatch (spnl ->> twoUl) ->> inline) <<- twoUl <<-
                           doesNotMatch alphaNum ## Strong
  
  let ticks n = text (replicate n '`') <<- doesNotMatch (char '`')
  let betweenTicks n = ticks n ->> many1 (many1 (noneOf "`") // doesNotMatch (ticks n) ->> many1 (char '`')) <<-
                         ticks n ## lrstrip ' ' . concat
  code <- newRule $ peek (char '`') ->> choice (map betweenTicks $ reverse [1..10]) ## Code
 
  rawHtml <- newRule $ (htmlComment // htmlTag) ## Html
 
  str <- newRule $ many1 normalChar ## Text
  special <- newRule $ specialChar ## Text . (: [])
  spaces <- newRule $ many1 spaceChar ##> Space
  endline <- newRule $ optional (text " ") <> newline <> doesNotMatch blankline <> doesNotMatch eof ##> Space
  linebreak <- newRule $ text " " ->> sp ->> endline ##> LineBreak
  entity <- newRule $ char '&' ->> (text "#" <++> (text "x" // text "X") <++> many1 hexDigit //
                             text "#" <++> many1 digit // many1 alphaNum) <<- char ';' ## Entity
 
  label <- newRule $ char '[' ->> many (doesNotMatch (char ']') ->> inline) <<- char ']'
  title <- newRule $ char '"' ->> many (doesNotMatch (char '"' <> sp <> (text ")" // newline)) ->>
                               doesNotMatch newline ->> anyChar) <<- char '"' //
                               char '\'' ->> many (doesNotMatch (char '\'' <> sp <> (text ")" // newline)) ->>
                               doesNotMatch newline ->> anyChar) <<- char '\''
  source <- newRule $ char '<' ->> source' <<- char '>' // source'
  source' <- newRule $ many (many1 (noneOf "()> \n\t") // (text "(" <++> source' <++> text ")") //
                                      (text "<" <++> source' <++> text ">")) ## concat
  sourceAndTitle <- newRule $ char '(' ->> sp ->> source <<- spnl <> option "" title <<- sp <<- char ')'
  explicitLink <- newRule $ label <> spnl ->> sourceAndTitle ## (\(l, s) -> Link l (Src s))
  autolinkUrl <- newRule $ char '<' ->> many1 alpha <++> text "://" <++>
                              many1 (doesNotMatch newline ->> doesNotMatch (char '>') ->> anyChar) <<- char '>' ##
                              (\s -> Link [Text s] (Src (s, "")))
  autolinkEmail <- newRule $ char '<' ->> many1 alpha <++> text "@" <++>
                              many1 (doesNotMatch newline ->> doesNotMatch (char '>') ->> anyChar) <<- char '>' ##
                              (\s -> Link [Text s] (Src ("mailto:" ++ s, "")))
  autolink <- newRule $ autolinkUrl // autolinkEmail
  referenceLink <- newRule $ label <> spnl <> label ## (\((l1,s), l2) -> Link l1 (Ref l2 s)) //
                             label ## (\l -> Link l Null)
 
  link <- newRule $ explicitLink // referenceLink
  image <- newRule $ char '!' ->> link ## (\(Link x y) -> Image x y)
 
  -- blocks
  block <- newRule $ blockquote // verbatim // reference // htmlBlock // heading // list // horizontalRule //
                            para // plain
  para <- newRule $ many1 inline <<- newline <<- many1 blankline ## Para
  plain <- newRule $ many1 inline <<- optional blankline ## Plain
  blockquote <- newRule $ many1 (many1 blockquoteLine <++> many (doesNotMatch blankline ->> anyline) <++>
                                 many blankline ## concat) ## (\ls -> BlockQuote [Markdown $ concat ls ++ "\n"])
  
  let setextHeadingWith lev c = many1 (doesNotMatch endline ->> inline) <<- newline <<-
                                text (replicate 3 c) <<- (many (char c)) <<- newline ## Heading lev
  setextHeading <- newRule $ setextHeadingWith 1 '=' // setextHeadingWith 2 '-'
 
  let atxHeadingFor lev = text (replicate lev '#') ->>
                          many1 (doesNotMatch endline ->> doesNotMatch (char '#') ->> inline) <<-
                          many (oneOf "# \t") <<- newline ## Heading lev
  atxHeading <- newRule $ choice $ map atxHeadingFor [6, 5..1]
 
  heading <- newRule $ (atxHeading // setextHeading) <<- many blankline
 
  let horizontalRuleWith c = nonindentSpace ->> char c ->> sp ->> char c ->> sp ->> char c ->>
                             many (sp ->> char c) ->> sp ->> newline ->> many1 blankline ##> HorizontalRule
  horizontalRule <- newRule $ choice $ map horizontalRuleWith ['*', '_', '-']
 
  verbatim <- newRule $ many1 (doesNotMatch blankline ->> indentedLine) <++>
                              (many (many1 (optional indent ->> blankline) <++>
                              many1 (doesNotMatch blankline ->> indentedLine)) ## concat) <<-
                              many blankline ## Verbatim . concat
 
  list <- newRule $ bulletList // orderedList
 
  bullet <- newRule $ nonindentSpace ->> oneOf "+*-" <<- many1 spaceChar ## (: [])
  bulletList <- newRule $ bulletListTight // bulletListLoose
  bulletListTight <- newRule $ many1 (bulletListItem ## (\s -> [Markdown s])) <<- many blankline <<-
                                 doesNotMatch bulletListLoose ## BulletList
  bulletListLoose <- newRule $ many1 ((bulletListItem <<- many blankline) ## (\s -> [Markdown $ s ++ "\n\n"])) ##
                                 BulletList
  bulletListItem <- newRule $ doesNotMatch horizontalRule ->> bullet ->>
                                  listBlock <++> (many listContinuationBlock ## concat)
  listBlock <- newRule $ anyline <++> (many (doesNotMatch (optional indent ->> (bulletListItem // orderedListItem))
                                    ->> doesNotMatch blankline ->> doesNotMatch (indent ->> (bullet // enumerator)) ->>
                                    optionallyIndentedLine) ## concat)
  listContinuationBlock <- newRule $ ((many1 blankline ## concat) // unit "\0") <++>
                                     (many1 (indent ->> listBlock) ## concat)
 
  enumerator <- newRule $ nonindentSpace ->> many1 digit <<- char '.' <<- many1 spaceChar
  orderedList <- newRule $ orderedListTight // orderedListLoose
  orderedListTight <- newRule $ many1 (orderedListItem ## (\s -> [Markdown s])) <<- many blankline <<-
                                doesNotMatch orderedListLoose ## OrderedList
  orderedListLoose <- newRule $ many1 ((orderedListItem <<- many blankline) ## (\s -> [Markdown $ s ++ "\n\n"])) ##
                                OrderedList
  orderedListItem <- newRule $ enumerator ->> listBlock <++> (many listContinuationBlock ## concat)
 
 
  let htmlBlockOpening tag = text "<" <++> spnl <++> text tag <++> spnl <++> (many htmlAttribute ## concat)
  let htmlBlockSolo tag = htmlBlockOpening tag <++> text "/" <++> spnl <++> text ">"
  let htmlBlockWithEnd tag = htmlBlockOpening tag <++> text ">" <++>
                             (many (doesNotMatch (htmlBlockEndFor tag) ->>
                                   (htmlBlockAny // many1 (noneOf "<") // text "<")) ## concat) <++>
                             htmlBlockEndFor tag
  let htmlBlockEndFor tag = text "<" <++> spnl <++> text "/" <++> text tag <++> spnl <++> text ">"
  let htmlBlockFor tag = htmlBlockSolo tag // htmlBlockWithEnd tag
  let blockTags = ["address", "blockquote", "center", "dir", "div", "dl", "fieldset", "form", "h1", "h2", "h3",
                              "h4", "h5", "h6", "hr", "isindex", "menu", "noframes", "noscript", "ol", "p", "pre", "table",
                              "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script"]
 
  htmlBlockAny <- newRule $ choice $ map htmlBlockFor (blockTags ++ map (map toUpper) blockTags)
  htmlBlock <- newRule $ nonindentSpace <++> (htmlComment // htmlBlockAny) ## HtmlBlock
 
  -- references
  reference <- newRule $ nonindentSpace ->> label <> char ':' ->> spnl ->>
                                many1 (doesNotMatch spaceChar ->> doesNotMatch newline ->> anyChar) <>
                                spnl ->> option "" title <<- many blankline ## (\((l,s),t) -> Reference l (Src (s,t)))
 
  -- document - returns (block list, unparsed text)
  document <- newRule $ (many (many blankline ->> block) <<- many blankline) <> rest <<- eof
  return document
 
--
-- Convert inlines and blocks to HTML
--
 
-- | Convert inline element to HTML.
inlineToHtml :: [([Inline],(String, String))] -- ^ list of link references
             -> Inline -- ^ inline element
             -> Doc
inlineToHtml refs i =
  case i of
    Text s -> P.text (escapeStringForXML s)
    Entity s -> P.char '&' P.<> P.text s P.<> P.char ';'
    Space -> P.char ' '
    LineBreak -> selfClosingTag "br" []
    Code s -> inTagsSimple "code" $ P.text $ escapeStringForXML s
    Emph xs -> inTagsSimple "em" $ hcat $ map (inlineToHtml refs) xs
    Strong xs -> inTagsSimple "strong" $ hcat $ map (inlineToHtml refs) xs
    Html s -> P.text s
    -- an autolink <http://google.com> or [explicit link](google.com) with no title
    Link l (Src (u,"")) -> inTags False "a" [("href", u)] $ hcat $ map (inlineToHtml refs) l
    -- an explicit link with a title: [like this](google.com "title")
    Link l (Src (u,t)) -> inTags False "a" [("href", u), ("title", t)] $ hcat $ map (inlineToHtml refs) l
    -- a shortcut-style reference link: [like this]
    Link l Null -> case lookup l refs of
                                Just (u, "") -> inTags False "a" [("href", u)] $ hcat $ map (inlineToHtml refs) l
                                Just (u, t) -> inTags False "a" [("href", u), ("title", t)] $ hcat $
                                                 map (inlineToHtml refs) l
                                Nothing -> hcat $ map (inlineToHtml refs) $ [Text "["] ++ l ++ [Text "]"]
    -- a regular reference link: [like][this] or [like] [this]
    Link l (Ref r s) -> let r' = if null r then l else r
                           in case lookup r' refs of
                                 Just (u, "") -> inTags False "a" [("href", u)] $ hcat $ map (inlineToHtml refs) l
                                 Just (u, t) -> inTags False "a" [("href", u), ("title", t)] $ hcat $
                                                  map (inlineToHtml refs) l
                                 Nothing -> hcat $ map (inlineToHtml refs) $ [Text "["] ++ l ++
                                                          [Text $ "]" ++ s ++ "["] ++ r ++ [Text "]"]
    Image l (Src (u,t)) -> selfClosingTag "img" [("src", u), ("title", t),
                                  ("alt", render $ hcat $ map (inlineToHtml refs) l)]
    -- a shortcut-style reference link: ![like this]
    Image l Null -> case lookup l refs of
                                Just (u, t) -> inlineToHtml refs $ Image l (Src (u,t))
                                Nothing -> hcat $ map (inlineToHtml refs) $ [Text "!["] ++ l ++ [Text "]"]
    -- a regular reference link: ![like][this] or ![like] [this]
    Image l (Ref r s) -> let r' = if null r then l else r
                           in case lookup r' refs of
                                 Just (u, t) -> inlineToHtml refs $ Image l (Src (u, t))
                                 Nothing -> hcat $ map (inlineToHtml refs) $ [Text "!["] ++ l ++
                                                          [Text $ "]" ++ s ++ "["] ++ r ++ [Text "]"]
 
-- | Convert block element to HTML.
blockToHtml :: [([Inline],(String, String))] -- ^ list of link references
            -> Block -- ^ block element to convert
            -> Doc
blockToHtml refs block =
  case block of
    Para xs -> inTagsSimple "p" $ wrap refs $ lrstrip Space xs
    Plain xs -> wrap refs $ lrstrip Space xs
    Heading lev xs -> inTagsSimple ("h" ++ show lev) $ hcat $ map (inlineToHtml refs) $ lrstrip Space xs
    HorizontalRule -> selfClosingTag "hr" []
    BlockQuote xs -> inTagsIndented "blockquote" $ vcat $ map (blockToHtml refs) xs
    Verbatim s -> inTagsSimple "pre" $ inTagsSimple "code" $ P.text $ escapeStringForXML s
    BulletList items -> inTagsIndented "ul" $ vcat $
                           map (\item -> inTagsSimple "li" $ vcat $ map (blockToHtml refs) item) items
    OrderedList items -> inTagsIndented "ol" $ vcat $
                           map (\item -> inTagsSimple "li" $ vcat $ map (blockToHtml refs) item) items
    Markdown s -> -- handle a raw chunk of markdown, e.g. a list item or block quote contents
                         -- if the chunk contains \0, it is split at that point into two chunks that
                         -- are parsed separately. This allows correct handling of lists like:
                         -- - item
                         -- - sub
                         -- - sub
                         let (a, b) = break (=='\0') s
                             (parsed, remaining) = runPeg doc a
                             first = vcat $ map (blockToHtml refs) parsed
                         in if null remaining
                                then if null b
                                        then first
                                        else first $$ blockToHtml refs (Markdown $ tail b)
                                else error $ "Parse failed at: " ++ take 35 remaining
    Reference _ _ -> P.empty
    HtmlBlock s -> P.text s
 
--
-- Wrapping code and other utilities from pandoc
--
 
-- | Take list of inline elements and return wrapped doc.
wrap :: [([Inline],(String, String))] -> [Inline] -> Doc
wrap refs lst = fsep $ map (hcat . map (inlineToHtml refs)) (splitBy Space lst)
 
-- | Split list by groups of one or more sep.
splitBy :: (Eq a) => a -> [a] -> [[a]]
splitBy _ [] = []
splitBy s lst =
  let (x, xs) = break (== s) lst
      xs' = dropWhile (== s) xs
  in x:(splitBy s xs')
 
-- | Strip leading and trailing elements.
lrstrip :: (Eq a) => a -> [a] -> [a]
lrstrip x = reverse . dropWhile (== x) . reverse . dropWhile (== x)
 
tabStop :: Int
tabStop = 4
 
tabFilter :: Int -> String -> String
tabFilter _ [] = ""
tabFilter _ ('\r':'\n':xs) = '\n' : tabFilter tabStop xs
tabFilter _ ('\r':xs) = '\n' : tabFilter tabStop xs
tabFilter _ ('\n':xs) = '\n' : tabFilter tabStop xs
tabFilter spsToNextStop ('\t':xs) = replicate spsToNextStop ' ' ++ tabFilter tabStop xs
tabFilter 1 (x:xs) = x:(tabFilter tabStop xs)
tabFilter spsToNextStop (x:xs) = x:(tabFilter (spsToNextStop - 1) xs)
 
isReference :: Block -> Bool
isReference (Reference _ _) = True
isReference _ = False