0
# Bluecloth is a Ruby implementation of Markdown, a text-to-HTML conversion
0
# * Put the StringScanner in the render state for thread-safety.
0
-### BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion
0
-class BlueCloth < String
0
- ### Exception class for formatting errors.
0
- class FormatError < RuntimeError
0
- ### Create a new FormatError with the given source +str+ and an optional
0
- ### message about the +specific+ error.
0
- def initialize( str, specific=nil )
0
- msg = "Bad markdown format near %p: %s" % [ str, specific ]
0
- msg = "Bad markdown format near %p" % str
0
+# BlueCloth is a Ruby implementation of Markdown, a text-to-HTML conversion
0
- # Rendering state struct. Keeps track of URLs, titles, and HTML blocks
0
- # midway through a render. I prefer this to the globals of the Perl version
0
- # because globals make me break out in hives. Or something.
0
- RenderState = Struct::new( "RenderState", :urls, :titles, :html_blocks, :log )
0
- # Tab width for #detab! if none is specified
0
- # The tag-closing string -- set to '>' for HTML
0
- EmptyElementSuffix = "/>";
0
- # Table of MD5 sums for escaped characters
0
- '\\`*_{}[]()#.!'.split(//).each {|char|
0
- hash = Digest::MD5::hexdigest( char )
0
- EscapeTable[ char ] = {
0
- :md5re => Regexp::new( hash ),
0
- :re => Regexp::new( '\\\\' + Regexp::escape(char) ),
0
- #################################################################
0
- ### I N S T A N C E M E T H O D S
0
- #################################################################
0
- ### Create a new BlueCloth string.
0
- def initialize( content="", *restrictions )
0
+ # Create a new BlueCloth string.
0
+ def initialize(content = "", *restrictions)
0
@log = Logger::new( $deferr )
0
@@ -137,16 +73,11 @@ class BlueCloth < String
0
restrictions.flatten.each {|r| __send__("#{r}=", true) }
0
@log.debug "String is: %p" % self
0
# Filters for controlling what gets output for untrusted input. (But really,
0
# you're filtering bad stuff out of untrusted input at submission-time via
0
# untainting, aren't you?)
0
@@ -156,989 +87,7 @@ class BlueCloth < String
0
# so this isn't used by anything.
0
attr_accessor :fold_lines
0
- ### Render Markdown-formatted text in this string object as HTML and return
0
- ### it. The parameter is for compatibility with RedCloth, and is currently
0
- ### unused, though that may change in the future.
0
- def to_html( lite=false )
0
- # Create a StringScanner we can reuse for various lexing tasks
0
- @scanner = StringScanner::new( '' )
0
- # Make a structure to carry around stuff that gets placeholdered out of
0
- rs = RenderState::new( {}, {}, {} )
0
- # Make a copy of the string with normalized line endings, tabs turned to
0
- # spaces, and a couple of guaranteed newlines at the end
0
- text = self.gsub( /\r\n?/, "\n" ).detab
0
- @log.debug "Normalized line-endings: %p" % text
0
- # Filter HTML if we're asked to do so
0
- text.gsub!( "<", "<" )
0
- text.gsub!( ">", ">" )
0
- @log.debug "Filtered HTML: %p" % text
0
- # Simplify blank lines
0
- text.gsub!( /^ +$/, '' )
0
- @log.debug "Tabs -> spaces/blank lines stripped: %p" % text
0
- # Replace HTML blocks with placeholders
0
- text = hide_html_blocks( text, rs )
0
- @log.debug "Hid HTML blocks: %p" % text
0
- @log.debug "Render state: %p" % rs
0
- # Strip link definitions, store in render state
0
- text = strip_link_definitions( text, rs )
0
- @log.debug "Stripped link definitions: %p" % text
0
- @log.debug "Render state: %p" % rs
0
- # Escape meta-characters
0
- text = escape_special_chars( text )
0
- @log.debug "Escaped special characters: %p" % text
0
- # Transform block-level constructs
0
- text = apply_block_transforms( text, rs )
0
- @log.debug "After block-level transforms: %p" % text
0
- # Now swap back in all the escaped characters
0
- text = unescape_special_chars( text )
0
- @log.debug "After unescaping special characters: %p" % text
0
- ### Convert tabs in +str+ to spaces.
0
- def detab( tabwidth=TabWidth )
0
- copy.detab!( tabwidth )
0
- ### Convert tabs to spaces in place and return self if any were converted.
0
- def detab!( tabwidth=TabWidth )
0
- newstr = self.split( /\n/ ).collect {|line|
0
- line.gsub( /(.*?)\t/ ) do
0
- $1 + ' ' * (tabwidth - $1.length % tabwidth)
0
- self.replace( newstr )
0
- ### Do block-level transforms on a copy of +str+ using the specified render
0
- ### state +rs+ and return the results.
0
- def apply_block_transforms( str, rs )
0
- # Port: This was called '_runBlockGamut' in the original
0
- @log.debug "Applying block transforms to:\n %p" % str
0
- text = transform_headers( str, rs )
0
- text = transform_hrules( text, rs )
0
- text = transform_lists( text, rs )
0
- text = transform_code_blocks( text, rs )
0
- text = transform_block_quotes( text, rs )
0
- text = transform_auto_links( text, rs )
0
- text = hide_html_blocks( text, rs )
0
- text = form_paragraphs( text, rs )
0
- @log.debug "Done with block transforms:\n %p" % text
0
- ### Apply Markdown span transforms to a copy of the specified +str+ with the
0
- ### given render state +rs+ and return it.
0
- def apply_span_transforms( str, rs )
0
- @log.debug "Applying span transforms to:\n %p" % str
0
- str = transform_code_spans( str, rs )
0
- str = encode_html( str )
0
- str = transform_images( str, rs )
0
- str = transform_anchors( str, rs )
0
- str = transform_italic_and_bold( str, rs )
0
- str.gsub!( / {2,}\n/, "<br#{EmptyElementSuffix}\n" )
0
- @log.debug "Done with span transforms:\n %p" % str
0
- # The list of tags which are considered block-level constructs and an
0
- # alternation pattern suitable for use in regexps made from the list
0
- StrictBlockTags = %w[ p div h[1-6] blockquote pre table dl ol ul script noscript
0
- form fieldset iframe math ins del ]
0
- StrictTagPattern = StrictBlockTags.join('|')
0
- LooseBlockTags = StrictBlockTags - %w[ins del]
0
- LooseTagPattern = LooseBlockTags.join('|')
0
- # tags for inner block must be indented.
0
- StrictBlockRegex = %r{
0
- <(#{StrictTagPattern}) # Start tag: \2
0
- (.*\n)*? # Any number of lines, minimal match
0
- </\1> # Matching end tag
0
- [ ]* # trailing spaces
0
- $ # End of line or document
0
- # More-liberal block-matching
0
- <(#{LooseTagPattern}) # start tag: \2
0
- (.*\n)*? # Any number of lines, minimal match
0
- .*</\1> # Anything + Matching end tag
0
- [ ]* # trailing spaces
0
- $ # End of line or document
0
- # Special case for <hr />.
0
- \A\n? # Start of doc + optional \n
0
- .*\n\n # anything + blank line
0
- ([^<>])*? # Attributes
0
- $ # followed by a blank line or end of document
0
- ### Replace all blocks of HTML in +str+ that start in the left margin with
0
- def hide_html_blocks( str, rs )
0
- @log.debug "Hiding HTML blocks in %p" % str
0
- # Tokenizer proc to pass to gsub
0
- tokenize = lambda {|match|
0
- key = Digest::MD5::hexdigest( match )
0
- rs.html_blocks[ key ] = match
0
- @log.debug "Replacing %p with %p" % [ match, key ]
0
- @log.debug "Finding blocks with the strict regex..."
0
- rval.gsub!( StrictBlockRegex, &tokenize )
0
- @log.debug "Finding blocks with the loose regex..."
0
- rval.gsub!( LooseBlockRegex, &tokenize )
0
- @log.debug "Finding hrules..."
0
- rval.gsub!( HruleBlockRegex ) {|match| $1 + tokenize[$2] }
0
- # Link defs are in the form: ^[id]: url "optional title"
0
- ^[ ]*\[(.+)\]: # id = $1
0
- \n? # maybe *one* newline
0
- \n? # maybe one newline
0
- # Titles are delimited by "quotes" or (parens).
0
- [")] # Matching ) or "
0
- )? # title is optional
0
- ### Strip link definitions from +str+, storing them in the given RenderState
0
- def strip_link_definitions( str, rs )
0
- str.gsub( LinkRegex ) {|match|
0
- id, url, title = $1, $2, $3
0
- rs.urls[ id.downcase ] = encode_html( url )
0
- rs.titles[ id.downcase ] = title.gsub( /"/, """ )
0
- ### Escape special characters in the given +str+
0
- def escape_special_chars( str )
0
- @log.debug " Escaping special characters"
0
- # The original Markdown source has something called '$tags_to_skip'
0
- # declared here, but it's never used, so I don't define it.
0
- tokenize_html( str ) {|token, str|
0
- @log.debug " Adding %p token %p" % [ token, str ]
0
- # Within tags, encode * and _
0
- gsub( /\*/, EscapeTable['*'][:md5] ).
0
- gsub( /_/, EscapeTable['_'][:md5] )
0
- # Encode backslashed stuff in regular text
0
- text += encode_backslash_escapes( str )
0
- raise TypeError, "Unknown token type %p" % token
0
- @log.debug " Text with escapes is now: %p" % text
0
- ### Swap escaped special characters in a copy of the given +str+ and return
0
- def unescape_special_chars( str )
0
- EscapeTable.each {|char, hash|
0
- @log.debug "Unescaping escaped %p with %p" % [ char, hash[:md5re] ]
0
- str.gsub!( hash[:md5re], char )
0
- ### Return a copy of the given +str+ with any backslashed special character
0
- ### in it replaced with MD5 placeholders.
0
- def encode_backslash_escapes( str )
0
- # Make a copy with any double-escaped backslashes encoded
0
- text = str.gsub( /\\\\/, EscapeTable['\\'][:md5] )
0
- EscapeTable.each_pair {|char, esc|
0
- text.gsub!( esc[:re], esc[:md5] )
0
- ### Transform any Markdown-style horizontal rules in a copy of the specified
0
- ### +str+ and return it.
0
- def transform_hrules( str, rs )
0
- @log.debug " Transforming horizontal rules"
0
- str.gsub( /^( ?[\-\*_] ?){3,}$/, "\n<hr#{EmptyElementSuffix}\n" )
0
- # Patterns to match and transform lists
0
- ListMarkerOl = %r{\d+\.}
0
- ListMarkerUl = %r{[*+-]}
0
- ListMarkerAny = Regexp::union( ListMarkerOl, ListMarkerUl )
0
- ^[ ]{0,#{TabWidth - 1}} # Indent < tab width
0
- (#{ListMarkerAny}) # unordered or ordered ($1)
0
- [ ]+ # At least one space
0
- (?m:.+?) # item content (include newlines)
0
- \n{2,} # Blank line...
0
- (?=\S) # ...followed by non-space
0
- (?![ ]* # ...but not another item
0
- ### Transform Markdown-style lists in a copy of the specified +str+ and
0
- def transform_lists( str, rs )
0
- @log.debug " Transforming lists at %p" % (str[0,100] + '...')
0
- str.gsub( ListRegexp ) {|list|
0
- @log.debug " Found list %p" % list
0
- list_type = (ListMarkerUl.match(bullet) ? "ul" : "ol")
0
- list.gsub!( /\n{2,}/, "\n\n\n" )
0
- %{<%s>\n%s</%s>\n} % [
0
- transform_list_items( list, rs ),
0
- # Pattern for transforming list items
0
- (\n)? # leading line = $1
0
- (^[ ]*) # leading whitespace = $2
0
- (#{ListMarkerAny}) [ ]+ # list marker = $3
0
- ((?m:.+?) # list item text = $4
0
- (?= \n* (\z | \2 (#{ListMarkerAny}) [ ]+))
0
- ### Transform list items in a copy of the given +str+ and return it.
0
- def transform_list_items( str, rs )
0
- @log.debug " Transforming list items"
0
- # Trim trailing blank lines
0
- str = str.sub( /\n{2,}\z/, "\n" )
0
- str.gsub( ListItemRegexp ) {|line|
0
- @log.debug " Found item line %p" % line
0
- leading_line, item = $1, $4
0
- if leading_line or /\n{2,}/.match( item )
0
- @log.debug " Found leading line or item has a blank"
0
- item = apply_block_transforms( outdent(item), rs )
0
- # Recursion for sub-lists
0
- @log.debug " Recursing for sublist"
0
- item = transform_lists( outdent(item), rs ).chomp
0
- item = apply_span_transforms( item, rs )
0
- %{<li>%s</li>\n} % item
0
- # Pattern for matching codeblocks
0
- ( # $1 = the code block
0
- (?:[ ]{#{TabWidth}} | \t) # a tab or tab-width of spaces
0
- (^[ ]{0,#{TabWidth - 1}}\S|\Z) # Lookahead for non-space at
0
- # line-start, or end of doc
0
- ### Transform Markdown-style codeblocks in a copy of the specified +str+ and
0
- def transform_code_blocks( str, rs )
0
- @log.debug " Transforming code blocks"
0
- str.gsub( CodeBlockRegexp ) {|block|
0
- # Generate the codeblock
0
- %{\n\n<pre><code>%s\n</code></pre>\n\n%s} %
0
- [ encode_code( outdent(codeblock), rs ).rstrip, remainder ]
0
- # Pattern for matching Markdown blockquote blocks
0
- BlockQuoteRegexp = %r{
0
- ^[ ]*>[ ]? # '>' at the start of a line
0
- .+\n # rest of the first line
0
- (?:.+\n)* # subsequent consecutive lines
0
- PreChunk = %r{ ( ^ \s* <pre> .+? </pre> ) }xm
0
- ### Transform Markdown-style blockquotes in a copy of the specified +str+
0
- def transform_block_quotes( str, rs )
0
- @log.debug " Transforming block quotes"
0
- str.gsub( BlockQuoteRegexp ) {|quote|
0
- @log.debug "Making blockquote from %p" % quote
0
- quote.gsub!( /^ *> ?/, '' ) # Trim one level of quoting
0
- quote.gsub!( /^ +$/, '' ) # Trim whitespace-only lines
0
- indent = " " * TabWidth
0
- quoted = %{<blockquote>\n%s\n</blockquote>\n\n} %
0
- apply_block_transforms( quote, rs ).
0
- gsub( PreChunk ) {|m| m.gsub(/^#{indent}/o, '') }
0
- @log.debug "Blockquoted chunk is: %p" % quoted
0
- AutoAnchorURLRegexp = /<((https?|ftp):[^'">\s]+)>/
0
- AutoAnchorEmailRegexp = %r{
0
- [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
0
- ### Transform URLs in a copy of the specified +str+ into links and return
0
- def transform_auto_links( str, rs )