require 'set' module Ludy # 2008-05-09 godfat module XhtmlFormatter module_function def format_article html, *allowed_tags require 'rubygems' require 'hpricot' allowed_tags = allowed_tags.to_set XhtmlFormatter.format_article_elems Hpricot.parse( XhtmlFormatter.escape_all_inside_pre(html, allowed_tags)), allowed_tags end def format_autolink html, attrs = {} require 'rubygems' require 'hpricot' doc = Hpricot.parse html doc.each_child{ |c| next unless c.kind_of?(Hpricot::Text) c.content = format_url c.content, attrs } doc.to_html end # translated from drupal-6.2/modules/filter/filter.module def format_autolink_regexp text, attrs = {} attrs = attrs.map{ |k,v| " #{k}=\"#{v}\""}.join # Match absolute URLs. " #{text}".gsub(%r{(

|

  • ||[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)])?)}i){ |match| match = [match, $1, $2, $3, $4, $5] match[2] = match[2] # escape something here caption = XhtmlFormatter.trim match[2] # match[2] = sanitize match[2] match[1]+'"+ caption+''+match[5] # Match e-mail addresses. }.gsub(%r{(

    |

  • ||[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))}i, '\1\2\3'). # Match www domains/addresses. gsub(%r{(

    |

  • |[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(

    |
  • ||[ \n\r\t\)]))}i){ |match| match = [match, $1, $2, $3, $4, $5] match[2] = match[2] # escape something here caption = XhtmlFormatter.trim match[2] # match[2] = sanitize match[2] match[1]+'"+ caption+''+match[3] }[1..-1] end def format_url text, attrs = {} # translated from drupal-6.2/modules/filter/filter.module # Match absolute URLs. text.gsub( %r{((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)}i){ |match| url = $1 # is there any other way to get this variable? caption = XhtmlFormatter.trim url attrs = attrs.map{ |k,v| " #{k}=\"#{v}\""}.join # Match www domains/addresses. url = "http://#{url}" unless url =~ %r{^http://} "#{caption}" # Match e-mail addresses. }.gsub( %r{([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)}i, '\1') end def format_newline text # windows: \r\n # mac os 9: \r text.gsub("\r\n", "\n").tr("\r", "\n").gsub("\n", '
    ') end private def self.trim text, length = 50 # Use +3 for '...' string length. if text.size <= 3 '...' elsif text.size > length "#{text[0...length-3]}..." else text end end def self.escape_all_inside_pre html, allowed_tags return html unless allowed_tags.member? :pre # don't bother nested pre, because we escape all tags in pre html = html + '' unless html =~ %r{}i html.gsub(%r{
    (.*)
    }mi){ # stop escaping for '>' because drupal's url filter would make > into url... # is there any other way to get $1? "
    #{XhtmlFormatter.escape_lt(XhtmlFormatter.escape_amp($1))}
    " } end def self.format_article_elems elems, allowed_tags = Set.new, no_format_newline = false elems.children.map{ |e| if e.kind_of?(Hpricot::Text) if no_format_newline format_url(e.content) else format_newline format_url(e.content) end elsif e.kind_of?(Hpricot::Elem) if allowed_tags.member? e.name.to_sym if e.empty? || e.name == 'a' e.to_html else e.stag.inspect + XhtmlFormatter.format_article_elems(e, allowed_tags, e.stag.name == 'pre') + (e.etag || Hpricot::ETag.new(e.stag.name)).inspect end else if e.empty? XhtmlFormatter.escape_lt(e.stag.inspect) else XhtmlFormatter.escape_lt(e.stag.inspect) + XhtmlFormatter.format_article_elems(e, allowed_tags) + XhtmlFormatter.escape_lt((e.etag || Hpricot::ETag.new(e.stag.name)).inspect) end end end }.join end def self.escape_amp text text.gsub('&', '&') end def self.escape_lt text text.gsub('<', '<') end end end # of Ludy