public
Description: Aims to extend Ruby standard library, providing some useful tools that's not existed in the standard library, especially for functional programming.
Homepage: http://rubyforge.org/projects/ludy
Clone URL: git://github.com/godfat/ludy.git
ludy / lib / ludy / xhtml_formatter.rb
100644 138 lines (123 sloc) 5.118 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
require 'set'
 
module Ludy
 
  # 2008-05-09 godfat
  module XhtmlFormatter
    module_function
    def format_article html, *allowed_tags
      require 'rubygems'
      require 'hpricot'
 
      allowed_tags = allowed_tags.to_set
      XhtmlFormatter.format_article_elems Hpricot.parse(
        XhtmlFormatter.escape_all_inside_pre(html, allowed_tags)), allowed_tags
    end
 
    def format_autolink html, attrs = {}
      require 'rubygems'
      require 'hpricot'
 
      doc = Hpricot.parse html
      doc.each_child{ |c|
        next unless c.kind_of?(Hpricot::Text)
        c.content = format_url c.content, attrs
      }
      doc.to_html
    end
 
    # translated from drupal-6.2/modules/filter/filter.module
    def format_autolink_regexp text, attrs = {}
      attrs = attrs.map{ |k,v| " #{k}=\"#{v}\""}.join
      # Match absolute URLs.
      " #{text}".gsub(%r{(<p>|<li>|<br\s*/?>|[ \n\r\t\(])((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)])?)}i){ |match|
        match = [match, $1, $2, $3, $4, $5]
        match[2] = match[2] # escape something here
        caption = XhtmlFormatter.trim match[2]
        # match[2] = sanitize match[2]
        match[1]+'<a href="'+match[2]+'" title="'+match[2]+"\"#{attrs}>"+
          caption+'</a>'+match[5]
 
      # Match e-mail addresses.
      }.gsub(%r{(<p>|<li>|<br\s*/?>|[ \n\r\t\(])([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))}i, '\1<a href="mailto:\2">\2</a>\3').
 
      # Match www domains/addresses.
      gsub(%r{(<p>|<li>|[ \n\r\t\(])(www\.[a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+~#\&=/;-])([.,?!]*?)(?=(</p>|</li>|<br\s*/?>|[ \n\r\t\)]))}i){ |match|
        match = [match, $1, $2, $3, $4, $5]
        match[2] = match[2] # escape something here
        caption = XhtmlFormatter.trim match[2]
        # match[2] = sanitize match[2]
        match[1]+'<a href="http://'+match[2]+'" title="http://'+match[2]+"\"#{attrs}>"+
          caption+'</a>'+match[3]
      }[1..-1]
    end
 
    def format_url text, attrs = {}
      # translated from drupal-6.2/modules/filter/filter.module
      # Match absolute URLs.
      text.gsub(
  %r{((http://|https://|ftp://|mailto:|smb://|afp://|file://|gopher://|news://|ssl://|sslv2://|sslv3://|tls://|tcp://|udp://|www\.)([a-zA-Z0-9@:%_+*~#?&=.,/;-]*[a-zA-Z0-9@:%_+*~#&=/;-]))([.,?!]*?)}i){ |match|
        url = $1 # is there any other way to get this variable?
 
        caption = XhtmlFormatter.trim url
        attrs = attrs.map{ |k,v| " #{k}=\"#{v}\""}.join
 
        # Match www domains/addresses.
        url = "http://#{url}" unless url =~ %r{^http://}
        "<a href=\"#{url}\" title=\"#{url}\"#{attrs}>#{caption}</a>"
      # Match e-mail addresses.
      }.gsub( %r{([A-Za-z0-9._-]+@[A-Za-z0-9._+-]+\.[A-Za-z]{2,4})([.,?!]*?)}i,
              '<a href="mailto:\1">\1</a>')
    end
 
    def format_newline text
      # windows: \r\n
      # mac os 9: \r
      text.gsub("\r\n", "\n").tr("\r", "\n").gsub("\n", '<br />')
    end
 
    private
    def self.trim text, length = 50
      # Use +3 for '...' string length.
      if text.size <= 3
        '...'
      elsif text.size > length
        "#{text[0...length-3]}..."
      else
        text
      end
    end
    def self.escape_all_inside_pre html, allowed_tags
      return html unless allowed_tags.member? :pre
      # don't bother nested pre, because we escape all tags in pre
      html = html + '</pre>' unless html =~ %r{</pre>}i
      html.gsub(%r{<pre>(.*)</pre>}mi){
        # stop escaping for '>' because drupal's url filter would make &gt; into url...
        # is there any other way to get $1?
        "<pre>#{XhtmlFormatter.escape_lt(XhtmlFormatter.escape_amp($1))}</pre>"
      }
    end
    def self.format_article_elems elems, allowed_tags = Set.new, no_format_newline = false
      elems.children.map{ |e|
        if e.kind_of?(Hpricot::Text)
          if no_format_newline
            format_url(e.content)
          else
            format_newline format_url(e.content)
          end
        elsif e.kind_of?(Hpricot::Elem)
          if allowed_tags.member? e.name.to_sym
            if e.empty? || e.name == 'a'
              e.to_html
            else
              e.stag.inspect +
                XhtmlFormatter.format_article_elems(e, allowed_tags, e.stag.name == 'pre') +
                (e.etag || Hpricot::ETag.new(e.stag.name)).inspect
            end
          else
            if e.empty?
              XhtmlFormatter.escape_lt(e.stag.inspect)
            else
              XhtmlFormatter.escape_lt(e.stag.inspect) +
                XhtmlFormatter.format_article_elems(e, allowed_tags) +
                XhtmlFormatter.escape_lt((e.etag || Hpricot::ETag.new(e.stag.name)).inspect)
            end
          end
        end
      }.join
    end
    def self.escape_amp text
      text.gsub('&', '&amp;')
    end
    def self.escape_lt text
      text.gsub('<', '&lt;')
    end
  end
end # of Ludy