public
Description: Yet Another Planet Refactoring
Homepage: http://intertwingly.net/blog/2007/12/19/Yet-Another-Planet-Refactoring
Clone URL: git://github.com/rubys/mars.git
Search Repo:
Scott Bronson (author)
Mon Mar 31 23:29:15 -0700 2008
commit  775bc2a397c7812ae67b9979f288c3c835aab059
tree    b847bf45fa3471cc91ecd89b2524fceb081f6f69
parent  567e2f3f459d446f0530bbd4c8acb00dde378420
mars / planet / sift.rb
100644 209 lines (186 sloc) 6.696 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
require 'planet/fido'
require 'planet/log'
require 'html5'
require 'html5/sanitizer'
 
module Planet
  def Planet.sift node, fido
    unique = {}
 
    node.elements.each do |child|
      next unless child.namespace == 'http://www.w3.org/2005/Atom'
      child.name = child.name # remove prefix
 
      # remove, merge, or allow through duplicate children
      if unique.has_key? child.name
        case child.name
        when 'author'
          unique['author'].elements.each {|prevnode|
            next unless prevnode.text
            curnode = child.elements[prevnode.name]
            if not curnode
              child.add prevnode
            elsif not curnode.text
              curnode.text = prevnode.texts.map {|t| t.value}.join
            end
          }
          unique[child.name].remove
        when 'entry', 'category', 'contributor', 'link'
        else
          unique[child.name].remove
        end
      end
 
      unique[child.name] = child
 
      # node specific canonicalization
      case child.name
      when 'content', 'rights', 'subtitle', 'summary', 'title'
        make_absolute child, 'src'
 
        if child.attributes['type'] == 'html'
          text = child.texts.map {|t| t.value}.join.strip
          child.children.each {|text_node| text_node.remove}
          div = child.add_element('div')
          div.add_namespace 'http://www.w3.org/1999/xhtml'
          HTML5.parse_fragment(text, :encoding => 'UTF-8').each do |frag|
            div.add(frag)
          end
          child.attributes['type'] = 'xhtml'
        end
 
        if child.attributes['type'] == 'xhtml'
          child.elements.each {|xhtml_element| sanitize xhtml_element, fido}
        end
 
      when 'category'
        make_absolute child, 'scheme'
      when 'link'
        make_absolute child, 'href'
        child.attributes['rel'] = 'alternate' unless child.attribute('rel')
      when 'icon', 'logo', 'uri'
        value = child.texts.map {|t| t.value}.join
        if !value.empty? and value != 'http://'
          value = uri_norm(child.xmlbase, value)
          child.children {|text_node| text_node.remove}
          child.text = value
        else
          child.remove
        end
      when 'generator'
        make_absolute child, 'uri'
      when 'published', 'updated'
        if child.text
          text = child.texts.map {|t| t.value}.join
          child.children.each {|text_node| text_node.remove}
          child.text = DateTime.parse(text).to_s
        end
      when 'author', 'email', 'entry', 'feed', 'id', 'name', 'source'
      else
        child.add_namespace('http://planet.intertwingly.net/unknown')
      end
 
      sift child, fido
 
    end
 
    # ensure required elements are present
    if %w(entry feed source).include? node.name
      if !unique.has_key? 'title'
        node << REXML::Element.new('title')
      end
 
      if !unique.has_key? 'id'
        link = node.elements['link[@rel="alternate"]/@href']
        if link
          id = node.add_element('id')
          id.text = link.value
        end
      end
    end
  end
 
  # resolve a relative URI attribute
  def Planet.make_absolute node, attr_name
    value = node.attributes[attr_name]
    return unless value
    value = uri_norm(node.xmlbase, value) rescue value
    node.attributes[attr_name] = value
  end
 
  # remove suspect markup, styles, uris
  include HTML5::HTMLSanitizeModule
  @sanitizer = HTML5::HTMLSanitizer.new ''
  def Planet.sanitize node, fido
    # cull empty formatting elements. They can cause FF & Konq to nest badly.
    # For instance, <i/> causes everything after it to italicized, including other entries.
    if node.elements.size == 0 && node.text == nil
      if %w{abbr acronym b big cite code del dfn em i ins kbd s
samp small strike strong sub sup tt u var}.include? node.name
        # If the node has no children and no text, it can only cause trouble.
        node.remove
        return
      end
    end
 
    node.elements.each {|child| sanitize child, fido}
 
    if node.namespace == 'http://www.w3.org/1999/xhtml'
      elist = ACCEPTABLE_ELEMENTS
      alist = ACCEPTABLE_ATTRIBUTES
    elsif node.namespace == 'http://www.w3.org/2000/svg'
      elist = SVG_ELEMENTS
      alist = SVG_ATTRIBUTES
    elsif node.namespace == 'http://www.w3.org/1998/Math/MathML'
      elist = MATHML_ELEMENTS
      alist = MATHML_ATTRIBUTES
    else
      elist = []
      alist = []
    end
 
    if !elist.include? node.name
 
      # inline svg objects
      if node.name=='object' and node.attributes['type']=='image/svg+xml'
        begin
          uri = Planet::uri_norm(node.attributes['data'])
          response = fido.fetch(uri)
          response = fido.read_from_cache(uri) if response.code == '304'
          svg = REXML::Document.new(response.body).root
          node.parent.insert_after node, svg
          svg.elements.each {|child| sanitize child, fido}
          fido.write_to_cache node.attributes['data'], response
          node.name = 'script' # make sure that children are eaten
        rescue Exception => e
          Planet.log.error e.inspect
          Planet.log.error uri
          e.backtrace.each {|line| Planet.log.error line}
        end
      end
 
      # retain children from bogus elements, except for truly evil ones
      if !%w[script applet style].include? node.name
        node.children.reverse.each {|child| node.next_sibling=child}
      end
 
      node.remove
    else
      node.attributes.each_value do |attribute|
        if !alist.include? attribute.expanded_name
          if attribute.expanded_name == 'style'
            node.add_attribute attribute.expanded_name,
              @sanitizer.sanitize_css(attribute.value)
          elsif attribute.name != 'xmlns'
            attribute.remove
          end
        elsif ATTR_VAL_IS_URI.include? attribute.expanded_name
          begin
            value = Addressable::URI.join(node.xmlbase, attribute.value)
            if ACCEPTABLE_PROTOCOLS.include? value.scheme
              node.add_attribute attribute.expanded_name, value.normalize.to_s
            else
              attribute.remove
            end
          rescue
            attribute.remove
          end
        end
      end
    end
  end
 
  # add a convenience method for computing the xml:base for any given Element
  if not REXML::Element.public_instance_methods.include? "xmlbase"
    class REXML::Element
      def xmlbase
        if not attribute('xml:base')
          parent.xmlbase
        elsif parent
          Planet::uri_norm(parent.xmlbase, attribute('xml:base').value)
        else
          attribute('xml:base').value || ''
        end
      end
    end
  end
 
end