public
Description: Yet Another Planet Refactoring
Homepage: http://intertwingly.net/blog/2007/12/19/Yet-Another-Planet-Refactoring
Clone URL: git://github.com/rubys/mars.git
Search Repo:
Sam Ruby (author)
Thu Apr 03 17:55:30 -0700 2008
commit  594cd30192668c6310b3236b978d5d4a6d706fb7
tree    fb2c98375a3f69e66dc22da049a8c7e26438d717
parent  775bc2a397c7812ae67b9979f288c3c835aab059
mars / planet / transmogrify.rb
100644 263 lines (229 sloc) 7.218 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
require 'planet/xmlparser'
 
module Planet
  class Transmogrify
    # ensure that feed elements can't cause arbitrary methods to be called
    instance_methods.each do |name|
      undef_method name unless name =~ /^__/ or name == :object_id
    end
 
    NAMESPACES = {
      '' => 'rss',
      'http://www.w3.org/1999/xhtml' => 'xhtml',
      'http://www.w3.org/2005/Atom' => 'atom',
      'http://purl.org/dc/elements/1.1/' => 'dc',
      'http://purl.org/rss/1.0/modules/content/' => 'content',
      'http://web.resource.org/cc/' => 'cc',
      'http://search.yahoo.com/mrss/' => 'media',
      'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
    }
 
    def Transmogrify.parse(source)
      doc = XmlParser.parse(source)
 
      source = nil
      class << doc
        attr_accessor :version
      end
 
      # determine the version
      root = doc.root || doc
      doc.version = 'unknown'
      if root.name == 'feed'
        if root.namespace == 'http://www.w3.org/2005/Atom'
          doc.version = 'atom10'
        else
          doc.version = 'atom'
        end
      elsif root.name == 'rss'
        case root.attributes['version']
        when /^2\./
          doc.version = 'rss20'
        when /^0\.9([234])/
          doc.version = "rss09#{$1}"
        when /^0\.91/
          if doc.doctype.to_s.index('netscape')
            doc.version = "rss091n"
          else
            doc.version = "rss091u"
          end
        else
          doc.version = 'rss'
        end
 
        root.delete_attribute('version')
        root.attributes['xmlns'] = '' if root.attributes['xmlns']
      end
 
      process(doc, Transmogrify.new)
      root.attributes['xmlns'] = 'http://www.w3.org/2005/Atom'
      doc
    end
 
    def Transmogrify.process(node, catalyst)
      method = "#{NAMESPACES[node.namespace] || '?'}_#{node.name}".to_sym
      begin
        catalyst.__send__ method, node
      rescue NoMethodError
      end
      node.elements.each {|child| process(child, catalyst)}
    end
 
    def rss_rss node
      node.name = 'feed'
      channel = node.elements['channel']
      if channel
        node.children.each {|child| node.delete(child)}
        channel.children.each {|child| node.add(child)}
      end
    end
    alias :rss_channel :rss_rss
 
    def rss_item node
      node.name = 'entry'
    end
 
    def rss_description node
      if node.parent.name == 'feed'
        node.name = 'subtitle'
      else
        if node.parent.elements['summary']
          node.name = 'content'
        else
          node.name = 'summary'
        end
        node.attributes['type'] = 'html'
      end
 
      if node.elements.to_a != []
        node.attributes['type'] = 'xhtml'
        div = REXML::Element.new('div')
        div.add_namespace('http://www.w3.org/1999/xhtml')
        node.children.each {|child| div << child}
        node << div
      end
    end
    alias :dc_description :rss_description
 
    def content_encoded node
      node.name = 'content'
      node.attributes['type'] = 'html'
    end
 
    def rss_fullitem node
      node.name = 'content'
      node.attributes['type'] = 'html'
    end
 
    def rss_guid node
      node.name='id'
 
      permalink = 'true'
      node.attributes.each do |name,value|
        permalink = value if name.downcase=='ispermalink'
      end
 
      if permalink.downcase != 'false'
        if not node.parent.elements['link']
          link = node.parent.add_element('link')
          link.attributes['href'] = node.texts.map {|t| t.value}.join
        end
      end
 
      node.attributes.delete_if {|name,value| name.downcase == 'ispermalink'}
    end
 
    def rss_link node
      node.name = 'link'
      if node.text and not node.attributes['href']
        node.attributes['href'] = node.texts.map {|t| t.value}.join
        node.children.each {|child| node.delete(child)}
      end
    end
 
    def rss_comments node
      rss_link node
      node.attributes['rel'] = 'replies'
      node.attributes['type'] = 'text/html'
    end
 
    def rss_enclosure node
      node.name = 'link'
      node.attributes['rel'] = 'enclosure'
      if node.attributes['url']
        node.attributes['href'] = node.attributes['url']
        node.delete_attribute('url')
      end
    end
 
    def creativeCommons_license node
      rss_link node
      node.attributes['rel'] = 'license'
    end
 
    def cc_license node
      creativeCommons_license node
      if node.attributes['rdf:resource']
        node.attributes['href'] = node.attributes['rdf:resource']
        node.delete_attribute('rdf:resource')
      end
    end
 
    def rss_category node
      node.name = 'category'
      node.attributes['term'] = node.texts.map {|t| t.value}.join
      if node.attributes['domain']
        node.attributes['scheme'] = node.attributes['domain']
        node.delete_attribute('domain')
      end
      node.children.each {|child| child.remove}
    end
    alias :dc_subject :rss_category
 
    def rss_copyright node
      node.name = 'rights'
    end
    alias :dc_rights :rss_copyright
 
    def rss_pubDate node
      node.name = 'published'
    end
 
    def dc_date node
      node.name = 'updated'
    end
    alias :rss_lastBuildDate :dc_date
 
    def dc_title node
      node.name='title'
    end
 
    def xhtml_body node
      node.name = 'content'
      node.delete_attribute('xmlns') if node.attributes['xmlns']
      node.attributes['type'] = 'xhtml'
      div = REXML::Element.new('div')
      div.add_namespace('http://www.w3.org/1999/xhtml')
      node.children.each {|child| div << child}
      node << div
    end
 
    def rss_author node
      node.name = 'author'
      name = node.texts.map {|t| t.value}.join.strip
      email = nil
      if /([\w._%+-]+@[A-Za-z][\w.-]+)\s+\((.*)\)/ =~ name
        email, name = $1, $2
      elsif /(.*?)\s+\(([\w._%+-]+@[A-Za-z][\w.-]+)\)/ =~ name
        name, email = $1, $2
      elsif /([\w._%+-]+@[A-Za-z][\w.-]+)\s+<(.*)>/ =~ name
        email, name = $1, $2
      elsif /(.*?)\s+<([\w._%+-]+@[A-Za-z][\w.-]+)>/ =~ name
        name, email = $1, $2
      elsif /([\w._%+-]+@[A-Za-z][\w.-]+)/ =~ name
        email = $1
        name.sub!($1, '')
      end
      node.children.each {|child| node.delete(child)}
      node.add_element('name').add_text(name)
      node.add_element('email').add_text(email) if email
    end
    alias :dc_author :rss_author
    alias :dc_creator :rss_author
    alias :dc_publisher :rss_author
    alias :rss_managingEditor :rss_author
    alias :rss_webMaster :rss_author
 
    def dc_contributor node
      rss_author node
      node.name = 'contributor'
    end
 
    def atom_url node
      node.name = 'uri'
    end
 
    def atom_content node
      # fixup miscoded 'html' text constructs
      if node.attributes['type'] == 'html'
        if !node.elements.empty?
          if node.elements.map {|child| child.name} == ['div'] and
            node.elements[1].elements.empty?
 
            # hoist HTML content outside of div
            node.elements[1].children.each {|child| node.add(child)}
            node.delete_element 1
          else
            node.attributes['type'] == 'xhtml'
          end
        end
      end
    end
  end
end