require 'planet/xmlparser'
module Planet
class Transmogrify
# ensure that feed elements can't cause arbitrary methods to be called
instance_methods.each do |name|
undef_method name unless name =~ /^__/ or name == :object_id
end
NAMESPACES = {
'' => 'rss',
'http://www.w3.org/1999/xhtml' => 'xhtml',
'http://www.w3.org/2005/Atom' => 'atom',
'http://purl.org/dc/elements/1.1/' => 'dc',
'http://purl.org/rss/1.0/modules/content/' => 'content',
'http://web.resource.org/cc/' => 'cc',
'http://search.yahoo.com/mrss/' => 'media',
'http://backend.userland.com/creativeCommonsRssModule' => 'creativeCommons',
}
def Transmogrify.parse(source)
doc = XmlParser.parse(source)
source = nil
class << doc
attr_accessor :version
end
# determine the version
root = doc.root || doc
doc.version = 'unknown'
if root.name == 'feed'
if root.namespace == 'http://www.w3.org/2005/Atom'
doc.version = 'atom10'
else
doc.version = 'atom'
end
elsif root.name == 'rss'
case root.attributes['version']
when /^2\./
doc.version = 'rss20'
when /^0\.9([234])/
doc.version = "rss09#{$1}"
when /^0\.91/
if doc.doctype.to_s.index('netscape')
doc.version = "rss091n"
else
doc.version = "rss091u"
end
else
doc.version = 'rss'
end
root.delete_attribute('version')
root.attributes['xmlns'] = '' if root.attributes['xmlns']
end
process(doc, Transmogrify.new)
root.attributes['xmlns'] = 'http://www.w3.org/2005/Atom'
doc
end
def Transmogrify.process(node, catalyst)
method = "#{NAMESPACES[node.namespace] || '?'}_#{node.name}".to_sym
begin
catalyst.__send__ method, node
rescue NoMethodError
end
node.elements.each {|child| process(child, catalyst)}
end
def rss_rss node
node.name = 'feed'
channel = node.elements['channel']
if channel
node.children.each {|child| node.delete(child)}
channel.children.each {|child| node.add(child)}
end
end
alias :rss_channel :rss_rss
def rss_item node
node.name = 'entry'
end
def rss_description node
if node.parent.name == 'feed'
node.name = 'subtitle'
else
if node.parent.elements['summary']
node.name = 'content'
else
node.name = 'summary'
end
node.attributes['type'] = 'html'
end
if node.elements.to_a != []
node.attributes['type'] = 'xhtml'
div = REXML::Element.new('div')
div.add_namespace('http://www.w3.org/1999/xhtml')
node.children.each {|child| div << child}
node << div
end
end
alias :dc_description :rss_description
def content_encoded node
node.name = 'content'
node.attributes['type'] = 'html'
end
def rss_fullitem node
node.name = 'content'
node.attributes['type'] = 'html'
end
def rss_guid node
node.name='id'
permalink = 'true'
node.attributes.each do |name,value|
permalink = value if name.downcase=='ispermalink'
end
if permalink.downcase != 'false'
if not node.parent.elements['link']
link = node.parent.add_element('link')
link.attributes['href'] = node.texts.map {|t| t.value}.join
end
end
node.attributes.delete_if {|name,value| name.downcase == 'ispermalink'}
end
def rss_link node
node.name = 'link'
if node.text and not node.attributes['href']
node.attributes['href'] = node.texts.map {|t| t.value}.join
node.children.each {|child| node.delete(child)}
end
end
def rss_comments node
rss_link node
node.attributes['rel'] = 'replies'
node.attributes['type'] = 'text/html'
end
def rss_enclosure node
node.name = 'link'
node.attributes['rel'] = 'enclosure'
if node.attributes['url']
node.attributes['href'] = node.attributes['url']
node.delete_attribute('url')
end
end
def creativeCommons_license node
rss_link node
node.attributes['rel'] = 'license'
end
def cc_license node
creativeCommons_license node
if node.attributes['rdf:resource']
node.attributes['href'] = node.attributes['rdf:resource']
node.delete_attribute('rdf:resource')
end
end
def rss_category node
node.name = 'category'
node.attributes['term'] = node.texts.map {|t| t.value}.join
if node.attributes['domain']
node.attributes['scheme'] = node.attributes['domain']
node.delete_attribute('domain')
end
node.children.each {|child| child.remove}
end
alias :dc_subject :rss_category
def rss_copyright node
node.name = 'rights'
end
alias :dc_rights :rss_copyright
def rss_pubDate node
node.name = 'published'
end
def dc_date node
node.name = 'updated'
end
alias :rss_lastBuildDate :dc_date
def dc_title node
node.name='title'
end
def xhtml_body node
node.name = 'content'
node.delete_attribute('xmlns') if node.attributes['xmlns']
node.attributes['type'] = 'xhtml'
div = REXML::Element.new('div')
div.add_namespace('http://www.w3.org/1999/xhtml')
node.children.each {|child| div << child}
node << div
end
def rss_author node
node.name = 'author'
name = node.texts.map {|t| t.value}.join.strip
email = nil
if /([\w._%+-]+@[A-Za-z][\w.-]+)\s+\((.*)\)/ =~ name
email, name = $1, $2
elsif /(.*?)\s+\(([\w._%+-]+@[A-Za-z][\w.-]+)\)/ =~ name
name, email = $1, $2
elsif /([\w._%+-]+@[A-Za-z][\w.-]+)\s+<(.*)>/ =~ name
email, name = $1, $2
elsif /(.*?)\s+<([\w._%+-]+@[A-Za-z][\w.-]+)>/ =~ name
name, email = $1, $2
elsif /([\w._%+-]+@[A-Za-z][\w.-]+)/ =~ name
email = $1
name.sub!($1, '')
end
node.children.each {|child| node.delete(child)}
node.add_element('name').add_text(name)
node.add_element('email').add_text(email) if email
end
alias :dc_author :rss_author
alias :dc_creator :rss_author
alias :dc_publisher :rss_author
alias :rss_managingEditor :rss_author
alias :rss_webMaster :rss_author
def dc_contributor node
rss_author node
node.name = 'contributor'
end
def atom_url node
node.name = 'uri'
end
def atom_content node
# fixup miscoded 'html' text constructs
if node.attributes['type'] == 'html'
if !node.elements.empty?
if node.elements.map {|child| child.name} == ['div'] and
node.elements[1].elements.empty?
# hoist HTML content outside of div
node.elements[1].children.each {|child| node.add(child)}
node.delete_element 1
else
node.attributes['type'] == 'xhtml'
end
end
end
end
end
end