require 'planet/fido'
require 'planet/transmogrify'
require 'planet/sift'
require 'fileutils'
module Planet
# Fetch a set of feeds, normalize, and write each as a set of entries into a
# cache directory.
def Planet.spider
config = Planet.config['Planet']
cache = config['cache_directory']
http_cache = File.join(cache,'http')
entry_cache = File.join(cache,'entry')
source_cache = File.join(cache,'source')
# make output directories
FileUtils.mkdir_p http_cache, :mode => 0700
FileUtils.mkdir_p entry_cache, :mode => 0700
FileUtils.mkdir_p source_cache, :mode => 0700
# prep fetcher
fido = Planet::Fido.new(http_cache)
fido.threads = config['spider_threads'].to_i if config['spider_threads']
fido.timeout = config['feed_timeout'].to_f if config['feed_timeout']
# process subscriptions: for each updated feed, updated the cache with
# the set of canonicalized entries augmented with source information.
subs = Planet.config.keys.grep(/^https?:\/\//)
fido.each(subs) do |sub, resp|
next unless resp.code == '200'
uri = resp.header['Content-Location'] || sub
# first set of filters: xml parsing and element names
doc = Planet::Transmogrify.parse(resp.body)
feed = doc.root || doc
# add in self information
if not feed.elements['link[@rel="self"]']
link = feed.add_element('link',{'rel'=>'self', 'href'=>uri})
if doc.version[0..2] == 'rss'
link.attributes['type'] == 'application/rss+xml'
elsif doc.version[0..3] == 'atom'
link.attributes['type'] == 'application/atom+xml'
else
Planet.log.error "Not a feed - #{uri}"
next
end
feed.add_text("\n ")
end
# second set of filters: cardinality, sanitization, dates, and uris
doc.attributes['xml:base'] = uri
Planet.sift feed, fido
# process feed attributes: xml* (xml:lang, xml:base, xmlns) will need
# need to be transplanted to each entry. The rest will simply be
# placed on the source element
root_attrs = {}
source = REXML::Element.new('source')
feed.attributes.each_attribute do |attrib|
if attrib.expanded_name[0..2] == 'xml'
root_attrs[attrib.expanded_name] = attrib.value
else
source.attributes[attrib.expanded_name] = attrib.value
end
end
# add in configuration information (names, hackergotchi icons...)
source.add_namespace 'planet', 'http://planet.intertwingly.net/'
Planet.source(sub, source)
# process feed elements: entries will be captured for later processing,
# other elements will be transplanted to the source element.
entries = []
feed.elements.each do |element|
if element.name == 'entry'
entries << element
else
source.add_element(element)
end
end
entries.each do |entry|
# try to find a unique id (TODO: try harder)
id = entry.elements['id'].text rescue nil
id ||= entry.elements['link[@rel="alternate"]/@href'] rescue nil
next unless id
# determine output file name for this entry
entry_file = File.join(entry_cache, Planet.filename(id))
# determine updated date
updated = entry.elements['updated']
if not updated
updated = entry.add_element('updated')
if entry.elements['published']
updated.text = entry.elements['published'].text
elsif File.exist? entry_file
updated.text=File.stat(entry_file).mtime.iso8601
else
updated.text=DateTime.now.to_s
end
end
# augment with feed xml* attributes and source information
root_attrs.each_pair {|name,value| entry.attributes[name]=value}
entry.add(source) if not entry.elements['source']
# output the entry, with a timestamp reflecting the update time
File.open(entry_file, 'w') { |file| file.write(entry.to_s) }
updated = Time.parse(updated.text)
File.utime updated, updated, entry_file
end
# write source information out to the cache
if feed.name == 'feed'
source.name = 'planet:source'
root_attrs.each_pair {|name,value| source.attributes[name]=value}
source_file = File.join(source_cache, Planet.filename(sub))
File.open(source_file, 'w') { |file| file.write(source.to_s) }
end
end
end
# add configuration information to a source element
def Planet.source sub, element
Planet.config[sub].each do |name,value|
next if name[0..1] == '__'
child = element.add_element("planet:#{name}")
child.text = value
end
end
end