public
Description: Yet Another Planet Refactoring
Homepage: http://intertwingly.net/blog/2007/12/19/Yet-Another-Planet-Refactoring
Clone URL: git://github.com/rubys/mars.git
Search Repo:
Scott Bronson (author)
Mon Mar 31 23:29:15 -0700 2008
commit  775bc2a397c7812ae67b9979f288c3c835aab059
tree    b847bf45fa3471cc91ecd89b2524fceb081f6f69
parent  567e2f3f459d446f0530bbd4c8acb00dde378420
mars / planet / spider.rb
100644 134 lines (116 sloc) 4.77 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
require 'planet/fido'
require 'planet/transmogrify'
require 'planet/sift'
require 'fileutils'
 
module Planet
 
  # Fetch a set of feeds, normalize, and write each as a set of entries into a
  # cache directory.
  def Planet.spider
    config = Planet.config['Planet']
    cache = config['cache_directory']
    http_cache = File.join(cache,'http')
    entry_cache = File.join(cache,'entry')
    source_cache = File.join(cache,'source')
 
    # make output directories
    FileUtils.mkdir_p http_cache, :mode => 0700
    FileUtils.mkdir_p entry_cache, :mode => 0700
    FileUtils.mkdir_p source_cache, :mode => 0700
 
    # prep fetcher
    fido = Planet::Fido.new(http_cache)
    fido.threads = config['spider_threads'].to_i if config['spider_threads']
    fido.timeout = config['feed_timeout'].to_f if config['feed_timeout']
 
    # process subscriptions: for each updated feed, updated the cache with
    # the set of canonicalized entries augmented with source information.
    subs = Planet.config.keys.grep(/^https?:\/\//)
    fido.each(subs) do |sub, resp|
      next unless resp.code == '200'
      uri = resp.header['Content-Location'] || sub
 
      # first set of filters: xml parsing and element names
      doc = Planet::Transmogrify.parse(resp.body)
      feed = doc.root || doc
 
      # add in self information
      if not feed.elements['link[@rel="self"]']
        link = feed.add_element('link',{'rel'=>'self', 'href'=>uri})
        if doc.version[0..2] == 'rss'
          link.attributes['type'] == 'application/rss+xml'
        elsif doc.version[0..3] == 'atom'
          link.attributes['type'] == 'application/atom+xml'
        else
          Planet.log.error "Not a feed - #{uri}"
          next
        end
        feed.add_text("\n ")
      end
 
      # second set of filters: cardinality, sanitization, dates, and uris
      doc.attributes['xml:base'] = uri
      Planet.sift feed, fido
 
      # process feed attributes: xml* (xml:lang, xml:base, xmlns) will need
      # need to be transplanted to each entry. The rest will simply be
      # placed on the source element
      root_attrs = {}
      source = REXML::Element.new('source')
      feed.attributes.each_attribute do |attrib|
        if attrib.expanded_name[0..2] == 'xml'
          root_attrs[attrib.expanded_name] = attrib.value
        else
          source.attributes[attrib.expanded_name] = attrib.value
        end
      end
 
      # add in configuration information (names, hackergotchi icons...)
      source.add_namespace 'planet', 'http://planet.intertwingly.net/'
      Planet.source(sub, source)
 
      # process feed elements: entries will be captured for later processing,
      # other elements will be transplanted to the source element.
      entries = []
      feed.elements.each do |element|
        if element.name == 'entry'
          entries << element
        else
          source.add_element(element)
        end
      end
 
      entries.each do |entry|
        # try to find a unique id (TODO: try harder)
        id = entry.elements['id'].text rescue nil
        id ||= entry.elements['link[@rel="alternate"]/@href'] rescue nil
        next unless id
 
        # determine output file name for this entry
        entry_file = File.join(entry_cache, Planet.filename(id))
 
        # determine updated date
        updated = entry.elements['updated']
        if not updated
          updated = entry.add_element('updated')
          if entry.elements['published']
            updated.text = entry.elements['published'].text
          elsif File.exist? entry_file
            updated.text=File.stat(entry_file).mtime.iso8601
          else
            updated.text=DateTime.now.to_s
          end
        end
 
        # augment with feed xml* attributes and source information
        root_attrs.each_pair {|name,value| entry.attributes[name]=value}
        entry.add(source) if not entry.elements['source']
 
        # output the entry, with a timestamp reflecting the update time
        File.open(entry_file, 'w') { |file| file.write(entry.to_s) }
        updated = Time.parse(updated.text)
        File.utime updated, updated, entry_file
      end
 
      # write source information out to the cache
      if feed.name == 'feed'
        source.name = 'planet:source'
        root_attrs.each_pair {|name,value| source.attributes[name]=value}
        source_file = File.join(source_cache, Planet.filename(sub))
        File.open(source_file, 'w') { |file| file.write(source.to_s) }
      end
    end
  end
 
  # add configuration information to a source element
  def Planet.source sub, element
    Planet.config[sub].each do |name,value|
      next if name[0..1] == '__'
      child = element.add_element("planet:#{name}")
      child.text = value
    end
  end
end