public
Description: Yet Another Planet Refactoring
Homepage: http://intertwingly.net/blog/2007/12/19/Yet-Another-Planet-Refactoring
Clone URL: git://github.com/rubys/mars.git
mars / planet / spider.rb
100644 178 lines (156 sloc) 6.396 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
require 'planet/fido'
require 'planet/transmogrify'
require 'planet/sift'
require 'fileutils'
require 'rexml/formatters/default'
 
module Planet
 
  # Fetch a set of feeds, normalize, and write each as a set of entries into a
  # cache directory.
  def Planet.spider
    config = Planet.config['Planet']
    cache = config['cache_directory']
    http_cache = File.join(cache,'http')
    entry_cache = File.join(cache,'entry')
    source_cache = File.join(cache,'source')
 
    # make output directories
    FileUtils.mkdir_p http_cache, :mode => 0700
    FileUtils.mkdir_p entry_cache, :mode => 0700
    FileUtils.mkdir_p source_cache, :mode => 0700
 
    # prep fetcher
    fido = Planet::Fido.new(http_cache)
    fido.threads = config['spider_threads'].to_i if config['spider_threads']
    fido.timeout = config['feed_timeout'].to_f if config['feed_timeout']
 
    # process subscriptions: for each updated feed, updated the cache with
    # the set of canonicalized entries augmented with source information.
    subs = Planet.config.keys.grep(/^https?:\/\//)
    fido.each(subs) do |sub, resp|
      next unless resp.code == '200'
      uri = resp.header['Content-Location'] || sub
 
      # first set of filters: xml parsing and element names
      doc = Planet::Transmogrify.parse(resp.body)
      feed = doc.root || doc
 
      # add in self information
      if not feed.elements['link[@rel="self"]']
        link = feed.add_element('link',{'rel'=>'self', 'href'=>uri})
        if doc.version[0..2] == 'rss'
          link.attributes['type'] == 'application/rss+xml'
        elsif doc.version[0..3] == 'atom'
          link.attributes['type'] == 'application/atom+xml'
        else
          Planet.log.error "Not a feed - #{uri}"
          next
        end
      end
 
      # fix title_type, name_type, summary_type, and content_type
      # per config file
      Planet.config[sub].each do |name,value|
        case value
          when 'text/html', 'html'
            type = 'html'
          when 'text/plain', 'text'
            type = 'text'
          when 'application/xhtml+xml', 'xhtml'
            type = 'xhtml'
          else
            next
        end
 
        case name
          when 'title_type'
            feed.each_element('//entry/title') do |title|
              title.add_attribute('type') unless title.attributes['type']
              title.attributes['type'] = type
            end
          when 'summary_type'
            feed.each_element('//entry/summary') do |summary|
              summary.add_attribute('type') unless summary.attributes['type']
              summary.attributes['type'] = type
            end
          when 'content_type'
            feed.each_element('//entry/content') do |content|
              content.add_attribute('type') unless content.attributes['type']
              content.attributes['type'] = type
            end
          when 'name_type'
            feed.each_element('//entry/author/name') do |auth_name|
              auth_name.add_attribute('type') unless auth_name.attributes['type']
              auth_name.attributes['type'] = type
            end
          end
        end
 
      # second set of filters: cardinality, sanitization, dates, and uris
      doc.attributes['xml:base'] = Planet.config[sub]['xml_base'] ? Planet.config[sub]['xml_base'] : uri
      Planet.sift feed, fido
 
      # process feed attributes: xml* (xml:lang, xml:base, xmlns) will need
      # need to be transplanted to each entry. The rest will simply be
      # placed on the source element
      root_attrs = {}
      source = REXML::Element.new('source')
      feed.attributes.each_attribute do |attrib|
        if attrib.expanded_name[0..2] == 'xml'
          root_attrs[attrib.expanded_name] = attrib.value
        else
          source.attributes[attrib.expanded_name] = attrib.value
        end
      end
 
      # add in configuration information (names, hackergotchi icons...)
      source.add_namespace 'planet', 'http://planet.intertwingly.net/'
      Planet.source(sub, source)
 
      # process feed elements: entries will be captured for later processing,
      # other elements will be transplanted to the source element.
      entries = []
      feed.elements.each do |element|
        if element.name == 'entry'
          entries << element
        else
          source.add_element(element)
        end
      end
 
      entries.each do |entry|
        # try to find a unique id (TODO: try harder)
        id = entry.elements['id'].text rescue nil
        id ||= entry.elements['link[@rel="alternate"]/@href'] rescue nil
        next unless id
 
        unless /^\w+\:/ =~ id
          id = 'urn:feed-entry-id:' + id
          entry.elements['id'].text = id
        end
 
        # determine output file name for this entry
        entry_file = File.join(entry_cache, Planet.filename(id))
 
        # determine updated date
        updated = entry.elements['updated']
        if not updated
          updated = entry.add_element('updated')
          if entry.elements['published']
            updated.text = entry.elements['published'].text
          elsif File.exist? entry_file
            updated.text=File.stat(entry_file).mtime.iso8601
          else
            updated.text=DateTime.now.to_s
          end
        end
 
        # augment with feed xml* attributes and source information
        root_attrs.each_pair {|name,value| entry.attributes[name]=value}
        entry.add(source) if not entry.elements['source']
 
        # output the entry, with a timestamp reflecting the update time
        File.open(entry_file, 'w') { |file| REXML::Formatters::Default.new.write(entry, file) }
        updated = Time.parse(updated.text)
        File.utime updated, updated, entry_file
      end
 
      # write source information out to the cache
      if feed.name == 'feed'
        source.name = 'planet:source'
        root_attrs.each_pair {|name,value| source.attributes[name]=value}
        source_file = File.join(source_cache, Planet.filename(sub))
        File.open(source_file, 'w') { |file| REXML::Formatters::Default.new.write(source, file) }
      end
    end
  end
 
  # add configuration information to a source element
  def Planet.source sub, element
    Planet.config[sub].each do |name,value|
      next if name[0..1] == '__'
      child = element.add_element("planet:#{name}")
      child.text = value
    end
  end
end