public
Description: RSS/Atom feeds aggregator, powered by Ramaze. Using Sequel for ORM.
Homepage: http://planet.zhekov.net/
Clone URL: git://github.com/zh/tamanegi.git
Click here to lend your support to: tamanegi and make a donation at www.pledgie.com !
tamanegi / model / feed.rb
100644 149 lines (130 sloc) 4.219 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
begin
  require 'system_timer'
  MyTimer = SystemTimer
rescue LoadError
  require 'timeout'
  MyTimer = Timeout
end
require 'open-uri'
require 'feed-normalizer'
 
class Feed < Sequel::Model(:feeds)
 
  set_schema do
    primary_key :id
    varchar :url
    varchar :title
    varchar :link
    varchar :handle, :size => 64, :unique => true
    integer :status
    time :created
    time :updated
    time :synced
    varchar :etag
    text :description
    boolean :always, :default => false
    index [:handle], :unique => true
    index [:synced]
  end
 
  #one_to_many :items, :key => :feed_id, :order => :id.DESC
  has_many :items
 
  include Validatable
 
  validates do
    presence_of :url, :handle
    # FIXME: check the uniqueness of :handle
    # uniqueness_of :handle, :event => :create
    format_of :handle, :with => /^\w+$/, :message => "cannot contain whitespace"
  end
 
  after_create do
    update_values(:created => Time.now, :updated => Time.now)
  end
 
  after_update do
    update_values(:updated => Time.now)
  end
 
  def self.add(handle, url)
    create :handle => handle, :url => url
  end
 
  def update(handle = handle, url = url, title = title, description = description)
    self.handle, self.url, self.title, self.description = handle, url, title, description
    save if valid?
  end
 
  #
  # return status code
  # or nil on failure
  #
  def sync!(forceUpdate = false, giveup = Configuration.for('app').giveup)
    begin
      MyTimer.timeout(giveup) do
        @opts = {}
        unless forceUpdate
          @opts = @opts.merge({'If-Modified-Since' => self.synced.to_formatted_s(:rfc822)}) if self.synced
          @opts = @opts.merge({'If-None-Match' => self.etag}) if self.etag
        end
        @data = open(self.url, @opts)
      end
    rescue OpenURI::HTTPError
      update_values(:synced => Time.now, :status => 304)
      save if valid?
      return 304
    rescue Timeout::Error
      Ramaze::Log.error "[E] #{self.url} timeout error"
      return nil
    rescue => e
      Ramaze::Log.error "[E] #{e}"
      return nil
    else
      rss = FeedNormalizer::FeedNormalizer.parse(@data)
    end
 
    # set the title, description and link ONLY IF EMPTY
    update_values(:title => rss.channel.title.to_s) unless self.title
    unless self.description
      update_values(:description => rss.channel.description ? rss.channel.description.to_s : self.title)
    end
    unless self.link
      update_values(:link => rss.channel.urls.first) if rss.channel.urls
    end
 
    # add only the uniq items (uniq GUID)
    rss.entries.reverse.each do |i|
      DB.transaction do
        guid = guid_for(i)
        next if Item[:guid=>guid]
        title = i.title.to_s.gsub(/<[a-zA-Z\/][^>]*>/,'')
        item = Item.create(
          :title => title,
          :link => i.urls.first,
          :description => fix_content(i.content||i.description||i.summary, self.link),
          :guid => guid
        )
        item.feed = self
        item.valid? ? item.save : rollback
      end
    end
    update_values(:synced => Time.now, :status => @data.status[0].to_i, :etag => @data.meta['etag'])
    save if valid?
    return @data.status[0].to_i
  end
 
private
  def guid_for(rss_entry)
    guid = rss_entry.urls.first
    guid = rss_entry.id.to_s if rss_entry.id
    return Digest::SHA1.hexdigest("--#{guid}--myBIGsecret")
  end
 
  def fix_content(content, site_link)
    content = CGI.unescapeHTML(content) unless /</ =~ content
    correct_urls(content, site_link)
  end
  
  def correct_urls(text, site_link)
    site_link += '/' unless site_link[-1..-1] == '/'
    text.gsub(%r{(src|href)=(['"])(?!http)([^'"]*?)}) do
      first_part = "#{$1}=#{$2}"
      url = $3
      url = url[1..-1] if url[0..0] == '/'
      "#{first_part}#{site_link}#{url}"
    end
  end
 
end
 
Feed.create_table unless Feed.table_exists?
 
if Feed.empty? && Configuration.for('app').bootstrap
  Feed.add('CNNTop', 'http://rss.cnn.com/rss/cnn_topstories.rss')
  Feed.add('Slashdot', 'http://rss.slashdot.org/Slashdot/slashdot')
  Feed.add('JoelOnSoftware', 'http://www.joelonsoftware.com/rss.xml')
  Feed.add('SchneierSecurity', 'http://www.schneier.com/blog/index.rdf')
end