public
Description: Small application that lets you generate an rss feed for any page
Clone URL: git://github.com/jduff/rssanything.git
commit  3c21f9b2c72a3331e266567746278014b906c6b2
tree    9a9b2a1c26ff8cf1ca70eeb1e1ba8ca8d11d7e60
parent  7b239ced367d9f90d11b4a9d1331af2e8719c844
rssanything / app / models / feed.rb
100644 105 lines (80 sloc) 2.955 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
require "net/http"
require "uri"
require "hpricot"
 
class Feed < ActiveRecord::Base
  has_many :items
  
  def self.refresh(feed)
    feed = feed.is_a?(Feed) ? feed : Feed.find_by_id(id)
    
    parsed_items = feed.execute
 
    existing_items = Item.find(:all, :select =>"guid",
      :conditions => ["feed_id =? and guid in (?)", feed.id, parsed_items.collect(&:guid)])
 
    existing_items.collect!(&:guid) unless existing_items.empty?
 
    new_items = parsed_items.reject {|item| existing_items.include?(item.guid)}
 
    feed.items << new_items unless new_items.empty?
 
    feed.last_published = Time.now
 
    feed.save unless new_items.empty?
  end
 
  def execute
    doc = Hpricot(fetch_page(link))
    pages = []
    
    if !more_regexp.blank?
      links = doc.search(more_regexp)[0].search("a[@href]").collect do |link|
        fix_relative_url(link.attributes["href"])
      end.uniq
      
      1.upto([links.length, more].sort[0]) do |i|
        pages << Hpricot(fetch_page(links[i-1]))
      end
    end
    pages << doc
    
    items = pages.collect { |page| parse_page(page) }.flatten.uniq
  end
  
  def parse_page(doc)
    links = doc.search(link_regexp).collect do |link|
      fix_relative_url(link.search("a[@href]").first.attributes["href"])
    end
    
    titles = doc.search(title_regexp).collect do |title|
      clean(title.to_s)
    end
    
    contents = doc.search(content_regexp).collect do |content|
      absolutize_links(content).to_s #make all links absolute
    end
    
    items = []
    links.each_index do |i|
      items << Item.new({:link=>links[i], :title=>titles[i], :content=>contents[i], :guid=>links[i].hash})
    end
    
    items
  end
  
  private
  def fetch_page(link)
    uri = URI.parse(link)
    path_with_query = uri.query.blank? ? uri.path : (uri.path + "?" + uri.query)
    
    response, html = Net::HTTP.new(uri.host, uri.port).get2(path_with_query, HEADERS)
    return html
  end
  
  # parses through the html nodes, finds the links and fixes any that are relative
  def absolutize_links(node)
    node.search("a") do |item|
      item.attributes["href"] = fix_relative_url(item.attributes["href"])
    end
  end
  
  # pull out html tags, get rid of new lines, remove extra spaces
  def clean(string)
    return "NO TITLE FOUND" if string == nil
    string = string.gsub(/<\/?[^>]*>/, "").gsub("\n", " ").gsub("\t", " ").squeeze(" ").strip
    return string.blank? ? "NO TITLE FOUND" : string
  end
  
  # some sites use relative urls, these wont work when clicking on them from the rss feed so
  # need to make them absolute
  def fix_relative_url(url)
    begin
      return URI.join(self.link, url).to_s if URI.parse(url).host.nil? && !URI.join(self.link, url).host.nil?
    rescue
    end
    return url
  end
  
  HEADERS = {
    'Accept' => '*/*',
    'Accept-Language' => 'en-ca',
    'User-Agent' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322)'
  }
end