0
-Tidy.path = "/usr/lib/libtidy.dylib"
0
-# class ScrapePosts < Scraper::Base
0
-# #cattr_reader :title_matcher
0
-# def initialize(title, source, options=nil)
0
-# @@title_matcher = title
0
-# super source, options
0
-# process @@title_matcher, :titles => :text
0
class Feed < ActiveRecord::Base
0
- #t.string :title, :description, :link, :link_regexp, :title_regexp, :content_regexp, :more_regexp
0
- #grab the contents of the page to scrape
0
- html = Net::HTTP.get(URI.parse(self.link))
0
+ items = parse_page(link)
0
+ doc = Hpricot(fetch_page(link))
0
- #scrape the data we want from the page
0
- result = page_scraper_for(self.title_regexp, self.link_regexp, self.content_regexp).scrape(html)
0
+ links = doc.search(link_regexp).collect do |link|
0
+ fix_relative_url(link.search("a[@href]").first.attributes["href"])
0
+ titles = doc.search(title_regexp).collect do |title|
0
+ contents = doc.search(content_regexp).collect do |content|
0
+ absolutize_links(content).to_s #make all links absolute
0
- result.link_array.each_index do |i|
0
- link = fix_relative_url(result.link_array[i])
0
- content = absolutize_links(result.content_array[i]).to_s #make all links absolute
0
- title = clean(result.title_array[i]) #remove html, newlines and extra spaces
0
- items << {:link => link, :title => title, :content => content}
0
+ links.each_index do |i|
0
+ items << Item.new({:link=>links[i], :title=>titles[i], :content=>contents[i], :guid=>links[i].hash})
0
- # returns the scraper for the patterns passed in
0
- def page_scraper_for(title_matcher, link_matcher, content_matcher)
0
- selector :select_link, "a"
0
- process title_matcher, :title_array => :text
0
- :link_array => Scraper.define {process "a[href]",:link =>"@href";result :link}
0
- process content_matcher do |element|
0
- @content_array << element
0
- attr_accessor :content_array
0
+ path_with_query = uri.query.blank? ? uri.path : (uri.path + "?" + uri.query)
0
+ response, html = Net::HTTP.new(uri.host, uri.port).get2(path_with_query, HEADERS)
0
# parses through the html nodes, finds the links and fixes any that are relative
0
def absolutize_links(node)
0
- while(sibling = sibling.next_sibling)
0
- absolutize_links(sibling) if sibling.tag? && sibling.name!="a"
0
+ node.search("a") do |item|
0
+ item.attributes["href"] = fix_relative_url(item.attributes["href"])
0
- node.children.each do |node|
0
- node.attributes["href"] = fix_relative_url(node.attributes["href"]) if node.tag? && node.name=="a"
0
- absolutize_links(node) if node.tag? && node.name!="a"
0
# pull out html tags, get rid of new lines, remove extra spaces
0
- string.gsub(/<\/?[^>]*>/, "").gsub("\n", " ").squeeze(" ").strip
0
+ return "NO TITLE FOUND" if string == nil
0
+ string = string.gsub(/<\/?[^>]*>/, "").gsub("\n", " ").gsub("\t", " ").squeeze(" ").strip
0
+ return string.blank? ? "NO TITLE FOUND" : string
0
# some sites use relative urls, these wont work when clicking on them from the rss feed so
0
+ 'Accept-Language' => 'en-ca',
0
+ 'User-Agent' => 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322)'