wiktor / pudelek

Tworzenie kanału wiadomości (RSS) z serwisu pudelek.pl

This URL has Read+Write access

pudelek / pudelek_rss.rb
100755 108 lines (91 sloc) 3.163 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
require "rubygems"
gem 'actionpack', '~> 2.2'
require 'builder'
require 'action_view/helpers/atom_feed_helper'
require 'hpricot'
require 'net/http'
require 'uri'
require 'iconv'
require 'sqlite3'
 
include ActionView::Helpers::AtomFeedHelper
 
PUDELEK_DB_FILE = "#{ENV['CORE_PATH']}pudelek.db"
BLOG_URL = "http://blog.mocna-kawa.com"
 
def fetch_remote_file(address)
  url = URI.parse(address)
  res = Net::HTTP.start(url.host, url.port) { |http|
    http.get(url.path)
  }
  Iconv.conv('UTF-8', 'ISO-8859-2', res.body)
end
 
def get_article_body_and_date(article_url)
  article_page = Hpricot(fetch_remote_file(article_url));
  article_page.search("#article_container").each do |article|
    (article/"script, h2, .tags, #goto_sg, #boom, .spacer, .date").remove;
    return article.inner_html, Time.now;
   end
end
 
class DB
  def initialize(db)
    @db = db
  end
  
  def article_exists?(id)
    result = @db.execute('select * from items where id = ?', id)
    result.empty? ? nil : result[0]
  end
 
  def insert_article(article_id, article_date, article_body);
    @db.execute('insert into items values(?, ?, ?)', article_id, article_date, article_body);
  end
end
 
def extract_item(db, item_url, title_prefix = '')
  article_title = title_prefix + item_url.inner_html
  article_url = item_url.attributes['href']
  article_id = article_url.split('/')[4]
  if (article = db.article_exists?(article_id))
    article_body, article_date = article[2], Time.at(article[1].to_i)
  else
    article_body, article_date = get_article_body_and_date(article_url)
    db.insert_article(article_id, article_date.to_i, article_body)
  end
  
  { :title => article_title, :url => article_url, :id => article_id,
    :body => article_body, :date => article_date }
end
 
def fetch_items(db)
  main_page = Hpricot(fetch_remote_file('http://www.pudelek.pl/'));
  
  items = []
  main_page.search("#left_column_container ul li h4 a").each do |item_url|
    items << extract_item(db, item_url)
  end
  main_page.search("#middle_column_container ul li.teaser_box span.link a") do |item_url|
    items << extract_item(db, item_url, 'FOTO: ')
  end
 
  items.sort { |a, b| b[:id] <=> a[:id] }
end
 
def render_feed(items)
  xml = Builder::XmlMarkup.new
  atom_feed({ :id => BLOG_URL + "/pudelek", :root_url => BLOG_URL, :language => 'pl_PL', :url => BLOG_URL }) do |feed|
    feed.title("Pudelek Nieoficjalnie")
    feed.url(BLOG_URL)
    feed.updated(items[0][:date])
 
    items.each do |item|
      feed.entry(nil, { :id => item[:url], :published => item[:date], :url => item[:url]}) do |entry|
        entry.title(item[:title])
        entry.content(item[:body], :type => 'html')
      end
    end
  end
end
 
def to_file(filename, content)
  file = File.open(filename, 'w')
  file << content
  file.close
end
 
if ARGV.empty?
  puts "Usage:\n\truby pudelek_rss.rb [output_file]"
elsif (!File.exist? PUDELEK_DB_FILE)
  puts "Error:\n\tDatabase has not been created (file: #{PUDELEK_DB_FILE}). Run 'ruby sqlite_init.rb' to init DB."
else
  db = DB.new(SQLite3::Database.new(PUDELEK_DB_FILE))
  items = fetch_items(db)
  feed = render_feed(items)
  to_file(ARGV[0], feed)
end