public
Description: generate code_swarm data from Wikipedia page histories & user contributions
Homepage: http://jamiedubs.com
Clone URL: git://github.com/jamiew/wikiswarm.git
wikiswarm / wikipedia.rb
100755 100 lines (82 sloc) 3.25 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env ruby
# Wikipedia API -> code_swarm event log
# by Jamie Wilkinson <http://jamiedubs.com>
require 'rubygems'
require 'uri'
require 'mechanize'
 
# go easy
def snooze; sleep 2; end
 
def page_history(page, offset = '')
  STDERR.print "#{offset}.. "; STDERR.flush
  
  rvlimit = 500 # revisions per page
  url = "http://en.wikipedia.org/w/api.php?action=query&prop=revisions&titles=#{URI.escape(page)}&rvprop=timestamp|user|size&rvlimit=#{rvlimit}&format=xml"
  url += "&rvstartid=#{offset}" unless offset.empty?
  snooze
 
  sleep 0.5 # easy
  agent = WWW::Mechanize.new # FIXME, don't always need to reinitialize
  doc = Hpricot.XML(agent.get(url).body)
  revisions = (doc/'rev').map { |rev|
    # STDERR.puts rev['timestamp']
    # STDERR.puts Time::parse(rev['timestamp']).to_s
    weight = (rev['size'].to_f/100.to_f).ceil rescue 1
    weight = 1 if weight == 0 #FIXME
    STDERR.puts "#{rev['timestamp']}: #{rev['size']} => #{weight}"
    {:filename => page, :date => Time::parse(rev['timestamp']).to_i*1000, :author => rev['user'], :weight => weight }
  } || []
 
  rvstartid = (doc/'query-continue'/'revisions')[0]['rvstartid'] rescue nil
  revisions += page_history(page, rvstartid) || [] if rvstartid
  return revisions
rescue
  STDERR.puts "Exception: #{$!}\n\t#{$!.backtrace.join('\n\t')}"
#ensure
  return revisions
end
 
def user_history(username, offset = '')
  rvlimit = 500 # revisions per page
  url = "http://en.wikipedia.org/w/index.php?title=Special:Contributions&limit=#{rvlimit}&target=Jamiew"
  url += "&offset=#{offset}" unless offset.empty?
  agent = WWW::Mechanize.new
  agent.user_agent = "WikiSwarm <http://github.com/jamiew/wikiswarm/>"
  snooze
  doc = agent.get(url)
  revisions = (doc/'#bodyContent li').map { |li|
    
    # links = (li/'a').delete rescue (print ".")
    # puts links.inspect if links
    filename = li.search('a').remove[2].innerHTML
    # filename = links[2].innerHTML
    comment = (li/'span').remove
    date = li.innerHTML.split('(')[0][0..-2]
    username = username.gsub('User:','')
    # weight = rev['size'] || 1
    weight = 1
    # puts (li/'span').delete
    # {:filename => (li
    
    { :filename => filename, :date => Time::parse(date).to_i*1000, :author => username, :weight => weight }
  }.sort_by { |f| f[:date] }
 
  # puts (doc/'a.mw-nextlink').inspect
 
  link = (doc/'.mw-nextlink')[0]['href'] rescue nil
  # STDERR.puts link.inspect
  
  rvstartid = link.match('.*offset=(.*)\&.*')[1] rescue nil
  revisions += user_history(username, rvstartid) || [] if rvstartid
  return revisions
end
 
 
 
 
# parse inputs
if ARGV.empty? || ARGV.first.empty?
  puts "#{ARGV[0] }: specify page(s) as parameters (remember to quote 'Barack Obama')"
  exit 1
end
pages = [*ARGV]
 
STDERR.puts "Building revhistory for #{pages.inspect}..."
puts '<?xml version="1.0"?>'
puts '<file_events>'
revisions = []
pages.each { |page|
  STDERR.puts "\n#{page}"
  revisions += (page =~ /^User\:.*/ ? user_history(page) : page_history(page))
}
 
revisions.sort_by { |r| r[:date] }.each { |rev|
  # code_swarm wants unixtime in milliseconds
  puts %{<event date="#{rev[:date]}" filename="#{rev[:filename]}" author="#{rev[:author]}" weight="#{rev[:weight]}" />}
}
puts '</file_events>'
exit 0