public
Fork of bruno/openaustralia-parser
Description: Parser component for Open Australia
Homepage: http://openaustralia.org
Clone URL: git://github.com/mlandauer/openaustralia-parser.git
openaustralia-parser / parse-speeches.rb
100755 108 lines (89 sloc) 3.044 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env ruby
 
$:.unshift "#{File.dirname(__FILE__)}/lib"
 
require 'people'
require 'hansard_parser'
require 'configuration'
require 'optparse'
require 'progressbar'
 
def parse_date(text)
  today = Date.today
  
  if text == "today"
    today
  elsif text == "yesterday"
    today - 1
  elsif text == "previous-working-day"
    # For Sunday (wday 0) and Monday (wday 1) the previous working day is last Friday otherwise it's
    # just the previous day
    if today.wday == 0
      today - 2
    elsif today.wday == 1
      today - 3
    else
      today - 1
    end
  else
    Date.parse(text)
  end
end
 
# Defaults
options = {:load_database => true, :proof => false, :force => false}
 
OptionParser.new do |opts|
  opts.banner = <<EOF
Usage: parse-speeches.rb [options] <from-date> [<to-date>]
formatting of date:
year.month.day or today or yesterday
EOF
  opts.on("--no-load", "Just generate XML and don't load up database") do |l|
    options[:load_database] = l
  end
  opts.on("--proof", "Only parse dates that are at proof stage. Will redownload and populate html cache for those dates.") do |l|
    options[:proof] = l
  end
  opts.on("--force", "On loading data into database delete records that are not in the XML") do |l|
    options[:force] = l
  end
end.parse!
 
if ARGV.size != 1 && ARGV.size != 2
  puts "Need to supply one or two dates"
  exit
end
    
from_date = parse_date(ARGV[0])
 
if ARGV.size == 1
  to_date = from_date
else
  to_date = parse_date(ARGV[1])
end
 
conf = Configuration.new
 
FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/representatives_debates"
FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/senate_debates"
 
# First load people back in so that we can look up member id's
people = PeopleCSVReader.read_members
 
parser = HansardParser.new(people)
 
progress = ProgressBar.new("parse-speeches", ((to_date - from_date + 1) * 2).to_i)
 
# Kind of helpful to start at the end date and go backwards when using the "--proof" option. So, always going to do this now.
date = to_date
while date >= from_date
  if options[:proof]
    parser.parse_date_house_only_in_proof(date, "#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives)
  else
    parser.parse_date_house(date, "#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives)
  end
  progress.inc
  if options[:proof]
    parser.parse_date_house_only_in_proof(date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml", House.senate)
  else
    parser.parse_date_house(date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml", House.senate)
  end
  progress.inc
  date = date - 1
end
 
progress.finish
 
# And load up the database
if options[:load_database]
  command_options = " --from=#{from_date} --to=#{to_date}"
  command_options << " --debates"
  command_options << " --lordsdebates"
  command_options << " --force" if options[:force]
  
  # Starts with 'perl' to be friendly with Windows
  system("perl #{conf.web_root}/twfy/scripts/xml2db.pl #{command_options}")
end