mlandauer / openaustralia-parser forked from bruno/openaustralia-parser

Parser component for Open Australia

This URL has Read+Write access

openaustralia-parser / parse-speeches.rb
b0f82070 » rsms 2007-11-18 [svn] The crudest of crudes... 1 #!/usr/bin/env ruby
2
7788529a » mlandauer 2008-02-03 Moved ruby files that are n... 3 $:.unshift "#{File.dirname(__FILE__)}/lib"
4
0d638f2d » mlandauer 2008-02-14 Put functionality that was ... 5 require 'people'
58b170fd » mlandauer 2008-03-02 Moved class Speeches to its... 6 require 'hansard_parser'
31eb9bc1 » mlandauer 2008-03-04 Merge 7 require 'configuration'
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 8 require 'optparse'
2a9bb808 » mlandauer 2009-04-16 Added 3rd party lib for dis... 9 require 'progressbar'
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 10
11 def parse_date(text)
ae4ad090 » mlandauer 2008-06-22 Added "Previous working day... 12 today = Date.today
13
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 14 if text == "today"
ae4ad090 » mlandauer 2008-06-22 Added "Previous working day... 15 today
8e2d229e » mlandauer 2008-05-11 Can now use yesterday for a... 16 elsif text == "yesterday"
ae4ad090 » mlandauer 2008-06-22 Added "Previous working day... 17 today - 1
18 elsif text == "previous-working-day"
19 # For Sunday (wday 0) and Monday (wday 1) the previous working day is last Friday otherwise it's
20 # just the previous day
21 if today.wday == 0
22 today - 2
23 elsif today.wday == 1
24 today - 3
25 else
26 today - 1
27 end
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 28 else
29 Date.parse(text)
30 end
31 end
32
33 # Defaults
b3ce9b76 » mlandauer 2009-04-21 Adding force option 34 options = {:load_database => true, :proof => false, :force => false}
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 35
36 OptionParser.new do |opts|
37 opts.banner = <<EOF
38 Usage: parse-speeches.rb [options] <from-date> [<to-date>]
8e2d229e » mlandauer 2008-05-11 Can now use yesterday for a... 39 formatting of date:
40 year.month.day or today or yesterday
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 41 EOF
42 opts.on("--no-load", "Just generate XML and don't load up database") do |l|
43 options[:load_database] = l
44 end
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 45 opts.on("--proof", "Only parse dates that are at proof stage. Will redownload and populate html cache for those dates.") do |l|
46 options[:proof] = l
47 end
b3ce9b76 » mlandauer 2009-04-21 Adding force option 48 opts.on("--force", "On loading data into database delete records that are not in the XML") do |l|
49 options[:force] = l
50 end
f94b909e » mlandauer 2008-05-11 Added command-line parsing ... 51 end.parse!
52
53 if ARGV.size != 1 && ARGV.size != 2
54 puts "Need to supply one or two dates"
55 exit
56 end
57
58 from_date = parse_date(ARGV[0])
59
60 if ARGV.size == 1
61 to_date = from_date
62 else
63 to_date = parse_date(ARGV[1])
64 end
7dd83a8a » mlandauer 2008-02-27 Extracted function 65
458b0a8e » mlandauer 2008-03-28 Puts xml directly into open... 66 conf = Configuration.new
67
2fd9727b » mlandauer 2009-05-25 Don't generate empty XML fi... 68 FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/representatives_debates"
69 FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/senate_debates"
458b0a8e » mlandauer 2008-03-28 Puts xml directly into open... 70
7dd83a8a » mlandauer 2008-02-27 Extracted function 71 # First load people back in so that we can look up member id's
bdc6a568 » mlandauer 2008-07-15 Got rid of duplication of p... 72 people = PeopleCSVReader.read_members
7dd83a8a » mlandauer 2008-02-27 Extracted function 73
78d8f703 » mlandauer 2008-05-01 Now people are not endlessl... 74 parser = HansardParser.new(people)
75
49b844ee » mlandauer 2009-04-20 Workaround for bug in Progr... 76 progress = ProgressBar.new("parse-speeches", ((to_date - from_date + 1) * 2).to_i)
2a9bb808 » mlandauer 2009-04-16 Added 3rd party lib for dis... 77
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 78 # Kind of helpful to start at the end date and go backwards when using the "--proof" option. So, always going to do this now.
79 date = to_date
80 while date >= from_date
91eaa7f7 » mlandauer 2008-07-12 Can now disable writing of ... 81 if conf.write_xml_representatives
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 82 if options[:proof]
2fd9727b » mlandauer 2009-05-25 Don't generate empty XML fi... 83 parser.parse_date_house_only_in_proof(date, "#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives)
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 84 else
2fd9727b » mlandauer 2009-05-25 Don't generate empty XML fi... 85 parser.parse_date_house(date, "#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives)
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 86 end
91eaa7f7 » mlandauer 2008-07-12 Can now disable writing of ... 87 end
2a9bb808 » mlandauer 2009-04-16 Added 3rd party lib for dis... 88 progress.inc
91eaa7f7 » mlandauer 2008-07-12 Can now disable writing of ... 89 if conf.write_xml_senators
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 90 if options[:proof]
2fd9727b » mlandauer 2009-05-25 Don't generate empty XML fi... 91 parser.parse_date_house_only_in_proof(date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml", House.senate)
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 92 else
2fd9727b » mlandauer 2009-05-25 Don't generate empty XML fi... 93 parser.parse_date_house(date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml", House.senate)
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 94 end
91eaa7f7 » mlandauer 2008-07-12 Can now disable writing of ... 95 end
2a9bb808 » mlandauer 2009-04-16 Added 3rd party lib for dis... 96 progress.inc
d0730573 » mlandauer 2008-08-22 Added option to parse-speec... 97 date = date - 1
6b883cee » mlandauer 2008-04-11 Run the speech parser over ... 98 end
26f7bc93 » rsms 2007-11-20 [svn] Now adds correct head... 99
2a9bb808 » mlandauer 2009-04-16 Added 3rd party lib for dis... 100 progress.finish
101
26f7bc93 » rsms 2007-11-20 [svn] Now adds correct head... 102 # And load up the database
b3ce9b76 » mlandauer 2009-04-21 Adding force option 103 if options[:load_database]
104 command_options = " --from=#{from_date} --to=#{to_date}"
105 command_options << " --debates" if conf.write_xml_representatives
106 command_options << " --lordsdebates" if conf.write_xml_senators
107 command_options << " --force" if options[:force]
108
109 # Starts with 'perl' to be friendly with Windows
110 system("perl #{conf.web_root}/twfy/scripts/xml2db.pl #{command_options}")
111 end