mlandauer / openaustralia-parser forked from bruno/openaustralia-parser
- Source
- Commits
- Network (2)
- Issues (0)
- Downloads (0)
- Wiki (1)
- Graphs
-
Tree:
1ced12c
openaustralia-parser / parse-speeches.rb
| b0f82070 » | rsms | 2007-11-18 | 1 | #!/usr/bin/env ruby | |
| 2 | |||||
| 7788529a » | mlandauer | 2008-02-03 | 3 | $:.unshift "#{File.dirname(__FILE__)}/lib" | |
| 4 | |||||
| 0d638f2d » | mlandauer | 2008-02-14 | 5 | require 'people' | |
| 58b170fd » | mlandauer | 2008-03-02 | 6 | require 'hansard_parser' | |
| 31eb9bc1 » | mlandauer | 2008-03-04 | 7 | require 'configuration' | |
| f94b909e » | mlandauer | 2008-05-11 | 8 | require 'optparse' | |
| 2a9bb808 » | mlandauer | 2009-04-16 | 9 | require 'progressbar' | |
| f94b909e » | mlandauer | 2008-05-11 | 10 | ||
| 11 | def parse_date(text) | ||||
| ae4ad090 » | mlandauer | 2008-06-22 | 12 | today = Date.today | |
| 13 | |||||
| f94b909e » | mlandauer | 2008-05-11 | 14 | if text == "today" | |
| ae4ad090 » | mlandauer | 2008-06-22 | 15 | today | |
| 8e2d229e » | mlandauer | 2008-05-11 | 16 | elsif text == "yesterday" | |
| ae4ad090 » | mlandauer | 2008-06-22 | 17 | today - 1 | |
| 18 | elsif text == "previous-working-day" | ||||
| 19 | # For Sunday (wday 0) and Monday (wday 1) the previous working day is last Friday otherwise it's | ||||
| 20 | # just the previous day | ||||
| 21 | if today.wday == 0 | ||||
| 22 | today - 2 | ||||
| 23 | elsif today.wday == 1 | ||||
| 24 | today - 3 | ||||
| 25 | else | ||||
| 26 | today - 1 | ||||
| 27 | end | ||||
| f94b909e » | mlandauer | 2008-05-11 | 28 | else | |
| 29 | Date.parse(text) | ||||
| 30 | end | ||||
| 31 | end | ||||
| 32 | |||||
| 33 | # Defaults | ||||
| b3ce9b76 » | mlandauer | 2009-04-21 | 34 | options = {:load_database => true, :proof => false, :force => false} | |
| f94b909e » | mlandauer | 2008-05-11 | 35 | ||
| 36 | OptionParser.new do |opts| | ||||
| 37 | opts.banner = <<EOF | ||||
| 38 | Usage: parse-speeches.rb [options] <from-date> [<to-date>] | ||||
| 8e2d229e » | mlandauer | 2008-05-11 | 39 | formatting of date: | |
| 40 | year.month.day or today or yesterday | ||||
| f94b909e » | mlandauer | 2008-05-11 | 41 | EOF | |
| 42 | opts.on("--no-load", "Just generate XML and don't load up database") do |l| | ||||
| 43 | options[:load_database] = l | ||||
| 44 | end | ||||
| d0730573 » | mlandauer | 2008-08-22 | 45 | opts.on("--proof", "Only parse dates that are at proof stage. Will redownload and populate html cache for those dates.") do |l| | |
| 46 | options[:proof] = l | ||||
| 47 | end | ||||
| b3ce9b76 » | mlandauer | 2009-04-21 | 48 | opts.on("--force", "On loading data into database delete records that are not in the XML") do |l| | |
| 49 | options[:force] = l | ||||
| 50 | end | ||||
| f94b909e » | mlandauer | 2008-05-11 | 51 | end.parse! | |
| 52 | |||||
| 53 | if ARGV.size != 1 && ARGV.size != 2 | ||||
| 54 | puts "Need to supply one or two dates" | ||||
| 55 | exit | ||||
| 56 | end | ||||
| 57 | |||||
| 58 | from_date = parse_date(ARGV[0]) | ||||
| 59 | |||||
| 60 | if ARGV.size == 1 | ||||
| 61 | to_date = from_date | ||||
| 62 | else | ||||
| 63 | to_date = parse_date(ARGV[1]) | ||||
| 64 | end | ||||
| 7dd83a8a » | mlandauer | 2008-02-27 | 65 | ||
| 458b0a8e » | mlandauer | 2008-03-28 | 66 | conf = Configuration.new | |
| 67 | |||||
| 2fd9727b » | mlandauer | 2009-05-25 | 68 | FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/representatives_debates" | |
| 69 | FileUtils.mkdir_p "#{conf.xml_path}/scrapedxml/senate_debates" | ||||
| 458b0a8e » | mlandauer | 2008-03-28 | 70 | ||
| 7dd83a8a » | mlandauer | 2008-02-27 | 71 | # First load people back in so that we can look up member id's | |
| bdc6a568 » | mlandauer | 2008-07-15 | 72 | people = PeopleCSVReader.read_members | |
| 7dd83a8a » | mlandauer | 2008-02-27 | 73 | ||
| 78d8f703 » | mlandauer | 2008-05-01 | 74 | parser = HansardParser.new(people) | |
| 75 | |||||
| 49b844ee » | mlandauer | 2009-04-20 | 76 | progress = ProgressBar.new("parse-speeches", ((to_date - from_date + 1) * 2).to_i) | |
| 2a9bb808 » | mlandauer | 2009-04-16 | 77 | ||
| d0730573 » | mlandauer | 2008-08-22 | 78 | # Kind of helpful to start at the end date and go backwards when using the "--proof" option. So, always going to do this now. | |
| 79 | date = to_date | ||||
| 80 | while date >= from_date | ||||
| 91eaa7f7 » | mlandauer | 2008-07-12 | 81 | if conf.write_xml_representatives | |
| d0730573 » | mlandauer | 2008-08-22 | 82 | if options[:proof] | |
| 2fd9727b » | mlandauer | 2009-05-25 | 83 | parser.parse_date_house_only_in_proof(date, "#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives) | |
| d0730573 » | mlandauer | 2008-08-22 | 84 | else | |
| 2fd9727b » | mlandauer | 2009-05-25 | 85 | parser.parse_date_house(date, "#{conf.xml_path}/scrapedxml/representatives_debates/#{date}.xml", House.representatives) | |
| d0730573 » | mlandauer | 2008-08-22 | 86 | end | |
| 91eaa7f7 » | mlandauer | 2008-07-12 | 87 | end | |
| 2a9bb808 » | mlandauer | 2009-04-16 | 88 | progress.inc | |
| 91eaa7f7 » | mlandauer | 2008-07-12 | 89 | if conf.write_xml_senators | |
| d0730573 » | mlandauer | 2008-08-22 | 90 | if options[:proof] | |
| 2fd9727b » | mlandauer | 2009-05-25 | 91 | parser.parse_date_house_only_in_proof(date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml", House.senate) | |
| d0730573 » | mlandauer | 2008-08-22 | 92 | else | |
| 2fd9727b » | mlandauer | 2009-05-25 | 93 | parser.parse_date_house(date, "#{conf.xml_path}/scrapedxml/senate_debates/#{date}.xml", House.senate) | |
| d0730573 » | mlandauer | 2008-08-22 | 94 | end | |
| 91eaa7f7 » | mlandauer | 2008-07-12 | 95 | end | |
| 2a9bb808 » | mlandauer | 2009-04-16 | 96 | progress.inc | |
| d0730573 » | mlandauer | 2008-08-22 | 97 | date = date - 1 | |
| 6b883cee » | mlandauer | 2008-04-11 | 98 | end | |
| 26f7bc93 » | rsms | 2007-11-20 | 99 | ||
| 2a9bb808 » | mlandauer | 2009-04-16 | 100 | progress.finish | |
| 101 | |||||
| 26f7bc93 » | rsms | 2007-11-20 | 102 | # And load up the database | |
| b3ce9b76 » | mlandauer | 2009-04-21 | 103 | if options[:load_database] | |
| 104 | command_options = " --from=#{from_date} --to=#{to_date}" | ||||
| 105 | command_options << " --debates" if conf.write_xml_representatives | ||||
| 106 | command_options << " --lordsdebates" if conf.write_xml_senators | ||||
| 107 | command_options << " --force" if options[:force] | ||||
| 108 | |||||
| 109 | # Starts with 'perl' to be friendly with Windows | ||||
| 110 | system("perl #{conf.web_root}/twfy/scripts/xml2db.pl #{command_options}") | ||||
| 111 | end | ||||
