public
Fork of bruno/openaustralia-parser
Description: Parser component for Open Australia
Homepage: http://openaustralia.org
Clone URL: git://github.com/mlandauer/openaustralia-parser.git
openaustralia-parser / register-split.rb
100755 86 lines (74 sloc) 3.549 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env ruby
#
# This splits and combines several large pdfs containing the Register of Members' Interests into one pdf per Senator/Member
#
# Requirement: pdftk (http://www.accesspdf.com/pdftk/)
#
# On Mac OS X 10.5 the latest Macports version kept segfaulting. Installing the pre-compiled
# version from http://www.pdfhacks.com/pdftk/OSX-10.3/pdftk1.12_OSX10.3.dmg.gz worked.
 
$:.unshift "#{File.dirname(__FILE__)}/lib"
 
require 'csv'
require 'name'
require 'people'
require 'configuration'
 
# Full path to pdftk executable
pdftk = "/usr/local/bin/pdftk"
 
people = PeopleCSVReader.read_members
 
conf = Configuration.new
PageRange = Struct.new(:filename, :start, :end)
 
def read_in_ranges(p, filename_prefix, date, house, people)
  pdf_filename = "data/register_of_interests/#{filename_prefix}.pdf"
  split_filename = "data/register_of_interests/#{filename_prefix}.split"
  
  # Read in one split file
  data = CSV.readlines(split_filename)
  # Throw away first line (comment)
  data.shift
 
  data.each_index do |i|
    start_page, last_name, first_name = data[i]
    start_page = start_page.to_i
    if i + 1 < data.size
      end_page = data[i+1][0].to_i - 1
    else
      end_page = 'end'
    end
    # Ignore page ranges marked as blank
    if last_name.downcase != "** blank page **"
      name = Name.last_title_first(last_name + " " + first_name)
      member = people.find_member_by_name_current_on_date(name, date, house)
      throw "Couldn't find #{name.full_name}" if member.nil?
      p[member.person] ||= []
      p[member.person] << PageRange.new(pdf_filename, start_page, end_page)
    end
  end
end
 
# Hash from person to array of page ranges
p = {}
 
read_in_ranges(p, "senate/2008_sep_vol_1", Date.new(2008, 9, 1), House.senate, people)
read_in_ranges(p, "senate/2008_sep_vol_2", Date.new(2008, 9, 1), House.senate, people)
read_in_ranges(p, "senate/2008_dec", Date.new(2008, 12, 1), House.senate, people)
read_in_ranges(p, "senate/2009_jun", Date.new(2009, 6, 22), House.senate, people)
read_in_ranges(p, "representatives/2008_mar_vol_1", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_2", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_3", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_4", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_5", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_6", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_7", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_mar_vol_8", Date.new(2008, 3, 1), House.representatives, people)
read_in_ranges(p, "representatives/2008_jun", Date.new(2008, 3, 1), House.representatives, people)
 
# Now step through all the people and create the pdfs
p.each do |person, ranges|
  filenames = []
  pages = []
  ranges.each_index do |i|
    letter = 'A'
    letter[0] = letter[0] + i
    filenames << "#{letter}=#{ranges[i].filename}"
    pages << "#{letter}#{ranges[i].start}-#{ranges[i].end}"
  end
  filenames = filenames.join(' ')
  pages = pages.join(' ')
  command = "#{pdftk} #{filenames} cat #{pages} output #{conf.base_dir}#{conf.regmem_pdf_path}/register_interests_#{person.id_count}.pdf"
  puts "Splitting and combining pdfs for #{person.name.full_name}..."
  system(command)
end