public
Fork of bruno/openaustralia-parser
Description: Parser component for Open Australia
Homepage: http://openaustralia.org
Clone URL: git://github.com/mlandauer/openaustralia-parser.git
mlandauer (author)
Tue Jun 16 13:56:21 -0700 2009
commit  1ced12c3ee351c2bff7c998a92d107b2c253248b
tree    95b7adebce42c5cfd68525ea5c77ea841147b037
parent  ecd5caa93c20072ee6acc231e7d4757982dc0ca7
openaustralia-parser / parse-postcodes.rb
100644 65 lines (49 sloc) 1.577 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/env ruby
 
$:.unshift "#{File.dirname(__FILE__)}/lib"
 
require 'mechanize_proxy'
require 'configuration'
require 'people'
 
conf = Configuration.new
 
agent = MechanizeProxy.new
agent.cache_subdirectory = "parse-postcodes"
 
puts "Reading Australia post office data..."
data = CSV.readlines("data/pc-full_20080529.csv")
# Ignore header
data.shift
 
valid_postcodes = data.map {|row| row.first}.uniq.sort
 
def extract_divisions_from_page(page)
  postcodes = []
  page.search('table').first.search('> tr').each do |row_tag|
    td_tag = row_tag.search('> td')[3]
    if td_tag
      postcode = td_tag.search('a').inner_text
      if postcode.nil?
        puts "Nil postcode in division #{division}"
      end
      postcodes << postcode
    end
  end
  postcodes
end
 
def other_pages?(page)
  table_tag = page.search('table')[1]
  !table_tag.search('> tr > td > a').map {|e| e.inner_text}.empty?
end
 
file = File.open("data/postcodes.csv", "w")
 
file.puts("Postcode,Electoral division name")
file.puts(",")
 
valid_postcodes.each do |postcode|
  page = agent.get("http://apps.aec.gov.au/esearch/LocalitySearchResults.aspx?filter=#{postcode}&filterby=Postcode")
  
  divisions = extract_divisions_from_page(page)
  
  if other_pages?(page)
    puts "WARNING: Multiple pages of data for postcode #{postcode}"
    file.puts("*** Double check data for postcode #{postcode} by hand ***")
  end
  
  if divisions.empty?
    puts "No divisions for postcode #{postcode}"
  else
    divisions.uniq.sort.each do |division|
      file.puts "#{postcode},#{division}"
    end
  end
end