lachie / openaustralia-parser forked from mlandauer/openaustralia-parser

Parser component for Open Australia

This URL has Read+Write access

openaustralia-parser / parse-member-links.rb
100755 154 lines (132 sloc) 4.793 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env ruby
 
$:.unshift "#{File.dirname(__FILE__)}/lib"
 
require 'environment'
require 'mechanize'
require 'name'
require 'people'
require 'configuration'
require 'json'
 
conf = Configuration.new
 
# Not using caching proxy since we will be running this script once a day and we
# always want to get the new data
agent = WWW::Mechanize.new
 
puts "Reading member data..."
people = PeopleCSVReader.read_members
 
puts "Twitter information (from tweetmp.org.au)..."
 
xml = File.open("#{conf.members_xml_path}/twitter.xml", 'w')
x = Builder::XmlMarkup.new(:target => xml, :indent => 1)
x.instruct!
x.peopleinfo do
  JSON.parse(agent.get("http://tweetmp.org.au/api/mps.json").body).each do |person|
    aph_id = person["GovernmentId"].upcase
    twitter = person["TwitterScreenName"]
    # Lookup the person based on their government id
    p = people.find_person_by_aph_id(aph_id)
    # Temporary workaround until we figure out what's going on with the aph_id's that start with '00'
    if p.nil?
      p = people.find_person_by_aph_id("00" + aph_id)
      puts "WARNING: Couldn't find person with aph id: #{aph_id}" if p.nil?
    end
    if twitter
      x.personinfo(:id => p.id, :mp_twitter_screen_name => twitter)
    else
      # Give the URL for inviting this person to Twitter using tweetmp.org.au
      x.personinfo(:id => p.id, :mp_twitter_invite_tweetmp => "http://tweetmp.org.au/mps/invite/#{person["Id"]}")
    end
  end
end
xml.close
 
puts "Personal home page & Contact Details (Gov website)..."
 
def extract_links(name, people, agent, link, x)
  person = people.find_person_by_name_current_on_date(name, Date.today)
  if person
    sub_page = agent.click(link)
    home_page_tag = sub_page.links.find{|l| l.text =~ /personal home page/i}
    
    params = {:id => person.id, :mp_contactdetails => sub_page.uri}
    params[:mp_website] = home_page_tag.uri if home_page_tag
    x.personinfo(params)
  else
    puts "WARNING: Could not find person with name #{name.full_name}"
  end
end
 
xml = File.open("#{conf.members_xml_path}/websites.xml", 'w')
x = Builder::XmlMarkup.new(:target => xml, :indent => 1)
x.instruct!
x.peopleinfo do
  if conf.write_xml_representatives
    agent.get(conf.alternative_current_house_members_url).links.each do |link|
      if link.to_s =~ /Member for/
        name = Name.last_title_first(link.text.split(',')[0..1].join(','))
        extract_links(name, people, agent, link, x)
      end
    end
  end
  if conf.write_xml_senators
    agent.get(conf.alternative_current_senate_members_url).links.each do |link|
      if link.to_s =~ /Senator/
        name = Name.last_title_first(link.to_s.split('-')[0..-2].join('-'))
        extract_links(name, people, agent, link, x)
      end
    end
  end
end
xml.close
 
puts "Q&A Links..."
 
data = {}
 
if conf.write_xml_representatives
 
  # First get mapping between constituency name and web page
  page = agent.get(conf.qanda_electorate_url)
  map = {}
 
  page.links[35..184].each do |link|
    map[link.text.downcase] = (page.uri + link.uri).to_s
  end
  # Hack to deal with "Flynn" constituency incorrectly spelled as "Flyn"
  map["flynn"] = "http://www.abc.net.au/tv/qanda/mp-profiles/flyn.htm"
 
  bad_divisions = []
  # Check that the links point to valid pages
  map.each_pair do |division, url|
    begin
      agent.get(url)
    rescue WWW::Mechanize::ResponseCodeError
      bad_divisions << division
      puts "ERROR: Invalid url #{url} for division #{division}"
    end
  end
  # Clear out bad divisions
  bad_divisions.each { |division| map.delete(division) }
 
  people.find_current_members(House.representatives).each do |member|
    short_division = member.division.downcase[0..3]
    link = map[member.division.downcase]
    data[member.person.id] = link
    puts "ERROR: Couldn't lookup division #{member.division}" if link.nil?
  end
end
 
if conf.write_xml_senators
  page = agent.get(conf.qanda_all_senators_url)
  page.links.each do |link|
    if link.uri.to_s =~ /^\/tv\/qanda\/senators\//
      # HACK to handle Unicode in Kerry O'Brien's name on Q&A site
      if link.to_s == "Kerry O\222Brien"
        name_text = "Kerry O'Brien"
      else
        name_text = link.to_s
      end
      member = people.find_member_by_name_current_on_date(Name.title_first_last(name_text), Date.today, House.senate)
      if member.nil?
        puts "WARNING: Can't find Senator #{link}"
      else
        data[member.person.id] = page.uri + link.uri
      end
    end
  end
end
 
xml = File.open("#{conf.members_xml_path}/links-abc-qanda.xml", 'w')
x = Builder::XmlMarkup.new(:target => xml, :indent => 1)
x.instruct!
x.peopleinfo do
  data.each do |id, link|
    x.personinfo(:id => id, :mp_biography_qanda => link)
  end
end
xml.close
 
system(conf.web_root + "/twfy/scripts/mpinfoin.pl links")