public
Description: Scripts for processing and analyzing federal lobbyist disclosure data reporting contributions to political campaigns
Homepage: http://www.palewire.com
Clone URL: git://github.com/palewire/sopr-contribs.git
sopr-contribs / fetch.py
2e8d1a13 » palewire 2008-08-17 first commit 1 #!/usr/bin/env python
2 """
3 A script that fetches, parses and archives the XML data dumps of lobbyist's
4 political contributions published by The Senate Office of Public Records.
5
6 Zips files containing the XML are:
7 1. Downloaded and unzipped.
8 2. Parsed out into flat text files and stored in a timestamped folder structure.
9 3. Imported to a SQLite database.
10
11 The ultimate goal is for a series of SQL statements to scrub and cut the data
12 to account for flaws in the reporting system first uncovered by Bill Allison
13 and Anupama Narayanswamy of The Sunlight Foundation.
14
15 Sunlight study:
16 http://realtime.sunlightprojects.org/2008/08/14/mark-warner-biggest-recipient-of-lobbyist-dough-new-disclosures-show-so-far/
17
ab2a4c1f » palewire 2008-08-17 Minor fixes 18 Simple analysis tasks could then be scripted to output as schedule XLS dumps,
19 email alerts or maybe even Django-ifed HTML.
2e8d1a13 » palewire 2008-08-17 first commit 20
21 Source URL:
22 http://www.senate.gov/legislative/Public_Disclosure/contributions_download.htm
23
24 Dependencies: BeautifulSoup, Pysqlite2
25
26 The MIT License
27
28 Copyright (c) 2008 Ben Welsh
29
30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
31 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
32 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
33 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
34 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
35 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
36 THE SOFTWARE.
37
38 """
39 __author__ = "Ben Welsh <ben.welsh@gmail.com>"
40 __date__ = "$Date: 2008/08/17 $"
41 __version__ = "$Revision: 0.1 $"
42
43 import datetime
44 import os
45 import urllib
46 import re
47 import codecs
48 import zipfile
49 import string
50 try:
51 from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
52 except ImportError:
53 print """
54 IMPORT ERROR: Required module not found: Beautiful Soup.
55 Installation instructions:
56 If you have easy_install, enter
57 "sudo easy_install BeautifulSoup"
58 via your shell.
59 Otherwise, the source can be downloaded from
60 http://www.crummy.com/software/BeautifulSoup/
61 """
62 raise SystemExit
63 try:
64 from pysqlite2 import dbapi2 as sqlite
65 except ImportError:
66 print """
67 IMPORT ERROR: Required module not found: Pysqlite.
68 Visit http://pysqlite.org and download the latest module.
69 """
70 raise SystemExit
71
72 #"""Create an archive folder structure using the current datetime. Returns path."""
73 ###Setting timestamps
74 now = datetime.datetime.now()
75 datestamp = "%s-%s-%s" % (now.year, now.month, now.day)
76 timestamp = "%sh%sm%ss" % (now.hour, now.minute, now.second)
77 sqlitestamp = "%s-%s-%s %s:%s:%s" % (now.year, now.month, now.day,
78 now.hour, now.minute, now.second)
79
80 ###Setting directory variables, creating archive folder structure
81 working_directory = "."
82 data_directory = os.path.join(working_directory, 'data')
83
84 if os.path.isdir(data_directory):
85 print "Data directory already exists at %s" % data_directory
86 else:
87 os.mkdir(data_directory)
88 print "Creating data directory at %s" % data_directory
89
90 todays_data_subdirectory = os.path.join(data_directory, datestamp)
91
92 if os.path.isdir(todays_data_subdirectory):
93 print "Today's data subdirectory already exists at %s" % todays_data_subdirectory
94 else:
95 os.mkdir(todays_data_subdirectory)
96 print "Creating today's data subdirectory at %s" % todays_data_subdirectory
97
98 this_scripts_data_subdirectory = os.path.join(todays_data_subdirectory, timestamp)
99
100 if os.path.isdir(this_scripts_data_subdirectory):
101 print "This script's data subdirectory already exists at %s" % this_scripts_data_subdirectory
102 else:
103 os.mkdir(this_scripts_data_subdirectory)
104 print "Creating this script's data subdirectory at %s" % this_scripts_data_subdirectory
105
106 ##Open files for writing out.
107 filings_path = os.path.join(this_scripts_data_subdirectory, 'filings.txt')
108 lobbyists_path = os.path.join(this_scripts_data_subdirectory, 'lobbyists.txt')
109 contribs_path = os.path.join(this_scripts_data_subdirectory, 'contribs.txt')
110
111 filings_file = codecs.open(filings_path, "w", "utf-8")
112 lobbyists_file = codecs.open(lobbyists_path, "w", "utf-8")
113 contribs_file = codecs.open(contribs_path, "w", "utf-8")
114
115 ##Visiting SOPR to grab the zip downloads
116 url = 'http://www.senate.gov/legislative/Public_Disclosure/contributions_download.htm'
117 http = urllib.urlopen(url)
118 soup = BeautifulSoup(http)
119 anchor_tags = soup.findAll('a')
120 zip_links = []
121 for a in anchor_tags:
122 href = a['href']
123 if re.search('(.*).zip', href):
124 zip_links.append(href)
125
126 for zip_link in zip_links:
127 zip_name = zip_link.split('/')[-1]
128 zip_path = os.path.join(this_scripts_data_subdirectory, zip_name)
129 urllib.urlretrieve(zip_link, zip_path)
130 print "Downloaded %s " % zip_name
131
132 ##Unzip file
133 try:
134 zip = zipfile.ZipFile(zip_path)
135 for file in zip.namelist():
136 print "Unzipping %s" % file
137 f = open(os.path.join(this_scripts_data_subdirectory, file), 'wb')
138 f.write(zip.read(file))
139 f.close()
140 except:
141 print "Failed to unzip %s" % zip_name
142
143 ##Snatching XML files for parsing
144 this_scripts_downloads = os.listdir(this_scripts_data_subdirectory)
145 this_scripts_xml_files = []
146
147 for file in this_scripts_downloads:
148 if re.search(".xml", file):
149 this_scripts_xml_files.append(file)
150
151 filing_id = 0
152
153 ##Parsing XML files
154 for xml_file_name in this_scripts_xml_files:
155 print "Processing %s" % xml_file_name
156
157 xml_file = os.path.join(this_scripts_data_subdirectory, xml_file_name)
158 xml = open(xml_file, "r")
159
160 soup = BeautifulStoneSoup(xml, selfClosingTags=['lobbyist', 'contribution', 'registrant'])
161
162 ##Parsing filing data
163 for f in soup.publicfilings.findAll('filing'):
164
165 filing_id = filing_id + 1
166
167 filing = []
168
169 filing.append("%s" % filing_id)
170 filing.append(xml_file_name)
171
172 try: filing.append(f['id'])
173 except: filing.append('null')
174
175 try: filing.append(f['year'])
176 except: filing.append('null')
177
178 try: filing.append(f['received'])
179 except: filing.append('null')
180
181 try: filing.append(f['type'])
182 except: filing.append('null')
183
184 try: filing.append(f['period'])
185 except: filing.append('null')
186
187 try: filing.append(f.registrant['registrantid'])
188 except: filing.append('null')
189
190 try:
191 raw_registrant = f.registrant['registrantname']
192 split_registrant = raw_registrant.split('&#x0D;&#x0A;')
193 clean_registrant = " ".join(split_registrant)
194 filing.append(clean_registrant)
195 except: filing.append('null')
196
197 try:
198 raw_address = f.registrant['address']
199 split_address = raw_address.split('&#x0D;&#x0A;')
200 clean_address = " ".join(split_address)
201 filing.append(clean_address)
202 except: filing.append('null')
203
204 try:
205 raw_country = f.registrant['registrantcountry']
206 split_country = raw_country.split('&#x0D;&#x0A;')
207 clean_country = " ".join(split_country)
208 filing.append(clean_country)
209 except: filing.append('null')
210
211 print >> filings_file, '|'.join(filing)
212
213 try:
214 ##Parsing lobbyist names
215 for l in f.findAll('lobbyist'):
216 lobbyist = []
217 lobbyist.append("%s" % filing_id)
218 lobbyist.append(xml_file_name)
219 lobbyist.append(f['id'])
220 try:
221 raw_name = l['lobbyistname']
222 split_name = raw_name.split('&#x0D;&#x0A;')
223 clean_name = " ".join(split_name)
224 lobbyist.append(clean_name)
225 except: lobbyist.append('null')
226
227 print >> lobbyists_file, '|'.join(lobbyist)
228 except:
229 print "Failed parsing lobbyist record for filing %s" % f['id']
230
231 try:
232 ##Parsing contributions data
233 for c in f.contributions:
234
235 contrib = []
236 contrib.append("%s" % filing_id)
237 contrib.append(xml_file_name)
238 contrib.append(f['id'])
239
240 try: contrib.append(c['contributor'])
241 except: contrib.append('null')
242
243 try: contrib.append(c['contributiontype'])
244 except: contrib.append('null')
245
246 try:
247 raw_payee = c['payee']
248 split_payee = raw_payee.split('&#x0D;&#x0A;')
249 clean_payee = " ".join(split_payee)
250 contrib.append(clean_payee)
251 except: contrib.append('null')
252
253 try:
254 raw_honoree = c['honoree']
255 split_honoree = raw_honoree.split('&#x0D;&#x0A;')
256 clean_honoree = " ".join(split_honoree)
257 contrib.append(clean_honoree)
258 except: contrib.append('null')
259
260 try: contrib.append(c['amount'])
261 except: contrib.append('null')
262
263 try: contrib.append(c['contributiondate'])
264 except: contrib.append('null')
265
266 print >> contribs_file, '|'.join(contrib)
267
268 except:
269 pass
270
271 ##Closing out files
272 filings_file.close()
273 lobbyists_file.close()
274 contribs_file.close()
275
276 con = sqlite.connect(os.path.join(this_scripts_data_subdirectory, "contribs"))
277 cur = con.cursor()
278
279 ## Creating Sqlite tables
280 ## Will ultimately need to convert date fields from varchar to datetime.
281 create_tables = """
282 create table if not exists
283 filing(
284 artificial_filing_id integer,
285 xml_file_name varchar(100),
286 sopr_filing_id varchar(100),
287 year integer,
288 received varchar(100),
289 type varchar(100),
290 period varchar(100),
291 registrant_id integer,
292 registrant_name varchar(100),
293 registrant_address varchar(500),
294 registrant_country varchar(100),
295 insert_datetime datetime
296 );
297
298 create table if not exists
299 contrib(
300 artificial_filing_id integer,
301 xml_file_name varchar(100),
302 sopr_filing_id varchar(100),
303 contributor varchar(100),
304 contribution_type varchar(100),
305 payee varchar(200),
306 honoree varchar(200),
307 amount integer,
308 contribution_date varchar(100),
309 insert_datetime datetime
310 );
311
312 create table if not exists
313 lobbyist(
314 artificial_filing_id integer,
315 xml_file_name varchar(100),
316 sopr_filing_id varchar(100),
317 lobbyist_name varchar(200),
318 insert_datetime datetime
319 );"""
320
321 cur.executescript(create_tables)
322
323 ##Reopening flat files for reading so they can be inserted into the db
324 filings_file = codecs.open(filings_path, "r", "utf-8")
325 lobbyists_file = codecs.open(lobbyists_path, "r", "utf-8")
326 contribs_file = codecs.open(contribs_path, "r", "utf-8")
327
328 print "Inserting filings"
329 for line in filings_file:
330 record = line.split('|')
331 record.append(sqlitestamp)
332
333 insert_record = """
334 insert into filing(
335 artificial_filing_id,
336 xml_file_name,
337 sopr_filing_id,
338 year,
339 received,
340 type,
341 period,
342 registrant_id,
343 registrant_name,
344 registrant_address,
345 registrant_country,
346 insert_datetime
347 )
348 values (
349 ?,
350 ?,
351 ?,
352 ?,
353 ?,
354 ?,
355 ?,
356 ?,
357 ?,
358 ?,
359 ?,
360 ?
361 );"""
362
363 cur.execute(insert_record, record)
364
365 con.commit()
366
367 print "Inserting contribs"
368 for line in contribs_file:
369 record = line.split('|')
370 record.append(sqlitestamp)
371
372 insert_record = """
373 insert into contrib(
374 artificial_filing_id,
375 xml_file_name,
376 sopr_filing_id,
377 contributor,
378 contribution_type,
379 payee,
380 honoree,
381 amount,
382 contribution_date,
383 insert_datetime
384 )
385 values (
386 ?,
387 ?,
388 ?,
389 ?,
390 ?,
391 ?,
392 ?,
393 ?,
394 ?,
395 ?
396 );"""
397
398 cur.execute(insert_record, record)
399
400 con.commit()
401
402 print "Inserting lobbyists"
403 for line in lobbyists_file:
404 record = line.split('|')
405 record.append(sqlitestamp)
406
407 insert_record = """
408 insert into lobbyist(
409 artificial_filing_id,
410 xml_file_name,
411 sopr_filing_id,
412 lobbyist_name,
413 insert_datetime
414 )
415 values (
416 ?,
417 ?,
418 ?,
419 ?,
420 ?
421 );"""
422
423 cur.execute(insert_record, record)
424
425 con.commit()
426
427 con.close()
428
429 ##Closing out files
430 filings_file.close()
431 lobbyists_file.close()
432 contribs_file.close()
433