palewire / sopr-contribs

Scripts for processing and analyzing federal lobbyist disclosure data reporting contributions to political campaigns

Ben Welsh (author)
Sun Aug 17 20:24:12 -0700 2008
sopr-contribs / fetch.py
2e8d1a13 » Ben Welsh 2008-08-17 first commit 1 #!/usr/bin/env python
2 """
3 A script that fetches, parses and archives the XML data dumps of lobbyist's
4 political contributions published by The Senate Office of Public Records.
5
6 Zips files containing the XML are:
7 1. Downloaded and unzipped.
8 2. Parsed out into flat text files and stored in a timestamped folder structure.
9 3. Imported to a SQLite database.
10
11 The ultimate goal is for a series of SQL statements to scrub and cut the data
12 to account for flaws in the reporting system first uncovered by Bill Allison
13 and Anupama Narayanswamy of The Sunlight Foundation.
14
15 Sunlight study:
16 http://realtime.sunlightprojects.org/2008/08/14/mark-warner-biggest-recipient-of-lobbyist-dough-new-disclosures-show-so-far/
17
18 Simple analysis tasks
19 could then be scripted to output as schedule XLS dumps, email alerts or maybe
20 even Django-ifed HTML.
21
22 Source URL:
23 http://www.senate.gov/legislative/Public_Disclosure/contributions_download.htm
24
25 Dependencies: BeautifulSoup, Pysqlite2
26
27 The MIT License
28
29 Copyright (c) 2008 Ben Welsh
30
31 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
32 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
33 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
34 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
35 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
36 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
37 THE SOFTWARE.
38
39 """
40 __author__ = "Ben Welsh <ben.welsh@gmail.com>"
41 __date__ = "$Date: 2008/08/17 $"
42 __version__ = "$Revision: 0.1 $"
43
44 import datetime
45 import os
46 import urllib
47 import re
48 import codecs
49 import zipfile
50 import string
51 try:
52 from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup
53 except ImportError:
54 print """
55 IMPORT ERROR: Required module not found: Beautiful Soup.
56 Installation instructions:
57 If you have easy_install, enter
58 "sudo easy_install BeautifulSoup"
59 via your shell.
60 Otherwise, the source can be downloaded from
61 http://www.crummy.com/software/BeautifulSoup/
62 """
63 raise SystemExit
64 try:
65 from pysqlite2 import dbapi2 as sqlite
66 except ImportError:
67 print """
68 IMPORT ERROR: Required module not found: Pysqlite.
69 Visit http://pysqlite.org and download the latest module.
70 """
71 raise SystemExit
72
73 #"""Create an archive folder structure using the current datetime. Returns path."""
74 ###Setting timestamps
75 now = datetime.datetime.now()
76 datestamp = "%s-%s-%s" % (now.year, now.month, now.day)
77 timestamp = "%sh%sm%ss" % (now.hour, now.minute, now.second)
78 sqlitestamp = "%s-%s-%s %s:%s:%s" % (now.year, now.month, now.day,
79 now.hour, now.minute, now.second)
80
81 ###Setting directory variables, creating archive folder structure
82 working_directory = "."
83 data_directory = os.path.join(working_directory, 'data')
84
85 if os.path.isdir(data_directory):
86 print "Data directory already exists at %s" % data_directory
87 else:
88 os.mkdir(data_directory)
89 print "Creating data directory at %s" % data_directory
90
91 todays_data_subdirectory = os.path.join(data_directory, datestamp)
92
93 if os.path.isdir(todays_data_subdirectory):
94 print "Today's data subdirectory already exists at %s" % todays_data_subdirectory
95 else:
96 os.mkdir(todays_data_subdirectory)
97 print "Creating today's data subdirectory at %s" % todays_data_subdirectory
98
99 this_scripts_data_subdirectory = os.path.join(todays_data_subdirectory, timestamp)
100
101 if os.path.isdir(this_scripts_data_subdirectory):
102 print "This script's data subdirectory already exists at %s" % this_scripts_data_subdirectory
103 else:
104 os.mkdir(this_scripts_data_subdirectory)
105 print "Creating this script's data subdirectory at %s" % this_scripts_data_subdirectory
106
107 ##Open files for writing out.
108 filings_path = os.path.join(this_scripts_data_subdirectory, 'filings.txt')
109 lobbyists_path = os.path.join(this_scripts_data_subdirectory, 'lobbyists.txt')
110 contribs_path = os.path.join(this_scripts_data_subdirectory, 'contribs.txt')
111
112 filings_file = codecs.open(filings_path, "w", "utf-8")
113 lobbyists_file = codecs.open(lobbyists_path, "w", "utf-8")
114 contribs_file = codecs.open(contribs_path, "w", "utf-8")
115
116 ##Visiting SOPR to grab the zip downloads
117 url = 'http://www.senate.gov/legislative/Public_Disclosure/contributions_download.htm'
118 http = urllib.urlopen(url)
119 soup = BeautifulSoup(http)
120 anchor_tags = soup.findAll('a')
121 zip_links = []
122 for a in anchor_tags:
123 href = a['href']
124 if re.search('(.*).zip', href):
125 zip_links.append(href)
126
127 for zip_link in zip_links:
128 zip_name = zip_link.split('/')[-1]
129 zip_path = os.path.join(this_scripts_data_subdirectory, zip_name)
130 urllib.urlretrieve(zip_link, zip_path)
131 print "Downloaded %s " % zip_name
132
133 ##Unzip file
134 try:
135 zip = zipfile.ZipFile(zip_path)
136 for file in zip.namelist():
137 print "Unzipping %s" % file
138 f = open(os.path.join(this_scripts_data_subdirectory, file), 'wb')
139 f.write(zip.read(file))
140 f.close()
141 except:
142 print "Failed to unzip %s" % zip_name
143
144 ##Snatching XML files for parsing
145 this_scripts_downloads = os.listdir(this_scripts_data_subdirectory)
146 this_scripts_xml_files = []
147
148 for file in this_scripts_downloads:
149 if re.search(".xml", file):
150 this_scripts_xml_files.append(file)
151
152 filing_id = 0
153
154 ##Parsing XML files
155 for xml_file_name in this_scripts_xml_files:
156 print "Processing %s" % xml_file_name
157
158 xml_file = os.path.join(this_scripts_data_subdirectory, xml_file_name)
159 xml = open(xml_file, "r")
160
161 soup = BeautifulStoneSoup(xml, selfClosingTags=['lobbyist', 'contribution', 'registrant'])
162
163 ##Parsing filing data
164 for f in soup.publicfilings.findAll('filing'):
165
166 filing_id = filing_id + 1
167
168 filing = []
169
170 filing.append("%s" % filing_id)
171 filing.append(xml_file_name)
172
173 try: filing.append(f['id'])
174 except: filing.append('null')
175
176 try: filing.append(f['year'])
177 except: filing.append('null')
178
179 try: filing.append(f['received'])
180 except: filing.append('null')
181
182 try: filing.append(f['type'])
183 except: filing.append('null')
184
185 try: filing.append(f['period'])
186 except: filing.append('null')
187
188 try: filing.append(f.registrant['registrantid'])
189 except: filing.append('null')
190
191 try:
192 raw_registrant = f.registrant['registrantname']
193 split_registrant = raw_registrant.split('&#x0D;&#x0A;')
194 clean_registrant = " ".join(split_registrant)
195 filing.append(clean_registrant)
196 except: filing.append('null')
197
198 try:
199 raw_address = f.registrant['address']
200 split_address = raw_address.split('&#x0D;&#x0A;')
201 clean_address = " ".join(split_address)
202 filing.append(clean_address)
203 except: filing.append('null')
204
205 try:
206 raw_country = f.registrant['registrantcountry']
207 split_country = raw_country.split('&#x0D;&#x0A;')
208 clean_country = " ".join(split_country)
209 filing.append(clean_country)
210 except: filing.append('null')
211
212 print >> filings_file, '|'.join(filing)
213
214 try:
215 ##Parsing lobbyist names
216 for l in f.findAll('lobbyist'):
217 lobbyist = []
218 lobbyist.append("%s" % filing_id)
219 lobbyist.append(xml_file_name)
220 lobbyist.append(f['id'])
221 try:
222 raw_name = l['lobbyistname']
223 split_name = raw_name.split('&#x0D;&#x0A;')
224 clean_name = " ".join(split_name)
225 lobbyist.append(clean_name)
226 except: lobbyist.append('null')
227
228 print >> lobbyists_file, '|'.join(lobbyist)
229 except:
230 print "Failed parsing lobbyist record for filing %s" % f['id']
231
232 try:
233 ##Parsing contributions data
234 for c in f.contributions:
235
236 contrib = []
237 contrib.append("%s" % filing_id)
238 contrib.append(xml_file_name)
239 contrib.append(f['id'])
240
241 try: contrib.append(c['contributor'])
242 except: contrib.append('null')
243
244 try: contrib.append(c['contributiontype'])
245 except: contrib.append('null')
246
247 try:
248 raw_payee = c['payee']
249 split_payee = raw_payee.split('&#x0D;&#x0A;')
250 clean_payee = " ".join(split_payee)
251 contrib.append(clean_payee)
252 except: contrib.append('null')
253
254 try:
255 raw_honoree = c['honoree']
256 split_honoree = raw_honoree.split('&#x0D;&#x0A;')
257 clean_honoree = " ".join(split_honoree)
258 contrib.append(clean_honoree)
259 except: contrib.append('null')
260
261 try: contrib.append(c['amount'])
262 except: contrib.append('null')
263
264 try: contrib.append(c['contributiondate'])
265 except: contrib.append('null')
266
267 print >> contribs_file, '|'.join(contrib)
268
269 except:
270 pass
271
272 ##Closing out files
273 filings_file.close()
274 lobbyists_file.close()
275 contribs_file.close()
276
277 con = sqlite.connect(os.path.join(this_scripts_data_subdirectory, "contribs"))
278 cur = con.cursor()
279
280 ## Creating Sqlite tables
281 ## Will ultimately need to convert date fields from varchar to datetime.
282 create_tables = """
283 create table if not exists
284 filing(
285 artificial_filing_id integer,
286 xml_file_name varchar(100),
287 sopr_filing_id varchar(100),
288 year integer,
289 received varchar(100),
290 type varchar(100),
291 period varchar(100),
292 registrant_id integer,
293 registrant_name varchar(100),
294 registrant_address varchar(500),
295 registrant_country varchar(100),
296 insert_datetime datetime
297 );
298
299 create table if not exists
300 contrib(
301 artificial_filing_id integer,
302 xml_file_name varchar(100),
303 sopr_filing_id varchar(100),
304 contributor varchar(100),
305 contribution_type varchar(100),
306 payee varchar(200),
307 honoree varchar(200),
308 amount integer,
309 contribution_date varchar(100),
310 insert_datetime datetime
311 );
312
313 create table if not exists
314 lobbyist(
315 artificial_filing_id integer,
316 xml_file_name varchar(100),
317 sopr_filing_id varchar(100),
318 lobbyist_name varchar(200),
319 insert_datetime datetime
320 );"""
321
322 cur.executescript(create_tables)
323
324 ##Reopening flat files for reading so they can be inserted into the db
325 filings_file = codecs.open(filings_path, "r", "utf-8")
326 lobbyists_file = codecs.open(lobbyists_path, "r", "utf-8")
327 contribs_file = codecs.open(contribs_path, "r", "utf-8")
328
329 print "Inserting filings"
330 for line in filings_file:
331 record = line.split('|')
332 record.append(sqlitestamp)
333
334 insert_record = """
335 insert into filing(
336 artificial_filing_id,
337 xml_file_name,
338 sopr_filing_id,
339 year,
340 received,
341 type,
342 period,
343 registrant_id,
344 registrant_name,
345 registrant_address,
346 registrant_country,
347 insert_datetime
348 )
349 values (
350 ?,
351 ?,
352 ?,
353 ?,
354 ?,
355 ?,
356 ?,
357 ?,
358 ?,
359 ?,
360 ?,
361 ?
362 );"""
363
364 cur.execute(insert_record, record)
365
366 con.commit()
367
368 print "Inserting contribs"
369 for line in contribs_file:
370 record = line.split('|')
371 record.append(sqlitestamp)
372
373 insert_record = """
374 insert into contrib(
375 artificial_filing_id,
376 xml_file_name,
377 sopr_filing_id,
378 contributor,
379 contribution_type,
380 payee,
381 honoree,
382 amount,
383 contribution_date,
384 insert_datetime
385 )
386 values (
387 ?,
388 ?,
389 ?,
390 ?,
391 ?,
392 ?,
393 ?,
394 ?,
395 ?,
396 ?
397 );"""
398
399 cur.execute(insert_record, record)
400
401 con.commit()
402
403 print "Inserting lobbyists"
404 for line in lobbyists_file:
405 record = line.split('|')
406 record.append(sqlitestamp)
407
408 insert_record = """
409 insert into lobbyist(
410 artificial_filing_id,
411 xml_file_name,
412 sopr_filing_id,
413 lobbyist_name,
414 insert_datetime
415 )
416 values (
417 ?,
418 ?,
419 ?,
420 ?,
421 ?
422 );"""
423
424 cur.execute(insert_record, record)
425
426 con.commit()
427
428 con.close()
429
430 ##Closing out files
431 filings_file.close()
432 lobbyists_file.close()
433 contribs_file.close()
434