palewire / sopr-contribs
- Source
- Commits
- Network (1)
- Issues (0)
- Downloads (0)
- Wiki (1)
- Graphs
-
Tree:
2e8d1a1
Ben Welsh (author)
Sun Aug 17 20:24:12 -0700 2008
sopr-contribs / fetch.py
| 2e8d1a13 » | Ben Welsh | 2008-08-17 | 1 | #!/usr/bin/env python | |
| 2 | """ | ||||
| 3 | A script that fetches, parses and archives the XML data dumps of lobbyist's | ||||
| 4 | political contributions published by The Senate Office of Public Records. | ||||
| 5 | |||||
| 6 | Zips files containing the XML are: | ||||
| 7 | 1. Downloaded and unzipped. | ||||
| 8 | 2. Parsed out into flat text files and stored in a timestamped folder structure. | ||||
| 9 | 3. Imported to a SQLite database. | ||||
| 10 | |||||
| 11 | The ultimate goal is for a series of SQL statements to scrub and cut the data | ||||
| 12 | to account for flaws in the reporting system first uncovered by Bill Allison | ||||
| 13 | and Anupama Narayanswamy of The Sunlight Foundation. | ||||
| 14 | |||||
| 15 | Sunlight study: | ||||
| 16 | http://realtime.sunlightprojects.org/2008/08/14/mark-warner-biggest-recipient-of-lobbyist-dough-new-disclosures-show-so-far/ | ||||
| 17 | |||||
| 18 | Simple analysis tasks | ||||
| 19 | could then be scripted to output as schedule XLS dumps, email alerts or maybe | ||||
| 20 | even Django-ifed HTML. | ||||
| 21 | |||||
| 22 | Source URL: | ||||
| 23 | http://www.senate.gov/legislative/Public_Disclosure/contributions_download.htm | ||||
| 24 | |||||
| 25 | Dependencies: BeautifulSoup, Pysqlite2 | ||||
| 26 | |||||
| 27 | The MIT License | ||||
| 28 | |||||
| 29 | Copyright (c) 2008 Ben Welsh | ||||
| 30 | |||||
| 31 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| 32 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| 33 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
| 34 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| 35 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| 36 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||||
| 37 | THE SOFTWARE. | ||||
| 38 | |||||
| 39 | """ | ||||
| 40 | __author__ = "Ben Welsh <ben.welsh@gmail.com>" | ||||
| 41 | __date__ = "$Date: 2008/08/17 $" | ||||
| 42 | __version__ = "$Revision: 0.1 $" | ||||
| 43 | |||||
| 44 | import datetime | ||||
| 45 | import os | ||||
| 46 | import urllib | ||||
| 47 | import re | ||||
| 48 | import codecs | ||||
| 49 | import zipfile | ||||
| 50 | import string | ||||
| 51 | try: | ||||
| 52 | from BeautifulSoup import BeautifulStoneSoup, BeautifulSoup | ||||
| 53 | except ImportError: | ||||
| 54 | print """ | ||||
| 55 | IMPORT ERROR: Required module not found: Beautiful Soup. | ||||
| 56 | Installation instructions: | ||||
| 57 | If you have easy_install, enter | ||||
| 58 | "sudo easy_install BeautifulSoup" | ||||
| 59 | via your shell. | ||||
| 60 | Otherwise, the source can be downloaded from | ||||
| 61 | http://www.crummy.com/software/BeautifulSoup/ | ||||
| 62 | """ | ||||
| 63 | raise SystemExit | ||||
| 64 | try: | ||||
| 65 | from pysqlite2 import dbapi2 as sqlite | ||||
| 66 | except ImportError: | ||||
| 67 | print """ | ||||
| 68 | IMPORT ERROR: Required module not found: Pysqlite. | ||||
| 69 | Visit http://pysqlite.org and download the latest module. | ||||
| 70 | """ | ||||
| 71 | raise SystemExit | ||||
| 72 | |||||
| 73 | #"""Create an archive folder structure using the current datetime. Returns path.""" | ||||
| 74 | ###Setting timestamps | ||||
| 75 | now = datetime.datetime.now() | ||||
| 76 | datestamp = "%s-%s-%s" % (now.year, now.month, now.day) | ||||
| 77 | timestamp = "%sh%sm%ss" % (now.hour, now.minute, now.second) | ||||
| 78 | sqlitestamp = "%s-%s-%s %s:%s:%s" % (now.year, now.month, now.day, | ||||
| 79 | now.hour, now.minute, now.second) | ||||
| 80 | |||||
| 81 | ###Setting directory variables, creating archive folder structure | ||||
| 82 | working_directory = "." | ||||
| 83 | data_directory = os.path.join(working_directory, 'data') | ||||
| 84 | |||||
| 85 | if os.path.isdir(data_directory): | ||||
| 86 | print "Data directory already exists at %s" % data_directory | ||||
| 87 | else: | ||||
| 88 | os.mkdir(data_directory) | ||||
| 89 | print "Creating data directory at %s" % data_directory | ||||
| 90 | |||||
| 91 | todays_data_subdirectory = os.path.join(data_directory, datestamp) | ||||
| 92 | |||||
| 93 | if os.path.isdir(todays_data_subdirectory): | ||||
| 94 | print "Today's data subdirectory already exists at %s" % todays_data_subdirectory | ||||
| 95 | else: | ||||
| 96 | os.mkdir(todays_data_subdirectory) | ||||
| 97 | print "Creating today's data subdirectory at %s" % todays_data_subdirectory | ||||
| 98 | |||||
| 99 | this_scripts_data_subdirectory = os.path.join(todays_data_subdirectory, timestamp) | ||||
| 100 | |||||
| 101 | if os.path.isdir(this_scripts_data_subdirectory): | ||||
| 102 | print "This script's data subdirectory already exists at %s" % this_scripts_data_subdirectory | ||||
| 103 | else: | ||||
| 104 | os.mkdir(this_scripts_data_subdirectory) | ||||
| 105 | print "Creating this script's data subdirectory at %s" % this_scripts_data_subdirectory | ||||
| 106 | |||||
| 107 | ##Open files for writing out. | ||||
| 108 | filings_path = os.path.join(this_scripts_data_subdirectory, 'filings.txt') | ||||
| 109 | lobbyists_path = os.path.join(this_scripts_data_subdirectory, 'lobbyists.txt') | ||||
| 110 | contribs_path = os.path.join(this_scripts_data_subdirectory, 'contribs.txt') | ||||
| 111 | |||||
| 112 | filings_file = codecs.open(filings_path, "w", "utf-8") | ||||
| 113 | lobbyists_file = codecs.open(lobbyists_path, "w", "utf-8") | ||||
| 114 | contribs_file = codecs.open(contribs_path, "w", "utf-8") | ||||
| 115 | |||||
| 116 | ##Visiting SOPR to grab the zip downloads | ||||
| 117 | url = 'http://www.senate.gov/legislative/Public_Disclosure/contributions_download.htm' | ||||
| 118 | http = urllib.urlopen(url) | ||||
| 119 | soup = BeautifulSoup(http) | ||||
| 120 | anchor_tags = soup.findAll('a') | ||||
| 121 | zip_links = [] | ||||
| 122 | for a in anchor_tags: | ||||
| 123 | href = a['href'] | ||||
| 124 | if re.search('(.*).zip', href): | ||||
| 125 | zip_links.append(href) | ||||
| 126 | |||||
| 127 | for zip_link in zip_links: | ||||
| 128 | zip_name = zip_link.split('/')[-1] | ||||
| 129 | zip_path = os.path.join(this_scripts_data_subdirectory, zip_name) | ||||
| 130 | urllib.urlretrieve(zip_link, zip_path) | ||||
| 131 | print "Downloaded %s " % zip_name | ||||
| 132 | |||||
| 133 | ##Unzip file | ||||
| 134 | try: | ||||
| 135 | zip = zipfile.ZipFile(zip_path) | ||||
| 136 | for file in zip.namelist(): | ||||
| 137 | print "Unzipping %s" % file | ||||
| 138 | f = open(os.path.join(this_scripts_data_subdirectory, file), 'wb') | ||||
| 139 | f.write(zip.read(file)) | ||||
| 140 | f.close() | ||||
| 141 | except: | ||||
| 142 | print "Failed to unzip %s" % zip_name | ||||
| 143 | |||||
| 144 | ##Snatching XML files for parsing | ||||
| 145 | this_scripts_downloads = os.listdir(this_scripts_data_subdirectory) | ||||
| 146 | this_scripts_xml_files = [] | ||||
| 147 | |||||
| 148 | for file in this_scripts_downloads: | ||||
| 149 | if re.search(".xml", file): | ||||
| 150 | this_scripts_xml_files.append(file) | ||||
| 151 | |||||
| 152 | filing_id = 0 | ||||
| 153 | |||||
| 154 | ##Parsing XML files | ||||
| 155 | for xml_file_name in this_scripts_xml_files: | ||||
| 156 | print "Processing %s" % xml_file_name | ||||
| 157 | |||||
| 158 | xml_file = os.path.join(this_scripts_data_subdirectory, xml_file_name) | ||||
| 159 | xml = open(xml_file, "r") | ||||
| 160 | |||||
| 161 | soup = BeautifulStoneSoup(xml, selfClosingTags=['lobbyist', 'contribution', 'registrant']) | ||||
| 162 | |||||
| 163 | ##Parsing filing data | ||||
| 164 | for f in soup.publicfilings.findAll('filing'): | ||||
| 165 | |||||
| 166 | filing_id = filing_id + 1 | ||||
| 167 | |||||
| 168 | filing = [] | ||||
| 169 | |||||
| 170 | filing.append("%s" % filing_id) | ||||
| 171 | filing.append(xml_file_name) | ||||
| 172 | |||||
| 173 | try: filing.append(f['id']) | ||||
| 174 | except: filing.append('null') | ||||
| 175 | |||||
| 176 | try: filing.append(f['year']) | ||||
| 177 | except: filing.append('null') | ||||
| 178 | |||||
| 179 | try: filing.append(f['received']) | ||||
| 180 | except: filing.append('null') | ||||
| 181 | |||||
| 182 | try: filing.append(f['type']) | ||||
| 183 | except: filing.append('null') | ||||
| 184 | |||||
| 185 | try: filing.append(f['period']) | ||||
| 186 | except: filing.append('null') | ||||
| 187 | |||||
| 188 | try: filing.append(f.registrant['registrantid']) | ||||
| 189 | except: filing.append('null') | ||||
| 190 | |||||
| 191 | try: | ||||
| 192 | raw_registrant = f.registrant['registrantname'] | ||||
| 193 | split_registrant = raw_registrant.split('
') | ||||
| 194 | clean_registrant = " ".join(split_registrant) | ||||
| 195 | filing.append(clean_registrant) | ||||
| 196 | except: filing.append('null') | ||||
| 197 | |||||
| 198 | try: | ||||
| 199 | raw_address = f.registrant['address'] | ||||
| 200 | split_address = raw_address.split('
') | ||||
| 201 | clean_address = " ".join(split_address) | ||||
| 202 | filing.append(clean_address) | ||||
| 203 | except: filing.append('null') | ||||
| 204 | |||||
| 205 | try: | ||||
| 206 | raw_country = f.registrant['registrantcountry'] | ||||
| 207 | split_country = raw_country.split('
') | ||||
| 208 | clean_country = " ".join(split_country) | ||||
| 209 | filing.append(clean_country) | ||||
| 210 | except: filing.append('null') | ||||
| 211 | |||||
| 212 | print >> filings_file, '|'.join(filing) | ||||
| 213 | |||||
| 214 | try: | ||||
| 215 | ##Parsing lobbyist names | ||||
| 216 | for l in f.findAll('lobbyist'): | ||||
| 217 | lobbyist = [] | ||||
| 218 | lobbyist.append("%s" % filing_id) | ||||
| 219 | lobbyist.append(xml_file_name) | ||||
| 220 | lobbyist.append(f['id']) | ||||
| 221 | try: | ||||
| 222 | raw_name = l['lobbyistname'] | ||||
| 223 | split_name = raw_name.split('
') | ||||
| 224 | clean_name = " ".join(split_name) | ||||
| 225 | lobbyist.append(clean_name) | ||||
| 226 | except: lobbyist.append('null') | ||||
| 227 | |||||
| 228 | print >> lobbyists_file, '|'.join(lobbyist) | ||||
| 229 | except: | ||||
| 230 | print "Failed parsing lobbyist record for filing %s" % f['id'] | ||||
| 231 | |||||
| 232 | try: | ||||
| 233 | ##Parsing contributions data | ||||
| 234 | for c in f.contributions: | ||||
| 235 | |||||
| 236 | contrib = [] | ||||
| 237 | contrib.append("%s" % filing_id) | ||||
| 238 | contrib.append(xml_file_name) | ||||
| 239 | contrib.append(f['id']) | ||||
| 240 | |||||
| 241 | try: contrib.append(c['contributor']) | ||||
| 242 | except: contrib.append('null') | ||||
| 243 | |||||
| 244 | try: contrib.append(c['contributiontype']) | ||||
| 245 | except: contrib.append('null') | ||||
| 246 | |||||
| 247 | try: | ||||
| 248 | raw_payee = c['payee'] | ||||
| 249 | split_payee = raw_payee.split('
') | ||||
| 250 | clean_payee = " ".join(split_payee) | ||||
| 251 | contrib.append(clean_payee) | ||||
| 252 | except: contrib.append('null') | ||||
| 253 | |||||
| 254 | try: | ||||
| 255 | raw_honoree = c['honoree'] | ||||
| 256 | split_honoree = raw_honoree.split('
') | ||||
| 257 | clean_honoree = " ".join(split_honoree) | ||||
| 258 | contrib.append(clean_honoree) | ||||
| 259 | except: contrib.append('null') | ||||
| 260 | |||||
| 261 | try: contrib.append(c['amount']) | ||||
| 262 | except: contrib.append('null') | ||||
| 263 | |||||
| 264 | try: contrib.append(c['contributiondate']) | ||||
| 265 | except: contrib.append('null') | ||||
| 266 | |||||
| 267 | print >> contribs_file, '|'.join(contrib) | ||||
| 268 | |||||
| 269 | except: | ||||
| 270 | pass | ||||
| 271 | |||||
| 272 | ##Closing out files | ||||
| 273 | filings_file.close() | ||||
| 274 | lobbyists_file.close() | ||||
| 275 | contribs_file.close() | ||||
| 276 | |||||
| 277 | con = sqlite.connect(os.path.join(this_scripts_data_subdirectory, "contribs")) | ||||
| 278 | cur = con.cursor() | ||||
| 279 | |||||
| 280 | ## Creating Sqlite tables | ||||
| 281 | ## Will ultimately need to convert date fields from varchar to datetime. | ||||
| 282 | create_tables = """ | ||||
| 283 | create table if not exists | ||||
| 284 | filing( | ||||
| 285 | artificial_filing_id integer, | ||||
| 286 | xml_file_name varchar(100), | ||||
| 287 | sopr_filing_id varchar(100), | ||||
| 288 | year integer, | ||||
| 289 | received varchar(100), | ||||
| 290 | type varchar(100), | ||||
| 291 | period varchar(100), | ||||
| 292 | registrant_id integer, | ||||
| 293 | registrant_name varchar(100), | ||||
| 294 | registrant_address varchar(500), | ||||
| 295 | registrant_country varchar(100), | ||||
| 296 | insert_datetime datetime | ||||
| 297 | ); | ||||
| 298 | |||||
| 299 | create table if not exists | ||||
| 300 | contrib( | ||||
| 301 | artificial_filing_id integer, | ||||
| 302 | xml_file_name varchar(100), | ||||
| 303 | sopr_filing_id varchar(100), | ||||
| 304 | contributor varchar(100), | ||||
| 305 | contribution_type varchar(100), | ||||
| 306 | payee varchar(200), | ||||
| 307 | honoree varchar(200), | ||||
| 308 | amount integer, | ||||
| 309 | contribution_date varchar(100), | ||||
| 310 | insert_datetime datetime | ||||
| 311 | ); | ||||
| 312 | |||||
| 313 | create table if not exists | ||||
| 314 | lobbyist( | ||||
| 315 | artificial_filing_id integer, | ||||
| 316 | xml_file_name varchar(100), | ||||
| 317 | sopr_filing_id varchar(100), | ||||
| 318 | lobbyist_name varchar(200), | ||||
| 319 | insert_datetime datetime | ||||
| 320 | );""" | ||||
| 321 | |||||
| 322 | cur.executescript(create_tables) | ||||
| 323 | |||||
| 324 | ##Reopening flat files for reading so they can be inserted into the db | ||||
| 325 | filings_file = codecs.open(filings_path, "r", "utf-8") | ||||
| 326 | lobbyists_file = codecs.open(lobbyists_path, "r", "utf-8") | ||||
| 327 | contribs_file = codecs.open(contribs_path, "r", "utf-8") | ||||
| 328 | |||||
| 329 | print "Inserting filings" | ||||
| 330 | for line in filings_file: | ||||
| 331 | record = line.split('|') | ||||
| 332 | record.append(sqlitestamp) | ||||
| 333 | |||||
| 334 | insert_record = """ | ||||
| 335 | insert into filing( | ||||
| 336 | artificial_filing_id, | ||||
| 337 | xml_file_name, | ||||
| 338 | sopr_filing_id, | ||||
| 339 | year, | ||||
| 340 | received, | ||||
| 341 | type, | ||||
| 342 | period, | ||||
| 343 | registrant_id, | ||||
| 344 | registrant_name, | ||||
| 345 | registrant_address, | ||||
| 346 | registrant_country, | ||||
| 347 | insert_datetime | ||||
| 348 | ) | ||||
| 349 | values ( | ||||
| 350 | ?, | ||||
| 351 | ?, | ||||
| 352 | ?, | ||||
| 353 | ?, | ||||
| 354 | ?, | ||||
| 355 | ?, | ||||
| 356 | ?, | ||||
| 357 | ?, | ||||
| 358 | ?, | ||||
| 359 | ?, | ||||
| 360 | ?, | ||||
| 361 | ? | ||||
| 362 | );""" | ||||
| 363 | |||||
| 364 | cur.execute(insert_record, record) | ||||
| 365 | |||||
| 366 | con.commit() | ||||
| 367 | |||||
| 368 | print "Inserting contribs" | ||||
| 369 | for line in contribs_file: | ||||
| 370 | record = line.split('|') | ||||
| 371 | record.append(sqlitestamp) | ||||
| 372 | |||||
| 373 | insert_record = """ | ||||
| 374 | insert into contrib( | ||||
| 375 | artificial_filing_id, | ||||
| 376 | xml_file_name, | ||||
| 377 | sopr_filing_id, | ||||
| 378 | contributor, | ||||
| 379 | contribution_type, | ||||
| 380 | payee, | ||||
| 381 | honoree, | ||||
| 382 | amount, | ||||
| 383 | contribution_date, | ||||
| 384 | insert_datetime | ||||
| 385 | ) | ||||
| 386 | values ( | ||||
| 387 | ?, | ||||
| 388 | ?, | ||||
| 389 | ?, | ||||
| 390 | ?, | ||||
| 391 | ?, | ||||
| 392 | ?, | ||||
| 393 | ?, | ||||
| 394 | ?, | ||||
| 395 | ?, | ||||
| 396 | ? | ||||
| 397 | );""" | ||||
| 398 | |||||
| 399 | cur.execute(insert_record, record) | ||||
| 400 | |||||
| 401 | con.commit() | ||||
| 402 | |||||
| 403 | print "Inserting lobbyists" | ||||
| 404 | for line in lobbyists_file: | ||||
| 405 | record = line.split('|') | ||||
| 406 | record.append(sqlitestamp) | ||||
| 407 | |||||
| 408 | insert_record = """ | ||||
| 409 | insert into lobbyist( | ||||
| 410 | artificial_filing_id, | ||||
| 411 | xml_file_name, | ||||
| 412 | sopr_filing_id, | ||||
| 413 | lobbyist_name, | ||||
| 414 | insert_datetime | ||||
| 415 | ) | ||||
| 416 | values ( | ||||
| 417 | ?, | ||||
| 418 | ?, | ||||
| 419 | ?, | ||||
| 420 | ?, | ||||
| 421 | ? | ||||
| 422 | );""" | ||||
| 423 | |||||
| 424 | cur.execute(insert_record, record) | ||||
| 425 | |||||
| 426 | con.commit() | ||||
| 427 | |||||
| 428 | con.close() | ||||
| 429 | |||||
| 430 | ##Closing out files | ||||
| 431 | filings_file.close() | ||||
| 432 | lobbyists_file.close() | ||||
| 433 | contribs_file.close() | ||||
| 434 | |||||
