This repository is private.
All pages are served over SSL and all pushing and pulling is done over SSH.
No one may fork, clone, or view it unless they are added as a member.
Every repository with this icon (
) is private.
Every repository with this icon (
This repository is public.
Anyone may fork, clone, or view it.
Every repository with this icon (
) is public.
Every repository with this icon (
Derek Willis (author)
Sun Jun 22 19:34:57 -0700 2008
fec-utilities / fec.py
| 145c8a14 » | Derek Willis | 2008-05-21 | 1 | #!/usr/bin/python | |
| 2 | """ | ||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 3 | Library of functions to process and handle Federal Election Commission data | |
| 4 | available from http://www.fec.gov and its FTP site, ftp://ftp.fec.gov. | ||||
| 145c8a14 » | Derek Willis | 2008-05-21 | 5 | ||
| 6 | The MIT License | ||||
| 7 | |||||
| 8 | Copyright (c) 2008 Derek Willis | ||||
| 9 | |||||
| 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||||
| 11 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||||
| 12 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||||
| 13 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||||
| 14 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||||
| 15 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||||
| 16 | THE SOFTWARE. | ||||
| 17 | |||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 18 | Modified by: | |
| 19 | Ben Welsh | ||||
| 20 | Los Angeles | ||||
| 21 | May 28, 2008 | ||||
| 22 | |||||
| 145c8a14 » | Derek Willis | 2008-05-21 | 23 | """ | |
| 24 | __author__ = "Derek Willis <dwillis@gmail.com>" | ||||
| 96ce2d43 » | Derek Willis | 2008-06-22 | 25 | __date__ = "$Date: 2008/06/22 $" | |
| 145c8a14 » | Derek Willis | 2008-05-21 | 26 | __version__ = "$Revision: 2.2 $" | |
| 27 | |||||
| 28 | import re | ||||
| 29 | import urllib | ||||
| 30 | import sys | ||||
| 31 | import string | ||||
| 32 | import urlparse | ||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 33 | import time | |
| 145c8a14 » | Derek Willis | 2008-05-21 | 34 | import datetime | |
| cf4bd065 » | Derek Willis | 2008-06-22 | 35 | import csv | |
| 145c8a14 » | Derek Willis | 2008-05-21 | 36 | ||
| c03f8998 » | Ben Welsh | 2008-05-28 | 37 | def latest_news(): | |
| 145c8a14 » | Derek Willis | 2008-05-21 | 38 | """ | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 39 | Scrape the FEC's news releases to produce an RSS feed using | |
| 40 | regular expressions. | ||||
| 41 | |||||
| 42 | The script supports relative urls and the elimination of | ||||
| 43 | extraneous whitespace, and produces an RSS 2.0 feed. | ||||
| 44 | |||||
| 45 | Running this script using versions of Python before 2.3 require importing | ||||
| 46 | sre as re and resetting the maximum recursion limit. | ||||
| 47 | |||||
| 48 | Based on a script by Sam Ruby. | ||||
| 15c719f3 » | Derek Willis | 2008-05-22 | 49 | ||
| 50 | Usage from within Python shell: | ||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 51 | from fec import latest_news | |
| 52 | latest_news() | ||||
| 145c8a14 » | Derek Willis | 2008-05-21 | 53 | """ | |
| 54 | # set up needed variables | ||||
| 55 | date = datetime.date.today() | ||||
| 56 | year = date.year | ||||
| 57 | base_url = 'http://www.fec.gov/press/press%s/' % year | ||||
| 58 | url = base_url + '%sNewsReleases.shtml' % year | ||||
| 59 | |||||
| 60 | # read the content of the FEC's press page | ||||
| 61 | try: | ||||
| 62 | page=urllib.urlopen(url).read() | ||||
| 63 | except IOError: | ||||
| 64 | page='' | ||||
| 65 | except AssertionError: | ||||
| 66 | page='' | ||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 67 | ||
| 145c8a14 » | Derek Willis | 2008-05-21 | 68 | #define regular expression to grab link, date and title from FEC news | |
| 69 | #releases page, using DOTALL to find multiline text. Since urls are | ||||
| 70 | #internal, we need to add a prefix to the link for the RSS feed. | ||||
| 8ec4c1b1 » | Derek Willis | 2008-05-31 | 71 | news = re.compile("""valign=.top.>(.*?\d\d\d\d).?</td>.*?<td.valign=.top.>.*?<a href=.(.*?).>(.*?)</a>.*?</td>.*?</tr>""", re.DOTALL) | |
| 145c8a14 » | Derek Willis | 2008-05-21 | 72 | ||
| 73 | #remove additional whitespace like linebreaks and returns from HTML code. | ||||
| 74 | page = ' '.join(page.split()) | ||||
| 75 | |||||
| 76 | #find first 10 matches | ||||
| 77 | matches = news.findall(page) | ||||
| 78 | matches = matches[:10] | ||||
| 79 | |||||
| 80 | #unpack tuple of matches and add to rss feed | ||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 81 | data = [] | |
| 82 | for (date_string, link, title) in matches: | ||||
| 83 | # combine the base_url with the tail_url | ||||
| 84 | link = urlparse.urljoin(base_url,link) | ||||
| 85 | # Leave the description field null for the time being | ||||
| 86 | description = '' | ||||
| 87 | # Pull out the date and reformat it for RSS | ||||
| 88 | # See: http://feedvalidator.org/docs/error/InvalidRFC2822Date.html | ||||
| 89 | date_date = time.strptime(date_string, '%B %d, %Y') | ||||
| 90 | pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date) | ||||
| 91 | record = (title, link, description, pubDate) | ||||
| 92 | # Append to our data list | ||||
| 93 | data.append(record) | ||||
| 94 | # Transform our data list to RSS 2.0 | ||||
| 8ec4c1b1 » | Derek Willis | 2008-05-31 | 95 | make_rss_20('Latest FEC News', 'Press releases and announcements.', data, 'latest_news.xml') | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 96 | ||
| 8cbb117f » | Derek Willis | 2008-06-02 | 97 | def latest_filings(): | |
| 15c719f3 » | Derek Willis | 2008-05-22 | 98 | """ | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 99 | Returns a list of electronic filings for today's date | |
| 8cbb117f » | Derek Willis | 2008-06-02 | 100 | and print them out as RSS. | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 101 | ||
| 102 | Dependency: BeautifulSoup for HTML parsing | ||||
| 103 | (http://www.crummy.com/software/BeautifulSoup/) | ||||
| 104 | |||||
| 105 | Usage from within Python shell: | ||||
| 106 | from fec import latest_news | ||||
| 8ec4c1b1 » | Derek Willis | 2008-05-31 | 107 | latest_filings() | |
| 15c719f3 » | Derek Willis | 2008-05-22 | 108 | """ | |
| 32944fea » | Ben Welsh | 2008-05-28 | 109 | try: | |
| 110 | from BeautifulSoup import BeautifulSoup | ||||
| 111 | except ImportError: | ||||
| 112 | print """ | ||||
| 113 | IMPORT ERROR: Required Beautiful Soup module not found. | ||||
| 114 | |||||
| 115 | Installation instructions: | ||||
| 116 | |||||
| 117 | If you have easy_install, enter | ||||
| fbc2d58d » | Ben Welsh | 2008-05-28 | 118 | "sudo easy_install BeautifulSoup" | |
| 32944fea » | Ben Welsh | 2008-05-28 | 119 | via your shell. | |
| 120 | |||||
| 121 | Otherwise, the source can be downloaded from | ||||
| 122 | http://www.crummy.com/software/BeautifulSoup/ | ||||
| 123 | """ | ||||
| b3a00d1d » | Ben Welsh | 2008-05-28 | 124 | raise SystemExit | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 125 | # Set the date for the URL string | |
| 126 | d = datetime.date.today() | ||||
| 127 | dm = str(d.month).zfill(2) | ||||
| 128 | dd = str(d.day).zfill(2) | ||||
| 129 | stringdate=dm+'/'+dd+'/'+str(d.year) | ||||
| 8cbb117f » | Derek Willis | 2008-06-02 | 130 | params = {'date':stringdate} | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 131 | base_url = 'http://query.nictusa.com/cgi-bin/dcdev/forms/' | |
| 132 | # Open the URL, pass the HTML to Beautiful Soup | ||||
| 133 | txt=urllib.urlopen(base_url, urllib.urlencode(params)).read() | ||||
| 15c719f3 » | Derek Willis | 2008-05-22 | 134 | soup = BeautifulSoup(txt) | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 135 | # Snatch all the <dt> tags | |
| 15c719f3 » | Derek Willis | 2008-05-22 | 136 | filings = soup.findAll('dt') | |
| 1bf283be » | Derek Willis | 2008-05-22 | 137 | today = [] | |
| 15c719f3 » | Derek Willis | 2008-05-22 | 138 | for cmte in filings: | |
| 139 | name = cmte('h4')[0]('a')[0].contents[0] | ||||
| 140 | number_of_filings = len(cmte.contents)/6 # each filing has six elements | ||||
| 141 | i = 5 # the fifth element in a filing is its title | ||||
| 142 | for filing in range(number_of_filings): | ||||
| 143 | title = cmte.contents[i] | ||||
| 1bf283be » | Derek Willis | 2008-05-22 | 144 | today.append(name+title) | |
| 15c719f3 » | Derek Willis | 2008-05-22 | 145 | i += 6 | |
| 1bf283be » | Derek Willis | 2008-05-22 | 146 | return today | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 147 | data = [] | |
| 148 | # Iterate through each filing in the HTML and snatch the data we want | ||||
| 149 | for filing in filings: | ||||
| 150 | # Pull the committee name and cut off c_id | ||||
| 151 | committee = filing('h4')[0]('a')[0].contents[0] | ||||
| 152 | title = re.split(' - ', committee)[0] | ||||
| 153 | # Pull the hyperlink | ||||
| 154 | link = filing.contents[2]['href'] | ||||
| 155 | # Grab the field with form and date information | ||||
| 156 | form = filing.contents[5].replace(' ', '').replace('\n', '') | ||||
| 157 | # Pull out the description | ||||
| 158 | description = re.split(' - ', form)[0].strip() | ||||
| 159 | # Pull out the date and reformat it for RSS | ||||
| 160 | # See: http://feedvalidator.org/docs/error/InvalidRFC2822Date.html | ||||
| 161 | date_string = re.split(' - ', form)[1].split('filed ')[1] | ||||
| 162 | date_date = time.strptime(string.strip(date_string), '%m/%d/%Y') | ||||
| 163 | pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date) | ||||
| 164 | # Collect in a tuple | ||||
| 165 | record = (title, urlparse.urljoin(base_url,link), description, pubDate) | ||||
| 166 | # Append to our data list | ||||
| 167 | data.append(record) | ||||
| 168 | # Transform our data list to RSS 2.0 | ||||
| 8ec4c1b1 » | Derek Willis | 2008-05-31 | 169 | make_rss_20('Latest FEC Filings', 'Committee finance reports.', data, 'latest_filings.xml') | |
| 4da37cca » | Derek Willis | 2008-05-28 | 170 | ||
| 171 | |||||
| 8cbb117f » | Derek Willis | 2008-06-02 | 172 | def cmte_filings(cmte): | |
| 173 | """ | ||||
| 174 | Returns a list of electronic filings for a given | ||||
| 175 | committee's C-number and print them out as RSS. | ||||
| 176 | |||||
| 177 | Dependency: BeautifulSoup for HTML parsing | ||||
| 178 | (http://www.crummy.com/software/BeautifulSoup/) | ||||
| 179 | |||||
| 180 | Usage from within Python shell: | ||||
| 181 | from fec import latest_news | ||||
| 182 | cmte_filings('C00260547') | ||||
| 183 | """ | ||||
| 184 | try: | ||||
| 185 | from BeautifulSoup import BeautifulSoup | ||||
| 186 | except ImportError: | ||||
| 187 | print """ | ||||
| 188 | IMPORT ERROR: Required Beautiful Soup module not found. | ||||
| 189 | |||||
| 190 | Installation instructions: | ||||
| 191 | |||||
| 192 | If you have easy_install, enter | ||||
| 193 | "sudo easy_install BeautifulSoup" | ||||
| 194 | via your shell. | ||||
| 195 | |||||
| 196 | Otherwise, the source can be downloaded from | ||||
| 197 | http://www.crummy.com/software/BeautifulSoup/ | ||||
| 198 | """ | ||||
| 199 | raise SystemExit | ||||
| 200 | # Set the date for the URL string | ||||
| 201 | params = {'comid':cmte} | ||||
| 202 | base_url = 'http://query.nictusa.com/cgi-bin/dcdev/forms/' | ||||
| 203 | # Open the URL, pass the HTML to Beautiful Soup | ||||
| 204 | txt=urllib.urlopen(base_url, urllib.urlencode(params)).read() | ||||
| 205 | soup = BeautifulSoup(txt) | ||||
| 206 | # Snatch all the <dt> tags | ||||
| 207 | filings = soup.findAll('dt') | ||||
| 208 | # Pull the committee name and cut off c_id | ||||
| 209 | committee = filings[0].a.contents[0] | ||||
| 210 | title = re.split(' - ', committee)[0] | ||||
| 211 | # set up the list to hold filings | ||||
| 212 | data = [] | ||||
| 213 | # regex to match filings - no soup here! | ||||
| 214 | results = re.compile("""[<BR>]?.*?<A HREF=\'/cgi-bin/dcdev/forms/(C[0-9]*?/[0-9]*?)/\'>(?:<FONT COLOR ="#990000">)?View(?:</FONT>)?</A> <A HREF=\'/cgi-bin/dcdev/forms/DL/[0-9].*/\'>(?:<FONT COLOR ="#990000">)?Download(?:</FONT>)?</A> .*?(Form F.*?)..-.(?:period.(.*?)-(.*?),.)?filed.(.*?).-.(.*?)(?:<BR> <B>Amended</B> by <A HREF=\'/cgi-bin/dcdev/forms/C[0-9]*?/([0-9]*?)/\'>.*?</A>)?\n""") | ||||
| 215 | for (link,form,periodstart,periodend,filedate,filing,amend) in results.findall(txt): | ||||
| 216 | date_date = time.strptime(filedate, '%m/%d/%Y') | ||||
| 217 | pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date) | ||||
| 218 | # Collect in a tuple | ||||
| 219 | record = (title, urlparse.urljoin(base_url,link), filing, pubDate) | ||||
| 220 | # Append to our data list | ||||
| 221 | data.append(record) | ||||
| 222 | # Transform our data list to RSS 2.0 inserting cmte id into title | ||||
| 223 | intro = 'Latest FEC Filings from %s' % cmte | ||||
| 224 | make_rss_20(intro, 'Committee finance reports.', data, 'latest_cmte_filings.xml') | ||||
| 225 | |||||
| 226 | |||||
| cf4bd065 » | Derek Willis | 2008-06-22 | 227 | def latest_electioneering_filings(): | |
| 228 | """ | ||||
| 229 | Returns a list of the most recent electioneering communications filings | ||||
| 230 | with the FEC by processing a CSV file on ftp.fec.gov and printing the 10 | ||||
| 231 | most recent filings out as RSS 2.0. | ||||
| 232 | |||||
| 233 | Uses a slightly altered version of the MONTHS_3 format used by Django to parse dates from FEC file. | ||||
| 234 | """ | ||||
| 235 | try: | ||||
| 236 | url = "ftp://ftp.fec.gov/FEC/electioneering.csv" | ||||
| 237 | ec = urllib.urlopen(url) | ||||
| 238 | reader = csv.DictReader(ec) | ||||
| 239 | reader.next() | ||||
| 240 | except IOError: | ||||
| 241 | print "Network Error: File cannot be accessed." | ||||
| 242 | raise SystemExit | ||||
| 243 | |||||
| 244 | # dictionary mapping three-letter months to Python month numbers for use in building Python dates | ||||
| 245 | MONTHS_3 = { 'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6, 'JUL': 7, 'AUG':8, 'SEP':9, 'OCT':10, 'NOV':11, 'DEC':12} | ||||
| 246 | |||||
| 247 | base_url = 'http://images.nictusa.com/cgi-bin/fecimg/?_' | ||||
| 248 | data = [] | ||||
| 249 | for row in reader: | ||||
| 250 | d = row[' RECEIPT_DT'].strip().split('-') | ||||
| 251 | d[1] = str(MONTHS_3[d[1]]) | ||||
| 252 | date_date = time.strptime("-".join(d), '%d-%m-%y') | ||||
| 253 | if row[' PUBLIC_DISTRIBUTION_DT '] == '': | ||||
| 254 | dd = d | ||||
| 255 | else: | ||||
| 256 | dd = row[' PUBLIC_DISTRIBUTION_DT '].strip().split('-') | ||||
| 257 | dd[1] = str(MONTHS_3[dd[1]]) | ||||
| 258 | dist_date = time.strftime("%b %d, %Y", time.strptime("-".join(dd), '%d-%m-%y')) # the date the ad(s) aired. | ||||
| 259 | pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date) # the filing date | ||||
| 260 | record = (row[' FILER_NAME '], base_url+row[' BEGIN_IMAGE_NUM ']+'+0', "Amount Spent: $"+ row[' TOTAL_DISBURSEMENTS_THIS_STMT '] + " first airing on "+ dist_date, pubDate) | ||||
| 261 | data.append(record) | ||||
| 262 | data = data[:10] # limits feed to 10 most recent. | ||||
| 263 | intro = "Latest Electioneering Communications filings" | ||||
| 264 | make_rss_20(intro, 'Electioneering communications', data, 'elect_comm.xml') | ||||
| 265 | |||||
| 266 | |||||
| 4da37cca » | Derek Willis | 2008-05-28 | 267 | def cand_summary_by_state(year, state): | |
| 268 | """ | ||||
| 269 | Partially working function to return summary | ||||
| 270 | numbers for candidates from a state for an | ||||
| 271 | election year. Bombs out on first House candidate. | ||||
| 272 | |||||
| 273 | Dependency: BeautifulSoup for HTML parsing | ||||
| 274 | (http://www.crummy.com/software/BeautifulSoup/) | ||||
| 275 | |||||
| 276 | """ | ||||
| 277 | try: | ||||
| 278 | from BeautifulSoup import BeautifulSoup | ||||
| 279 | except ImportError: | ||||
| 280 | print """ | ||||
| 281 | IMPORT ERROR: Required Beautiful Soup module not found. | ||||
| 282 | |||||
| 283 | Installation instructions: | ||||
| 284 | |||||
| 285 | If you have easy_install, enter | ||||
| 286 | "sudo easy_install BeautifulSoup" | ||||
| 287 | via your shell. | ||||
| 288 | |||||
| 289 | Otherwise, the source can be downloaded from | ||||
| 290 | http://www.crummy.com/software/BeautifulSoup/ | ||||
| 291 | """ | ||||
| 292 | raise SystemExit | ||||
| 293 | params = { 'dbyear': int(str(year)[3]), 'state': state } | ||||
| 294 | base_url = 'http://herndon1.sdrdc.com/cgi-bin/cancomsrs/' | ||||
| 295 | txt=urllib.urlopen(base_url, urllib.urlencode(params)).read() | ||||
| 296 | soup = BeautifulSoup(txt) | ||||
| 297 | t = soup.table.contents | ||||
| 298 | data = [] | ||||
| 299 | for row in t[3:]: | ||||
| 300 | name = row.contents[0].a.contents[0] | ||||
| 301 | office = row.contents[1].a.contents[0] | ||||
| 302 | receipts = row.contents[2].contents[0] | ||||
| 303 | spent = row.contents[3].contents[0] | ||||
| 304 | cash = row.contents[4].contents[0] | ||||
| 305 | debt = row.contents[5].contents[0] | ||||
| 306 | date = row.contents[6].contents[0] | ||||
| 307 | record = (name, office, receipts, spent, cash, debt, date) | ||||
| 308 | data.append(record) | ||||
| 309 | return data | ||||
| 310 | |||||
| 311 | |||||
| c03f8998 » | Ben Welsh | 2008-05-28 | 312 | def make_rss_20(title, description, data, file_name): | |
| 2c892b6c » | Derek Willis | 2008-05-28 | 313 | """ | |
| 8ec4c1b1 » | Derek Willis | 2008-05-31 | 314 | Returns a list of electronic filings for today or a given committee, using its C-number passed in to the function. | |
| 2c892b6c » | Derek Willis | 2008-05-28 | 315 | Dependency: BeautifulSoup for HTML parsing (http://www.crummy.com/software/BeautifulSoup/) | |
| 8ec4c1b1 » | Derek Willis | 2008-05-31 | 316 | ||
| c03f8998 » | Ben Welsh | 2008-05-28 | 317 | Prints out data from both scrapes according RSS 2.0 standards | |
| 318 | http://en.wikipedia.org/wiki/RSS_(file_format)#RSS_2.0 | ||||
| 2c892b6c » | Derek Willis | 2008-05-28 | 319 | """ | |
| c03f8998 » | Ben Welsh | 2008-05-28 | 320 | rss="""<?xml version="1.0" encoding="ISO-8859-1"?> | |
| 321 | <rss version="2.0"> | ||||
| 322 | <channel> | ||||
| 323 | <title>""" + title + """</title> | ||||
| 324 | <link>http://www.fec.gov/</link> | ||||
| 325 | <description>""" + description + """"</description> | ||||
| 326 | <language>en-us</language> | ||||
| 327 | """ | ||||
| 328 | |||||
| 329 | #find first 10 matches | ||||
| 330 | latest = data[:10] | ||||
| 331 | |||||
| 332 | #unpack tuple of matches and add to rss feed | ||||
| 333 | for (title, link, description, pubDate) in latest: | ||||
| 334 | rss+=""" | ||||
| 335 | <item> | ||||
| 336 | <title>%s</title> | ||||
| 337 | <link>%s</link> | ||||
| 338 | <description>%s</description> | ||||
| 339 | <pubDate>%s</pubDate> | ||||
| 340 | </item>\n""" % (title, link, description, pubDate) | ||||
| 341 | |||||
| 342 | #close rss feed | ||||
| 343 | rss+=""" | ||||
| 344 | </channel> | ||||
| 345 | </rss> | ||||
| 346 | """ | ||||
| 347 | |||||
| 348 | fh=open(file_name,'w') | ||||
| 349 | fh.write(rss) | ||||
| 350 | fh.close() | ||||
| 2c892b6c » | Derek Willis | 2008-05-28 | 351 | ||
| 352 | |||||







