public
Description: Various tools for downloading and processing Federal Election Commission data
Homepage:
Clone URL: git://github.com/dwillis/fec-utilities.git
Derek Willis (author)
Sun Jun 22 19:34:57 -0700 2008
fec-utilities / fec.py
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 1 #!/usr/bin/python
2 """
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 3 Library of functions to process and handle Federal Election Commission data
4 available from http://www.fec.gov and its FTP site, ftp://ftp.fec.gov.
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 5
6 The MIT License
7
8 Copyright (c) 2008 Derek Willis
9
10 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
11 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
12 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
13 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
14 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
15 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
16 THE SOFTWARE.
17
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 18 Modified by:
19 Ben Welsh
20 Los Angeles
21 May 28, 2008
22
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 23 """
24 __author__ = "Derek Willis <dwillis@gmail.com>"
96ce2d43 » Derek Willis 2008-06-22 updated date 25 __date__ = "$Date: 2008/06/22 $"
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 26 __version__ = "$Revision: 2.2 $"
27
28 import re
29 import urllib
30 import sys
31 import string
32 import urlparse
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 33 import time
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 34 import datetime
cf4bd065 » Derek Willis 2008-06-22 added latest electioneering... 35 import csv
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 36
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 37 def latest_news():
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 38 """
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 39 Scrape the FEC's news releases to produce an RSS feed using
40 regular expressions.
41
42 The script supports relative urls and the elimination of
43 extraneous whitespace, and produces an RSS 2.0 feed.
44
45 Running this script using versions of Python before 2.3 require importing
46 sre as re and resetting the maximum recursion limit.
47
48 Based on a script by Sam Ruby.
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 49
50 Usage from within Python shell:
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 51 from fec import latest_news
52 latest_news()
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 53 """
54 # set up needed variables
55 date = datetime.date.today()
56 year = date.year
57 base_url = 'http://www.fec.gov/press/press%s/' % year
58 url = base_url + '%sNewsReleases.shtml' % year
59
60 # read the content of the FEC's press page
61 try:
62 page=urllib.urlopen(url).read()
63 except IOError:
64 page=''
65 except AssertionError:
66 page=''
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 67
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 68 #define regular expression to grab link, date and title from FEC news
69 #releases page, using DOTALL to find multiline text. Since urls are
70 #internal, we need to add a prefix to the link for the RSS feed.
8ec4c1b1 » Derek Willis 2008-05-31 some doc string cleanup and... 71 news = re.compile("""valign=.top.>(.*?\d\d\d\d).?</td>.*?<td.valign=.top.>.*?<a href=.(.*?).>(.*?)</a>.*?</td>.*?</tr>""", re.DOTALL)
145c8a14 » Derek Willis 2008-05-21 moved news script into main... 72
73 #remove additional whitespace like linebreaks and returns from HTML code.
74 page = ' '.join(page.split())
75
76 #find first 10 matches
77 matches = news.findall(page)
78 matches = matches[:10]
79
80 #unpack tuple of matches and add to rss feed
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 81 data = []
82 for (date_string, link, title) in matches:
83 # combine the base_url with the tail_url
84 link = urlparse.urljoin(base_url,link)
85 # Leave the description field null for the time being
86 description = ''
87 # Pull out the date and reformat it for RSS
88 # See: http://feedvalidator.org/docs/error/InvalidRFC2822Date.html
89 date_date = time.strptime(date_string, '%B %d, %Y')
90 pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date)
91 record = (title, link, description, pubDate)
92 # Append to our data list
93 data.append(record)
94 # Transform our data list to RSS 2.0
8ec4c1b1 » Derek Willis 2008-05-31 some doc string cleanup and... 95 make_rss_20('Latest FEC News', 'Press releases and announcements.', data, 'latest_news.xml')
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 96
8cbb117f » Derek Willis 2008-06-02 added working cmte filings ... 97 def latest_filings():
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 98 """
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 99 Returns a list of electronic filings for today's date
8cbb117f » Derek Willis 2008-06-02 added working cmte filings ... 100 and print them out as RSS.
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 101
102 Dependency: BeautifulSoup for HTML parsing
103 (http://www.crummy.com/software/BeautifulSoup/)
104
105 Usage from within Python shell:
106 from fec import latest_news
8ec4c1b1 » Derek Willis 2008-05-31 some doc string cleanup and... 107 latest_filings()
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 108 """
32944fea » Ben Welsh 2008-05-28 Added an import test for Be... 109 try:
110 from BeautifulSoup import BeautifulSoup
111 except ImportError:
112 print """
113 IMPORT ERROR: Required Beautiful Soup module not found.
114
115 Installation instructions:
116
117 If you have easy_install, enter
fbc2d58d » Ben Welsh 2008-05-28 Fixing some typos in docstr... 118 "sudo easy_install BeautifulSoup"
32944fea » Ben Welsh 2008-05-28 Added an import test for Be... 119 via your shell.
120
121 Otherwise, the source can be downloaded from
122 http://www.crummy.com/software/BeautifulSoup/
123 """
b3a00d1d » Ben Welsh 2008-05-28 Replaced sys.exit() with Sy... 124 raise SystemExit
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 125 # Set the date for the URL string
126 d = datetime.date.today()
127 dm = str(d.month).zfill(2)
128 dd = str(d.day).zfill(2)
129 stringdate=dm+'/'+dd+'/'+str(d.year)
8cbb117f » Derek Willis 2008-06-02 added working cmte filings ... 130 params = {'date':stringdate}
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 131 base_url = 'http://query.nictusa.com/cgi-bin/dcdev/forms/'
132 # Open the URL, pass the HTML to Beautiful Soup
133 txt=urllib.urlopen(base_url, urllib.urlencode(params)).read()
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 134 soup = BeautifulSoup(txt)
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 135 # Snatch all the <dt> tags
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 136 filings = soup.findAll('dt')
1bf283be » Derek Willis 2008-05-22 bug fix in today's filings 137 today = []
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 138 for cmte in filings:
139 name = cmte('h4')[0]('a')[0].contents[0]
140 number_of_filings = len(cmte.contents)/6 # each filing has six elements
141 i = 5 # the fifth element in a filing is its title
142 for filing in range(number_of_filings):
143 title = cmte.contents[i]
1bf283be » Derek Willis 2008-05-22 bug fix in today's filings 144 today.append(name+title)
15c719f3 » Derek Willis 2008-05-22 added today's electronic fi... 145 i += 6
1bf283be » Derek Willis 2008-05-22 bug fix in today's filings 146 return today
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 147 data = []
148 # Iterate through each filing in the HTML and snatch the data we want
149 for filing in filings:
150 # Pull the committee name and cut off c_id
151 committee = filing('h4')[0]('a')[0].contents[0]
152 title = re.split(' - ', committee)[0]
153 # Pull the hyperlink
154 link = filing.contents[2]['href']
155 # Grab the field with form and date information
156 form = filing.contents[5].replace('&nbsp;', '').replace('\n', '')
157 # Pull out the description
158 description = re.split(' - ', form)[0].strip()
159 # Pull out the date and reformat it for RSS
160 # See: http://feedvalidator.org/docs/error/InvalidRFC2822Date.html
161 date_string = re.split(' - ', form)[1].split('filed ')[1]
162 date_date = time.strptime(string.strip(date_string), '%m/%d/%Y')
163 pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date)
164 # Collect in a tuple
165 record = (title, urlparse.urljoin(base_url,link), description, pubDate)
166 # Append to our data list
167 data.append(record)
168 # Transform our data list to RSS 2.0
8ec4c1b1 » Derek Willis 2008-05-31 some doc string cleanup and... 169 make_rss_20('Latest FEC Filings', 'Committee finance reports.', data, 'latest_filings.xml')
4da37cca » Derek Willis 2008-05-28 added partially working can... 170
171
8cbb117f » Derek Willis 2008-06-02 added working cmte filings ... 172 def cmte_filings(cmte):
173 """
174 Returns a list of electronic filings for a given
175 committee's C-number and print them out as RSS.
176
177 Dependency: BeautifulSoup for HTML parsing
178 (http://www.crummy.com/software/BeautifulSoup/)
179
180 Usage from within Python shell:
181 from fec import latest_news
182 cmte_filings('C00260547')
183 """
184 try:
185 from BeautifulSoup import BeautifulSoup
186 except ImportError:
187 print """
188 IMPORT ERROR: Required Beautiful Soup module not found.
189
190 Installation instructions:
191
192 If you have easy_install, enter
193 "sudo easy_install BeautifulSoup"
194 via your shell.
195
196 Otherwise, the source can be downloaded from
197 http://www.crummy.com/software/BeautifulSoup/
198 """
199 raise SystemExit
200 # Set the date for the URL string
201 params = {'comid':cmte}
202 base_url = 'http://query.nictusa.com/cgi-bin/dcdev/forms/'
203 # Open the URL, pass the HTML to Beautiful Soup
204 txt=urllib.urlopen(base_url, urllib.urlencode(params)).read()
205 soup = BeautifulSoup(txt)
206 # Snatch all the <dt> tags
207 filings = soup.findAll('dt')
208 # Pull the committee name and cut off c_id
209 committee = filings[0].a.contents[0]
210 title = re.split(' - ', committee)[0]
211 # set up the list to hold filings
212 data = []
213 # regex to match filings - no soup here!
214 results = re.compile("""[<BR>]?.*?<A HREF=\'/cgi-bin/dcdev/forms/(C[0-9]*?/[0-9]*?)/\'>(?:<FONT COLOR ="#990000">)?View(?:</FONT>)?</A>&nbsp;&nbsp;&nbsp;&nbsp;<A HREF=\'/cgi-bin/dcdev/forms/DL/[0-9].*/\'>(?:<FONT COLOR ="#990000">)?Download(?:</FONT>)?</A>&nbsp;&nbsp;.*?(Form F.*?)..-.(?:period.(.*?)-(.*?),.)?filed.(.*?).-.(.*?)(?:<BR>&nbsp;&nbsp;&nbsp;<B>Amended</B> by <A HREF=\'/cgi-bin/dcdev/forms/C[0-9]*?/([0-9]*?)/\'>.*?</A>)?\n""")
215 for (link,form,periodstart,periodend,filedate,filing,amend) in results.findall(txt):
216 date_date = time.strptime(filedate, '%m/%d/%Y')
217 pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date)
218 # Collect in a tuple
219 record = (title, urlparse.urljoin(base_url,link), filing, pubDate)
220 # Append to our data list
221 data.append(record)
222 # Transform our data list to RSS 2.0 inserting cmte id into title
223 intro = 'Latest FEC Filings from %s' % cmte
224 make_rss_20(intro, 'Committee finance reports.', data, 'latest_cmte_filings.xml')
225
226
cf4bd065 » Derek Willis 2008-06-22 added latest electioneering... 227 def latest_electioneering_filings():
228 """
229 Returns a list of the most recent electioneering communications filings
230 with the FEC by processing a CSV file on ftp.fec.gov and printing the 10
231 most recent filings out as RSS 2.0.
232
233 Uses a slightly altered version of the MONTHS_3 format used by Django to parse dates from FEC file.
234 """
235 try:
236 url = "ftp://ftp.fec.gov/FEC/electioneering.csv"
237 ec = urllib.urlopen(url)
238 reader = csv.DictReader(ec)
239 reader.next()
240 except IOError:
241 print "Network Error: File cannot be accessed."
242 raise SystemExit
243
244 # dictionary mapping three-letter months to Python month numbers for use in building Python dates
245 MONTHS_3 = { 'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6, 'JUL': 7, 'AUG':8, 'SEP':9, 'OCT':10, 'NOV':11, 'DEC':12}
246
247 base_url = 'http://images.nictusa.com/cgi-bin/fecimg/?_'
248 data = []
249 for row in reader:
250 d = row[' RECEIPT_DT'].strip().split('-')
251 d[1] = str(MONTHS_3[d[1]])
252 date_date = time.strptime("-".join(d), '%d-%m-%y')
253 if row[' PUBLIC_DISTRIBUTION_DT '] == '':
254 dd = d
255 else:
256 dd = row[' PUBLIC_DISTRIBUTION_DT '].strip().split('-')
257 dd[1] = str(MONTHS_3[dd[1]])
258 dist_date = time.strftime("%b %d, %Y", time.strptime("-".join(dd), '%d-%m-%y')) # the date the ad(s) aired.
259 pubDate = time.strftime('%a, %d %b %Y 00:00:00 GMT', date_date) # the filing date
260 record = (row[' FILER_NAME '], base_url+row[' BEGIN_IMAGE_NUM ']+'+0', "Amount Spent: $"+ row[' TOTAL_DISBURSEMENTS_THIS_STMT '] + " first airing on "+ dist_date, pubDate)
261 data.append(record)
262 data = data[:10] # limits feed to 10 most recent.
263 intro = "Latest Electioneering Communications filings"
264 make_rss_20(intro, 'Electioneering communications', data, 'elect_comm.xml')
265
266
4da37cca » Derek Willis 2008-05-28 added partially working can... 267 def cand_summary_by_state(year, state):
268 """
269 Partially working function to return summary
270 numbers for candidates from a state for an
271 election year. Bombs out on first House candidate.
272
273 Dependency: BeautifulSoup for HTML parsing
274 (http://www.crummy.com/software/BeautifulSoup/)
275
276 """
277 try:
278 from BeautifulSoup import BeautifulSoup
279 except ImportError:
280 print """
281 IMPORT ERROR: Required Beautiful Soup module not found.
282
283 Installation instructions:
284
285 If you have easy_install, enter
286 "sudo easy_install BeautifulSoup"
287 via your shell.
288
289 Otherwise, the source can be downloaded from
290 http://www.crummy.com/software/BeautifulSoup/
291 """
292 raise SystemExit
293 params = { 'dbyear': int(str(year)[3]), 'state': state }
294 base_url = 'http://herndon1.sdrdc.com/cgi-bin/cancomsrs/'
295 txt=urllib.urlopen(base_url, urllib.urlencode(params)).read()
296 soup = BeautifulSoup(txt)
297 t = soup.table.contents
298 data = []
299 for row in t[3:]:
300 name = row.contents[0].a.contents[0]
301 office = row.contents[1].a.contents[0]
302 receipts = row.contents[2].contents[0]
303 spent = row.contents[3].contents[0]
304 cash = row.contents[4].contents[0]
305 debt = row.contents[5].contents[0]
306 date = row.contents[6].contents[0]
307 record = (name, office, receipts, spent, cash, debt, date)
308 data.append(record)
309 return data
310
311
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 312 def make_rss_20(title, description, data, file_name):
2c892b6c » Derek Willis 2008-05-28 adding beginnings of cmte f... 313 """
8ec4c1b1 » Derek Willis 2008-05-31 some doc string cleanup and... 314 Returns a list of electronic filings for today or a given committee, using its C-number passed in to the function.
2c892b6c » Derek Willis 2008-05-28 adding beginnings of cmte f... 315 Dependency: BeautifulSoup for HTML parsing (http://www.crummy.com/software/BeautifulSoup/)
8ec4c1b1 » Derek Willis 2008-05-31 some doc string cleanup and... 316
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 317 Prints out data from both scrapes according RSS 2.0 standards
318 http://en.wikipedia.org/wiki/RSS_(file_format)#RSS_2.0
2c892b6c » Derek Willis 2008-05-28 adding beginnings of cmte f... 319 """
c03f8998 » Ben Welsh 2008-05-28 Expanded filings scrape, br... 320 rss="""<?xml version="1.0" encoding="ISO-8859-1"?>
321 <rss version="2.0">
322 <channel>
323 <title>""" + title + """</title>
324 <link>http://www.fec.gov/</link>
325 <description>""" + description + """"</description>
326 <language>en-us</language>
327 """
328
329 #find first 10 matches
330 latest = data[:10]
331
332 #unpack tuple of matches and add to rss feed
333 for (title, link, description, pubDate) in latest:
334 rss+="""
335 <item>
336 <title>%s</title>
337 <link>%s</link>
338 <description>%s</description>
339 <pubDate>%s</pubDate>
340 </item>\n""" % (title, link, description, pubDate)
341
342 #close rss feed
343 rss+="""
344 </channel>
345 </rss>
346 """
347
348 fh=open(file_name,'w')
349 fh.write(rss)
350 fh.close()
2c892b6c » Derek Willis 2008-05-28 adding beginnings of cmte f... 351
352