In [1]:
%%writefile get_bill_overviews.py
import pandas as pd
from pymongo import MongoClient
import copy
from bs4 import BeautifulSoup
import requests
from random import randint
from time import sleep
import threading

def get_soup(url):
    '''
    Get soup object from url to be parsed out in another function. If status code != 200, 
    prints out error message.
    
    Parameters: url
    
    Returns: BeautifulSoup object
    '''
    # included sleep time to attempt human user mimicking
    sleep_time = randint(0, 6)
    sleep(sleep_time)
    req = requests.get(url)
    stat_code = req.status_code

    if stat_code != 200:
        print('_______________')
        print('_______________')
        print('Error requesting {}'.format(url))
        print('Request Status Code: {}'.format(stat_code))

    if stat_code == 200:            
        print('_______________')
        print('_______________')
        print('\tRetrieving soup from {}'.format(url))
        soup = BeautifulSoup(req.content, 'lxml')
        
        return soup
    

def soup_details_to_mongo(soup, collection):
    '''
    Parses out the details from the soup object and inserts the details into 
    Mongo database collection row by row.
    
    Parameters: soup - a soup object with table within 'ol' class
                collection - collection name of Mongo database
                
    Returns:    None
    '''
    # initialize empty row to populate data
    empty_row = {'leg_id': None, 
                'leg_type': None,
                'leg_url': None,
                'intro_date': None,
                'congress_id': None,
                'desc': None,
                'sponsor': None, 
                'sponsor_party': None, 
                'sponsor_state': None,
                'sponsor_district': None,  #senators don't have districts
                'num_of_cosponsors': None,
                'cosponsors_url': None,
                'cosponsors': None,        #requires navigation to another url and extracting names from table
                'num_of_amendments': None,  #requires navigation to another url
                'committee': None, 
                'bill_status': None,
                'body': None               #requires navigation to another url
                }


    # table of bills are in ol class
    div = soup.find('div', {'class':'search-column-main'})
    table = div.find('ol')

    # iterate though each li class expanded to get rows
    rows = table.find_all('li', {'class':'expanded'})
   
    for row in rows:
        new_row = copy.copy(empty_row)

        # parse items within 'span' tag
        columns = row.find_all('span')
        if len(columns) > 3:
            # we only want bills and joint resolutions
            legislation_type = columns[0].text.strip()

            if (legislation_type == 'BILL') |  (legislation_type == 'JOINT RESOLUTION') | (legislation_type == 'LAW'):
                if columns[0].text != '':
                    new_row['leg_type'] = legislation_type
                if columns[1].text.strip().split()[2] != '':
                    new_row['congress_id'] = columns[1].text.strip().split()[2][:3]
                if columns[2].text != '':
                    new_row['desc'] = columns[2].text
                if ('Committee' in columns[4].text):
                    new_row['committee'] = columns[4].text.strip()[12:]

                dt = columns[3].text.strip().split()
                if '(Introduced' in dt:
                    new_row['intro_date'] = dt[dt.index('(Introduced') + 1][:-1]


                # bill_status is within 'p' tag
                columns = row.find_all('p')
                if columns[0].text.strip()[25:] != '':
                    new_row['bill_status'] = columns[0].text.strip()[25:]


                # parse info within 'a' tag
                columns = row.find_all('a')
                if columns[0].text.strip() != '':
                    new_row['leg_id'] = columns[0].text.strip().replace('.', ' ')

                # also within 'a' tag, reserved bill numbers will not have the information below
                if (len(columns) > 2):    
                    if columns[0]['href'].strip() != '':
                        new_row['leg_url'] = columns[0]['href'].strip()
                    if columns[2].text.strip() != '':
                        new_row['num_of_cosponsors'] = columns[2].text.strip()
                        if new_row['num_of_cosponsors'] != '0':
                            new_row['cosponsors_url'] = columns[2]['href']

                # party, state, and district (for house reps) need to be stripped out of sponsor info
                    for c in range(len(columns)):
                        if '[' in columns[c].text.strip():
                            rep = columns[c].text.strip()
                            new_row['sponsor'] = rep.rsplit('[', 1)[0][:-1][5:]
                            party_dist = rep.rsplit('[', 1)[1][: -1]
                            party_dist_split = party_dist.split('-')
                            new_row['sponsor_state'] = party_dist_split[1]
                            new_row['sponsor_party'] = party_dist_split[0]
                            if len(party_dist_split) == 3:
                                new_row['sponsor_district'] = party_dist_split[2]
            
                collection.insert_one(new_row)


            
def get_amendment_count(url):
    '''
    Returns amendment counts for a bill at the url
    
    Parameters: url - url that gives access to bill details
    
    Return: Integer - count of amendments
    '''
    soup = get_soup(url)
    
    # iterate through tabs to find Amendments and get count
    tabs = soup.find('nav', {'id': 'tabs'})
    info = tabs.find_all('a')
    for i in info:
        if 'Amendment' in i.text.split()[0]:        
            return i.text.split()[1].strip('()')
        
        
        
def initiate_process(page):
    client = MongoClient()
    db = client.bills
    bill_info = db.bill_info

    url_root = 'https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%7D&pageSize=250&page='
    
    site_url = '{}{}'.format(url_root, page)

#     print(site_url)
    soup = get_soup(site_url)
    soup_details_to_mongo(soup, bill_info)
    
        
        
        

if __name__ == '__main__':
    # begin by populating Mongo with general info for bills and joint resolutions using threading

    # the 110th Congress ends on page 444 with 250 results on page
    # https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%7D&pageSize=250&page=2
    page_range = range(1, 445)

    for p in page_range[::4]:
        t1 = threading.Thread(target=initiate_process, args=[p])
        t2 = threading.Thread(target=initiate_process, args=[p+1])
        t3 = threading.Thread(target=initiate_process, args=[p+2])
        t4 = threading.Thread(target=initiate_process, args=[p+3])

        t1.start()
        t2.start()
        t3.start()
        t4.start()

        t1.join()
        t2.join()
        t3.join()
        t4.join()
        
    print('-----------')
    print('-----------')
    print('Initial data collection complete!... DATA SCIENCE!!!')

    
    
    
    # once mongo data is populated, retrieve data from mongo to fill in additional details
    
    
    

Overwriting get_bill_overviews.py


In [106]:
po = range(1, 445, 2)
for p in po:
    print(p)

1
3
5
7
9
11
13
15
17
19
21
23
25
27
29
31
33
35
37
39
41
43
45
47
49
51
53
55
57
59
61
63
65
67
69
71
73
75
77
79
81
83
85
87
89
91
93
95
97
99
101
103
105
107
109
111
113
115
117
119
121
123
125
127
129
131
133
135
137
139
141
143
145
147
149
151
153
155
157
159
161
163
165
167
169
171
173
175
177
179
181
183
185
187
189
191
193
195
197
199
201
203
205
207
209
211
213
215
217
219
221
223
225
227
229
231
233
235
237
239
241
243
245
247
249
251
253
255
257
259
261
263
265
267
269
271
273
275
277
279
281
283
285
287
289
291
293
295
297
299
301
303
305
307
309
311
313
315
317
319
321
323
325
327
329
331
333
335
337
339
341
343
345
347
349
351
353
355
357
359
361
363
365
367
369
371
373
375
377
379
381
383
385
387
389
391
393
395
397
399
401
403
405
407
409
411
413
415
417
419
421
423
425
427
429
431
433
435
437
439
441
443


In [107]:
pe = range(2, 445, 2)
for p in pe:
    print(p)

2
4
6
8
10
12
14
16
18
20
22
24
26
28
30
32
34
36
38
40
42
44
46
48
50
52
54
56
58
60
62
64
66
68
70
72
74
76
78
80
82
84
86
88
90
92
94
96
98
100
102
104
106
108
110
112
114
116
118
120
122
124
126
128
130
132
134
136
138
140
142
144
146
148
150
152
154
156
158
160
162
164
166
168
170
172
174
176
178
180
182
184
186
188
190
192
194
196
198
200
202
204
206
208
210
212
214
216
218
220
222
224
226
228
230
232
234
236
238
240
242
244
246
248
250
252
254
256
258
260
262
264
266
268
270
272
274
276
278
280
282
284
286
288
290
292
294
296
298
300
302
304
306
308
310
312
314
316
318
320
322
324
326
328
330
332
334
336
338
340
342
344
346
348
350
352
354
356
358
360
362
364
366
368
370
372
374
376
378
380
382
384
386
388
390
392
394
396
398
400
402
404
406
408
410
412
414
416
418
420
422
424
426
428
430
432
434
436
438
440
442
444


In [62]:
url = new_row['leg_url']
url

'https://www.congress.gov/bill/109th-congress/house-bill/6413?s=1&r=111000'

In [63]:
soup = get_soup(url)


tabs = soup.find('nav', {'id': 'tabs'})
info = tabs.find_all('a')
for i in info:
    if 'Amendment' in i.text.split()[0]:        
        print(i.text.split()[1].strip('()'))
        


_______________
_______________
	Retrieving soup from https://www.congress.gov/bill/109th-congress/house-bill/6413?s=1&r=111000


In [99]:
url

'https://www.congress.gov/bill/109th-congress/house-bill/6413?s=1&r=111000'

In [58]:
new_row

{'leg_id': 'H R 6413',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/109th-congress/house-bill/6413?s=1&r=111000',
 'intro_date': '12/07/2006',
 'congress_id': '109',
 'desc': 'Sacramento River National Recreation Area Establishment Act of 2006',
 'sponsor': 'Herger, Wally',
 'sponsor_party': 'R',
 'sponsor_state': 'CA',
 'sponsor_district': '2',
 'num_of_cosponsors': '0',
 'cosponsors_url': None,
 'cosponsors': None,
 'num_of_amendments': None,
 'committee': 'House - Resources',
 'bill_status': 'Introduced',
 'body': None}

In [104]:
site_url = 'https://www.congress.gov/bill/115th-congress/house-resolution/35'

In [105]:
amend_count = get_amendment_count(site_url)
amend_count

_______________
_______________
	Retrieving soup from https://www.congress.gov/bill/115th-congress/house-resolution/35


'2'

In [5]:
bill_details.find_one()

{'_id': ObjectId('5c182d3a1417de23a825544e'),
 'leg_id': 'H R 2840',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/104th-congress/house-bill/2840?r=193696',
 'intro_date': '12/27/1995',
 'congress_id': '104th',
 'desc': 'To assure that all Federal employees work and are paid.',
 'sponsor': 'Rep. Morella, Constance A.',
 'sponsor_party': 'MD',
 'sponsor_state': 'R',
 'sponsor_district': '8',
 'num_of_cosponsors': '3',
 'cosponsors_url': 'https://www.congress.gov/bill/104th-congress/house-bill/2840/cosponsors?r=193696&overview=closed#tabs',
 'cosponsors': None,
 'committee': 'House - Government Reform and Oversight',
 'bill_status': 'Introduced',
 'body': None}

In [6]:
bill_details.find().count()

  """Entry point for launching an IPython kernel.


164874