In [105]:
# %%writefile get_new_bill_info.py
import pandas as pd
from pymongo import MongoClient
import copy
from bs4 import BeautifulSoup
import requests
from random import randint
from time import sleep
import threading

def get_soup(url):
    '''
    Get soup object from url to be parsed out in another function. If status code != 200, 
    prints out error message.
    
    Parameters: url
    
    Returns: BeautifulSoup object
    '''
    # included sleep time to attempt human user mimicking
    sleep_time = randint(0, 6)
    sleep(sleep_time)
    req = requests.get(url)
    stat_code = req.status_code

    if stat_code != 200:
        print('_______________')
        print('_______________')
        print('Error requesting {}'.format(url))
        print('Request Status Code: {}'.format(stat_code))

    if stat_code == 200:            
        print('_______________')
        print('_______________')
        print('\tRetrieving soup from {}'.format(url))
        soup = BeautifulSoup(req.content, 'lxml')
        
        return soup
    

def soup_details_to_list(soup):
    '''
    Parses out the details from the soup object and inserts the details into list. Each 
    item in the list will be compared to the data that already exists in Mongo.
    
    Parameters: soup - a soup object with table within 'ol' class
                collection - collection name of Mongo database
                
    Returns:    list of bill details to compare to what is already in Mongo
    '''
    # initialize empty list to temporarily store data.
    # each item will be checked against Mongo data to see if anything has changed since the last load.
    all_rows = []
    
    # initialize empty row to populate data
    empty_row = {'leg_id': None, 
                'leg_type': None,
                'leg_url': None,
                'intro_date': None,
                'congress_id': None,
                'desc': None,
                'sponsor': None, 
                'sponsor_party': None, 
                'sponsor_state': None,
                'sponsor_district': None,  #senators don't have districts
                'num_of_cosponsors': None,
                'cosponsors_url': None,
                'cosponsors': None,        #requires navigation to another url and extracting names from table
                'num_of_amendments': None,  #requires navigation to another url
                'committee': None, 
                'bill_status': None,
                'body': None               #requires navigation to another url
                }


    # table of bills are in ol class
    div = soup.find('div', {'class':'search-column-main'})
    table = div.find('ol')

    # iterate though each li class expanded to get rows
    rows = table.find_all('li', {'class':'expanded'})
   
    for row in rows:
        new_row = copy.copy(empty_row)

        # parse items within 'span' tag
        columns = row.find_all('span')

        # we only want bills and joint resolutions
        legislation_type = columns[0].text.strip()

        if (legislation_type == 'BILL') |  (legislation_type == 'JOINT RESOLUTION') | (legislation_type == 'LAW'):
            if columns[0].text != '':
                new_row['leg_type'] = columns[0].text.strip()
            if columns[1].text.strip().split()[2] != '':
                new_row['congress_id'] = columns[1].text.strip().split()[2][:3]
            if columns[2].text != '':
                new_row['desc'] = columns[2].text
            if len(columns) > 4:
                if ('Committee' in columns[4].text):
                    new_row['committee'] = columns[4].text.strip()[12:]

            dt = columns[3].text.strip().split()
            if '(Introduced' in dt:
                new_row['intro_date'] = dt[dt.index('(Introduced') + 1][:-1]


            # bill_status is within 'p' tag
            columns = row.find_all('p')
            if columns[0].text.strip()[25:] != '':
                new_row['bill_status'] = columns[0].text.strip()[25:]


            # parse info within 'a' tag
            columns = row.find_all('a')
            if columns[0].text.strip() != '':
                new_row['leg_id'] = columns[0].text.strip().replace('.', ' ')

            # also within 'a' tag, reserved bill numbers will not have the information below
            if (len(columns) > 2):    
                if columns[0]['href'].strip() != '':
                    new_row['leg_url'] = columns[0]['href'].strip()
                if columns[2].text.strip() != '':
                    new_row['num_of_cosponsors'] = columns[2].text.strip()
                    if new_row['num_of_cosponsors'] != '0':
                        new_row['cosponsors_url'] = columns[2]['href']

            # party, state, and district (for house reps) need to be stripped out of sponsor info
                for c in range(len(columns)):
                    if '[' in columns[c].text.strip():
                        rep = columns[c].text.strip()
                        new_row['sponsor'] = rep.rsplit('[', 1)[0][:-1][5:]
                        party_dist = rep.rsplit('[', 1)[1][: -1]
                        party_dist_split = party_dist.split('-')
                        new_row['sponsor_state'] = party_dist_split[1]
                        new_row['sponsor_party'] = party_dist_split[0]
                        if len(party_dist_split) == 3:
                            new_row['sponsor_district'] = party_dist_split[2]
            
            all_rows.append(new_row)
    return all_rows


def mongo_check(leg_id, cong_id, collection):
    '''
    Checks to see if a record from web scrape is in Mongo by querying the leg_id and
    cong_id. Returns True if present, else returns False.
    '''
    mongo_record = bill_details.find_one({'leg_id': leg_id, 'congress_id': cong_id})
    if mongo_record is None: 
        print('Congress ID {}, Bill {} not in Mongo'.format(cong_id, leg_id))
        return False
    else: 
        return True


def update_mongo_value(leg_id, cong_id, key_to_update, new_value, collection):  
    '''
    Updates the value for a single key in a mongo record specified by leg_id and
    cong_id (congress_id) from db.collection with new_value.
    
    Parameters: leg_id - value to filter on for key leg_id
                cong_id - value to filter on for key congress_id
                key_to_update - key from document that needs to be updated
                new_value - new value to be inserted into mongo document
                collection - the name of the mongo collection
                
    Returns:    None
    '''
    collection.update_one({'leg_id': leg_id, 'congress_id': cong_id}, {'$set': {key_to_update: new_value}})


def update_mongo_with_list_values(bill_list, collection):
    '''
    Compares each item in bill_list (scraped data) to documents in Mongo collection. 
    
    If the item is not in Mongo, it inserts it.

    If the item is in Mongo collection, it updates values if they do not match by 
    calling function update_mongo_value.
    
    Parameters - bill_list - list of bills created from web scrape (soup_details_to_list)
                 collection - Mongo collection to query and update, if needed.
                 
    Returns -    None
    '''
    keys_to_check = ['leg_type', 'desc', 'num_of_cosponsors', 'committee', 'bill_status']

    for i in range(len(bill_list)): 

        list_record = bill_list[i]

        leg_id = list_record['leg_id']
        cong_id = list_record['congress_id']
        
        # check to see if list_record is in Mongo collection and update values
        mongo_document = bill_details.find_one({'leg_id': leg_id, 'congress_id': cong_id})

        if mongo_check(leg_id, cong_id, bill_details):
            for k in keys_to_check:
                if list_record[k] != mongo_document[k]:
                    print('\tUpdating {} {}... \n\t...from {} \n\t...to {}'.format(leg_id, k, mongo_document[k], list_record[k]))
                    update_mongo_value(leg_id, cong_id, k, list_record[k], collection)
        
        # if list_record not in Mongo, insert it
        else:
            print('\tInserting new bill {}'.format(leg_id))
            collection.insert_one(list_record)
                


In [5]:
client = MongoClient()
db = client.bills
bill_details = db.bill_details

In [106]:
page = 2
url_root = 'https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%7D&pageSize=250&page='

site_url = '{}{}'.format(url_root, page)

soup = get_soup(site_url)

bill_list = soup_details_to_list(soup)

update_mongo_with_list_values(bill_list, bill_details)

_______________
_______________
	Retrieving soup from https://www.congress.gov/search?q=%7B%22source%22%3A%22legislation%22%7D&pageSize=250&page=2
	Updating H R 7146 num_of_cosponsors... 
	...from 4 
	...to 10
	Updating H R 7142 num_of_cosponsors... 
	...from 13 
	...to 15
	Updating H R 7141 num_of_cosponsors... 
	...from 22 
	...to 23
	Updating H R 7138 num_of_cosponsors... 
	...from 1 
	...to 2
	Updating H R 7137 num_of_cosponsors... 
	...from 27 
	...to 28
	Updating H R 7136 num_of_cosponsors... 
	...from 1 
	...to 3
	Updating H R 7129 num_of_cosponsors... 
	...from 4 
	...to 5
	Updating H R 7128 num_of_cosponsors... 
	...from 5 
	...to 6
	Updating H R 7127 num_of_cosponsors... 
	...from 16 
	...to 18
	Updating H R 7124 num_of_cosponsors... 
	...from 38 
	...to 40
	Updating H R 7123 num_of_cosponsors... 
	...from 13 
	...to 17
	Updating H R 7120 leg_type... 
	...from BILL 
	...to LAW
	Updating H R 7120 bill_status... 
	...from Passed House 
	...to Became Law
	Updating H R 7116 num_o

In [71]:
bill_list[i]

{'leg_id': 'H R 7197',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/115th-congress/house-bill/7197?s=1&r=203',
 'intro_date': '11/29/2018',
 'congress_id': '115',
 'desc': 'Renewable HEAT for Schools Act',
 'sponsor': 'Rice, Kathleen M.',
 'sponsor_party': 'D',
 'sponsor_state': 'NY',
 'sponsor_district': '4',
 'num_of_cosponsors': '0',
 'cosponsors_url': None,
 'cosponsors': None,
 'num_of_amendments': None,
 'committee': 'House - Education and the Workforce',
 'bill_status': 'Introduced',
 'body': None}

In [72]:
mongo_record = bill_details.find_one({'leg_id': leg_id, 'congress_id': cong_id})
mongo_record

{'_id': ObjectId('5c182da71417de23a827c56b'),
 'leg_id': 'H R 7197',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/115th-congress/house-bill/7197?r=15',
 'intro_date': '11/29/2018',
 'congress_id': '115th',
 'desc': 'Renewable HEAT for Schools Act',
 'sponsor': 'Rep. Rice, Kathleen M.',
 'sponsor_party': 'NY',
 'sponsor_state': 'D',
 'sponsor_district': '4',
 'num_of_cosponsors': '0',
 'cosponsors_url': None,
 'cosponsors': None,
 'committee': 'House - Education and the Workforce',
 'bill_status': 'Introduced',
 'body': "[Congressional Bills 115th Congress] [From the U.S. Government Publishing Office] [H.R. 7197 Introduced in House (IH)] <DOC> 115th CONGRESS 2d Session H. R. 7197 To authorize the Secretary of Energy to make grants to local educational agencies for purposes of supporting renewable energy-based heating and cooling systems in schools. _______________________________________________________________________ IN THE HOUSE OF REPRESENTATIVES November 29, 2018

In [100]:
for i in range(len(bill_list)):
    leg_id = bill_list[i]['leg_id']
    cong_id = bill_list[i]['congress_id']
    
    mongo_check(leg_id, cong_id, bill_details)
    

0: H R 7399 not in Mongo
1: H R 7398 not in Mongo
2: H R 7397 not in Mongo
3: H R 7396 not in Mongo
4: H R 7395 not in Mongo
5: H R 7394 not in Mongo
6: H R 7393 not in Mongo
7: H R 7392 not in Mongo
8: H R 7391 not in Mongo
9: H R 7390 not in Mongo
10: H R 7389 not in Mongo
11: H R 7388 not in Mongo
12: H R 7387 not in Mongo
13: H R 7386 not in Mongo
14: H R 7385 not in Mongo
15: H R 7384 not in Mongo
16: H R 7383 not in Mongo
17: H R 7382 not in Mongo
18: H R 7381 not in Mongo
19: H R 7380 not in Mongo
20: H R 7379 not in Mongo
21: H R 7378 not in Mongo
22: H R 7377 not in Mongo
23: H R 7376 not in Mongo
24: H R 7375 not in Mongo
25: H R 7374 not in Mongo
26: H R 7373 not in Mongo
27: H R 7372 not in Mongo
28: H R 7371 not in Mongo
29: H R 7370 not in Mongo
30: H R 7369 not in Mongo
31: H R 7368 not in Mongo
32: H R 7367 not in Mongo
33: H R 7366 not in Mongo
34: H R 7365 not in Mongo
35: H R 7364 not in Mongo
36: H R 7363 not in Mongo
37: H R 7362 not in Mongo
38: H R 7361 not in Mo

In [74]:
bill_details.find_one()

{'_id': ObjectId('5c182d3a1417de23a825544e'),
 'leg_id': 'H R 2840',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/104th-congress/house-bill/2840?r=193696',
 'intro_date': '12/27/1995',
 'congress_id': '104th',
 'desc': 'To assure that all Federal employees work and are paid.',
 'sponsor': 'Rep. Morella, Constance A.',
 'sponsor_party': 'MD',
 'sponsor_state': 'R',
 'sponsor_district': '8',
 'num_of_cosponsors': '3',
 'cosponsors_url': 'https://www.congress.gov/bill/104th-congress/house-bill/2840/cosponsors?r=193696&overview=closed#tabs',
 'cosponsors': None,
 'committee': 'House - Government Reform and Oversight',
 'bill_status': 'Introduced',
 'body': None}

In [91]:
all_bills = bill_details.find()

for bill in all_bills:
    leg_id = bill['leg_id']
    cong_id = bill['congress_id']
    new_cong_id = cong_id[:3]
    
    bill_details.update_one({'leg_id': leg_id, 'congress_id': cong_id}, {'$set': {'congress_id': new_cong_id}})
