In [21]:
%%writefile mongo_cleanup.py
'''
This script attempts to populate the bill text for each record in mongo database 
bills.bill_details if it doesn't already exist.
'''
from pymongo import MongoClient
import bson.json_util
from bs4 import BeautifulSoup
import requests
import json
import codecs


def write_json_file(obj, path):
    '''Dump an object and write it out as json to a file'''
    f = codecs.open(path, 'a', 'utf-8')
    json_record = json.dumps(obj, ensure_ascii = False)
    f.write(json_record + '\n')
    f.close


def url_builder(record_url):
    '''
    Builds endpoint url from leg_url in mongo. Endpoint url should be the site that 
    contains the text version of the bill.
    
    Parameters: a mongo record
    
    Returns:    url
    '''
    url_root = record_url.rsplit('?')[0]
    return '{}/text?format=txt&r=1'.format(url_root)


def get_bill_text(url):
    '''
    Scrapes the page at url to return the text of the bill.
    
    Parameters: url
    
    Returns: bill text, if it exists
    '''
    site_url = url

    req = requests.get(site_url)
    stat_code = req.status_code
#     print(stat_code)

    # if error in getting url, print and log the error
    if stat_code != 200:
        print('_______________')
        print('_______________')
        print('')
        print('\t{}'.format(site_url))
        print('\t\tError in retrieving bill text.')
        print('\t\tRequest Status Code: {}'.format(stat_code))
        errored_line = {'url': site_url, 'error': stat_code}
        write_json_file(errored_line, '../data/logs/bill_text_errors.jsonl')
        print('Error logged in ../data/logs/bill_text_errors.jsonl')

    if stat_code == 200:
        soup = BeautifulSoup(req.content, 'lxml')
        # print(soup.prettify())

        # if there is no text, print and log the error
        if soup.find('pre') is None:
            print('_______________')
            print('_______________')
            print('\t{}'.format(site_url))
            print('\t\tError in retrieving bill text.')
            print('\t\tNo text available for scraping.')
            errored_line = {'url': site_url, 'error': 'no text available'}
            write_json_file(errored_line, '../data/logs/bill_text_errors.jsonl')
            print('\t\tReturned None and error logged in ../data/logs/bill_text_errors.jsonl')
            
            return None


        # else scrape the text
        else:
            bill_txt = soup.find('pre').text
            bill_txt = ' '.join(bill_txt.split())

            return bill_txt


def update_mongo_body(txt, bill_issue, cong_id, collection):
    '''
    Updates the body field in the mongo record specified by bill_issue (leg_id) and
    cong_id (congress_id) from db.collection with txt.
    
    Parameters: txt - the text of the bill
                bill_issue - value to filter on for key leg_id
                cong_id - value to filter on for key congress_id
                collection - the name of the mongo collection
                
    Returns: None
    '''
    
    collection.update({'leg_id': bill_issue, 'congress_id': cong_id}, {'$set': {'body': txt}})

    

  

  

if __name__ == '__main__':
    client = MongoClient() # defaults to localhost
    db = client.bills
    bill_details = db.bill_details

    # print out record counts
    print('--------------------')
    print('--------------------')
    print('Number of records in database: {}'.format(bill_details.find().count()))
    print('Ignoring RESOLUTIONS, CONCURRENT RESOLUTIONS, and AMENDMENTS for populating bills text.')
    
    # iterate through date range in reverse
    date_range = range(1990, 2019)[::-1]

    for d in date_range:
        print('--------------------')
        print('Cleaning up year {}'.format(d))
        date_str = str(d)
        records_to_pop = bill_details.find({'leg_url': {'$regex': 'http'}, 'intro_date': {'$regex': date_str}, 'body': None})
        record_count = records_to_pop.count()
        print('--> Number of records with no text for year {}: {}'.format(d, record_count))


        i = 0
        for rec in records_to_pop:
            # ignore concurrent resolution and simple resolution
            if (rec['leg_type'] != 'CONCURRENT RESOLUTION') & (rec['leg_type'] != 'RESOLUTION') & (rec['leg_type'] != 'AMENDMENT'):
                url = url_builder(rec['leg_url'])
                # get bill text
                bill_text = get_bill_text(url)

                # update mongo record with bill text
                bill_issue = rec['leg_id']
                cong_id = rec['congress_id']
                update_mongo_body(bill_text, bill_issue, cong_id, bill_details)

            i += 1
            if i%200 == 0:
                print('\t{:.2f}% complete'.format(100 * i / record_count))

Overwriting mongo_cleanup.py


In [104]:
# exploration
from pymongo import MongoClient
import bson.json_util
from bs4 import BeautifulSoup
import requests
import json
import codecs
import pandas as pd
from random import randint
from time import sleep
import datetime




client = MongoClient() # defaults to localhost
db = client.bills
senate_votes = db.senate_votes


In [90]:
missing_votes = senate_votes.find({'vote_results': None})

cols = ['_id', 
        'congress_id', 
        'session', 
        'vote_id', 
        'issue', 
        'result', 
        'question', 
        'desc', 
        'date', 
        'year', 
        'vote_results']
df = pd.DataFrame(columns = cols)

for i in range(missing_votes.count()):
    df = df.append(pd.DataFrame.from_dict(missing_votes[i], orient='index').T, ignore_index=True)


df.head()

  app.launch_new_instance()


Unnamed: 0,_id,congress_id,session,vote_id,issue,result,question,desc,date,year,vote_results
0,5c23c41fbb863d0538383635,115,1,302,H R 1,Agreed to,On the Amendment S.Amdt. 1856,Merkley Amdt. No. 1856; Of a perfecting nature.,Dec 02,2017,
1,5c23c425bb863d0538383636,115,1,301,H R 1,Rejected,On the Motion S.Amdt. 1717,Motion to Waive the CBA Re: Cantwell Amdt. No....,Dec 02,2017,
2,5c23c42cbb863d0538383637,115,1,300,H R 1,Rejected,On the Motion,Manchin Motion to Commit H.R. 1 to the Committ...,Dec 02,2017,
3,5c23c431bb863d0538383639,115,1,298,H R 1,Agreed to,On the Amendment S.Amdt. 1852,Cruz Amdt. No. 1852; To allow limited 529 acco...,Dec 01,2017,
4,5c23c435bb863d053838363a,115,1,297,H R 1,Rejected,On the Motion,Menendez Motion to Commit H.R. 1 to the Commit...,Dec 01,2017,


In [91]:
df.result.unique()

array(['Agreed to', 'Rejected', 'Passed', 'Not Sustained'], dtype=object)

In [106]:
# https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=2&vote=00140
url_root = 'https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?'
for i in range(df.shape[0]):
    cong_id = df.iloc[i, 1]
    sess = df.iloc[i, 2]
    vote_id = str(df.iloc[i, 3]).zfill(5)
    url_tail = 'congress={}&session={}&vote={}'.format(cong_id, sess, vote_id)
    site_url = '{}{}'.format(url_root, url_tail)
    print(site_url)

https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00302
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00301
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00300
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00298
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00297
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00296
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00295
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vote=00294
https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=115&session=1&vot

In [107]:
site_url

'https://www.senate.gov/legislative/LIS/roll_call_lists/roll_call_vote_cfm.cfm?congress=112&session=1&vote=00017'

In [108]:
req = requests.get(site_url)
sleep_time = randint(0, 5)
sleep(sleep_time)

tstamp = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
stat_code = req.status_code
print(stat_code)

200


In [70]:
senate_votes.find().count()

  """Entry point for launching an IPython kernel.


755

In [19]:
t = list(bill_details.find({'num_of_cosponsors': 'TXT'}))

In [20]:
# lost HJ RES 45 , 111th congress when trying to update it

2

In [14]:
bill_details.update({'num_of_cosponsors': 'TXT', 'congress_id': '111th'}, 
                   {'$set': {'num_of_cosponsors': '0'}})

  


{'n': 1, 'nModified': 1, 'ok': 1.0, 'updatedExisting': True}

In [16]:
site_url = 'https://www.congress.gov/amendment/115th-congress/house-amendment/982/text?format=txt&r=1'
req = requests.get(site_url)
stat_code = req.status_code
print(stat_code)

200


In [18]:
soup = BeautifulSoup(req.content, 'lxml')
# print(soup.prettify())

pre = soup.find('pre')

print(pre)

None


In [19]:
soup.find('pre') is None

True