In [15]:
%%writefile truncate_bill_text.py
'''
This script truncates the 'body' in each Mongo document to store the actual bill text in key bill_text
'''
from pymongo import MongoClient

def update_mongo_bill_text(leg_id, cong_id, bill_text_trunc, collection):
    '''
    ------------------------------------------
    Updates the bill_text field in the mongo record specified by bill_issue (leg_id) 
    and cong_id (congress_id) from db.collection with bill_text_trunc.
    
    ------------------------------------------
    Parameters: leg_id - value to filter on for key leg_id
                cong_id - value to filter on for key congress_id
                bill_text - truncated text in 'body'
                collection - the name of the mongo collection
                
    ------------------------------------------
    Returns:    None
    
    ------------------------------------------
    '''
    collection.update_one({'leg_id': leg_id, 'congress_id': cong_id}, {'$set': {'bill_text': bill_text_trunc}})


if __name__ == '__main__':
    client = MongoClient()
    db = client.bills
    bill_info = db.bill_info

    # get doc count to show status
    doc_count = bill_info.count_documents({'body': {'$regex': '(.+)'}, 'bill_text': None})
    
    # retrieve Mongo documents
    documents = bill_info.find({'body': {'$regex': '(.+)'}, 'bill_text': None})

    i = 0
    
    for doc in documents:
        leg_id = doc['leg_id']
        cong_id = doc['congress_id']
        bill_text = doc['body']

        # search through headers to get index
        if ('A BILL' in bill_text[:5000]):
            header_text = 'A BILL'

        elif ('A Bill' in bill_text[:5000]):
            header_text = 'A Bill'            

        elif ('JOINT RESOLUTION' in bill_text[:5000]): 
            header_text = 'JOINT RESOLUTION'

        elif ('Joint Resolution' in bill_text[:5000]):
            header_text = 'Joint Resolution'

        elif ('An Act' in bill_text[:5000]): 
            header_text = 'An Act'

        elif ('AN ACT' in bill_text[:5000]): 
            header_text = 'AN ACT'

        else: 
            header_text = 'ing Office'

        text_start = bill_text.find(header_text)

        # truncate the bill_text to remove header
        bill_text_trunc = bill_text[text_start:].split(' ', 2)[2]


        # truncate bill text to remove footer
        if 'LEGISLATIVE HISTORY' in bill_text_trunc:
            text_end = bill_text_trunc.find('LEGISLATIVE HISTORY')
            bill_text_trunc = bill_text_trunc[:text_end].rsplit('Approved')[0]

        # update Mongo
        update_mongo_bill_text(leg_id, cong_id, bill_text_trunc, bill_info)     
        
        # show status
        if i%500 == 0:
            print('{:.2f}% complete'.format(i / doc_count))
        i += 1

Overwriting truncate_bill_text.py


In [4]:
#exploration
from pymongo import MongoClient
client = MongoClient()
db = client.bills
bill_info = db.bill_info


In [11]:
docs = bill_info.find({'body': {'$regex': '(.+)'}, 'bill_text': None})

In [12]:
docs[0]

{'_id': ObjectId('5c26ca411417de25ef8add61'),
 'leg_id': 'H R 2369',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/112th-congress/house-bill/2369?s=1&r=57759',
 'intro_date': '06/24/2011',
 'congress_id': '112',
 'desc': 'To amend title 36, United States Code, to provide for an additional power for the American Legion under its Federal charter.',
 'sponsor': 'Altmire, Jason',
 'sponsor_party': 'D',
 'sponsor_state': 'PA',
 'sponsor_district': '4',
 'num_of_cosponsors': '432',
 'cosponsors_url': 'https://www.congress.gov/bill/112th-congress/house-bill/2369/cosponsors?s=1&r=57759&overview=closed#tabs',
 'cosponsors': None,
 'num_of_amendments': '0',
 'committee': 'House - Judiciary',
 'bill_status': 'Introduced',
 'body': "[Congressional Bills 112th Congress] [From the U.S. Government Printing Office] [H.R. 2369 Reported in House (RH)] Union Calendar No. 212 112th CONGRESS 1st Session H. R. 2369 [Report No. 112-313] To amend title 36, United States Code, to provide for 

In [None]:
doc_count = bill_info.count_documents({'body': {'$regex': '(.+)'}})

In [None]:
doc_count

In [None]:
all_docs = bill_info.count_documents({})

In [None]:
all_docs

In [None]:
leg_id = 'H R 6897'
cong_id = '115'
doc = bill_info.find_one({'leg_id': leg_id, 'congress_id': cong_id})
bill_text = doc['body']
text_start = bill_text.find('An Act')

bill_text_trunc = bill_text[text_start:].split(' ', 2)[2]

text_end = bill_text_trunc.find('LEGISLATIVE HISTORY')
bill_text_trunc = bill_text_trunc[:text_end].rsplit('Approved')[0]
print(bill_text_trunc)