In [27]:
%%writefile truncate_bill_text.py
'''
This script truncates the 'body' in each Mongo document to store the actual bill text in key bill_text
'''
from pymongo import MongoClient
from my_tools import read_jsonl_file
from datetime import date

def update_mongo_bill_text(leg_id, cong_id, bill_text_trunc, collection):
    '''
    ------------------------------------------
    Updates the bill_text field in the mongo record specified by bill_issue (leg_id) 
    and cong_id (congress_id) from db.collection with bill_text_trunc.
    
    ------------------------------------------
    Parameters: leg_id - value to filter on for key leg_id
                cong_id - value to filter on for key congress_id
                bill_text - truncated text in 'body'
                collection - the name of the mongo collection
                
    ------------------------------------------
    Returns:    None
    
    ------------------------------------------
    '''
    collection.update_one({'leg_id': leg_id, 'congress_id': cong_id}, {'$set': {'bill_text': bill_text_trunc}})


if __name__ == '__main__':
    client = MongoClient()
    db = client.bills
    bill_info = db.bill_info
    
    # retrieve logs where the bill text has changed when get_bill_text was run
    log_path = '/home/ubuntu/galvanize_capstone/data/logs/mongo_updates.jsonl'
    logs = read_jsonl_file(log_path)

    for log in logs:
        if 'body' in log.keys():
            print('----------------------')
            print('----------------------')
            cong_id = log['congress_id']
            leg_id = log['leg_id'] 
            print('The bills text for Congress ID {}, {} has changed. Updating truncated text'.format(cong_id, leg_id))


    # get doc count to show status
    doc_count = bill_info.count_documents({'body': {'$regex': '(.+)'}, 'bill_text': None})
    print('-------------------')
    print('There are {} bills needing truncated text'.format(doc_count))
    
    # retrieve Mongo documents
    documents = bill_info.find({'body': {'$regex': '(.+)'}, 'bill_text': None})

    i = 0
    
    for doc in documents:
        leg_id = doc['leg_id']
        cong_id = doc['congress_id']
        bill_text = doc['body']

        # search through headers to get index
        if ('A BILL' in bill_text[:5000]):
            header_text = 'A BILL'

        elif ('A Bill' in bill_text[:5000]):
            header_text = 'A Bill'            

        elif ('JOINT RESOLUTION' in bill_text[:5000]): 
            header_text = 'JOINT RESOLUTION'

        elif ('Joint Resolution' in bill_text[:5000]):
            header_text = 'Joint Resolution'

        elif ('An Act' in bill_text[:5000]): 
            header_text = 'An Act'

        elif ('AN ACT' in bill_text[:5000]): 
            header_text = 'AN ACT'

        else: 
            header_text = 'ing Office'

        text_start = bill_text.find(header_text)

        # truncate the bill_text to remove header
        bill_text_trunc = bill_text[text_start:].split(' ', 2)[2]


        # truncate bill text to remove footer
        if 'LEGISLATIVE HISTORY' in bill_text_trunc:
            text_end = bill_text_trunc.find('LEGISLATIVE HISTORY')
            bill_text_trunc = bill_text_trunc[:text_end].rsplit('Approved')[0]

        # update Mongo
        update_mongo_bill_text(leg_id, cong_id, bill_text_trunc, bill_info)     
        
        # show status
        if i%20 == 0:
            print('{:.2f}% complete truncating bill text'.format(i / doc_count))
        i += 1

Overwriting truncate_bill_text.py


In [9]:
#exploration
from pymongo import MongoClient
from my_tools import read_jsonl_file
from datetime import date

client = MongoClient()
db = client.bills
bill_info = db.bill_info


In [20]:
today = date.today().isoformat()
today

'2019-01-09'

In [24]:
log_path = '/home/ubuntu/galvanize_capstone/data/logs/mongo_updates.jsonl'
logs = read_jsonl_file(log_path)
today = date.today().isoformat()

for log in logs:
    if 'body' in log.keys():
        print('----------------------')
        print('----------------------')
        print(log['congress_id'], log['leg_id'])
        print(log['body']['date'] == today)
        
        

----------------------
----------------------
116 H R 227
True
----------------------
----------------------
116 H R 226
True
----------------------
----------------------
116 H R 221
True
----------------------
----------------------
116 H R 206
True
----------------------
----------------------
116 H R 202
True
----------------------
----------------------
116 H R 192
True
----------------------
----------------------
116 H R 190
True
----------------------
----------------------
116 H R 161
True
----------------------
----------------------
116 H R 150
True
----------------------
----------------------
116 H R 136
True
----------------------
----------------------
116 H R 135
True
----------------------
----------------------
116 H R 133
True
----------------------
----------------------
116 H R 116
True
----------------------
----------------------
116 H R 115
True
----------------------
----------------------
116 H R 113
True
----------------------
----------------------
116 H R 5

In [21]:
log['body']['date'] == today

True

In [25]:
len(logs)

271

In [10]:
doc_count = bill_info.count_documents({'congress_id': '116', 'bill_text': {'$regex': '(.+)'}})
doc_count

26

In [5]:
docs = bill_info.find({'congress_id': '116', 'bill_text': {'$regex': '(.+)'}})

In [8]:
docs[1]

{'_id': ObjectId('5c2fb9ad1417de116c271824'),
 'leg_id': 'H R 238',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/116th-congress/house-bill/238?s=1&r=2',
 'intro_date': '01/03/2019',
 'congress_id': '116',
 'desc': 'To authorize the President to award the Medal of Honor to James Megellas, formerly of Fond du Lac, Wisconsin, and currently of Colleyville, Texas, for acts of valor on January 28, 1945, during the Battle of the Bulge in World War II.',
 'sponsor': 'Grothman, Glenn',
 'sponsor_party': 'R',
 'sponsor_state': 'WI',
 'sponsor_district': '6',
 'num_of_cosponsors': '0',
 'cosponsors_url': None,
 'cosponsors': None,
 'num_of_amendments': '0',
 'committee': 'House - Armed Services',
 'bill_status': 'Introduced',
 'body': '[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [H.R. 238 Introduced in House (IH)] <DOC> 116th CONGRESS 1st Session H. R. 238 To authorize the President to award the Medal of Honor to James Megellas, formerly of

In [11]:
docs = bill_info.find({'body': {'$regex': '(.+)'}, 'bill_text': None})

In [11]:
docs[25]

{'_id': ObjectId('5c2fb9cd1417de116c27192f'),
 'leg_id': 'S J Res 1',
 'leg_type': 'JOINT RESOLUTION',
 'leg_url': 'https://www.congress.gov/bill/116th-congress/senate-joint-resolution/1?s=1&r=312',
 'intro_date': '01/03/2019',
 'congress_id': '116',
 'desc': 'A joint resolution proposing an amendment to the Constitution of the United States relative to limiting the number of terms that a Member of Congress may serve.',
 'sponsor': 'Cruz, Ted',
 'sponsor_party': 'R',
 'sponsor_state': 'TX',
 'sponsor_district': None,
 'num_of_cosponsors': '2',
 'cosponsors_url': 'https://www.congress.gov/bill/116th-congress/senate-joint-resolution/1/cosponsors?s=1&r=312&overview=closed#tabs',
 'cosponsors': None,
 'num_of_amendments': '0',
 'committee': 'Senate - Judiciary',
 'bill_status': 'Introduced',
 'body': "[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [S.J. Res. 1 Introduced in Senate (IS)] <DOC> 116th CONGRESS 1st Session S. J. RES. 1 Proposing an amendment 

In [None]:
doc_count = bill_info.count_documents({'body': {'$regex': '(.+)'}})

In [None]:
doc_count

In [None]:
all_docs = bill_info.count_documents({})

In [None]:
all_docs

In [None]:
leg_id = 'H R 6897'
cong_id = '115'
doc = bill_info.find_one({'leg_id': leg_id, 'congress_id': cong_id})
bill_text = doc['body']
text_start = bill_text.find('An Act')

bill_text_trunc = bill_text[text_start:].split(' ', 2)[2]

text_end = bill_text_trunc.find('LEGISLATIVE HISTORY')
bill_text_trunc = bill_text_trunc[:text_end].rsplit('Approved')[0]
print(bill_text_trunc)