In [39]:
%%writefile truncate_bill_text.py
'''
This script truncates the 'body' in each Mongo document to store the actual bill text in key bill_text
'''
from pymongo import MongoClient
from my_tools import read_jsonl_file
from datetime import date

def update_mongo_bill_text(leg_id, cong_id, bill_text_trunc, collection):
    '''
    ------------------------------------------
    Updates the bill_text field in the mongo record specified by bill_issue (leg_id) 
    and cong_id (congress_id) from db.collection with bill_text_trunc.
    
    ------------------------------------------
    Parameters: leg_id - value to filter on for key leg_id
                cong_id - value to filter on for key congress_id
                bill_text - truncated text in 'body'
                collection - the name of the mongo collection
                
    ------------------------------------------
    Returns:    None
    
    ------------------------------------------
    '''
    collection.update_one({'leg_id': leg_id, 'congress_id': cong_id}, {'$set': {'bill_text': bill_text_trunc}})


def truncate_bill_body(bill_text):
    '''
    
    '''
    # search through headers to get index
    if ('A BILL' in bill_text[:5000]):
        header_text = 'A BILL'

    elif ('A Bill' in bill_text[:5000]):
        header_text = 'A Bill'            

    elif ('JOINT RESOLUTION' in bill_text[:5000]): 
        header_text = 'JOINT RESOLUTION'

    elif ('Joint Resolution' in bill_text[:5000]):
        header_text = 'Joint Resolution'

    elif ('An Act' in bill_text[:5000]): 
        header_text = 'An Act'

    elif ('AN ACT' in bill_text[:5000]): 
        header_text = 'AN ACT'

    else: 
        header_text = 'ing Office'

    text_start = bill_text.find(header_text)

    # truncate the bill_text to remove header
    bill_text_trunc = bill_text[text_start:].split(' ', 2)[2]


    # truncate bill text to remove footer
    if 'LEGISLATIVE HISTORY' in bill_text_trunc:
        text_end = bill_text_trunc.find('LEGISLATIVE HISTORY')
        bill_text_trunc = bill_text_trunc[:text_end].rsplit('Approved')[0]

    return bill_text_trunc    
    
    
    

if __name__ == '__main__':
    client = MongoClient()
    db = client.bills
    bill_info = db.bill_info
    
    # retrieve logs where the bill text has changed when get_bill_text was run
    log_path = '/home/ubuntu/galvanize_capstone/data/logs/mongo_updates.jsonl'
    logs = read_jsonl_file(log_path)
    
    today = date.today.isoformat()

    for log in logs:
        # check to see if the body was updated with get_bill_text today
        if 'body' in log.keys():
            if log['body']['date'] == today:
                cong_id = log['congress_id']
                leg_id = log['leg_id'] 
                print('The bills text for Congress ID {}, {} has changed. Updating truncated text'.format(cong_id, leg_id))

                # use cong_id and leg_id in log to pull bill text from Mongo and clip it
                doc = bill_info.find_one({'congress_id': cong_id, 'leg_id': leg_id})
                bill_text_clipped = truncate_bill_body(doc['body'])

                # update Mongo
                update_mongo_bill_text(leg_id, cong_id, bill_text_clipped, bill_info)     

            
            
            

Overwriting truncate_bill_text.py


In [32]:
#exploration
from pymongo import MongoClient
from my_tools import read_jsonl_file
from datetime import date

client = MongoClient()
db = client.bills
bill_info = db.bill_info


In [20]:
today = date.today().isoformat()
today

'2019-01-09'

In [34]:
# mongo_updates.jsonl is where all changes are logged.
log_path = '/home/ubuntu/galvanize_capstone/data/logs/mongo_updates.jsonl'
logs = read_jsonl_file(log_path)
today = date.today().isoformat()

for log in logs:
    if 'body' in log.keys():
        print('----------------------')
        print('----------------------')
        cong_id = log['congress_id']
        leg_id = log['leg_id']
        print(log['congress_id'], log['leg_id'])
        print(log['body']['date'] == today)
        print(log['body'])
#         if log['body']['date'] == today:
            
        
        

----------------------
----------------------
116 H R 227
True
{'old_value': None, 'new_value': "[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [H.R. 227 Introduced in House (IH)] <DOC> 116th CONGRESS 1st Session H. R. 227 To amend the Small Business Act to specify what credit is given for certain subcontractors and to provide a dispute process for non-payment to subcontractors, and for other purposes. _______________________________________________________________________ IN THE HOUSE OF REPRESENTATIVES January 3, 2019 Ms. Velazquez (for herself and Mr. Kelly of Mississippi) introduced the following bill; which was referred to the Committee on Small Business _______________________________________________________________________ A BILL To amend the Small Business Act to specify what credit is given for certain subcontractors and to provide a dispute process for non-payment to subcontractors, and for other purposes. Be it enacted by the Senate and Hou

In [35]:
cong_id, leg_id

('116', 'H J Res 22')

In [38]:
doc = bill_info.find_one({'congress_id': cong_id, 'leg_id': leg_id})
bill_text = doc['body']
bill_text

"[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [H.J. Res. 22 Introduced in House (IH)] <DOC> 116th CONGRESS 1st Session H. J. RES. 22 Proposing a balanced budget amendment to the Constitution of the United States. _______________________________________________________________________ IN THE HOUSE OF REPRESENTATIVES January 8, 2019 Mr. Chabot (for himself, Mr. Higgins of Louisiana, Mr. King of Iowa, Mr. David P. Roe of Tennessee, Mr. Rice of South Carolina, Mr. Guthrie, Mr. Olson, Mr. Bacon, Mr. Griffith, Mr. Turner, Mr. Simpson, Mr. Conaway, Mr. Flores, Mr. Calvert, Mr. Latta, Mr. Hudson, Mr. Palazzo, Mr. Emmer, Mr. Comer, Mr. Allen, Mr. Abraham, Mr. Arrington, Mr. Smucker, and Mr. Kustoff of Tennessee) submitted the following joint resolution; which was referred to the Committee on the Judiciary _______________________________________________________________________ JOINT RESOLUTION Proposing a balanced budget amendment to the Constitution of the U

In [21]:
log['body']['date'] == today

True

In [29]:
len(logs)

271

In [10]:
doc_count = bill_info.count_documents({'congress_id': '116', 'bill_text': {'$regex': '(.+)'}})
doc_count

26

In [5]:
docs = bill_info.find({'congress_id': '116', 'bill_text': {'$regex': '(.+)'}})

In [8]:
docs[1]

{'_id': ObjectId('5c2fb9ad1417de116c271824'),
 'leg_id': 'H R 238',
 'leg_type': 'BILL',
 'leg_url': 'https://www.congress.gov/bill/116th-congress/house-bill/238?s=1&r=2',
 'intro_date': '01/03/2019',
 'congress_id': '116',
 'desc': 'To authorize the President to award the Medal of Honor to James Megellas, formerly of Fond du Lac, Wisconsin, and currently of Colleyville, Texas, for acts of valor on January 28, 1945, during the Battle of the Bulge in World War II.',
 'sponsor': 'Grothman, Glenn',
 'sponsor_party': 'R',
 'sponsor_state': 'WI',
 'sponsor_district': '6',
 'num_of_cosponsors': '0',
 'cosponsors_url': None,
 'cosponsors': None,
 'num_of_amendments': '0',
 'committee': 'House - Armed Services',
 'bill_status': 'Introduced',
 'body': '[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [H.R. 238 Introduced in House (IH)] <DOC> 116th CONGRESS 1st Session H. R. 238 To authorize the President to award the Medal of Honor to James Megellas, formerly of

In [11]:
docs = bill_info.find({'body': {'$regex': '(.+)'}, 'bill_text': None})

In [11]:
docs[25]

{'_id': ObjectId('5c2fb9cd1417de116c27192f'),
 'leg_id': 'S J Res 1',
 'leg_type': 'JOINT RESOLUTION',
 'leg_url': 'https://www.congress.gov/bill/116th-congress/senate-joint-resolution/1?s=1&r=312',
 'intro_date': '01/03/2019',
 'congress_id': '116',
 'desc': 'A joint resolution proposing an amendment to the Constitution of the United States relative to limiting the number of terms that a Member of Congress may serve.',
 'sponsor': 'Cruz, Ted',
 'sponsor_party': 'R',
 'sponsor_state': 'TX',
 'sponsor_district': None,
 'num_of_cosponsors': '2',
 'cosponsors_url': 'https://www.congress.gov/bill/116th-congress/senate-joint-resolution/1/cosponsors?s=1&r=312&overview=closed#tabs',
 'cosponsors': None,
 'num_of_amendments': '0',
 'committee': 'Senate - Judiciary',
 'bill_status': 'Introduced',
 'body': "[Congressional Bills 116th Congress] [From the U.S. Government Publishing Office] [S.J. Res. 1 Introduced in Senate (IS)] <DOC> 116th CONGRESS 1st Session S. J. RES. 1 Proposing an amendment 

In [None]:
doc_count = bill_info.count_documents({'body': {'$regex': '(.+)'}})

In [None]:
doc_count

In [None]:
all_docs = bill_info.count_documents({})

In [None]:
all_docs

In [None]:
leg_id = 'H R 6897'
cong_id = '115'
doc = bill_info.find_one({'leg_id': leg_id, 'congress_id': cong_id})
bill_text = doc['body']
text_start = bill_text.find('An Act')

bill_text_trunc = bill_text[text_start:].split(' ', 2)[2]

text_end = bill_text_trunc.find('LEGISLATIVE HISTORY')
bill_text_trunc = bill_text_trunc[:text_end].rsplit('Approved')[0]
print(bill_text_trunc)