In [68]:
# %%writefile mongo_export.py
'''
script to upload mongo db bills, collection bill_details to s3, and ultimately to ec2.
'''

from pymongo import MongoClient
import bson.json_util
import re
import string
import copy
import os
import boto
from boto.s3.connection import S3Connection, Location


from my_tools import write_jsonl_file

In [2]:
client = MongoClient() # defaults to localhost
db = client.bills
bill_details = db.bill_details


In [3]:
# iterate through date range in reverse
date_range = range(1990, 2019)[::-1]


In [58]:
# store each row as key-value pair in a dictionary
empty_row = {'leg_id': None, 
            'leg_type': None,
            'leg_url': None,
            'intro_date': None,
            'congress_id': None,
            'desc': None,
            'sponsor': None, 
            'sponsor_party': None, 
            'sponsor_state': None,
            'sponsor_district': None,  #senators don't have districts
            'num_of_cosponsors': None,
            'cosponsors_url': None,
            'cosponsors': None,  #requires navigation to another url and extracting names from table
            'committee': None, 
            'bill_status': None,
            'body': None   #requires navigation to another url
            }


In [66]:
for d in date_range:
    
    print('--------------------')
    print('Extracting year {}'.format(d))
    date_str = str(d)
    
    print('--------------------')
    outfile = '../data/bill_details_{}.jsonl'.format(date_str)
    print('Writing file out to {}'.format(outfile))
    records = list(bill_details.find({'intro_date': {'$regex': date_str}}))

    all_rows = []

    for i in range(len(records)):
        new_row = copy.copy(empty_row)
        
        new_row['leg_id'] = records[i]['leg_id']
        new_row['leg_type'] = records[i]['leg_type']
        new_row['leg_url'] = records[i]['leg_url']
        new_row['intro_date'] = records[i]['intro_date']
        new_row['congress_id'] = records[i]['congress_id']
        new_row['desc'] = records[i]['desc']
        new_row['sponsor'] = records[i]['sponsor']
        new_row['sponsor_party'] = records[i]['sponsor_party']
        new_row['sponsor_state'] = records[i]['sponsor_state']
        new_row['sponsor_district'] = records[i]['sponsor_district']
        new_row['num_of_cosponsors'] = records[i]['num_of_cosponsors']
        new_row['cosponsors_url'] = records[i]['cosponsors_url']
        new_row['cosponsors'] = records[i]['cosponsors']
        new_row['committee'] = records[i]['committee']
        new_row['bill_status'] = records[i]['bill_status']
        new_row['body'] = records[i]['body']
        
        all_rows.append(new_row)
    
    write_jsonl_file(all_rows, outfile)
                                        

--------------------
Extracting year 2018
--------------------
Writing file out to ../data/bill_details_2018.jsonl
--------------------
Extracting year 2017
--------------------
Writing file out to ../data/bill_details_2017.jsonl
--------------------
Extracting year 2016
--------------------
Writing file out to ../data/bill_details_2016.jsonl
--------------------
Extracting year 2015
--------------------
Writing file out to ../data/bill_details_2015.jsonl
--------------------
Extracting year 2014
--------------------
Writing file out to ../data/bill_details_2014.jsonl
--------------------
Extracting year 2013
--------------------
Writing file out to ../data/bill_details_2013.jsonl
--------------------
Extracting year 2012
--------------------
Writing file out to ../data/bill_details_2012.jsonl
--------------------
Extracting year 2011
--------------------
Writing file out to ../data/bill_details_2011.jsonl
--------------------
Extracting year 2010
--------------------
Writing file out 

In [71]:
# create an AWS S3 connection
conn = boto.s3.connect_to_region('us-west-2', host = 's3-us-west-2.amazonaws.com')

print('Buckets available: {}'.format(conn.get_all_buckets()))


Buckets available: [<Bucket: aws-logs-220699157430-us-west-2>, <Bucket: galvcap-leg>, <Bucket: magdielb1>]


In [72]:
legislation_bucket = conn.get_bucket('galvcap-leg')

print('Keys currently in bucket galvcap-leg: {}'.format(legislation_bucket.get_all_keys()))

for f in os.listdir('../data'):
    if f.startswith('bill_details'):
        print('Loading {} to s3'.format(f))
        # create new key in s3
        file_ = legislation_bucket.new_key(f)

        # copy one local file to s3
        filepath = '../data/{}'.format(f)
        file_.set_contents_from_filename(filepath)
        file_.get_contents_to_filename(f)
        
print('Upload to s3 complete!')

Keys currently in bucket galvcap-leg: [<Key: galvcap-leg,bill_texts_1990.jsonl>, <Key: galvcap-leg,bill_texts_1991.jsonl>, <Key: galvcap-leg,bill_texts_1992.jsonl>, <Key: galvcap-leg,bill_texts_1993.jsonl>, <Key: galvcap-leg,bill_texts_1994.jsonl>, <Key: galvcap-leg,bill_texts_1995.jsonl>, <Key: galvcap-leg,bill_texts_1996.jsonl>, <Key: galvcap-leg,bill_texts_1997.jsonl>, <Key: galvcap-leg,bill_texts_1998.jsonl>, <Key: galvcap-leg,bill_texts_1999.jsonl>, <Key: galvcap-leg,bill_texts_2000.jsonl>, <Key: galvcap-leg,bill_texts_2001.jsonl>, <Key: galvcap-leg,bill_texts_2002.jsonl>, <Key: galvcap-leg,bill_texts_2003.jsonl>, <Key: galvcap-leg,bill_texts_2004.jsonl>, <Key: galvcap-leg,bill_texts_2005.jsonl>, <Key: galvcap-leg,bill_texts_2006.jsonl>, <Key: galvcap-leg,bill_texts_2007.jsonl>, <Key: galvcap-leg,bill_texts_2008.jsonl>, <Key: galvcap-leg,bill_texts_2009.jsonl>, <Key: galvcap-leg,bill_texts_2010.jsonl>, <Key: galvcap-leg,bill_texts_2011.jsonl>, <Key: galvcap-leg,bill_texts_2012.jso