In [71]:
# %%writefile vote_scraper.py
from pymongo import MongoClient
import pprint 
import pandas as pd 
import copy
from bs4 import BeautifulSoup
import requests
import datetime

from time import sleep
import warnings

import codecs
import json

import boto

In [17]:
client = MongoClient('mongodb://localhost:27017/')
db = client.bills
vote_records = db.vote_records

# the 101st Congress (1989 - 1990) starts on pg 1011 for pageSize=250
house_url_root = 'http://clerk.house.gov/evs'


def house_year_iterator(date_range, root_url):
    print('_______________')
    print('Beginning iterations for House data for years {} to {}'.format(min(date_range), max(date_range)))
    print('_______________')
    for yr in date_range:
        site_url = '{}/{}/index.asp'.format(root_url, yr)
        req = requests.get(site_url)
        tstamp = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
        stat_code = req.status_code
        if stat_code != 200:
            print('_______________')
            print('_______________')
            print('Error requesting {}'.format(site_url))
            print('Request Status Code: {}, {}'.format(stat_code, tstamp))
            sleep(3)

    print('Iterations through years of House data complete')
    print('Last url requested: {}'.format(site_url))
    print("Examine output above for occurrences in request errors, if any.")

In [18]:
date_range = list(range(1990, 2019))

house_year_iterator(date_range, house_url_root)

In [8]:
vote_url_head = 'http://clerk.house.gov/evs'

In [88]:
def write_json_file(obj, path):
    '''Dump an object and write it ous as json to a file'''
    f = codecs.open(path, 'a', 'utf-8')
    json_record = json.dumps(obj, ensure_ascii = False)
    f.write(json_record + '\n')
    f.close

In [26]:
def get_table_summary(root_url, yr, final_roll_id):
    # get voting data by iterating through roll ids
    for i in range(1, final_roll_id + 1):
        # convert roll id to 3-digits for url
        three_digit_roll = '{}'.format(str(i).zfill(3))
        vote_table_url = '{}/{}/roll{}.xml'.format(root_url, yr, three_digit_roll)
    #     print(vote_table_url)
        req = requests.get(vote_table_url)
        stat_code = req.status_code

        # print verification that iterator is working
        if i%100 == 0:
            print('...................')
            print('...on Roll ID {}'.format(i))

        if stat_code != 200:
            print('_______________')
            print('_______________')
            print(site_url)
            print('Request Status Code: {}, {}'.format(stat_code, tstamp))

    #     sleep(3)

    print('Iterations through rolls for year {} complete.'.format(yr))
    print('Last url: {}'.format(vote_table_url))
    print("Examine output above for occurrences in request errors, if any.")


In [28]:
get_table_summary(vote_url_head, yr, final_roll)

In [31]:
# test code with year 1990 for House votes
site_url_root = 'http://clerk.house.gov/evs'
yr = '1990'

site_url = '{}/{}/index.asp'.format(site_url_root, yr)
req = requests.get(site_url)
tstamp = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
stat_code = req.status_code
print('_______________')
print('_______________')
print(site_url)
print('Request Status Code: {}, {}'.format(stat_code, tstamp))

# use BeautifulSoup to find the data we need.
soup = BeautifulSoup(req.content, 'lxml')
# print(soup.prettify())

table = soup.find('table')
# print(table.prettify())

rows = table.find_all('tr')

# initial request of webpage will show the final table with the most recent roll call votes
# get the largest value of roll for iteration
final_roll = int(rows[1].find_all('a')[0].text.strip())
print('Final Roll ID: {}'.format(final_roll))

_______________
_______________
http://clerk.house.gov/evs/1990/index.asp
Request Status Code: 200, 12-10-2018 12:26:39
Final Roll ID: 536


In [29]:
vote_table_url

'http://clerk.house.gov/evs/1990/roll223.xml'

In [34]:
print(rows[1])

<tr><td><a href="http://clerk.house.gov/cgi-bin/vote.asp?year=1990&amp;rollnumber=536">536</a></td>
<td><font face="Arial" size="-1">27-Oct</font></td>
<td><font face="Arial" size="-1"><a href="https://www.congress.gov/bill/101st-congress/senate-bill/280">S 280</a></font></td>
<td><font face="Arial" size="-1">SUSPEND THE RULES AND CONCUR IN S. ADTS. TO H. ADT</font></td>
<td align="CENTER"><font face="Arial" size="-1">F</font></td>
<td><font face="Arial" size="-1">NIOBRARA RIVER DESIGNATION</font></td></tr>


In [45]:
# all_rows = []
empty_row = {
            "year": yr,
            "roll": None, 
            "date": None, 
            "issue": None,
            "question": None,
            "result": None,
            "description": None
            }

In [94]:
# skip the header when reading table
for row in rows[1:]:
    new_row = copy.copy(empty_row)
    columns = row.find_all('td')
    new_row['roll'] = columns[0].text.strip()
    new_row['date'] = columns[1].text.strip()
    new_row['issue'] = columns[2].text.strip()
    new_row['question'] = columns[3].text.strip()
    new_row['result'] = columns[4].text.strip()
    new_row['description'] = columns[5].text.strip()
#     all_rows.append(new_row)
    write_json_file(new_row, '../data/test_data.txt')

In [80]:
all_rows[0]      #all_rows is a list of dictionaries

{'year': '1990',
 'roll': '536',
 'date': '27-Oct',
 'issue': 'S 280',
 'question': 'SUSPEND THE RULES AND CONCUR IN S. ADTS. TO H. ADT',
 'result': 'F',
 'description': 'NIOBRARA RIVER DESIGNATION'}

In [88]:
    
def write_json_lines_file(list_of_objs, path):
    '''dump an array of objects out as a json lines file'''
    f = codecs.open(path, 'w', 'utf-8')
    for row_object in list_of_objs:
        json_record = json.dumps(row_object, ensure_ascii = False)
        f.write(json_record + '\n')
    f.close()

In [91]:
write_json_file(all_rows[2], '../data/test_data.txt')

In [82]:
!pwd

/Users/u1b1700/Documents/Galvanize/galvanize_capstone/src


In [61]:
# create an AWS S3 connection
conn = boto.connect_s3()
# print(conn)

conn.get_all_buckets()

[<Bucket: aws-logs-220699157430-us-west-2>,
 <Bucket: capstonetwi>,
 <Bucket: capstoneyelp112018>,
 <Bucket: galvcap-legislation>,
 <Bucket: magdielb1>]

In [59]:
# create a bucket for all of our project data
# not needed after creation
# legislation_bucket = conn.create_bucket('galvcap-legislation')

In [63]:
legislation_bucket.get_all_keys()

[]

In [67]:
# write data to S3
roll_summary_file = legislation_bucket.new_key('roll_summaries.txt')
roll_summary_file.set_contents_from_file(all_rows)

AttributeError: 'list' object has no attribute 'tell'

In [None]:
# 


In [None]:
# if req.status_code == 200:
#################
browser.get(site_url)
soup = BeautifulSoup(browser.page_source, 'lxml')
# print(soup.prettify())
div = soup.find('div', {'class':'row'})
print(div.prettify())

In [None]:
# add page page html to mongo
collection_name.insert_one({'lxml': req.content})


# print result of load
with open('data/load_results.txt', 'a') as f:
    f.writelines('{}, {}, {}\n'.format(site_url, stat_code, tstamp))
f.close()


##################
# else: 
#     print('failed to get {}'.format(site_url))
#     # print result of load
#     with open('data/logs/load_results.txt', 'a') as f:
#         f.writelines('{}, {}, {}\n'.format(site_url, stat_code, tstamp))
#     f.close()