In [17]:
# %%writefile vote_scraper.py
from pymongo import MongoClient
import pprint 
import pandas as pd 
import copy
from bs4 import BeautifulSoup
import requests
import datetime

from time import sleep
import warnings

client = MongoClient('mongodb://localhost:27017/')
db = client.bills
vote_records = db.vote_records

# the 101st Congress (1989 - 1990) starts on pg 1011 for pageSize=250
house_url_root = 'http://clerk.house.gov/evs'

date_range = list(range(1990, 2019))


def house_year_iterator(date_range, root_url):
    print('_______________')
    print('Beginning iterations for House data for years {} to {}'.format(min(date_range), max(date_range)))
    print('_______________')
    for yr in date_range:
        site_url = '{}/{}/index.asp'.format(root_url, yr)
        req = requests.get(site_url)
        tstamp = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
        stat_code = req.status_code
        if stat_code != 200:
            print('_______________')
            print('_______________')
            print('Error requesting {}'.format(site_url))
            print('Request Status Code: {}, {}'.format(stat_code, tstamp))
            sleep(3)

    print('Iterations through years of House data complete')
    print('Last url requested: {}'.format(site_url))
    print("Examine output above for occurrences in request errors, if any.")

In [18]:
house_year_iterator(date_range, house_url_root)

_______________
Beginning iterations for House data for years 1990 to 2018
Iterations through years of House data complete
Last url requested: http://clerk.house.gov/evs/2018/index.asp
Examine output above for occurrences in request errors, if any.


In [4]:
# test code with year 1990 for House votes
site_url_root = 'http://clerk.house.gov/evs'
yr = '1990'

site_url = '{}/{}/index.asp'.format(site_url_root, yr)
req = requests.get(site_url)
tstamp = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
stat_code = req.status_code
print('_______________')
print('_______________')
print(site_url)
print('Request Status Code: {}, {}'.format(stat_code, tstamp))

_______________
_______________
http://clerk.house.gov/evs/1990/index.asp
Request Status Code: 200, 12-10-2018 11:09:28


In [5]:
# use BeautifulSoup to find the data we need.
soup = BeautifulSoup(req.content, 'lxml')
# print(soup.prettify())

table = soup.find('table')
# print(table.prettify())

rows = table.find_all('tr')

In [6]:
# initial request of webpage will show the final table with the most recent roll call votes
# get the largest value of roll for iteration
final_roll = int(rows[1].find_all('a')[0].text.strip())

In [8]:
vote_url_head = 'http://clerk.house.gov/evs'

In [22]:
# get voting data by iterating through roll ids
for i in range(1, final_roll + 1):
    # convert roll id to 3-digits for url
    three_digit_roll = '{}'.format(str(i).zfill(3))
    vote_table_url = '{}/{}/roll{}.xml'.format(vote_url_head, yr, three_digit_roll)
#     print(vote_table_url)
    req = requests.get(vote_table_url)
    stat_code = req.status_code
    
    # print verification that iterator is working
    if i%100 == 0:
        print('...................')
        print('On Roll ID {}'.format(i))
        
    if stat_code != 200:
        print('_______________')
        print('_______________')
        print(site_url)
        print('Request Status Code: {}, {}'.format(stat_code, tstamp))

#     sleep(3)
        
print('Iterations through rolls for year {} complete.'.format(yr))
print('Last url: {}'.format(vote_table_url))
print("Examine output above for occurrences in request errors, if any.")


In [None]:
all_rows = []
empty_row = {
            "year": yr,
            "roll": None, 
            "date": None, 
            "issue": None,
            "question": None,
            "result": None,
            "description": None
            }

In [None]:
# skip the header when reading table
for row in rows[1:]:
    new_row = copy.copy(empty_row)
    columns = row.find_all('td')
    new_row['roll'] = columns[0].text.strip()
    new_row['issue'] = columns[2].text.strip()
    all_rows.append(new_row)

In [None]:
all_rows

In [None]:
# if req.status_code == 200:
#################
browser.get(site_url)
soup = BeautifulSoup(browser.page_source, 'lxml')
# print(soup.prettify())
div = soup.find('div', {'class':'row'})
print(div.prettify())

In [None]:
# add page page html to mongo
collection_name.insert_one({'lxml': req.content})


# print result of load
with open('data/load_results.txt', 'a') as f:
    f.writelines('{}, {}, {}\n'.format(site_url, stat_code, tstamp))
f.close()


##################
# else: 
#     print('failed to get {}'.format(site_url))
#     # print result of load
#     with open('data/logs/load_results.txt', 'a') as f:
#         f.writelines('{}, {}, {}\n'.format(site_url, stat_code, tstamp))
#     f.close()