In [143]:
# %%writefile vote_scraper.py
from pymongo import MongoClient
import pprint 
import pandas as pd 
import copy
from bs4 import BeautifulSoup
import requests
import datetime

from time import sleep
import warnings

import codecs
import json

import boto


def write_json_file(obj, path):
    '''Dump an object and write it ous as json to a file'''
    f = codecs.open(path, 'a', 'utf-8')
    json_record = json.dumps(obj, ensure_ascii = False)
    f.write(json_record + '\n')
    f.close
    

def get_all_votes(date_range, root_url):
    print('_______________')
    print('Beginning iterations for House summary data for years {} to {}'.format(min(date_range), max(date_range)))
    print('_______________')
    for yr in date_range:
        site_url = '{}/{}/index.asp'.format(root_url, yr)
        req = requests.get(site_url)
        tstamp = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
        stat_code = req.status_code
        if stat_code != 200:
            print('_______________')
            print('_______________')
            print('Error requesting {}'.format(site_url))
            print('Request Status Code: {}, {}'.format(stat_code, tstamp))
            sleep(3)
            
        if stat_code == 200:            
            final_roll = get_final_roll(root_url, yr)
            get_table_summary(root_url, yr, final_roll)

    print('_______________')
    print('_______________')
    print('Iterations through years {} to {} of House summary data complete'.format(min(date_range), max(date_range)))
    print('Last url requested: {}'.format(site_url))
    print("Examine output above for occurrences in request errors, if any.")

    
def get_final_roll(site_url_root, yr):
    site_url = '{}/{}/index.asp'.format(site_url_root, yr)
    req = requests.get(site_url)
    stat_code = req.status_code

    # use BeautifulSoup to find the data we need.
    soup = BeautifulSoup(req.content, 'lxml')
    table = soup.find('table')
    rows = table.find_all('tr')

    # initial request of webpage will show the final table with the most recent roll call votes
    # get the largest value of roll for iteration
    final_roll = int(rows[1].find_all('a')[0].text.strip())
    print('Year: {}'.format(yr))
    print('Final Roll ID: {}'.format(final_roll))
    
    return final_roll


def get_table_summary(root_url, yr, final_roll_id):
    # get roll summaries from tables from links at index on bottom left
    indx_list = []
    for i in range(0, final_roll_id + 1):
        if i%100 == 0:
            indx_list.append('{}'.format(str(i).zfill(3)))
    
    for indx in indx_list:
        vote_table_url = '{}/{}/ROLL_{}.asp'.format(root_url, yr, indx)
        req = requests.get(vote_table_url)
        stat_code = req.status_code

        if stat_code != 200:
            print('_______________')
            print('_______________')
            print(site_url)
            print('Request Status Code: {}, {}'.format(stat_code, tstamp))

        if stat_code == 200:
            # use BeautifulSoup to find the data we need.
            soup = BeautifulSoup(req.content, 'lxml')
            table = soup.find('table')            
            rows = table.find_all('tr')
            
            append_rows_to_file(rows, yr, '../data/roll_summaries.jsonl')

            
    print('\tIterations through rolls for year {} complete.'.format(yr))
    print('\tLast url: {}'.format(vote_table_url))
    print("\tExamine output above for occurrences in request errors, if any.")
    print('_______________')

    

# # this one should get the actual vote results
# # NEED MORE WORK ON THIS
# def get_vote_results(root_url, yr, final_roll_id):
#     # get voting data by iterating through roll ids
#     for i in range(1, final_roll_id + 1):
#         # convert roll id to 3-digits for url
#         three_digit_roll = '{}'.format(str(i).zfill(3))
#         vote_table_url = '{}/{}/roll{}.xml'.format(root_url, yr, three_digit_roll)
#         print(vote_table_url)
#         req = requests.get(vote_table_url)
#         stat_code = req.status_code

#         # print verification that iterator is working
#         if i%100 == 0:
#             print('...................')
#             print('...on Roll ID {}'.format(i))

#         if stat_code != 200:
#             print('_______________')
#             print('_______________')
#             print(site_url)
#             print('Request Status Code: {}, {}'.format(stat_code, tstamp))

#         if stat_code == 200:
#             # use BeautifulSoup to find the data we need.
#             soup = BeautifulSoup(req.content, 'lxml')
#             table = soup.find('table')            
#             rows = table.find_all('tr')
            
#             append_rows_to_file(rows, yr, '../data/roll_summaries.jsonl')

            
#     print('Iterations through rolls for year {} complete.'.format(yr))
#     print('Last url: {}'.format(vote_table_url))
#     print("Examine output above for occurrences in request errors, if any.")
#     print('_______________')
    
    

def append_rows_to_file(rows, yr, filename):
    # all_rows = []
    empty_row = {
                "year": None,
                "roll": None, 
                "date": None, 
                "issue": None,
                "question": None,
                "result": None,
                "description": None, 
                "timestamp": None
                }

    # skip the header when reading table
    for row in rows[1:]:
        new_row = copy.copy(empty_row)
        columns = row.find_all('td')
        new_row['year'] = yr
        new_row['roll'] = columns[0].text.strip()
        new_row['date'] = columns[1].text.strip()
        new_row['issue'] = columns[2].text.strip()
        new_row['question'] = columns[3].text.strip()
        new_row['result'] = columns[4].text.strip()
        new_row['description'] = columns[5].text.strip()
        new_row['timestamp'] = datetime.datetime.now().strftime('%m-%d-%Y %H:%M:%S')
    #     all_rows.append(new_row)
        write_json_file(new_row, filename)

In [144]:
house_url_root = 'http://clerk.house.gov/evs'

# date_range = list(range(1990, 2019))
date_range = list(range(1990, 1995))

get_all_votes(date_range, house_url_root)

_______________
Beginning iterations for House data for years 1990 to 1994
_______________
Year: 1990
Final Roll ID: 536
	Iterations through rolls for year 1990 complete.
	Last url: http://clerk.house.gov/evs/1990/ROLL_500.asp
	Examine output above for occurrences in request errors, if any.
_______________
Year: 1991
Final Roll ID: 444
	Iterations through rolls for year 1991 complete.
	Last url: http://clerk.house.gov/evs/1991/ROLL_400.asp
	Examine output above for occurrences in request errors, if any.
_______________
Year: 1992
Final Roll ID: 488
	Iterations through rolls for year 1992 complete.
	Last url: http://clerk.house.gov/evs/1992/ROLL_400.asp
	Examine output above for occurrences in request errors, if any.
_______________
Year: 1993
Final Roll ID: 615
	Iterations through rolls for year 1993 complete.
	Last url: http://clerk.house.gov/evs/1993/ROLL_600.asp
	Examine output above for occurrences in request errors, if any.
_______________
Year: 1994
Final Roll ID: 507
	Iterations

In [112]:
req = requests.get('http://clerk.house.gov/evs/1990/roll001.xml')
req.status_code == 200

True

In [116]:
soup = BeautifulSoup(req.content, 'lxml')
print(soup.prettify())

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE rollcall-vote PUBLIC "-//US Congress//DTDs/vote v1.0 20031119 //EN" "http://clerk.house.gov/evs/vote.dtd">
<?xml-stylesheet type="text/xsl" href="http://clerk.house.gov/evs/vote.xsl"?>
<html>
 <body>
  <rollcall-vote>
   <vote-metadata>
    <majority>
     D
    </majority>
    <congress>
     101
    </congress>
    <session>
     2nd
    </session>
    <chamber>
     U.S. House of Representatives
    </chamber>
    <rollcall-num>
     1
    </rollcall-num>
    <legis-num>
     QUORUM 1
    </legis-num>
    <vote-question>
     Call of the House
    </vote-question>
    <vote-type>
     QUORUM
    </vote-type>
    <vote-result>
     Passed
    </vote-result>
    <action-date>
     23-Jan-1990
    </action-date>
    <action-time time-etz="12:31">
     12:31 PM
    </action-time>
    <vote-desc>
    </vote-desc>
    <vote-totals>
     <totals-by-party-header>
      <party-header>
       Party
      </party-header>
      <yea-header>
     

In [115]:
table = soup.find('table')
print(table)

None


In [113]:
rows = table.find_all('tr')

AttributeError: 'NoneType' object has no attribute 'find_all'

In [80]:
all_rows[0]      #all_rows is a list of dictionaries

{'year': '1990',
 'roll': '536',
 'date': '27-Oct',
 'issue': 'S 280',
 'question': 'SUSPEND THE RULES AND CONCUR IN S. ADTS. TO H. ADT',
 'result': 'F',
 'description': 'NIOBRARA RIVER DESIGNATION'}

In [88]:
    
def write_json_lines_file(list_of_objs, path):
    '''dump an array of objects out as a json lines file'''
    f = codecs.open(path, 'w', 'utf-8')
    for row_object in list_of_objs:
        json_record = json.dumps(row_object, ensure_ascii = False)
        f.write(json_record + '\n')
    f.close()

In [91]:
write_json_file(all_rows[2], '../data/test_data.txt')

In [82]:
!pwd

/Users/u1b1700/Documents/Galvanize/galvanize_capstone/src


In [61]:
# create an AWS S3 connection
conn = boto.connect_s3()
# print(conn)

conn.get_all_buckets()

[<Bucket: aws-logs-220699157430-us-west-2>,
 <Bucket: capstonetwi>,
 <Bucket: capstoneyelp112018>,
 <Bucket: galvcap-legislation>,
 <Bucket: magdielb1>]

In [59]:
# create a bucket for all of our project data
# not needed after creation
# legislation_bucket = conn.create_bucket('galvcap-legislation')

In [102]:
legislation_bucket.get_all_keys()

[]

In [101]:
# write data to S3
roll_summary_file = legislation_bucket.new_key('roll_summaries.txt')


In [None]:
# 


In [None]:
# if req.status_code == 200:
#################
browser.get(site_url)
soup = BeautifulSoup(browser.page_source, 'lxml')
# print(soup.prettify())
div = soup.find('div', {'class':'row'})
print(div.prettify())

In [None]:
# add page page html to mongo
collection_name.insert_one({'lxml': req.content})


# print result of load
with open('data/load_results.txt', 'a') as f:
    f.writelines('{}, {}, {}\n'.format(site_url, stat_code, tstamp))
f.close()


##################
# else: 
#     print('failed to get {}'.format(site_url))
#     # print result of load
#     with open('data/logs/load_results.txt', 'a') as f:
#         f.writelines('{}, {}, {}\n'.format(site_url, stat_code, tstamp))
#     f.close()