# House Scraper

In [74]:
import scipy.linalg as sci
import numpy as np
import requests
import xml.etree.ElementTree as ET
import pandas as pd
import sympy as sp
import seaborn as sns
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup as bs
import math
import time
from functools import reduce
import re


In [4]:
def fetch_page(url):
    response = requests.get(url)
    # 200 is a success code
    if response.status_code == 200:
        return response.content
    else:
        print("Failed to fetch page:", url)
        return None

In [97]:
base_house_url = f'https://clerk.house.gov/Votes/'

dict={'Aye':1,'No':-1,'Yea':1,'Nay':-1,'Present':0,'Not Voting':0,'Guilty':1,'Not Guilty':-1}

# Presidential control: 1 = R, -1 = D
# House/Senate Margin: R-D (More Rs = positive, more Ds = negative)

house_df = pd.DataFrame(columns=['Session','House','H_R_error','H_D_error'])

congress_list = np.arange(101.5,118.5,0.5)
congress_list = np.arange(117,118,0.5)

for l in congress_list:
    congress = int(l)
    if l%1 == .5:
        session = 2
    else:
        session = 1
    year = int(l*2+1787)

    print(f'Beginning scrape of Congress {congress}, Session {session} which occurred in {year}')

    roll_call_lists = []

    # Grab each vote page in a Congressional Session.
    i = 0
    while True:
        i += 1
        url = base_house_url + str(year) + "{:03d}".format(i)
        
        page_data = fetch_page(url)
        if page_data:
            
            soup = bs(page_data, 'html.parser')
            votes = soup.find_all('a',class_="library-link")
            if soup.find('h1').get_text() == 'Roll call vote not available':
                print('Finished all votes in this Congressional Session!')
                break
            member_data = []

            for v in votes:
                vote = v.get('aria-label').split(' ')
                vote = vote[-1]
                rep = v.get_text() + "_" + re.search(r'of\s+(\w+)', v.get('aria-label')).group(1)
                member_data.append({'name':rep,'vote':vote})
            
            roll_call_lists.append(member_data)

        else:
            break

    df=pd.DataFrame(roll_call_lists[0]).drop(columns='vote',axis=1) # Create the first column of the dataframe
    # Assemble the vote data
    vote_data = []
    for i in range(len(roll_call_lists)):
        column_name = f"vote_{i}"
        temp_d = pd.DataFrame(roll_call_lists[i])
        vote_list = temp_d['vote'].map(dict)  
        # Append the vote list as a Series to vote_data
        vote_data.append(vote_list.rename(column_name))
    # Concatenate all the vote data into a single DataFrame
    vote_df = pd.concat(vote_data, axis=1)
    df = pd.concat([df, vote_df], axis=1)
    df = df.fillna(0)
    print(f'Votes assembled. {len(df)} Reps, {len(df.columns)-1} votes')

    # Use SVD
    M1=df.drop(columns=['name'],axis=1).to_numpy()
    U,s,Vt=sci.svd(M1)
    U_full=pd.DataFrame(U)

    # Identify the first 2 columns
    U_first_two=U_full[[0,1]] # Grab the first 2 columns
    U_first_two=pd.concat([df['name'].astype(str),U_first_two],axis=1) # Add the names (which are still formatted with party and state)

    # Create the Party and State columns
    U_first_two[['Name','Party']] = U_first_two['name'].str.split('_',expand=True)
    U_first_two['Party'] = U_first_two['Party'].replace({'Republican':'R','Democratic':'D'})
    U_first_two['State'] = False

    # Clean up excess columns
    U_first_two = U_first_two.drop(columns=['name'],axis=1)
    U_first_two
    U_first_two.columns=['0','1','Name','Party','State']

    # Find party center
    H_R=U_first_two[U_first_two['Party']=='R'].drop(['Party','State','Name'],axis=1)
    H_D=U_first_two[U_first_two['Party']=='D'].drop(['Party','State','Name'],axis=1)
    Rcenter = np.mean(H_R,axis=0) # Returns a set of 2 values (xy coords)
    Dcenter = np.mean(H_D,axis=0) # Returns a set of 2 values (xy coords)

    # Calculate party errors (RMSE)
    R_error=[]
    for i in range(0,len(H_R)):
        dist=math.dist(H_R.iloc[i],Rcenter)
        R_error.append(dist**2)
    R_RMSE=np.sqrt(np.mean(R_error))
    print(f'R Error: {R_RMSE}')

    D_error=[]
    for i in range(0,len(H_D)):
        dist=math.dist(H_D.iloc[i],Dcenter)
        D_error.append(dist**2)
    D_RMSE=np.sqrt(np.mean(D_error))
    print(f'D Error: {D_RMSE}')

    house_df.loc[len(house_df)] = {'Session':(congress,session),'House':len(H_R)-len(H_D),'H_R_error':R_RMSE,'H_D_error':D_RMSE}


Beginning scrape of Congress 117, Session 1 which occurred in 2021
Finished all votes in this Congressional Session!
Votes assembled. 434 Reps, 449 votes
R Error: 0.04693074612430461
D Error: 0.04250269028769032
Beginning scrape of Congress 117, Session 2 which occurred in 2022
Finished all votes in this Congressional Session!
Votes assembled. 438 Reps, 549 votes
R Error: 0.044341897287638016
D Error: 0.042504928986901565


In [98]:
house_df

Unnamed: 0,Session,House,H_R_error,H_D_error
0,"(117, 1)",-10,0.046931,0.042503
1,"(117, 2)",-9,0.044342,0.042505


In [89]:
session = 1
congress = 117
year = 2021


# Presidential control: 1 = R, -1 = D
# House/Senate Margin: R-D (More Rs = positive, more Ds = negative)

house_df = pd.DataFrame(columns=['Session','House','H_R_error','H_D_error'])



print(f'Beginning scrape of Congress {congress}, Session {session} which occurred in {year}')

roll_call_lists = []

# Grab each vote page in a Congressional Session.
i = 0
while True:
    i += 1
    url = base_house_url + str(year) + "{:03d}".format(i)
    
    page_data = fetch_page(url)
    if page_data:
        
        soup = bs(page_data, 'html.parser')
        votes = soup.find_all('a',class_="library-link")
        if soup.find('h1').get_text() == 'Roll call vote not available' or i>6:
            print('Finished all votes in this Congressional Session!')
            break
        member_data = []

        for v in votes:
            vote = v.get('aria-label').split(' ')
            vote = vote[-1]
            rep = v.get_text() + "_" + re.search(r'of\s+(\w+)', v.get('aria-label')).group(1)
            member_data.append({'name':rep,'vote':vote})
        
        roll_call_lists.append(member_data)

    else:
        break

df=pd.DataFrame(roll_call_lists[0]).drop(columns='vote',axis=1) # Create the first column of the dataframe
# Assemble the vote data
vote_data = []
for i in range(len(roll_call_lists)):
    column_name = f"vote_{i}"
    temp_d = pd.DataFrame(roll_call_lists[i])
    vote_list = temp_d['vote'].map(dict)  
    # Append the vote list as a Series to vote_data
    vote_data.append(vote_list.rename(column_name))
# Concatenate all the vote data into a single DataFrame
vote_df = pd.concat(vote_data, axis=1)
df = pd.concat([df, vote_df], axis=1)
df = df.fillna(0)
print(f'Votes assembled. {len(df)} Reps, {len(df.columns)-1} votes')



# Use SVD
M1=df.drop(columns=['name'],axis=1).to_numpy()
U,s,Vt=sci.svd(M1)
U_full=pd.DataFrame(U)

# Identify the first 2 columns
U_first_two=U_full[[0,1]] # Grab the first 2 columns
U_first_two=pd.concat([df['name'].astype(str),U_first_two],axis=1) # Add the names (which are still formatted with party and state)

# Create the Party and State columns
U_first_two[['Name','Party']] = U_first_two['name'].str.split('_',expand=True)
U_first_two['Party'] = U_first_two['Party'].replace({'Republican':'R','Democratic':'D'})
U_first_two['State'] = False

# Clean up excess columns
U_first_two = U_first_two.drop(columns=['name'],axis=1)
U_first_two
U_first_two.columns=['0','1','Name','Party','State']

# Find party center
H_R=U_first_two[U_first_two['Party']=='R'].drop(['Party','State','Name'],axis=1)
H_D=U_first_two[U_first_two['Party']=='D'].drop(['Party','State','Name'],axis=1)
Rcenter = np.mean(H_R,axis=0) # Returns a set of 2 values (xy coords)
Dcenter = np.mean(H_D,axis=0) # Returns a set of 2 values (xy coords)

# Calculate party errors (RMSE)
R_error=[]
for i in range(0,len(H_R)):
    dist=math.dist(H_R.iloc[i],Rcenter)
    R_error.append(dist**2)
R_RMSE=np.sqrt(np.mean(R_error))
print(f'R Error: {R_RMSE}')

D_error=[]
for i in range(0,len(H_D)):
    dist=math.dist(H_D.iloc[i],Dcenter)
    D_error.append(dist**2)
D_RMSE=np.sqrt(np.mean(D_error))
print(f'D Error: {D_RMSE}')

house_df.loc[len(house_df)] = {'Session':(congress,session),'House':len(H_R)-len(H_D),'H_R_error':R_RMSE,'H_D_error':D_RMSE}


Beginning scrape of Congress 117, Session 1 which occurred in 2021
End of the line, bucko
Votes assembled. 434 Reps, 6 votes
R Error: 0.04909436028969164
D Error: 0.04825871036634522


In [92]:
house_df


Unnamed: 0,Session,House,H_R_error,H_D_error
0,"(117, 2)",-9,0.065741,0.039998


# Some Test Stuff

In [75]:
page_data = fetch_page('https://clerk.house.gov/Votes/2021444')
if page_data:
    soup = bs(page_data, 'html.parser')

print(soup.find('h1').get_text())

dict={'Aye':1,'No':-1,'Present':0,'Not Voting':0,'Guilty':1,'Not Guilty':-1}

votes = soup.find_all('a',class_="library-link")

vote_list = []
rep_list = []

for v in votes:
    vote = v.get('aria-label').split(' ')
    vote = dict.get(vote[-1],0)
    vote_list.append(vote)
    rep = v.get_text() + " " + re.search(r'of\s+(\w+)', v.get('aria-label')).group(1)
    rep_list.append(rep)

this_vote = pd.DataFrame({'reps':rep_list,'this_v_tally':vote_list})
this_vote



                                Roll Call 444
 | Bill Number: H. Res. 849 


Unnamed: 0,reps,this_v_tally
0,Adams Democratic,0
1,Aderholt Republican,0
2,Aguilar Democratic,0
3,Allen Republican,0
4,Allred Democratic,0
...,...,...
428,Wittman Republican,0
429,Womack Republican,0
430,Yarmuth Democratic,0
431,Young Republican,0
