In [54]:
import pandas as pd
from bs4 import BeautifulSoup
import urllib

# Townhall.com has well-formatted county-level result tables for the 2016 presidential general election
# credit to the web and data teams from townhall.com for putting these data together in a rational format

In [55]:
# each page has a summary table that rolls up results at the state level
# get rid of it
def cond(x):
    if x:
        return x.startswith("table ec-table") and not "table ec-table ec-table-summary" in x
    else:
        return False

In [56]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

# headers for csv export
data = [['State', 'County', '% Reporting', 'Candidate', 'Party', 'Votes', '% Won']]

In [74]:
# loop through each state's web page http://townhall.com/election/2016/president/%s/county, where %s is the state abbr
for state in states:
    r = urllib.urlopen('http://townhall.com/election/2016/president/' + state + '/county').read()
    soup = BeautifulSoup(r, "html.parser")

    # loop through each <table> tag with .ec-table class
    tables = soup.findAll('table', attrs={'class':cond})

    for table in tables:
        if table.findParent("table") is None:
            table_body = table.find('tbody')

            rows = table_body.find_all('tr')
            for row in rows:
                cols = row.find_all('td')
                # first tbody tr has four td
                if len(cols) == 4:    
                    # strip text from each td
                    county = cols[0].text.strip().split()[0]
                    per_reporting = cols[0].text.strip().split()[1]
                    candidate = cols[1].text.strip()
                    party = cols[1]['class'][0]
                    votes = str(cols[2].text.strip())
                    per_won = cols[3].text.strip()
                # all other tbody tr have three td
                else:
                    candidate = cols[0].text.strip()
                    party = cols[1]['class'][0]
                    votes = str(cols[1].text.strip())
                    per_won = cols[2].text.strip()
                    
                #combine each row's results
                rowData = [state,county,per_reporting,candidate,party,votes,per_won]
                data.append(rowData)

In [75]:
results = pd.DataFrame(data) # throw results in dataframe
new_header = results.iloc[0] #grab the first row for the header
results = results[1:] #take the data less the header row
# results.rename(columns = new_header) #set the header row as the df header
results.columns = new_header #set the header row as the df header
results

Unnamed: 0,State,County,% Reporting,Candidate,Party,Votes,% Won
1,AL,Autauga,100%,Donald Trump,GOP,18110,73.4%
2,AL,Autauga,100%,Hillary Clinton,DEM,5908,24.0%
3,AL,Autauga,100%,Gary Johnson,IND,538,2.2%
4,AL,Autauga,100%,Jill Stein,IND,105,0.4%
5,AL,Baldwin,100%,Donald Trump,GOP,72780,77.4%
6,AL,Baldwin,100%,Hillary Clinton,DEM,18409,19.6%
7,AL,Baldwin,100%,Gary Johnson,IND,2448,2.6%
8,AL,Baldwin,100%,Jill Stein,IND,453,0.5%
9,AL,Barbour,100%,Donald Trump,GOP,5431,52.3%
10,AL,Barbour,100%,Hillary Clinton,DEM,4848,46.7%


In [76]:
results.to_csv('2016_US_County_Level_Presidential_Results.csv',sep=',')