In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import cfscrape
import urllib
from IPython.display import display, display_pretty, Javascript, HTML

ImportError: No module named cfscrape

# Flow

- [Download and preprocess county-level results](#Townhall-data)
- [Downlaod and preprocess county-level metadata](#Census-data)
- [Combine datasets](#Combine-data)
- [Export county-level results](#Export-data)
- [Visualize](#Visualize)

## Townhall data

In [2]:
# each page has a summary table that rolls up results at the state level
# get rid of it
def cond(x):
    if x:
        return x.startswith("table ec-table") and not "table ec-table ec-table-summary" in x
    else:
        return False

In [3]:
# list of state abbreviations
states = ['AL','AK','AZ','AR','CA','CO','CT','DC','DE','FL','GA','HI','ID','IL','IN','IA','KS','KY','LA','ME','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC','ND','OH','OK','OR','PA','RI','SC','SD','TN','TX','UT','VT','VA','WA','WV','WI','WY']

# list of years
years = ['2010', '2012', '2014', '2016']

# headers for csv export
data = [['state_abbr', 'county_name', 'year', 'candidate', 'votes_total']]

In [4]:
# loop through each state's web page http://townhall.com/election/2016/president/%s/county, where %s is the state abbr
for year in years:    
    for state in states:

        scraper = cfscrape.create_scraper()
        r = scraper.get('https://townhall.com/election/' + year + '/senate/' + state + '/county').content
        soup = BeautifulSoup(r, "lxml")
        #print soup.prettify()

        # loop through each <table> tag with .ec-table class
        tables = soup.find_all("table", attrs={'class':cond})

        for table in tables:
            if table.findParent("table") is None:
                table_body = table.find('tbody')
                rows = table_body.find_all('tr')
                for row in rows:
                    cols = row.find_all('td')
                    # first tbody tr has four td
                    if len(cols) == 4:
                        # strip text from each td
                        divs = cols[0].find_all('div')
                        county = divs[0].text.strip()
                        candidate = cols[1].text.strip()
                        #party = cols[1]['class'][0]
                        try:
                            total_votes = int(cols[2].text.strip().replace(',','').replace('-','0'))
                        except:
                            total_votes = None
                    # all other tbody tr have three td
                    else:
                        candidate = cols[0].text.strip()
                        #party = cols[0]['class'][0]
                        try:
                            total_votes = int(cols[1].text.strip().replace(',','').replace('-','0'))
                        except:
                            total_votes = None
                        

                    #combine each row's results
                    rowData = [state,county,year,candidate,total_votes]
                    data.append(rowData)
        print state
        print year

NameError: name 'cfscrape' is not defined

In [35]:
townhall = pd.DataFrame(data) # throw results in dataframe
new_header = townhall.iloc[0] #grab the first row for the header
townhall = townhall[1:] #take the data less the header row
townhall.columns = new_header #set the header row as the df header
townhall['votes_total'] = townhall['votes_total'].astype('float64')
print(townhall.shape[0])
townhall.head()

31803


Unnamed: 0,state_abbr,county_name,year,candidate,votes_total
1,AL,Autauga,2010,Richard Shelby,13980.0
2,AL,Autauga,2010,William Barnes,3978.0
3,AL,Baldwin,2010,Richard Shelby,46471.0
4,AL,Baldwin,2010,William Barnes,11027.0
5,AL,Barbour,2010,Richard Shelby,4242.0


In [36]:
# view by state
townhall[(townhall['state_abbr'] == 'AK')]

Unnamed: 0,state_abbr,county_name,year,candidate,votes_total
135,AK,District 1,2010,Lisa Murkowski,2534.0
136,AK,District 1,2010,Joe Miller,1776.0
137,AK,District 1,2010,Scott McAdams,1043.0
138,AK,District 10,2010,Joe Miller,1514.0
139,AK,District 10,2010,Lisa Murkowski,1307.0
140,AK,District 10,2010,Scott McAdams,757.0
141,AK,District 11,2010,Joe Miller,4268.0
142,AK,District 11,2010,Lisa Murkowski,2209.0
143,AK,District 11,2010,Scott McAdams,933.0
144,AK,District 12,2010,Joe Miller,2886.0


In [37]:
# view special cases
print(townhall[(townhall['state_abbr'] == 'NV') & (townhall['county_name'] == 'Carson City')])

0     state_abbr  county_name  year                 candidate  votes_total
3900          NV  Carson City  2010             Sharron Angle       9362.0
3901          NV  Carson City  2010                Harry Reid       8714.0
3902          NV  Carson City  2010             Jesse Holland        518.0
8987          NV  Carson City  2012               Dean Heller      13488.0
8988          NV  Carson City  2012           Shelley Berkley       7509.0
8989          NV  Carson City  2012          David VanDerBeek       1153.0
8990          NV  Carson City  2012  None of these candidates        982.0
28847         NV  Carson City  2016                  Joe Heck      13027.0
28848         NV  Carson City  2016    Catherine Cortez Masto       9741.0
28849         NV  Carson City  2016  None of these candidates        895.0
28850         NV  Carson City  2016                 Tom Jones        448.0
28851         NV  Carson City  2016                Tom Sawyer        358.0
28852         NV  Carson 

In [38]:
# fix townhall county name for Washington DC, Sainte Genevieve, MO, Oglala, SD
townhall.loc[townhall['state_abbr'] =='DC', 'county_name'] = 'District of Columbia'
townhall.loc[townhall['county_name'] == 'Sainte Genevieve', 'county_name'] = 'Ste. Genevieve County'
townhall.loc[townhall['county_name'] == 'Oglala Lakota', 'county_name'] = 'Oglala'
print(townhall[(townhall['county_name'] == 'District of Columbia') | (townhall['county_name'] == 'Ste. Genevieve County') | (townhall['county_name'] == 'Oglala')])

0     state_abbr            county_name  year            candidate  \
3828          MO  Ste. Genevieve County  2010            Roy Blunt   
3829          MO  Ste. Genevieve County  2010       Robin Carnahan   
3830          MO  Ste. Genevieve County  2010        Jonathan Dine   
8561          MO  Ste. Genevieve County  2012     Claire McCaskill   
8562          MO  Ste. Genevieve County  2012            Todd Akin   
8563          MO  Ste. Genevieve County  2012        Jonathan Dine   
14066         DC   District of Columbia  2014         Paul Strauss   
14067         DC   District of Columbia  2014    David Schwartzman   
14068         DC   District of Columbia  2014      Glenda Richmond   
14069         DC   District of Columbia  2014          John Daniel   
28727         MO  Ste. Genevieve County  2016            Roy Blunt   
28728         MO  Ste. Genevieve County  2016         Jason Kander   
28729         MO  Ste. Genevieve County  2016        Jonathan Dine   
28730         MO  St

In [39]:
# change 'Co.' to 'County' in county_name to match census county name
townhall['county_name'] = townhall['county_name'].apply(lambda x: x.replace('Co.','County').strip())
print(townhall[(townhall['state_abbr'] == 'NV') & (townhall['county_name'] == 'Carson City')])

0     state_abbr  county_name  year                 candidate  votes_total
3900          NV  Carson City  2010             Sharron Angle       9362.0
3901          NV  Carson City  2010                Harry Reid       8714.0
3902          NV  Carson City  2010             Jesse Holland        518.0
8987          NV  Carson City  2012               Dean Heller      13488.0
8988          NV  Carson City  2012           Shelley Berkley       7509.0
8989          NV  Carson City  2012          David VanDerBeek       1153.0
8990          NV  Carson City  2012  None of these candidates        982.0
28847         NV  Carson City  2016                  Joe Heck      13027.0
28848         NV  Carson City  2016    Catherine Cortez Masto       9741.0
28849         NV  Carson City  2016  None of these candidates        895.0
28850         NV  Carson City  2016                 Tom Jones        448.0
28851         NV  Carson City  2016                Tom Sawyer        358.0
28852         NV  Carson 

In [40]:
# combine state and county names
townhall['combined'] = townhall['state_abbr'] + townhall['county_name'].apply(lambda x: x.replace(' ','').lower())
print(townhall[(townhall['state_abbr'] == 'NV') & (townhall['county_name'] == 'Carson City')])

0     state_abbr  county_name  year                 candidate  votes_total  \
3900          NV  Carson City  2010             Sharron Angle       9362.0   
3901          NV  Carson City  2010                Harry Reid       8714.0   
3902          NV  Carson City  2010             Jesse Holland        518.0   
8987          NV  Carson City  2012               Dean Heller      13488.0   
8988          NV  Carson City  2012           Shelley Berkley       7509.0   
8989          NV  Carson City  2012          David VanDerBeek       1153.0   
8990          NV  Carson City  2012  None of these candidates        982.0   
28847         NV  Carson City  2016                  Joe Heck      13027.0   
28848         NV  Carson City  2016    Catherine Cortez Masto       9741.0   
28849         NV  Carson City  2016  None of these candidates        895.0   
28850         NV  Carson City  2016                 Tom Jones        448.0   
28851         NV  Carson City  2016                Tom Sawyer   

In [41]:
townhall.to_csv('senate_results.csv')