### Election Data 2008 from NYT

This script is used to gather data from NYT's website for election 2008.

In [25]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
from fake_useragent import UserAgent

In [26]:
# Function which get the map between candidate and party
def getPartyMap(state_soup):
    partyMap = {}
    map_table = state_soup.find_all('table',{'id':'presidential-results-table'})[0]
    column_name = [column.get_text().strip() for column in map_table.find_all('tr')[0].find_all('th')[0:2]]
    for row in map_table.find_all('tbody')[0].find_all('tr'):
        name = row.find_all('th')[0].get_text().strip().split(' ')[-1].replace('"','')
        party = row.find_all('td',{'class':'party'})[0].get_text().strip()
        if party == 'Dem.':
            party = 'D'
        elif party == 'Rep.':
            party = 'R'
        else:
            party = 'O'
        partyMap[name] = party
    return partyMap

# Generate data row for each county
def getContentForState(state_name, state_soup, state_county_soup):
    county_result = state_county_soup.find_all('table',{'id': 'winners-by-county-table'})[0]
    candidate = [name.get_text().strip() for name in county_result.find_all('tr')[0].find_all('th')[1:3]]
    partyMap = getPartyMap(state_soup)

    county_party = [partyMap[this_cand] for this_cand in candidate]

    county_stat = county_result.find_all('tr')[1:]

    raw_data = pd.DataFrame(columns = column_Name) 
    for this_county in county_stat:
        this_county_data = this_county.find_all('td')
        county_name = this_county_data[0].get_text().strip()
        county_pct = [float(num.get_text().strip().split('%')[0]) for num in this_county_data[1:4:2]]
        county_vote = [int(num.get_text().strip().replace(',','').replace('votes','')) for num in this_county_data[2::2]]
        sort_index = [b[0] for b in sorted(enumerate(county_vote),key=lambda i:i[1], reverse=True)]
        ordered_candidate = [candidate[i] for i in sort_index]
        county_vote = [county_vote[i] for i in sort_index]
        county_pct = [county_pct[i] for i in sort_index]
        ordered_county_party = [county_party[i] for i in sort_index]
    

        if len(ordered_candidate) < 3:
            ordered_candidate = ordered_candidate+[None]*(3-len(ordered_candidate))
            county_vote = county_vote + [None]*(3-len(county_vote))
            county_pct = county_pct + [None]*(3-len(county_pct))
            ordered_county_party = ordered_county_party + [None]*(3-len(ordered_county_party))

        this_record = [state_name, county_name] + ordered_candidate + county_vote + county_pct + ordered_county_party

        raw_data = raw_data.append(pd.DataFrame([this_record], columns= column_Name))
    
    return raw_data

In [27]:
column_Name = ['state','county', '1st', '2nd', '3rd', 'votes1', 'votes2', 'votes3', 'pct1', 'pct2', 'pct3', 'party1', 'party2', 'party3']

state_name = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
              'District_of_Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 
              'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 
              'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New_Hampshire', 'New_Jersey', 'New_Mexico', 
              'New_York', 'North_Carolina', 'North_Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 
              'Rhode_Island', 'South_Carolina', 'South_Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 
              'Washington', 'West_Virginia', 'Wisconsin', 'Wyoming']

# Remove the state which doesn't provide county data
county_data_exist = [1]*len(state_name)
county_data_exist[1] = 0
county_data_exist[8] = 0

county_exist = {key: value for (key, value) in zip(state_name, county_data_exist)}

In [28]:
all_data_2008 = pd.DataFrame(columns = column_Name) 

# Loop through each state to get the data
for state in state_name:
    
    if county_exist[state] == 0:
        continue
    state_name = state.replace('_', ' ')
    state = state.lower().replace('_','-')
    state_url = 'https://www.nytimes.com/elections/2008/results/states/' + state + '.html'
    ua = UserAgent()
    header = {'User-Agent':str(ua.chrome)}
    htmlContent = requests.get(state_url, headers=header)
    state_soup = BeautifulSoup(htmlContent.text, 'html.parser')

    state_county_url = 'https://www.nytimes.com/elections/2008/results/states/president/' + state + '.html'

    ua = UserAgent()
    header = {'User-Agent':str(ua.chrome)}
    htmlContent = requests.get(state_county_url, headers=header)
    state_county_soup = BeautifulSoup(htmlContent.text, 'html.parser')

    
    all_data_2008 = all_data_2008.append(getContentForState(state_name, state_soup, state_county_soup))


In [29]:
all_data_2008.to_csv('2008.csv')