In [16]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
from io import StringIO

matplotlib.style.use('ggplot') 
#pd.set_option('display.max_columns', None)


In [3]:
state_abbrev = {
    'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA',
    'Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA',
    'Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS',
    'Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA',
    'Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT',
    'Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM',
    'New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK',
    'Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC',
    'South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT',
    'Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY',
    'District of Columbia': 'DC'
}

In [4]:
# Comparison Data from David Leip's Elections Atlas
# Source: http://uselectionatlas.org/RESULTS/

# 2004
# George W. Bush	Richard Cheney	Republican	62,039,572	50.73%	286	53.2%  
# John Kerry	John Edwards	Democratic	59,027,115	48.26%	251	46.7%  
# Total	122,303,590		538	

# 2008
# Barack H. Obama	Joseph R. Biden, Jr.	Democratic	69,499,428	52.86%	365	67.8%  
# John S. McCain, III	Sarah H. Palin	Republican	59,950,323	45.60%	173	32.2%  
# Total	131,473,705		538	

# 2012
# Barack H. Obama	Joseph R. Biden, Jr.	Democratic	65,918,507	51.01%	332	61.7%  
# Willard Mitt Romney	Paul Ryan	Republican	60,934,407	47.15%	206	38.3%  
# Total	129,237,642		538	

# 2016
# Donald J. Trump	Michael R. Pence	Republican	62,985,106	45.94%	304	56.5%  
# Hillary Clinton	Tim Kaine	Democratic	65,853,625	48.03%	227	42.2%  
# Total	137,100,229		538	


In [5]:
# Load the 2012, 2016 Voting and Turnout data
# Source: https://github.com/kyaroch/2012_and_2016_presidential_election_results_by_county

voting1_df = pd.read_csv('./data/2012_and_2016_presidential_election_results_by_county.csv',
                         converters={'fips_code': lambda x: str(x)})

columns = ['state_postal_abbrev', 'fips_code', 'county_name', '2012_total_votes', 
           'obama_votes', 'romney_votes', '2016_total_votes', 'clinton_votes', 
           'trump_votes', '2012_adult_pop', '2016_extrapolated_adult_pop', 
           '2012_turnout', '2016_turnout']

renames = ['state',  'fips_code', 'county_name',  'total_2012', 'dem_2012', 
           'rep_2012','total_2016', 'dem_2016', 'rep_2016', 'vap_2012','vap_2016',
           'turnout_2012','turnout_2016']

voting1_df = voting1_df[columns]
voting1_df.rename(columns = dict(zip(columns, renames)), inplace=True)

#Fix fips_code, add leading zero
voting1_df['fips_code'] = voting1_df['fips_code'].apply(lambda x: '0' + x if len(x) < 5 else x)

voting1_df['county_name'] = voting1_df['county_name'] + " County"

# voting1_df[voting1_df['2016_results_official'] == 0].count()
# 385 unofficial results

#Close enough to above numbers from Leip's election atlas. 
print(voting1_df.sum(numeric_only=True).apply(lambda x: '%.2f' % x))

#Turn year into column
year_dfs = []

for year in [2012, 2016]:
    cols = 'state fips_code county_name total_{0} dem_{0} rep_{0} vap_{0} turnout_{0}'.format(year)
    cols = cols.split(' ')
    temp_df = voting1_df[cols].copy()
    temp_df['year'] = year
    renames = dict(zip(cols, ['state', 'fips_code', 'county_name',
                              'county_num', 'dem_num', 'rep_num', 'vap', 'turnout']))
    temp_df.rename(columns=renames, inplace=True)
    year_dfs.append(temp_df)

voting1_df = pd.concat(year_dfs)

voting1_df['dem_margin'] = (voting1_df['dem_num'] - voting1_df['rep_num'])/voting1_df['county_num']

#voting1_df.sort_values(by='dem_margin')
voting1_df.head()


total_2012      128957555.00
dem_2012         65791440.00
rep_2012         60767069.00
total_2016      136352670.00
dem_2016         65689549.00
rep_2016         62778232.00
vap_2012        239849637.00
vap_2016        249735736.00
turnout_2012         1751.15
turnout_2016         1775.76
dtype: object


Unnamed: 0,state,fips_code,county_name,county_num,dem_num,rep_num,vap,turnout,year,dem_margin
0,AL,1001,Autauga County,23973,6363,17379,40874,0.5865,2012,-0.459517
1,AL,1003,Baldwin County,85491,18424,66016,147416,0.5799,2012,-0.55669
2,AL,1005,Barbour County,11517,5912,5550,21334,0.5398,2012,0.031432
3,AL,1007,Bibb County,8420,2202,6132,17796,0.4731,2012,-0.466746
4,AL,1009,Blount County,24060,2970,20757,43876,0.5484,2012,-0.739277


In [6]:
# Load Voting Age Population 2005 to 2009 average, can't find yearly VAP data
# Source: https://www.census.gov/rdo/data/voting_age_population_by_citizenship_and_race_cvap.html

# 'latin-1' deals with e-acute: https://stackoverflow.com/questions/5552555
population2009_df = pd.read_csv('./data/CVAP_CSV_Format_2005-2009/County.csv', encoding='latin-1')

population2009_df['fips_code'] = population2009_df['GEOID'].apply(lambda x: x[7:])
population2009_df = population2009_df[(population2009_df['LNTITLE'] == 'Total')]
population2009_df.rename(columns={'CVAP_EST':'vap'}, inplace=True)
population2009_df = population2009_df[['fips_code', 'vap']]  

population2009_df.head()

Unnamed: 0,fips_code,vap
0,1001,35315
13,1003,128945
26,1005,22285
39,1007,16255
52,1009,41095


In [7]:
# Load 2004-2012 Voting data 
# Source: https://github.com/helloworlddata/us-presidential-election-county-results
columns = ['year', 'state', 'county', 'fips', 'vote_rep', 'vote_dem', 'vote_total']
renames = ['year', 'state', 'county_name', 'fips_code', 'rep_num', 'dem_num', 'county_num']


voting2_df = pd.read_csv('./data/us-presidential-election-county-results-2004-through-2012.csv')

voting2_df = voting2_df[columns]
voting2_df.rename(columns=dict(zip(columns, renames)), inplace=True)

# Fix Laclede County Results, otherwise state switches to D
# https://en.wikipedia.org/wiki/United_States_presidential_election_in_Missouri,_2008
# Laclede	32.4%	5,218	67.6%	10,875	16,093
voting2_df.loc[(voting2_df['state'] == 'MO') & 
           (voting2_df['year'] == 2008) & 
           (voting2_df['county_name'] == 'Laclede County'), 'county_num'] = 16093

voting2_df['dem_margin'] = (voting2_df['dem_num'] - voting2_df['rep_num'])/voting2_df['county_num'] 

voting2_df = voting2_df[(voting2_df['year'].isin([2004, 2008])) &
                        (voting2_df['state'] != 'AK')]

voting2_df = pd.merge(voting2_df, population2009_df, on='fips_code')

voting2_df['turnout'] = voting2_df['county_num'] / voting2_df['vap']

#Close enough to above numbers from Leip's election atlas. 
#print(voting2_df[.sum(numeric_only=True).apply(lambda x: '%.2f' % x))


voting2_df.head()

Unnamed: 0,year,state,county_name,fips_code,rep_num,dem_num,county_num,dem_margin,vap,turnout
0,2004,AL,Autauga County,1001,15196,4758,20081,-0.519795,35315,0.568625
1,2008,AL,Autauga County,1001,17403,6093,23641,-0.478406,35315,0.669432
2,2004,AL,Baldwin County,1003,52971,15599,69320,-0.539123,128945,0.537594
3,2008,AL,Baldwin County,1003,61271,19386,81413,-0.514476,128945,0.631378
4,2004,AL,Barbour County,1005,5899,4832,10777,-0.099007,22285,0.483599


In [8]:
# Data quality checks:

# voting2_df.sort_values(by='turnout')  #by='dem_margin'

# voting2_df[(voting2_df['turnout'] > 1) & (voting2_df['year'] == 2004)]

# year	state	county_name	fips_code	rep_num	dem_num	county_num	dem_margin	vap	turnout
# 484	2004	CO	Hinsdale County	08053	355	236	602	-0.197674	490	1.228571
# 2254	2004	LA	Saint Bernard Parish	22087	19597	9956	29838	-0.323111	27540	1.083442
# 3254	2004	NE	Arthur County	31005	240	24	266	-0.812030	235	1.131915
# 3554	2004	NM	Harding County	35021	380	259	644	-0.187888	505	1.275248
# 4294	2004	OK	McIntosh County	40091	9946	6933	16879	-0.178506	15420	1.094617
# 4296	2004	OK	Major County	40093	10041	3742	13783	-0.457012	5405	2.550046
# 4298	2004	OK	Marshall County	40095	7472	3684	11156	-0.339548	10340	1.078917

# voting2_df[(voting2_df['turnout'] > 1) & (voting2_df['year'] == 2008)]
# Possible this will affect RI results, 168,000 will be filtered out of a population of 1 million
# We'll see how it works.

# 	year	state	county_name	fips_code	rep_num	dem_num	county_num	dem_margin	vap	turnout
# 485	2008	CO	Hinsdale County	08053	344	240	599	-0.173623	490	1.222449
# 1427	2008	IN	LaPorte County	18091	17918	28258	208757	0.049531	82565	2.528396
# 3193	2008	MT	McCone County	30055	2822	1607	4567	-0.266039	1330	3.433835
# 3255	2008	NE	Arthur County	31005	217	39	263	-0.676806	235	1.119149
# 3555	2008	NM	Harding County	35021	358	260	626	-0.156550	505	1.239604
# 4573	2008	RI	Washington County	44009	49810	116156	168633	0.393434	98525	1.711576
# 4639	2008	SC	McCormick County	45065	5416	9608	15174	0.276262	8435	1.798933
# 4907	2008	TN	McNairy County	47109	23290	20209	43846	-0.070269	19350	2.265943
# 4913	2008	TN	Marion County	47115	20288	13058	33767	-0.214114	21455	1.573852

# print(
# voting2_df[voting2_df['turnout'] > 1]['county_num'].count(),
# voting2_df[voting2_df['turnout'] > 1]['county_num'].sum(),
# voting2_df[(voting2_df['turnout'] > 1) & 
#             (voting2_df['year'] == 2004)]['county_num'].sum(),
# voting2_df[(voting2_df['turnout'] > 1) & 
#             (voting2_df['year'] == 2008)]['county_num'].sum()
# )
# So there are 16 counties with turnout over 1
# and they make up 549400 votes. 
# with 476232 coming from 2008, and 73168 from 2004
# 16 549400 73168 476232

# These counties have negative turnout. . . 
# 1039	2008	HI	Kalawao County	15005	-9999	-9999	-9999	-0.000000	80	-124.987500
# 1038	2004	HI	Kalawao County	15005	-9999	-9999	-9999	-0.000000	80	-124.987500


# MO switches sides in results:
# voting2_df[(voting2_df['state'] == 'MO') & 
#            (voting2_df['year'] == 2008) &
#            ((voting2_df['turnout'] > 1) | (voting2_df['dem_margin'] > 1) | (voting2_df['dem_margin'] < -1)) ]

# This is the cause:
# 	year	state	county_name	fips_code	rep_num	dem_num	county_num	dem_margin	vap	turnout
# 3013	2008	MO	Laclede County	29105	10875	5218	2024	-2.79496	25950	0.077996
# Fixed above.  


# voting2_df['fips_code'].drop_duplicates().count()
# 3154 counties, other is 3112, so probably alaska is main difference? but that's only ~ 10-20
# Note, without AK = 3114, close enough.  2 missing ones are from other dataset that I removed.  I could look
# those counties up, then these would match exactly.  pretty good.  

# voting2_df.sort_values(by='dem_margin')

# -200% margin here.
# 4660	2008	MO	Laclede County	29105	10875	5218	2024	-2.794960
# vs https://en.wikipedia.org/wiki/United_States_presidential_election_in_Missouri,_2008
# Laclede	32.4%	5,218	67.6%	10,875	16,093
# Issue is that county num is 2023 when it should be around 16093

# https://en.wikipedia.org/wiki/Oglala_Lakota_County,_South_Dakota
# Oglala Lakota County, known as Shannon County until May 2015, is a county located 
# in the U.S. state of South Dakota. The population was 13,586 at the 2010 census. 
# Oglala Lakota County does not have its own county seat. Wikipedia
# have the same fips code though, so should be fine.  

# 2004
# print('2004 Results:')
# print('Dem: ', voting2_df[(voting2_df['year'] == 2004)]['dem_num'].sum())
# print('Rep: ', voting2_df[(voting2_df['year'] == 2004)]['rep_num'].sum())
# print('Total: ', voting2_df[(voting2_df['year'] == 2004)]['county_num'].sum())

# # 2008
# print('2008 Results:')
# print('Dem: ', voting2_df[(voting2_df['year'] == 2008)]['dem_num'].sum())
# print('Rep: ', voting2_df[(voting2_df['year'] == 2008)]['rep_num'].sum())
# print('Total: ', voting2_df[(voting2_df['year'] == 2008)]['county_num'].sum())

# 2004 Results:
# Dem:  59093576
# Rep:  62131489
# Total:  122459692
# 2008 Results:
# Dem:  69547395
# Rep:  59959177
# Total:  131502027

# Leip atlas:
# 2004
# http://uselectionatlas.org/RESULTS/
# 	George W. Bush	Richard Cheney	Republican	62,039,572	50.73%	286	53.2%
# 	John Kerry	John Edwards	Democratic	59,027,115	48.26%	251	46.7%
# Total	122,303,590		538	

# 2008
# 	Barack H. Obama	Joseph R. Biden, Jr.	Democratic	69,499,428	52.86%	365	67.8%
# 	John S. McCain, III	Sarah H. Palin	Republican	59,950,323	45.60%	173	32.2%
# Total	131,473,705		538	

# Ok, so pretty close/good enough.  

In [9]:
# Source: https://en.wikipedia.org/wiki/Electoral_College_(United_States)#Chronological_table
electoral_df = pd.read_csv('./data/state_electoral_votes.csv') #sep="\t"

# Electoral points in 2004 = 2008, 2012=2016=2020
electoral_df['votes_2016'] = electoral_df['votes_2012']
electoral_df['votes_2004'] = electoral_df['votes_2008']
electoral_df['state'] = electoral_df['state'].apply(lambda x: state_abbrev[x])

years = [2004, 2008, 2012, 2016]
electoral_dfs = []

#Stack by year
for year in years:
    cols = 'state votes_{0}'.format(year)
    cols = cols.split(' ')
    temp_df = electoral_df[cols].copy()
    temp_df['year'] = year
    temp_df.rename(columns={'votes_{}'.format(year): 'state_electoral_votes'}, inplace=True)
    electoral_dfs.append(temp_df)

electoral_df = pd.concat(electoral_dfs)

electoral_df.head()

Unnamed: 0,state,state_electoral_votes,year
0,AL,9,2004
1,AK,3,2004
2,AZ,10,2004
3,AR,6,2004
4,CA,55,2004


In [10]:
# Manually input Alaska statewide data because they don't use counties
# http://uselectionatlas.org/RESULTS/compare.php?year=2016&fips=2&f=0&off=0&elect=0&type=state

columns = ['fips_code', 'county_num', 'dem_num', 'rep_num', 
            'year', 'state', 'county_name', 'vap']

ak = [['02', '02', '02', '02'], [312598, 326197, 300495, 318608], [111025, 123594, 122640, 116454], 
      [190889, 193841, 164676, 163387], [2004, 2008, 2012, 2016], ['AK', 'AK', 'AK', 'AK'],
      ['Alaska', 'Alaska', 'Alaska', 'Alaska'], [550189, 550189, 550189, 550189]]

alaska_df = pd.DataFrame(dict(zip(columns, ak)))

alaska_df['dem_margin'] =  (alaska_df['dem_num'] - alaska_df['rep_num'])/ alaska_df['county_num']
alaska_df['turnout'] = alaska_df['county_num']/ alaska_df['vap']

alaska_df


Unnamed: 0,county_name,county_num,dem_num,fips_code,rep_num,state,vap,year,dem_margin,turnout
0,Alaska,312598,111025,2,190889,AK,550189,2004,-0.255485,0.568165
1,Alaska,326197,123594,2,193841,AK,550189,2008,-0.215351,0.592882
2,Alaska,300495,122640,2,164676,AK,550189,2012,-0.139889,0.546167
3,Alaska,318608,116454,2,163387,AK,550189,2016,-0.147306,0.579088


In [11]:
# Combine all dataframes together
turnout_df = pd.concat([voting1_df, voting2_df, alaska_df])

#Add in electoral college points for each year
turnout_df = pd.merge(turnout_df, electoral_df, on=('state', 'year'))

turnout_df.head()

Unnamed: 0,county_name,county_num,dem_margin,dem_num,fips_code,rep_num,state,turnout,vap,year,state_electoral_votes
0,Autauga County,23973,-0.459517,6363,1001,17379,AL,0.5865,40874,2012,9
1,Baldwin County,85491,-0.55669,18424,1003,66016,AL,0.5799,147416,2012,9
2,Barbour County,11517,0.031432,5912,1005,5550,AL,0.5398,21334,2012,9
3,Bibb County,8420,-0.466746,2202,1007,6132,AL,0.4731,17796,2012,9
4,Blount County,24060,-0.739277,2970,1009,20757,AL,0.5484,43876,2012,9


In [11]:
#Data output script for interactive visualization, filters out abnormal values  
out_df = turnout_df[(turnout_df['dem_margin'] <= 1.0) &
                    (turnout_df['dem_margin'] >= -1.0) &
                    (turnout_df['turnout'] >= 0.0) &
                    (turnout_df['turnout'] <= 1.0)]

# out_df = out_df[['county', 'state', 'county_num', 'turnout', 'num_rep', 'num_dem', 'num_state_dem',
#                  'num_state_rep','num_state', 'year', 'vap', 'fips_code', 'state_electoral_votes']]

out_df = out_df.round(decimals=4)

out_df.to_csv('./US_County_Level_Presidential_Results_04-16.csv', index=False)

In [14]:
state_df = turnout_df[(turnout_df['dem_margin'] <= 1.0) &
                    (turnout_df['dem_margin'] >= -1.0) &
                    (turnout_df['turnout'] >= 0.0) &
                    (turnout_df['turnout'] <= 1.0)]

state_df = state_df.groupby(['state','year']).agg(
    {'county_num':'sum', 'dem_num':'sum','rep_num':'sum',
     'vap':'sum', 'state_electoral_votes':'max'})

state_df = state_df.round(decimals=4)

state_df = state_df.reset_index()
state_df.rename(columns={'county_num':'state_num'}, inplace=True)
state_df

Unnamed: 0,state,year,dem_num,state_num,vap,rep_num,state_electoral_votes
0,AK,2004,111025,312598,550189,190889,3
1,AK,2008,123594,326197,550189,193841,3
2,AK,2012,122640,300495,550189,164676,3
3,AK,2016,116454,318608,550189,163387,3
4,AL,2004,693933,1883415,3430120,1176394,9
5,AL,2008,813479,2099819,3430120,1266546,9
6,AL,2012,795696,2074338,3699846,1255925,9
7,AL,2016,729547,2123372,3771201,1318255,9
8,AR,2004,469953,1054945,2067620,572898,6
9,AR,2008,422310,1086617,2067620,638017,6


In [70]:
#state_df['turnout'] = state_df['state_num']/state_df['vap']
#list(state_df)
#state_df = state_df.round(decimals=4)
state_df.to_csv('./US_State_Level_Presidential_Results_04-16.csv', index=False)


In [76]:
national_df = turnout_df[(turnout_df['dem_margin'] <= 1.0) &
                    (turnout_df['dem_margin'] >= -1.0) &
                    (turnout_df['turnout'] >= 0.0) &
                    (turnout_df['turnout'] <= 1.0)]

national_df = national_df.groupby(['year']).agg(
    {'county_num':'sum', 'dem_num':'sum','rep_num':'sum',
     'vap':'sum'})

national_df.rename(columns={'county_num':'num_nation'}, inplace=True)

national_df = national_df.reset_index()

national_df

Unnamed: 0,year,dem_num,num_nation,vap,rep_num
0,2004,59006163,122217180,208206569,61992079
1,2008,69367959,131049863,208033614,59848713
2,2012,65912769,129254383,240396331,60929447
3,2016,65805766,136670626,250285285,62941275


In [149]:
## Calculate 2016 Democratic Margin By Race
## source: ANES 2016 time series study:
## http://www.electionstudies.org/studypages/anes_timeseries_2016/anes_timeseries_2016.htm

anes_df = pd.read_csv('./data/anes/anes_timeseries_2016/anes_timeseries_2016_rawdata.txt',
                      sep='|')

columns = ['V160102', 'V161310x', 'V162058x']
labels = ['weight_postfull', 'group', 'party']

anes_df = anes_df[columns]
anes_df.rename(columns=dict(zip(columns, labels)), inplace=True)

anes_df = anes_df[anes_df['party'].isin([10,11,12])]  #Filter out nonvoters

group_df = anes_df.groupby(['group', 'party']).agg({'weight_postfull':'sum'})
group_df = group_df.unstack(level=-1)
group_df.columns = group_df.columns.droplevel(0)
group_df = group_df.rename_axis(None, axis=1)

group_df.loc[6] = group_df.loc[[3,4,6,-9]].sum()  #Include nonresponders in other, -9
rows = [1,2,5,6]
races = ['Non-Hispanic White', 'Non-Hispanic Black', 'Hispanic', 'Other']
group_df.rename(index=dict(zip(rows,races)), inplace=True)
group_df = group_df.loc[races] 

group_df.reset_index(inplace=True)
group_df['total'] = group_df[[10,11,12]].sum(axis=1)
group_df.rename(columns={10:'d',11:'r',12:'o'}, inplace=True)

group_df['dem_frac'] = group_df['d']/group_df['total']
group_df['rep_frac'] = group_df['r']/group_df['total']
group_df['other_frac'] = group_df['o']/group_df['total']
#group_df['dem_margin'] = group_df['dem_frac'] - group_df['rep_frac']

group_df['year'] = 2016
group_df = group_df[['group','year','dem_frac', 'rep_frac']]
group_df

#['dem_frac', 'rep_frac', 'year', 'group'] 

# What do you need? turnout, electorate frac

#This is where you could calculate dem_frac, rep_frac and fraction of electorate for each race as well
#if you wanted.  But you're getting that data from the Census Bureau for now, you just need
#the party choice by race.

#You could also look at the intended choice of nonvoters, which would be interesting to look 
#at because it would show what party increased turnout would go towards. 

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,group,year,dem_frac,rep_frac
0,Non-Hispanic White,2016,0.393836,0.538035
1,Non-Hispanic Black,2016,0.899861,0.063772
2,Hispanic,2016,0.686294,0.238819
3,Other,2016,0.573222,0.314705


In [153]:
#Sources:
#http://www.electproject.org/home/voter-turnout/demographics
#https://docs.google.com/spreadsheets/d/1l5fpK7ysQhQbZPv9hnZ_-PO1J1zBVPXSSQjNejTXecY/edit#gid=0
#Below includes both corrected and uncorrected data, as data corrected for turnout response
#bias doesn't result in correct overall D/R voter percentages.
#Neither corrected or uncorrected result in right count.  

# Corrected USEP turnout data:
# race_turnout = StringIO("""
# Turnout Rate	2016	2014	2012	2010	2008	2006	2004
# Non-Hispanic White	64.7%	40.8%	61.8%	45.0%	65.2%	44.7%	64.3%
# Non-Hispanic Black	59.9%	36.4%	67.4%	41.6%	69.1%	36.6%	61.4%
# Hispanic	44.9%	21.1%	43.1%	26.6%	46.5%	25.5%	42.9%
# Other	46.3%	24.5%	45.4%	30.7%	48.0%	28.5%	44.9%
# """)

#Uncorrected USEP turnout data:
race_turnout = StringIO("""
Turnout Rate	2016	2014	2012	2010	2008	2006	2004
Non-Hispanic White	65.3%	45.8%	64.1%	48.6%	66.1%	51.6%	67.2%
Non-Hispanic Black	59.3%	40.4%	66.4%	43.8%	65.2%	41.2%	60.3%
Hispanic	47.6%	27.0%	48.0%	31.2%	49.9%	32.3%	47.2%
Other	49.0%	29.5%	49.0%	34.1%	49.5%	33.4%	48.0%
""")

# Corrected USEP share of electorate data:
# race_share = StringIO("""
# Share of Electorate	2016	2014	2012	2010	2008	2006	2004
# Non-Hispanic White	73.6%	76.9%	74.1%	77.9%	76.6%	81.0%	79.5%
# Non-Hispanic Black	12.3%	11.9%	13.2%	11.4%	12.3%	10.0%	11.1%
# Hispanic	9.1%	7.0%	8.3%	6.8%	7.3%	5.6%	5.9%
# Other	5.0%	4.2%	4.4%	3.9%	3.8%	3.4%	3.6%
# """)

#Try uncorrected USEP data:
race_share = StringIO("""
Share of Electorate	2016	2014	2012	2010	2008	2006	2004
Non-Hispanic White	73.3%	76.3%	73.7%	77.5%	76.3%	80.4%	79.2%
Non-Hispanic Black	12.4%	12.1%	13.4%	11.7%	12.4%	10.3%	11.2%
Hispanic	9.2%	7.3%	8.4%	6.9%	7.4%	5.8%	6.0%
Other	5.0%	4.3%	4.5%	3.9%	3.9%	3.4%	3.6%
""")

#Source:
#http://www.electionstudies.org/nesguide/text/t9a_1_1.txt
race_demmargin = StringIO("""
percent_of_group,1948,1952,1954,1956,1958,1960,1962,1964,1966,1968,1970,1972,1974,1976,1978,1980,1982,1984,1986,1988,1990,1992,1994,1996,1998,2000,2002,2004,2008,2012
Non-Hispanic White,53,40,,39,,48,,65,,41,,30,,46,,36,,36,,39,,52,,51,,46,,42,44,43,
Non-Hispanic Black,65*,80,,64*,,71,,100,,97,,87,,95,,93,,91,,92,,95,,99,,92,,89,100,97,
Hispanic,,,,,,,,,,40*,,50*,,86*,,58*,,52,,65,,65,,83,,59,,61,73,73,
Other,,,,,,,,,,33*,,70*,,43*,,25*,,50*,,83*,,44*,,54*,,57,,36*,82,61
""")

race_repmargin = StringIO("""
percent_of_group,2004,2008,2012
Non-Hispanic White,58,56,57 
Non-Hispanic Black,11,0,3 
Hispanic,39,27,27 
Other,64,18,39 
""")


raceturnout_df = pd.read_csv(race_turnout, sep='\t') #.T
raceturnout_df = pd.melt(raceturnout_df, id_vars=["Turnout Rate"], 
                  var_name="year", value_name="turnout")
raceturnout_df.rename(columns={'Turnout Rate':'group'}, inplace=True)

raceshare_df = pd.read_csv(race_share, sep='\t')
raceshare_df = pd.melt(raceshare_df, id_vars=["Share of Electorate"], 
                  var_name="year", value_name="electorate_frac")
raceshare_df.rename(columns={'Share of Electorate':'group'}, inplace=True)

racedemmargin_df = pd.read_csv(race_demmargin, sep=',',index_col=False)
racedemmargin_df = racedemmargin_df[['percent_of_group', '2004', '2008', '2012']]
racedemmargin_df = pd.melt(racedemmargin_df, id_vars=["percent_of_group"], 
                  var_name="year", value_name="dem_frac")
racedemmargin_df.rename(columns={'percent_of_group':'group'}, inplace=True)

racerepmargin_df = pd.read_csv(race_repmargin, sep=',',index_col=False)
racerepmargin_df = racerepmargin_df[['percent_of_group', '2004', '2008', '2012']]
racerepmargin_df = pd.melt(racerepmargin_df, id_vars=["percent_of_group"], 
                  var_name="year", value_name="rep_frac")
racerepmargin_df.rename(columns={'percent_of_group':'group'}, inplace=True)

race_df = raceturnout_df.merge(raceshare_df, on=['group','year'], how='inner')
race_df = race_df.merge(racedemmargin_df, on=['group','year'], how='inner')
race_df = race_df.merge(racerepmargin_df, on=['group','year'], how='inner')
race_df = race_df.replace({'\%': '', '\*':''}, regex=True)
race_df = race_df.apply(pd.to_numeric, errors='ignore', axis=0)
race_df[['turnout', 'electorate_frac',
         'dem_frac', 'rep_frac']] = race_df[['turnout', 'electorate_frac', 
                                             'dem_frac','rep_frac']] / 100

#Add in 2016 data from ANES
int_df = raceturnout_df.merge(raceshare_df, on=['group','year'], how='inner')
int_df = int_df.replace({'\%': '', '\*':''}, regex=True)
int_df = int_df.apply(pd.to_numeric, errors='ignore', axis=0)
int_df = int_df.merge(group_df, on=['group','year'], how='inner')
int_df['electorate_frac'] = int_df['electorate_frac']/100
int_df['turnout'] = int_df['turnout']/100
race_df = pd.concat([race_df, int_df])

electorate_df = national_df[['year','num_nation']]
race_df = race_df.merge(electorate_df, on='year', how='left')

race_df['dem_margin'] = race_df['dem_frac'] - race_df['rep_frac']
race_df['num_group'] = race_df['electorate_frac']*race_df['num_nation']
race_df['num_dem'] = race_df['dem_frac']*race_df['num_group']
race_df['num_rep'] = race_df['rep_frac']*race_df['num_group']

race_df

Unnamed: 0,group,year,turnout,electorate_frac,dem_frac,rep_frac,num_nation,dem_margin,num_group,num_dem,num_rep
0,Non-Hispanic White,2012,0.641,0.737,0.43,0.57,129254383,-0.14,95260480.0,40962010.0,54298470.0
1,Non-Hispanic Black,2012,0.664,0.134,0.97,0.03,129254383,0.94,17320090.0,16800480.0,519602.6
2,Hispanic,2012,0.48,0.084,0.73,0.27,129254383,0.46,10857370.0,7925879.0,2931489.0
3,Other,2012,0.49,0.045,0.61,0.39,129254383,0.22,5816447.0,3548033.0,2268414.0
4,Non-Hispanic White,2008,0.661,0.763,0.44,0.56,131049863,-0.12,99991050.0,43996060.0,55994990.0
5,Non-Hispanic Black,2008,0.652,0.124,1.0,0.0,131049863,1.0,16250180.0,16250180.0,0.0
6,Hispanic,2008,0.499,0.074,0.73,0.27,131049863,0.46,9697690.0,7079314.0,2618376.0
7,Other,2008,0.495,0.039,0.82,0.18,131049863,0.64,5110945.0,4190975.0,919970.0
8,Non-Hispanic White,2004,0.672,0.792,0.42,0.58,122217180,-0.16,96796010.0,40654320.0,56141680.0
9,Non-Hispanic Black,2004,0.603,0.112,0.89,0.11,122217180,0.78,13688320.0,12182610.0,1505716.0


In [154]:
out_df = race_df[['group', 'year', 'turnout', 
         'dem_margin', 'num_group', 
         'num_rep', 'num_dem', 'num_nation']].copy()

out_df['demographic'] = 'race'

order = ['demographic','group', 'year', 'turnout', 'dem_margin', 'num_group', 
         'num_rep', 'num_dem', 'num_nation']

#out_df = out_df.round(decimals=4), don't round, numbers are huge
out_df[order].to_csv('./US_Race_Presidential_Results_04-16_uncorrected.csv', index=False)