In [17]:
import copy
import csv
import json
import numpy as np
import pandas as pd
import re

from collections import Counter
from io import StringIO

In [18]:
CENSUS_DATA_LOCATION = 'build/DEC_10_SF1_P1_with_ann.csv'
ELECTION_DATA_LOCATION = 'python/data2012.csv'
OUTPUT_LOCATION = 'build/merged_data.csv'
US_JSON_LOCATION = 'us.json'
OUTPUT_US_JSON_LOCATION = 'public/data/us2012.json'

df = pd.read_csv(ELECTION_DATA_LOCATION)

lines = []

with open(CENSUS_DATA_LOCATION, encoding='latin1') as f:
    reader = csv.reader(f)
    next(reader)
    for line in reader:
        count = line[3]
        count = re.match(r'(\d+)(?=\(r\d+\))?', count).groups()[0]
        lines.append([line[0], line[1], line[2], int(count)])

pop_df = pd.DataFrame.from_records(lines, columns=['id', 'id2', 'name', 'total'])

In [19]:
df.head()

Unnamed: 0,state,county,reporting,dem,gop,lib,grn,una,oth
0,AK,Alaska,100%,102138,136848,6131,0,0,2366.0
1,DC,Washington,222332,17337,1634,0,0,2045,
2,AL,Autauga,100%,6354,17366,136,0,0,53.0
3,AL,Baldwin,100%,18329,65772,597,0,0,290.0
4,AL,Barbour,100%,5873,5539,32,0,0,15.0


In [20]:
pop_df.head()

Unnamed: 0,id,id2,name,total
0,0500000US01001,1001,"Autauga County, Alabama",54571
1,0500000US01003,1003,"Baldwin County, Alabama",182265
2,0500000US01005,1005,"Barbour County, Alabama",27457
3,0500000US01007,1007,"Bibb County, Alabama",22915
4,0500000US01009,1009,"Blount County, Alabama",57322


In [21]:
pop_df['county'] = [x[0] for x in pop_df.name.str.split(',')]
pop_df['state'] = [x[1] for x in pop_df.name.str.split(',')]

In [22]:
STATES = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut',
'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas',
'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']

STATE_ABBREVS = [
  'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL',
  'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME',
  'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH',
  'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI',
  'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI',
  'WY']

STATE_TO_ABBREV = dict(zip(STATES, STATE_ABBREVS))

assert len(STATE_TO_ABBREV) == 51

In [23]:
pop_df['state'] = np.array(pop_df.name.apply(lambda x: x.split(',')[1]))
pop_df['state'] =pop_df['state'].str.strip()
pop_df['state_abbrev'] = pop_df['state'].apply(lambda x: STATE_TO_ABBREV[x])

In [24]:
# All found corrections
df.loc[df['county'] == 'Bedford Co.', 'county'] = 'Bedford'
pop_df.loc[pop_df['name'] == 'Doña Ana County, New Mexico', 'name'] = 'Dona Ana County, New Mexico'
df.loc[df['county'] == 'Sainte Genevieve', 'county'] = 'Ste. Genevieve'
df = df.loc[df['county'] != 'Kalawao']

In [25]:
# Election data in AK only at state level. Combine....
ak_population = pop_df[pop_df['state_abbrev'] == 'AK'].total.sum()
ak_df = pd.DataFrame.from_records([(-1, -1, 'Alaska, Alaska', ak_population, 'Alaska', 'Alaska', 'AK')],
                                  columns=('id', 'id2', 'name', 'total', 'county', 'state', 'state_abbrev'))
pop_df = pop_df.loc[~(pop_df['state_abbrev'] == 'AK')]
pop_df = pd.concat([pop_df, ak_df])

In [26]:
pop_df[pop_df['state_abbrev'] == 'AK']

Unnamed: 0,id,id2,name,total,county,state,state_abbrev
0,-1,-1,"Alaska, Alaska",710231,Alaska,Alaska,AK


In [27]:
# Then there are a bunch of special cases.....
records_to_join_on = []

def row_with_county(row, county):
    records_to_join_on.append((row.id, row.id2, row.name, row.total,
                               county, row.state, row.state_abbrev))

# Fix Bedford County and City together (townhall only has results for one)    
bedford_total = pop_df[pop_df.county.apply(lambda x: 'Bedford' in x) & (pop_df.state_abbrev == 'VA')].total.sum()
pop_df = pop_df[~(pop_df.county.apply(lambda x: 'Bedford city' in x) & (pop_df.state_abbrev == 'VA'))]
pop_df.ix[pop_df.county.apply(lambda x: 'Bedford County' in x) & (pop_df.state_abbrev == 'VA'), 'total'] = bedford_total

for i, row in pop_df.iterrows():
    name = row['name']
    state = row.state_abbrev
    if name.startswith('Baltimore') and state == 'MD':
        if 'ity' in name:
            row_with_county(row, 'Baltimore City')
        else:
            row_with_county(row, 'Baltimore County')
    elif name.startswith('Carson City') and state == 'NV':
        row_with_county(row, 'Carson City')
    elif state == 'VA' and any(name.startswith(x) for x in ('Fairfax', 'Franklin', 'Richmond', 'Roanoke')):
        if 'County' in name:
            row_with_county(row, name.split(' ')[0] + ' Co.')
        else:
            row_with_county(row, name.split(' ')[0])
    elif name.startswith('LaSalle'):
        row_with_county(row, 'La Salle')
    elif state == 'DC':
        row_with_county(row, 'Washington')
    elif state == 'MO' and name.startswith('St. Louis'):
        if 'County' in name:
            row_with_county(row, 'St. Louis Co.')
        else:
            row_with_county(row, 'St. Louis')
    elif state == 'HI' and 'Kalawao' in name:
        # This is some uninhabited place
        pass
    #elif state == 'SD' and 'Shannon' in name:
    #    # Name change in 2015
    #    row_with_county(row, 'Oglala Lakota')
    elif state == 'AK':
        row_with_county(row, 'Alaska')
    else:
        match = re.match(r'^(.*) (?=County|Parish|city)', name)
        if not match:
            print("WTF", row)
        else:
            row_with_county(row, match.groups()[0])

In [28]:
pop_df_fixed = pd.DataFrame.from_records(
    records_to_join_on, columns=['id', 'id2', 'name', 'total', 
                                 'county', 'state', 'state_abbrev'])

In [29]:
merged_df = df.merge(pop_df_fixed, how='inner', 
         left_on=['county', 'state'],
         right_on=['county', 'state_abbrev'])

assert len(merged_df) == len(df)

In [30]:
merged_df.rename(columns={'state_y': 'full_state', 'state_x': 'state'}, inplace=True)
del merged_df['state_abbrev']

In [31]:
merged_df.to_csv(OUTPUT_LOCATION, index=False)

In [34]:
color = {}
for state in 'AK, AL, AR, CT, DE, HI, IL, ME, MI, MN, MT, NE, NM, NV, SC, VA, WA'.split(', '):
    color[state] = 0
for state in 'AZ, DC, FL, KS, KY, MS, NC, ND, OR, PA, RI, TX, VT, WI, WY'.split(', '):
    color[state] = 1
for state in 'CA, CO, GA, ID, IN, LA, MA, MO, NJ, SD, WV'.split(', '):
    color[state] = 2
for state in 'IA, MD, NH, NY, OH, OK, TN, UT'.split(', '):
    color[state] = 3

id_to_properties = {int(row.id2): {'state': row.state,
                                   'color': color[row.state],
                                   'name': row.county, 
                                   'population': row.total,
                                   'dem': row.dem if np.isfinite(row.dem) else 0,
                                   'gop': row.gop if np.isfinite(row.gop) else 0, 
                                   'grn': row.grn if np.isfinite(row.grn) else 0,
                                   'lib': row.lib if np.isfinite(row.lib) else 0,
                                   'una': row.una if np.isfinite(row.una) else 0,
                                   'oth': row.oth if np.isfinite(row.oth) else 0}
                   for _, row in merged_df.iterrows()}

In [35]:
with open(US_JSON_LOCATION, 'r') as f:
    data = json.load(f)
    
data['objects']['counties']['geometries'] = [x for x in data['objects']['counties']['geometries'] if x['id'] < 60000]
data['objects']['counties']['geometries'] = [x for x in data['objects']['counties']['geometries'] if x['id'] // 1000 != 2]

In [36]:
for val in data['objects']['counties']['geometries']:
    if val['id'] in id_to_properties:
        val['properties'] = id_to_properties[val['id']]

In [37]:
data['objects']['counties']['geometries'][0]

{'arcs': [[[0, 1, 2]]],
 'id': 53073,
 'properties': {'color': 0,
  'dem': 49743,
  'gop': 38764,
  'grn': 0,
  'lib': 1156,
  'name': 'Whatcom',
  'oth': 1495.0,
  'population': 201140,
  'state': 'WA',
  'una': 0},
 'type': 'MultiPolygon'}

In [38]:
# Fix AK
ak_geometry = copy.deepcopy([x for x in data['objects']['states']['geometries'] if x['id'] == 2][0])

In [39]:
ak_row = merged_df[merged_df.state == 'AK'].iloc[0]

ak_geometry['properties'] = {
    'color': 0,
    'dem': int(ak_row.dem),
    'gop': int(ak_row.gop),
    'grn': int(ak_row.grn),
    'lib': int(ak_row.lib),
    'una': int(ak_row.una),
    'name': 'Alaska',
    'oth': int(ak_row.oth),
    'population': int(ak_row.total),
    'state': 'AK'
}

ak_geometry['id'] = 2001

In [40]:
data['objects']['counties']['geometries'].append(ak_geometry)
data['objects']['counties']['geometries'].sort(key=lambda x: x['id'])

In [41]:
with open(OUTPUT_US_JSON_LOCATION, 'wt') as f:
    json.dump(data, f)