In [1]:
# This is based off of http://localhost:8820/notebooks/projects/demographics/Voting-2018-AW8.ipynb
# and is intended to give access to the voter map from other notebooks.

In [None]:
import numpy as np
import json
import pickle

In [2]:
# Key is voter ID, DOB:str, reg_date:str, 
#    reg_info:{month_str: {status: , party: }}, addresses:{month_str:address}}
#    vote_info:{vote_str: {party: , 'how': }}
# vote_str is in the form GN_11_06_12 or PR_05_15_18 or PR_05_17_11
# In the case of vote_info records added from fixed width files,
#    'party' is set to be earliest known party and 'how' is None
# In the case of those added from later files, 'how' is processed directly:
#    'AP' indicates the voter cast their ballot at the polls
#    'AB' indicates the voter cast an absentee ballot
#    'P' indicates that the voter cast a provisional ballot
# Added by voter_process_address_history:
#   'addr_arr':[{'address': , 'date': , 'census_block': , 'latlon': },]

# Initialize voter_map as empty if it doesn't already exist
try:
    voter_map
except:
    voter_map = {}

In [16]:
# Try to load voter_map 
#   voter_map_17_18_18.pickle has 'reg_date' as a string
#   voter_map_17_18_18_b.pickle has 'reg_date' converted to datetime
#   voter_map_09_17_18_18_c.pickle has data from 2009-07
#   voter_map_09_17_18_18_d.pickle has census_block info (full for 15213, partial for all, need to do TODO addresses)
#   voter_map_09_17_18_18_e.pickle has census_block info for all
#   voter_map_09_17_18_18_f.pickle has census_block latlon info for all
#   voter_map_09_17_18_18_g.pickle has fixups from processing of 2005-02 (delete pre-1900 DOB, deal with Date_Registered < DOB+18yrs)
#   voter_map_05_09_13_17_18_18_18_h.pickle has voter history
#   voter_map_05_09_13_17_18_18_18_i.pickle has census_block addresses resolutions for 2005-02 and 2013-02
#   voter_map_05_09_13_17_18_18_18_j.pickle has initial fixup from 2005-02 and 2009-07 from running fixup_addr_dates
#   voter_map_05_09_13_17_18_18_18_k.pickle has remaining fixups from 2013-02 through 2018-10 from running fixup_addr_dates
#   voter_map_05_09_13_17_18_18_18_l.pickle has more fixups from running voter_crosscheck_address_history_array and regenerating census block dots

voter_map_file_path_t = 'voters/voter_map_{}.pickle'
default_vm_suffix = '05_09_13_17_18_18_18_l'

def save_voter_map(suffix):
    global voter_map
    
    if not suffix:
        print "ERROR: need to specify suffix to save voter_map.  Default load suffix is %s"%(default_vm_suffix)
    voter_map_file_path = voter_map_file_path_t.format(suffix)
    
    print 'Saving voter_map to %s'%(voter_map_file_path)
    
    # Save out voter_map
    with open(voter_map_file_path, 'wb') as handle:
        pickle.dump(voter_map, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_voter_map(suffix):
    global voter_map

    if not suffix:
        suffix = default_vm_suffix
    voter_map_file_path = voter_map_file_path_t.format(suffix)
    
    print 'Loading voter_map from %s'%(voter_map_file_path)

    # Load in voter_map
    with open(voter_map_file_path, 'rb') as handle:
        voter_map = pickle.load(handle)

In [11]:
# Key is vote_str, value is {'date': datetime, 'month_strs': set(month_str),'date_type': }
# 'date_type' is 'exact' if we actually knew the date, or 'approx' if we're just guessing
# strings generated from fixed width files will be created as 'approx' dates; those
# created by newer files will be 'exact'.  

# Initialize vote_map as empty if it doesn't already exist
try:
    vote_map
except:
    vote_map= {}
    
# vote_arr is an ordered list of tuples in time order [(vs1, date1), (vs2, date2)...]
# The invariant is that when an item is added to vote_map, vote_arr must be regenerated
# appropriately
try:
    vote_arr
except:
    vote_arr = []

# This is an array mapping from the index of Vote_History strings into 
# vote_str strings.  Earlier than the first exact date strings these are just 
# PR_YYYY and GN_YYYY.  After the first exact date strings, these are PR_MM_DD_YY
try:
    vote_history_index_to_vote_str
except:
    vote_history_index_to_vote_str=[]

In [9]:
# Load/save for vote_info elements
#   vote_info_05_09_13_17_18_18_18_a.pickle is the initial save after loading vote info from 2005, 2013, and 2018

vote_info_file_path_t = 'voters/vote_info_{}_{}.pickle'
default_vi_suffix = '05_09_13_17_18_18_18_a'

def save_vote_info(suffix):
    global vote_map
    global vote_arr
    global vote_history_index_to_vote_str
    if not suffix:
        print "ERROR: need to specify suffix to save vote_info.  Default load suffix is %s"%(default_vi_suffix)

    vote_info_file_path={}
    for elt in ['map','arr','index']:
        vote_info_file_path[elt] = vote_info_file_path_t.format(elt,suffix)
        print 'Saving vote_%s to %s'%(elt, vote_info_file_path[elt])
    
    # Save out vote_info elements
    with open(vote_info_file_path['map'], 'wb') as handle:
        pickle.dump(vote_map, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(vote_info_file_path['arr'], 'wb') as handle:
        pickle.dump(vote_arr, handle, protocol=pickle.HIGHEST_PROTOCOL)
    with open(vote_info_file_path['index'], 'wb') as handle:
        pickle.dump(vote_history_index_to_vote_str, handle, protocol=pickle.HIGHEST_PROTOCOL)

def load_vote_info(suffix):
    global vote_map
    global vote_arr
    global vote_history_index_to_vote_str

    if not suffix:
        suffix = default_vi_suffix
    
    vote_info_file_path={}
    for elt in ['map','arr','index']:
        vote_info_file_path[elt] = vote_info_file_path_t.format(elt,suffix)
        print 'Loading vote_%s from %s'%(elt, vote_info_file_path[elt])

    # Load in vote_info elements
    with open(vote_info_file_path['map'], 'rb') as handle:
        vote_map = pickle.load(handle)
    with open(vote_info_file_path['arr'], 'rb') as handle:
        vote_arr = pickle.load(handle)
    with open(vote_info_file_path['index'], 'rb') as handle:
        vote_history_index_to_vote_str = pickle.load(handle)

In [5]:
# Any time vote_map is changed in a way that affects dates this should be called
# so we always have a sorted list of votes
def regenerate_vote_arr():
    global vote_arr
    global vote_map
    vote_arr = sorted(vote_map.items(),key=lambda x:x[1]['date'])

# Vote strings are in the form TT_MM_DD_YY, parse the date
def vote_str_to_date(vote_str):
    str_date = datetime.datetime.strptime(vote_str[3:], '%m_%d_%y')
    return(str_date)

# Given a list of column names return a list of vote_str values included in the dataframe
# and populate vote_map with the date if we don't already have it
def columns_to_vote_strs(month_str, column_list):
    global vote_arr
    global vote_map

    # Process any new style column names
    vote_str_list = list(filter(lambda x: ('GN_' in x or 'PR_' in x or 'SP_' in x) and (not '_VM' in x), 
                                column_list))
    
    for vote_str in vote_str_list:
        if not vote_str in vote_map:
            vote_map[vote_str] = {'date': vote_str_to_date(vote_str), 
                                  'month_strs': set([month_str]), 
                                  'date_type':'exact'
                                 }
        else:
            vote_map[vote_str]['month_strs'].add(month_str)
            
    regenerate_vote_arr()
    return(vote_str_list)

# Old style Vote_History from fixed width files start with the primary of 1983 but only have
# 2 bits of info about each election: '0' = not registered, '1' = voted, '2' = didn't vote
# we don't know exact dates or party affiliation.

# Add entries into vote_map for the longest of these, which is currently 2009.
# Before calling this, dadd '2013-02', or whatever the oldest of the new style 
# files are, so vote_map contains exact entries for the earliest dates it can

# See the file VoterThankYou-12Dec2007 in https://drive.google.com/drive/folders/1xVsl029scwhnVnXa-v6XpFufjOavNrhe
# for definitions on the format of the Vote_History string
def add_vote_history_map_entries():
    global voter_map
    
    first_new_style_year = 2004
    last_new_style_year = 2010
    history_month_str = '2005-02'
    new_style_names = list(filter(lambda x: ('GN_' in x or 'PR_' in x),vote_map.keys()))
    
    idx = 0
    for year in range(1983, last_new_style_year):
        pr_name = "PR_%4d"%(year)
        gn_name = "GN_%4d"%(year)
        # Estimate priaries as 5/16/YYYY and general elections as 11/6/YYYY if we don't know 
        # any better
        pr_date = dateparser.parse("5/16/%d"%(year))
        gn_date = dateparser.parse("11/6/%d"%(year))
        pr_type = 'approx'
        gn_type = 'approx'
        if(year>=first_new_style_year):
            # Try to retrieve the exact dates from vote_map
            yy = ("%d"%(year))[2:4]
            pr_names = list(filter(lambda x: bool(re.search('PR_\d\d_\d\d_%s'%(yy), x)), 
                                 new_style_names))
            gn_names = list(filter(lambda x: bool(re.search('GN_\d\d_\d\d_%s'%(yy), x)), 
                                 new_style_names))
            #print "%d: Search for %s: pr=%r, gn=%r" % (idx, yy, pr_names, gn_names)
            if(len(pr_names)==1):
                pr_name = pr_names[0]
                pr_date = vote_map[pr_name]['date']
                pr_type = vote_map[pr_name]['date_type']
            if(len(gn_names)==1):
                gn_name = gn_names[0]
                gn_date = vote_map[gn_name]['date']
                gn_type = vote_map[gn_name]['date_type']
        # Do PR then GN for this year and increment index 
        vote_history_index_to_vote_str.append(pr_name)
        vote_history_index_to_vote_str.append(gn_name)
        idx = idx+2
        
        # Add to vote_map
        if not pr_name in vote_map:
            vote_map[pr_name] = {'date': pr_date, 
                                  'month_strs': set([history_month_str]), 
                                  'date_type':pr_type
                                 }
        else:
            vote_map[pr_name]['month_strs'].add(history_month_str)
            
        if not gn_name in vote_map:
            vote_map[gn_name] = {'date': gn_date, 
                                  'month_strs': set([history_month_str]), 
                                  'date_type':gn_type
                                 }
        else:
            vote_map[gn_name]['month_strs'].add(history_month_str)
            
    # Udpdate vote_arr to make sure new entries are in there and in order
    regenerate_vote_arr()


#####################################################
# Individual voter vote_info support
# Given the existing vote_info_map for a voter, add an entry for 
# a given vote_str, party, and how.  
def add_vote_record(vote_info_map, vote_str, party, how):
    if vote_str in vote_info_map:
        # There's already an entry.  Is this new info better?
        if(how and not vote_info_map[vote_str]['how']):
            # This new info is better, use it
            vote_info_map[vote_str]['how']=how
            vote_info_map[vote_str]['party']=party
        else:
            # This is a duplicate, ignore it
            pass
    else:
        # There's not an entry yet, create one
        vote_info_map[vote_str]={'party': party, 'how':how}
        

# Find all the '1's in the history str, translate those into 
def process_vote_history(vote_info_map, history_str, party):
    idx=0
    while(idx != -1 and idx < len(history_str)):
        # Find the index of the next '1'
        new_idx = history_str.find('1', idx)
        #print "Found %d (%d)"% (new_idx,idx)
        if(new_idx!=-1):
            # Found a vote that this voter participated in
            vote_str = vote_history_index_to_vote_str[new_idx] if new_idx < len(vote_history_index_to_vote_str) else None
            if(vote_str):
                add_vote_record(vote_info_map, vote_str, party, None)
            else:
                print "WARNING: No entry in vote_history_index_to_vote_str for index %d"% (new_idx)
                break
            idx=new_idx+1
        else:
            break

    return(vote_info_map)
                
def process_vote_columns(vote_info_map, df, i):
    vote_strs = columns_to_vote_strs(month_str, list(df))
    for vote_str in vote_strs:
        if not pd.isna(df[vote_str].iloc[i]):
            # They voted in this one, find out how
            party = df[vote_str].iloc[i]
            how = df["%s_VM"%(vote_str)].iloc[i]
            add_vote_record(vote_info_map, vote_str, party, how)

    return(vote_info_map)

def get_next_vote_str_rec_after_date(vote_info_map, cmp_date):
    if(not cmp_date):
        return None
    
    # Find and return the next item after this one in vote_arr or None if we're past the end 
    return (next((item for item in vote_arr if item[0] in vote_info_map and item[1]["date"] > cmp_date),None))

def get_vote_str_rec_on_or_after_date(vote_info_map, cmp_date):
    # Find and return the next item after this one in vote_arr or None if we're past the end 
    return (next((item for item in vote_arr if item[0] in vote_info_map and item[1]["date"] >= cmp_date),None))

In [6]:
# Given a voter ID, and address, find and return a tuple of (i,addr_rec) if there
# is an entry in addr_arr matching the address.  If no match, return None
# TODO: What if there are multiple entries with the same address?
def find_matching_addr_rec(vid, addr):
    global voter_map
    
    # Make sure vid is in voter_map
    if(not vid in voter_map):
        return None
    vrec = voter_map[vid]
    if(not 'addr_arr' in vrec):
        return None
    # Find the entry in addr_arr matching this address
    for i in range(0,len(voter_map[vid]['addr_arr'])):
        if(addr == voter_map[vid]['addr_arr'][i]['address']):
            return(i,voter_map[vid]['addr_arr'][i])
    return None


In [12]:
# Keep track of what month_strs are in the current voter_map
try:
    active_month_str_set
    active_month_str_arr
except:
    active_month_str_set=set(['2005-02', '2009-07', '2013-02', '2017-11', '2018-03', '2018-08', '2018-10'])
    active_month_str_arr=sorted(list(active_month_str_set))

# Save/load voter_map

# Use  random points in census blocks for voter IDs

In [22]:
try:
    block_id_use_map
except:
    print "WARNING: block_id_use_map not defined.  exec_ipynb('block_points.ipynb')"

In [18]:
def block_get_voter_coords(census_block, vid):
    global block_id_use_map
    
    if(not census_block in block_id_use_map):
        # Need to create an entry, set 'next' to 0, initialize points
        # First convert from GEOID10 to the index into the block_idx
        block_idx = geoid2idx[census_block]
        point_arr = block_points(block_idx)
        block_id_use_map[census_block]={'next':0, 'numpoints':len(point_arr)}
        block_points_map[census_block]=point_arr
    elif(not census_block in block_points_map):
        # We have an entry in block_id_use_map for this census_block, 
        # but not in block_points_map.  This can happen if we restore
        # block_id_use_map from a saved file.  Use block_points to fill in 
        # block_points_map
        block_idx = geoid2idx[census_block]
        point_arr = block_points(block_idx)
        block_points_map[census_block]=point_arr
        
    # We know census_block is in block_id_use_map (which we may have just added)
    # Check if this voter ID is already registered
    if(vid in block_id_use_map[census_block]):
        # Already assigned, just use the same index as before
        point_idx=block_id_use_map[census_block][vid]
    else:
        # Not yet assigned, take the next available 
        # and increment 'next'
        point_idx = block_id_use_map[census_block]['next']
        if(point_idx>block_id_use_map[census_block]['numpoints']-1):
            # Overflow
            if(not census_block in block_id_overflow_set):
                block_id_overflow_set.add(census_block)
                print "WARNING: overflow in census block %s" % (census_block)
            # For now, just start reassigning numbers back at 0
            point_idx=0
            block_id_use_map[census_block]['next']=0
        block_id_use_map[census_block]['next'] = point_idx + 1
        block_id_use_map[census_block][vid] = point_idx
        
    # Retrieve the webmercator point from the points array.
    # Convert it to lat lon and create a Point to return.
    # Note that WebMercatorToLonLat(x,y) returns [lon, lat]
    # the args to create a Point are also (lon, lat)
    point_xy = block_points_map[census_block][point_idx]
    ll_arr = WebMercatorToLonLat(point_xy['x'],point_xy['y'])
    return Point(ll_arr[0], ll_arr[1])

def voter_get_coords_for_addr_history(vid, force_update):
    global voter_map
    
    if(not vid in voter_map):
        print "WARNING: %s not in voter_map" % (vid)
        return
    vrec = voter_map[vid]
    if(not 'addr_arr' in vrec):
        print "WARNING: %s missing addr_arr in voter_map, re-run voter_process_address_history_array" % (vid)
        return

    addr_arr = vrec['addr_arr']
    for j in range(0,len(addr_arr)):
        addr_rec = vrec['addr_arr'][j]
        if('latlon' in addr_rec and not force_update):
            continue
        if(not 'census_block' in addr_rec):
            print "WARNING %s: '%s' lacks 'census_block', skipping" % (vid, this_addr)
            continue
        # census_block might be None, in which case don't get coords
        if(addr_rec['census_block']==None):
            continue
        try:
            coords = block_get_voter_coords(addr_rec['census_block'], vid)
            #print "Assigning %r to %s[%d]='%s'"%(coords, vid,j,addr_rec['census_block'])
            #voter_map[vid]['addr_arr'][j]['latlon'] = coords
            addr_rec['latlon'] = coords
        except Exception as e:
            print "ERROR: Exception getting coords for %s[%d]='%s': %s"%(vid,j,addr_rec['census_block'],e)

def voter_get_coords_for_addr_history_array(vid_arr, force_update):
    start=arrow.now()
    chunk_start_time=start
    chunk_size=10000

    for i in range(0,len(vid_arr)):
        vid = vid_arr[i]
        voter_get_coords_for_addr_history(vid, force_update)
        
        # Handle periodic debug message
        if((i%chunk_size)==0 and i>0):
            print "%d-%d: processing %r, %s time elapsed" %(i-(chunk_size-1), i, vid,arrow.now()-chunk_start_time)
            addcnt=0
            chunk_start_time=arrow.now()

    end=arrow.now()
    print "Processing took %s for %d addresses" % (str(end-start), len(vid_arr))