**Quick notebook to reverse geocode adddresses for all U.S. counties (fips as of 2015)**
Note: Since this is purely to get a geographic centroid for address identification and U.S. Census updates to county fips codes involved mostly name changes as opposed to geographic changes, ok to use old data.

In [1]:
#re-imports modules that have changed
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append("../")

In [3]:
import os
import logging
import pandas as pd
import requests
import tqdm
import time
from src.candidate_fetcher import ReverseGeocode, VoterInfo 
from src.utils import CloudStorageClient 


In [4]:
logging.basicConfig(level="INFO")

In [5]:
def check_result(response): 
    civic = VoterInfo()
    try:  
        result = civic.fetch_voter_info(address)
        if result['pollingLocations']: 
            return address
        else: 
            return 0
    except Exception as e:
        return "error"

In [6]:
# Load data
filepath = "../data/Gaz_counties_national.txt"
data = pd.read_csv(filepath, encoding="latin-1", sep='\t')

In [7]:
# Process data
data.columns = data.columns.str.strip()
data['GEOID'] = data['GEOID'].astype(str).str.zfill(5)
data = data[['USPS', 'GEOID', 'NAME', 'INTPTLAT', 'INTPTLONG']].copy()
data.columns = ['state_abbr', 'fips', 'name', 'lat', 'long']

In [10]:
# Get formatted addresses
geo=ReverseGeocode()
data['addresses'] = None
for index, row in data.iterrows():
    try: 
        formats = []
        lat = row['lat']
        long = row['long']
        response = geo.reverse_geocode(lat, long)
        for item in response['results']: 
            address = item['formatted_address']
            formats.append(address)
        data.at[index, 'addresses'] = formats
    except Exception as e: 
        data.at[index, 'addresses'] = []

In [11]:
# Set addresses - Take 1st address - most likely, but preserve others
data['address'] = None
for index, row in data.iterrows():
    addresses = row['addresses']
    try:
        data.at[index, 'address'] = addresses[0]
    except Exception as e: 
        data.at[index, 'address'] = None

In [12]:
# Identify Unnamed roads for future cleanup
# Note adding a more standard address does not appear to affect the results 
# Tested: 'Unnamed Road, Loxley, AL 36551, USA' vs '23130 McAuliffe Drive, Robertsdale, AL 36576' with election id '4898'
    
unnamed_road=data.loc[data['address'].str.contains('Unnamed Road', na=False)]
logging.info(f"Number of unnamed roads: {len(unnamed_road)}")

# Clean Unnamed roads where 2nd formatted address is valid four-part address
for index, row in unnamed_road.iterrows(): 
    addresses = row['addresses']
    address = addresses[1]
    split = address.split(",")
    if len(split) == 4: 
        data.at[index, 'address'] = address
    else: 
        pass
    
#Output remaining Unnamed roads to .cvs  
unnamed_road=data.loc[data['address'].str.contains('Unnamed Road', na=False)]
logging.info(f"Number of unnamed roads remaining: {len(unnamed_road)}")
unnamed_road.to_csv("../output/unnamed_road.csv", index=False, encoding='utf-8')

INFO:root:Number of unnamed roads: 362
INFO:root:Number of unnamed roads remaining: 313


In [13]:
# Check that address is valid for Civic Info API 
data['result'] = None
for index, row in tqdm.tqdm(data.iterrows()):
    try:
        civic = VoterInfo()
        result =  civic.fetch_voter_info(address, election_id=None)
        data.at[index, 'result'] = result
        time.sleep(1)
    except Exception as e: 
        data.at[index, 'result'] = {'Failed': e}

3221it [1:11:00,  1.32s/it]


In [15]:
filepath = '../output/locales_county.csv'
data.to_csv(filepath, encoding='utf-8', index=False)

In [16]:
client = CloudStorageClient()
client.upload_file(filepath, 'address_locales', 'addresses_county.csv')

INFO:root:Connected to Google Cloud Storage.
INFO:root:Loaded file ../output/locales_county.csv to addresses_county.csv


In [17]:
logging.info(f"Address information retrieved for {len(data)} counties.")

INFO:root:Address information retrieved for 3221 counties.
