# Reverse Geocoding
Phase 2, 2018

The module censusgeocode was used to reverse geocode coordinates to census tracts and blocks. The dataframe is partitioned to allow for processing and saving in chunks.

Next steps:
- parallel processing to speed up geocoding

In [1]:
# Import modules
import os
import pandas as pd
import numpy as np
import censusgeocode as cg
from math import ceil
from datetime import datetime
import tqdm

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Establish file paths
ROOT_dir = os.path.abspath('')
dataFolder = ROOT_dir + '/data'
outputFolder = ROOT_dir + '/phase2_output/geocodes'

In [2]:
# Load CSV in to a dataframe
raw = pd.read_csv(dataFolder + '/2009-2014_RedCross_DisasterCases.csv',
                 encoding = "ISO-8859-1",
                 error_bad_lines = False)

In [3]:
# Create dataframe for coordinate & census data
coords_df = raw[['esri_longitude_x', 'esri_latitude_x']]
coords_df = coords_df.rename(columns = {'esri_longitude_x':'long', 'esri_latitude_x':'lat'})
coords_df['census_tract_geoid'] = np.nan
coords_df['census_tract_name'] = np.nan
coords_df['census_tract_state'] = np.nan
coords_df['census_tract'] = np.nan
coords_df['census_block_geoid'] = np.nan
coords_df['census_block_tract'] = np.nan
coords_df['census_block_name'] = np.nan
coords_df['census_block'] = np.nan
coords_df.head()

Unnamed: 0,long,lat,census_tract_geoid,census_tract_name,census_tract_state,census_tract,census_block_geoid,census_block_tract,census_block_name,census_block
0,-86.765284,34.810583,,,,,,,,
1,-86.147937,31.764383,,,,,,,,
2,-85.622874,32.807411,,,,,,,,
3,-86.396463,34.61706,,,,,,,,
4,-86.777434,33.738204,,,,,,,,


In [4]:
# Partition data into chunks to allow for processing & saving in chunks

num_chunks = 10000
idx = np.where(coords_df['long'].notnull() & coords_df['lat'].notnull())[0]
chunk_size = ceil(coords_df.shape[0] / float(num_chunks))
chunk_idx = []
for i in range(num_chunks + 2):
    if i == 0:
        chunk_idx.append([0,chunk_size])
    else:
        temp = range(max(chunk_idx[-1]) + 1, max(chunk_idx[-1]) + chunk_size)
        if max(temp) > coords_df.shape[0]:
            chunk_idx.append([min(temp),coords_df.shape[0]])
        else:
            chunk_idx.append([min(temp),max(temp)])

In [None]:
# Reverse Geocode using censusgeocode module

start_time = datetime.now()
passed = []
failed = []
for chunk in tqdm.tqdm(range(len(chunk_idx))):
    idx_list = list(range(chunk_idx[chunk][0], chunk_idx[chunk][1] + 1))
    for idx in idx_list:
        try:
            results = cg.coordinates(x = coords_df.loc[idx,'long'], y = coords_df.loc[idx,'lat'])
            coords_df.loc[idx,'census_tract_geoid'] = results['Census Tracts'][0]['GEOID']
            coords_df.loc[idx,'census_tract_name'] = results['Census Tracts'][0]['NAME']
            coords_df.loc[idx,'census_tract_state'] = results['Census Tracts'][0]['STATE']
            coords_df.loc[idx,'census_tract'] = results['Census Tracts'][0]['TRACT']
            coords_df.loc[idx,'census_block_geoid'] = results['2010 Census Blocks'][0]['GEOID']
            coords_df.loc[idx,'census_block_tract'] = results['2010 Census Blocks'][0]['TRACT']
            coords_df.loc[idx,'census_block_name'] = results['2010 Census Blocks'][0]['NAME']
            coords_df.loc[idx,'census_block'] = results['2010 Census Blocks'][0]['BLOCK']
            passed.append(idx)
        except:
            failed.append(idx)
    
    # Overwrite CSV
    coords_df.to_csv(outputFolder + '/geocodes.csv')

  3%|▎         | 282/10002 [4:17:43<148:03:21, 54.84s/it]