In [73]:
import csv, json, os, re, shutil
import subprocess, sys, threading, time, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')


LEHD Residence and Workplace Area Characteristic data (RAC and WAC)
-------------------------------------------------------------------

From https://lehd.ces.census.gov/data/lodes/LODES7/LODESTechDoc7.2.pdf:

<img src="LEHD-RAC-info.png">

In [55]:
lodes7_rac_parts = ['S000', 'SA01', 'SA02', 'SA03', 'SE01', 'SE02', 'SE03', 'SI01', 'SI02', 'SI03']        

def download_lodes7_rac(state, jt, year):
    curdir = "capture/lodes7/%s/" % state
    for part in lodes7_rac_parts:
        filename = '{state}_rac_{part}_{jt}_{year}.csv.gz'.format(**locals())
        src = 'http://lehd.ces.census.gov/data/lodes/LODES7/{state}/rac/{filename}'.format(**locals())
        dest = 'capture/lodes7/{state}/{filename}'.format(**locals())
        download_file(src, dest)

pool = SimpleThreadPoolExecutor(4)

for state in state_names:
    for jt in ['JT01']:
        for year in range(2002, 2015):
            pool.submit(download_lodes7_rac, state, jt, year)
            
pool.shutdown()
None

capture/lodes7/ak/ak_rac_S000_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SA01_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SA02_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SA03_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SE01_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SE02_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SE03_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SI01_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_S000_JT01_2003.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SA01_JT01_2003.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SA02_JT01_2003.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SI02_JT01_2002.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SA03_JT01_2003.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SE01_JT01_2003.csv.gz already downloaded
capture/lodes7/ak/ak_rac_SE02_JT01_2003.csv.gz already downloaded
capture/lo

## Map geoid2010 to blockidx2010

In [56]:
blockidx2010_from_geoid2010 = dict(query_psql('SELECT geoid2010, blockidx2010 FROM census2010_block_idxs'))
colsize = len(blockidx2010_from_geoid2010) + 1

Execution of SELECT geoid2010, blockidx2010 FROM census2010_block_idxs
took 8.73598 seconds and returned 11078297 rows


## Create columns

Filenames like: columncache/lodes2011/rac_jt01_s000.numpy


In [74]:
def open_gz(filename):
    return subprocess.Popen(["zcat", filename], stdout=subprocess.PIPE).stdout

def read_lodes7_rac_file(filename, columns):
    f = open_gz(filename)
    header = f.readline().strip().split(',')
    table = numpy.genfromtxt(f, delimiter=',')
    (nrows, ncols) = table.shape
    assert header[0] == 'h_geocode'
    assert header[-1] == 'createdate'
    # Rewrite geocode2010 to blockidx2010
    for r in range(nrows):
        table[r, 0] = blockidx2010_from_geoid2010['%015.0f' % table[r, 0]]
    
    print 'Loading {nrows} rows from {filename}'.format(**locals())
    for c in range(1, ncols-1):
        if not header[c] in columns:
            columns[header[c]] = numpy.zeros(colsize, dtype=numpy.float32)
        column = columns[header[c]]
        for r in range(nrows):
            column[int(table[r, 0])] += table[r, c]

def create_lodes7_rac_columns(jt, year, part):            
    columns = {}    
    for state in state_names:
        filename = 'capture/lodes7/{state}/{state}_rac_{part}_{jt}_{year}.csv.gz'.format(**locals())
        read_lodes7_rac_file(filename, columns)
    for (column_name, array) in dict.iteritems(columns):
        col_filename = 'columncache/lodes{year}/rac_{jt}_{column_name}.numpy'.format(**locals()).lower()
        numpy_atomic_save(col_filename, array)

In [76]:
pool = SimpleProcessPoolExecutor(6)  # reasonable number for 64GB RAM

for jt in ['JT01']:
    for year in range(2002, 2015):
        for part in lodes7_rac_parts:
            pool.submit(create_lodes7_rac_columns, jt, year, part)
            
pool.shutdown()
None

Loading 10846 rows from capture/lodes7/ak/ak_rac_S000_JT01_2002.csv.gz
Loading 6103 rows from capture/lodes7/ak/ak_rac_SA03_JT01_2002.csv.gz
Loading 8035 rows from capture/lodes7/ak/ak_rac_SA01_JT01_2002.csv.gz
Loading 9939 rows from capture/lodes7/ak/ak_rac_SA02_JT01_2002.csv.gz
Loading 8709 rows from capture/lodes7/ak/ak_rac_SE01_JT01_2002.csv.gz
Loading 8747 rows from capture/lodes7/ak/ak_rac_SE02_JT01_2002.csv.gz
Loading 75584 rows from capture/lodes7/al/al_rac_SA03_JT01_2002.csv.gz
Loading 95736 rows from capture/lodes7/al/al_rac_SA01_JT01_2002.csv.gz
Loading 103364 rows from capture/lodes7/al/al_rac_SE01_JT01_2002.csv.gz
Loading 118397 rows from capture/lodes7/al/al_rac_SA02_JT01_2002.csv.gz
Loading 129638 rows from capture/lodes7/al/al_rac_S000_JT01_2002.csv.gz
Loading 111899 rows from capture/lodes7/al/al_rac_SE02_JT01_2002.csv.gz
Loading 3617 rows from capture/lodes7/ar/ar_rac_SA03_JT01_2002.csv.gz
Loading 3302 rows from capture/lodes7/az/az_rac_SA03_JT01_2002.csv.gz
Loading 6