In [12]:
import csv, json, os, pandas, re, scipy, scipy.sparse, shutil
import subprocess, sys, threading, time, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

In [None]:
pandas.options.display.max_colwidth = 300

## Download ACS2015 File Templates for 5-year and 1-year data

In [101]:
src = 'https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/2015_1yr_Summary_FileTemplates.zip'
dest = 'capture/ACS2015_1year/2015_1yr_Summary_FileTemplates.zip'
download_file(src, dest)
templates = unzip_file(dest)

src = 'https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/2015_5yr_Summary_FileTemplates.zip'
dest = 'capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.zip'
download_file(src, dest)
templates = unzip_file(dest)

capture/ACS2015_1year/2015_1yr_Summary_FileTemplates.zip already downloaded
capture/ACS2015_1year/2015_1yr_Summary_FileTemplates.zip already unzipped
Downloading https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/2015_5yr_Summary_FileTemplates.zip to capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.zip
Done, wrote 1397862 bytes to capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.zip
Unzipping capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.zip into capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.tmp
Success, created capture/ACS2015_5year/2015_5yr_Summary_FileTemplates


In [217]:
!ls -l capture/ACS2015_1year/2015_1yr_Summary_FileTemplates/Templates | head

total 13192
-rw-rw-r-- 1 rsargent rsargent  25600 Aug  5  2016 2015_SFGeoFileTemplate.xls
-rw-rw-r-- 1 rsargent rsargent  92160 Aug  5  2016 Seq100.xls
-rw-rw-r-- 1 rsargent rsargent  60928 Aug  5  2016 Seq101.xls
-rw-rw-r-- 1 rsargent rsargent  88064 Aug  5  2016 Seq102.xls
-rw-rw-r-- 1 rsargent rsargent 115200 Aug  5  2016 Seq103.xls
-rw-rw-r-- 1 rsargent rsargent  99328 Aug  5  2016 Seq104.xls
-rw-rw-r-- 1 rsargent rsargent 111104 Aug  5  2016 Seq105.xls
-rw-rw-r-- 1 rsargent rsargent 107520 Aug  5  2016 Seq106.xls
-rw-rw-r-- 1 rsargent rsargent  88064 Aug  5  2016 Seq107.xls
ls: write error: Broken pipe


## Download ACS2015 5-year data (tract and block group)

In [33]:
!wget --header="User-Agent: Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" --header="Referer: http://xmodulo.com/" https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/5_year_entire_sf/Tracts_Block_Groups_Only.tar.gz

!mkdir -p capture/ACS2005_5year
!mv Tracts_Block_Groups_Only.tar.gz capture/ACS2005_5year

!cd capture/ACS2005_5year; tar xvfz Tracts_Block_Groups_Only.tar.gz >/dev/null

!wget --header="User-Agent: Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/5_year_entire_sf/2015_ACS_Geography_Files.zip

!mv 2015_ACS_Geography_Files.zip capture/ACS2005_5year

unzip_file('capture/ACS2005_5year/2015_ACS_Geography_Files.zip')

--2017-07-22 10:59:54--  https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/5_year_entire_sf/Tracts_Block_Groups_Only.tar.gz
Resolving www2.census.gov (www2.census.gov)... 104.95.31.46, 2600:1408:7:2a5::208c, 2600:1408:7:291::208c
Connecting to www2.census.gov (www2.census.gov)|104.95.31.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3747109902 (3.5G) [application/x-gzip]
Saving to: ‘Tracts_Block_Groups_Only.tar.gz’


2017-07-22 11:13:55 (4.25 MB/s) - ‘Tracts_Block_Groups_Only.tar.gz’ saved [3747109902/3747109902]



## Read CSV utility functions

In [67]:
def read_acs2015_5year_template(seqno):
    path = 'capture/ACS2015_5year/2015_5yr_Summary_FileTemplates/2015_5yr_Templates/Seq%d.xls' % seqno
    if not os.path.exists(path):
        return None
    return pandas.read_excel(path)

# Combine template header and data into pandas frame
def read_acs2015_year_data(state, seqno):
    header = read_acs2015_5year_template(seqno)
    data = pandas.read_csv('capture/ACS2015_5year/group2/e20155%s%04d000.txt' % (state, seqno),
                            index_col=False,
                            dtype={'FILEID':numpy.str,
                                   'FILETYPE':numpy.str,
                                   'STUSAB':numpy.str,
                                   'CHARITER':numpy.str,
                                   'SEQUENCE':numpy.str,
                                   'LOGRECNO':numpy.str},
                            header=None,
                            names=header.columns.values)
    return data

## Write ACS2015 5-year description.html

In [89]:
dataset = 'acs2015_5year_tract2010'
column_dir = 'columncache'
description_path = column_dir + '/' + dataset + '/description.html'

if os.path.exists(description_path):
    print '{description_path} already exists, skipping'.format(**locals())
else:
    table_rows = []

    for seqno in range(1, 1000):
        template = read_acs2015_5year_template(seqno)
        if template is None:
            break
        for col in range(6, template.shape[1]):
            colname = template.columns.values[col]
            description = template.iloc[0,col]
            table_rows.append(u'<tr><td>{dataset}.{colname}</td><td>{description}</td></tr>\n'.format(**locals()))

    html = '<table>' + ''.join(table_rows) + '</table>'

    open(description_path, 'w').write(html.encode('utf8'))
    print 'Wrote %d column names and descriptions to %s' % (len(table_rows), description_path)


columncache/acs2015_5year_tract2010/description.html already exists, skipping


## Create ACS2015 block-level population

### Read 2010 block geoids and 2010 block populations

In [42]:
block_geoids_2010 = [row[0] for row in query_psql("SELECT geoid2010 FROM sf1_2010_block_p001 order by blockidx2010")]
print 'There are', len(block_geoids_2010), 'blocks'

block_populations = numpy.load('columncache/census2010_block2010/p001001.numpy')
print 'block_populations has', sum(block_populations), 'total people'

assert(len(block_geoids_2010) + 1 == len(block_populations))

Execution of SELECT geoid2010 FROM sf1_2010_block_p001 order by blockidx2010
took 18.8642 seconds and returned 11078297 rows
There are 11078297 blocks
block_populations has 308745538 total people


### Compute 2010 population by tract and block indices from tract


In [43]:
tract_populations = {}
tract_block_indexes = {}

for block_index_minus_one, block_geoid in enumerate(block_geoids_2010):
    block_index = block_index_minus_one + 1
    tract_name = block_geoid[0:11]
    if tract_name not in tract_populations:
        tract_populations[tract_name] = 0
        tract_block_indexes[tract_name] = []
    tract_populations[tract_name] += block_populations[block_index]
    tract_block_indexes[tract_name].append(block_index)

print 'There are', len(tract_populations), 'tracts'
print 'tract_populations has', sum(tract_populations.values()), 'people'

There are 73057 tracts
tract_populations has 308745538 people


### Map tract identifiers to LOGRECNO using geography file

In [70]:
geography = pandas.read_csv('capture/ACS2015_5year/2015_ACS_Geography_Files/g20155pa.csv',
                            dtype=numpy.str,
                            index_col=False,
                            header=None,
                            keep_default_na=False,
                            na_values=[])

print 'There are', geography.shape[0], 'rows in geography'

tract_to_logrecno = {}

for r in range(0, geography.shape[0]):
    aggregation_level = geography.iloc[r, 2]
    if aggregation_level == '140': # census tract
        tract_identifier = geography.iloc[r, 48][7:]
        logrecno = geography.iloc[r, 4]
        tract_to_logrecno[tract_identifier] = logrecno

print 'There are', len(tract_to_acs_row), 'tracts in tract_to_logrecno'

There are 24674 rows in geography
There are 3218 tracts in tract_to_logrecno


### Map LOGRECNO to data table row using data file

In [71]:
data = read_acs2015_year_data('pa', 2)

logrecnos = data['LOGRECNO']

logrecno_to_row = {}

for r, logrecno in enumerate(logrecnos):
    logrecno_to_row[logrecno] = r

### Interpolate and write column

In [91]:
# TODO: can we do this with a data frame then write out columns?

col_name = 'B01001_001'

input_col = data[col_name]

output_col = numpy.zeros(population_by_block.size, dtype=numpy.float32)

for tract in sorted(tract_to_logrecno.keys()):
    input_pop = input_col[logrecno_to_row[tract_to_logrecno[tract]]]
    for block_index in tract_block_indexes[tract]:
        if block_populations[block_index]:
            output_col[block_index] = input_pop * float(block_populations[block_index]) / tract_populations[tract]
            

output_col_path = column_dir + '/' + dataset + '/' + col_name + '.float32'
output_col.tofile(output_col_path)

!ls -l $output_col_path

-rw-rw-r-- 1 rsargent rsargent 44313192 Aug  4 17:00 columncache/acs2015_5year_tract2010/B01001_001.float32
