In [13]:
import csv, json, glob, math, numpy, os, pandas, re, scipy, scipy.sparse, shutil
import subprocess, sys, threading, time, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

Inputs files from NHGIS
-----------------------

In [3]:
capture90 = 'capture/NHGIS_1990'
crosswalk_file = 'capture/NHGIS_1990/NHGIS_block1990_to_block2010/crosswalk_block1990_block2010_v001.csv'

In [4]:
!cat capture/NHGIS_1990/NHGIS_block1990_to_block2010/readme.txt

NHGIS_block1990_to_block2010.zip
*PRELIMARY* crosswalk between 1990 and 2010 census blocks

Producer: National Historical Geographic Information System (NHGIS)
NHGIS citation: See https://nhgis.org/research/citation for current citation

___Summary___
File name: crosswalk_block1990_block2010_v001.csv
Content:
	- Each row represents an intersection between a 1990 block and 2010 block
	- The FID1990 and FID2010 fields contain block IDs preceded arbitrarily by an "F" to prevent the loss of leading zeros.
		- The FID1990 field contains numerous values of "NULL". These represent cases where the only 1990 blocks intersecting the corresponding 2010 block are offshore, lying in coastal or Great Lakes waters, which are excluded from NHGIS's block boundary files. None of the missing 1990 blocks had any reported population or housing units. The NULL records are included here to ensure that all 2010 blocks are represented in the file.
	- The WEIGHT field contains the interp

In [5]:
nhgis_userid = 97092

# Corrupt files:
# 18: 11-15.  Split into 38, 39, 40
#          38: 11, 14, 15 is good
#          39: 12 is corrupt
#          40: 13 is good
# 26: 51-55.  Resubmitted as 36, corrupt.  Resubmitted as 43, corrupt
# 33: 86-90.  Resubmitted as 42, corrupt.
# 34: 91-95.  Resubmitted as 37, corrupt

#nhgis_extract_numbers = [16, 17, 19, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 35, 38, 40, 41]
nhgis_extract_numbers = [46, 47, 48]
#nhgis_extract_numbers = [46]

decennial_datafiles = []

for extract_no in nhgis_extract_numbers:
    extract_no_dddd = '%04d' % extract_no
    source = 'https://data2.nhgis.org/extracts/{nhgis_userid}/{extract_no}/nhgis{extract_no_dddd}_csv.zip'.format(**locals())
    dest = '{capture90}/{extract_no_dddd}'.format(**locals())
    download_file(source, dest + '.zip')
    unzip_file(dest + '.zip')
    csvfiles = glob.glob(dest + '/*/*.csv')
    assert len(csvfiles) == 1
    csvfile = csvfiles[0]
    decennial_datafiles.append(csvfile)
    !ls -lh $csvfile
print decennial_datafiles

Downloading https://data2.nhgis.org/extracts/97092/46/nhgis0046_csv.zip to capture/NHGIS_1990/0046.zip
Done, wrote 338384564 bytes to capture/NHGIS_1990/0046.zip
Unzipping capture/NHGIS_1990/0046.zip into capture/NHGIS_1990/0046.tmp
Success, created capture/NHGIS_1990/0046
-rw-rw-r-- 1 rsargent rsargent 4.2G Jul 20 19:17 capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv
capture/NHGIS_1990/0047.zip already downloaded
capture/NHGIS_1990/0047.zip already unzipped
-rw-rw-r-- 1 rsargent rsargent 1.8G Jul 20 16:54 capture/NHGIS_1990/0047/nhgis0047_csv/nhgis0047_ds120_1990_block.csv
capture/NHGIS_1990/0048.zip already downloaded
capture/NHGIS_1990/0048.zip already unzipped
-rw-rw-r-- 1 rsargent rsargent 1.8G Jul 20 16:57 capture/NHGIS_1990/0048/nhgis0048_csv/nhgis0048_ds120_1990_block.csv
['capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv', 'capture/NHGIS_1990/0047/nhgis0047_csv/nhgis0047_ds120_1990_block.csv', 'capture/NHGIS_1990/0048/nhgis0048_csv/nhgi

Construct mapping from 1990 GEOID to decennial row
--------------------------------------------------

In [6]:
def canonicalize_nhgis_1990_gisjoin(g):
    (leader, g) = (g[0:1], g[1:])    # always G, ignore
    assert leader == 'G'
    (state, g) = (g[0:2], g[2:])     # state 2 digits
    (zero, g) = (g[0:1], g[1:])      # always zero, ignore
    assert zero == '0'
    (county, g) = (g[0:3], g[3:])    # county 4 digits
    (zero, g) = (g[0:1], g[1:])      # always zero, ignore
    assert zero == '0'
    if len(g) > 8:                   # tract is 4 or 6 digits
        (tract, g) = (g[0:6], g[6:])  
    else:
        (tract, g) = (g[0:4] + '00', g[4:])
    block = g                        # block is 3 or 4 chars
    assert len(g) == 3 or len(g) == 4
    ret = state + county + tract + block
    assert len(ret) == 14 or len(ret) == 15
    return ret

#print canonicalize_nhgis_1990_gisjoin('01000100201103')
#print canonicalize_nhgis_1990_gisjoin('01000100201101A')
#print canonicalize_nhgis_1990_gisjoin('0100030010701122')
#print canonicalize_nhgis_1990_gisjoin('0100030010702134A')

def row_names_from_decennial_datafile(decfile):
    for _ in stopwatch('Reading row names from %s' % decfile):
        original_row_names = pandas.read_csv(decfile, usecols=[0], skiprows=[1], memory_map=True)
    return [canonicalize_nhgis_1990_gisjoin(g) for g in list(original_row_names['GISJOIN'])]

# print row_names_from_decennial_datafile(decennial_datafiles[0])[0:10]
# print row_names_from_decennial_datafile(decennial_datafiles[-1])[0:10]

In [7]:
row_names_1990 = row_names_from_decennial_datafile(decennial_datafiles[0])

# Confirm all 1990 decennial datafiles have same rows
# This check already succeeded 2017 Mac

confirm_same_rows = True

if confirm_same_rows:
    for decfile in decennial_datafiles[1:]:
        assert(row_names_from_decennial_datafile(decfile) == row_names_1990)

print 'Decennial datafiles have %d rows each' % len(row_names_1990)    

geoid2rowidx1990 = {}

for i in range(0, len(row_names_1990)):
        geoid2rowidx1990[row_names_1990[i]] = i

Reading row names from capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv took 62.4 seconds
Reading row names from capture/NHGIS_1990/0047/nhgis0047_csv/nhgis0047_ds120_1990_block.csv took 13.8 seconds
Reading row names from capture/NHGIS_1990/0048/nhgis0048_csv/nhgis0048_ds120_1990_block.csv took 13.1 seconds
Decennial datafiles have 4934106 rows each


Read 1990 to 2010 crosswalk
---------------------------

In [8]:
for _ in stopwatch('Reading crosswalk'):
    crosswalk = pandas.read_csv(crosswalk_file, names=['geoid1990', 'geoid2010', 'weight', 'parea'], memory_map=True)

print 'Crosswalk has %d rows' % len(crosswalk)

sorted_crosswalk_geoids_2010 = [g[1:] for g in sorted(set(crosswalk['geoid2010']))]

print 'Crosswalk has %d 2010 GEOIDs' % len(sorted_crosswalk_geoids_2010)

# Number of blocks to expect in 2010
block2010_count = 11078297

assert block2010_count == len(sorted_crosswalk_geoids_2010)

# But we index starting at 1 in the numpy vectors,
# so they're length 11078298
assert block2010_count + 1 == len(numpy.load('columncache/census2000_block2010/H0020001.numpy'))

# numpy vectors are float32
assert numpy.float32 == numpy.load('columncache/census2000_block2010/H0020001.numpy').dtype

# Map 2010 blocks to vector indices
geoid2rowidx2010 = {}
for i in range(0, block2010_count):
    geoid2rowidx2010[sorted_crosswalk_geoids_2010[i]] = i + 1

Reading crosswalk took 22.1 seconds
Crosswalk has 20286557 rows
Crosswalk has 11078297 2010 GEOIDs


In [9]:
# Construct a sparse crosswalk matrix C so that we can multiply
# decennial 1990 data D with len(row_names_1990) rows to create
# interpolated data I with block2010_count+1 rows 

for _ in stopwatch('Creating crosswalk matrix'):
    crosswalk_matrix = scipy.sparse.lil_matrix((block2010_count + 1,len(row_names_1990)))
    
#for _ in stopwatch('Sleeping for half a second'):
#    time.sleep(0.5)

for _ in stopwatch('Populating crosswalk matrix'):
    for (geoid1990, geoid2010, weight, darea) in crosswalk.itertuples(index=False, name=None):
        geoid1990 = geoid1990[1:]
        geoid2010 = geoid2010[1:]
        if geoid1990 in geoid2rowidx1990:
            r = geoid2rowidx2010[geoid2010]
            c = geoid2rowidx1990[geoid1990]
            crosswalk_matrix[r,c] = weight

Creating crosswalk matrix took 31.0 seconds
Populating crosswalk matrix took 137.4 seconds


In [15]:
columnMap = json.load(open('capture/NHGIS_1990/columnMap.json'))

def canonicalize_column_name(c):
    prefix = 'census1990_block2010.'
    return columnMap[prefix + c].replace(prefix, '')

def interpolate_nhgis_1990_datafile(csvfilename):
    header = csv.reader(open(csvfilename)).next()
    first_data_col = 26
    assert(header[first_data_col - 1] == 'ANPSADPI')
    all_cols = range(first_data_col, len(header))
    shard_size = 50
    nshards = math.ceil(float(len(all_cols)) / shard_size)
    print '%s has %d data columns, dividing into %d shards' % (csvfilename, len(all_cols), nshards)
    
    for shardno, cols in enumerate(numpy.array_split(all_cols, nshards)):
        
        for _ in stopwatch('Reading columns %d-%d (shard %d of %d) from %s' % (
                min(cols), max(cols), shardno + 1, nshards, csvfilename)):
            p = pandas.read_csv(csvfilename, usecols=cols, skiprows=[1], dtype=numpy.float64, memory_map=True)
            d = p.as_matrix()
            colnames = p.columns.values
            
        for _ in stopwatch('Interpolating %d columns' % len(cols)):
            interpolated = crosswalk_matrix * d
            
        for _ in stopwatch('Writing %d columns into columncache' % len(cols)):
            for i in range(0, len(colnames)):
                canonical_colname = canonicalize_column_name(colnames[i])
                dest = 'columncache/census1990_block2010/{canonical_colname}.numpy'.format(**locals())
                col = interpolated[:, i].astype(numpy.float32)
                assert len(col) == block2010_count + 1
                assert col.dtype == numpy.float32
                numpy_atomic_save(dest, col)

In [16]:
print decennial_datafiles

['capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv', 'capture/NHGIS_1990/0047/nhgis0047_csv/nhgis0047_ds120_1990_block.csv', 'capture/NHGIS_1990/0048/nhgis0048_csv/nhgis0048_ds120_1990_block.csv']


In [17]:
failed = []
for file in decennial_datafiles:
    try:
        interpolate_nhgis_1990_datafile(file)
    except:
        traceback.print_exc()
        failed.append(file)
if failed:
    raise Exception('Some files failed to load: %s' % (' '.join(failed)))

capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv has 360 data columns, dividing into 8 shards
Reading columns 26-70 (shard 1 of 8) from capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv took 165.0 seconds
Interpolating 45 columns took 96.0 seconds
Writing 45 columns into columncache took 21.3 seconds
Reading columns 71-115 (shard 2 of 8) from capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv took 121.0 seconds
Interpolating 45 columns took 89.5 seconds
Writing 45 columns into columncache took 12.2 seconds
Reading columns 116-160 (shard 3 of 8) from capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv took 74.0 seconds
Interpolating 45 columns took 96.4 seconds
Writing 45 columns into columncache took 8.5 seconds
Reading columns 161-205 (shard 4 of 8) from capture/NHGIS_1990/0046/nhgis0046_csv/nhgis0046_ds120_1990_block.csv took 53.4 seconds
Interpolating 45 columns took 26.0 seconds
Writing 45 columns into column