TO DO
-----

* Renumber notebook names to reflect correct order of execution;  03 and 04 should swap
* Bump process pool to full amount (e.g. 32 on earthserve2) but limit concurrent psql queries to maybe 8 using Semaphore
* Write points, maybe every 1MB or so, directly to disk to avoid filling up RAM, using append mode and a semaphore to protect access
* Scramble the points at the end


In [44]:
import array, csv, fcntl, glob, json, math, multiprocessing, os, random, re, shutil
import shapely, shapely.wkb, struct, subprocess, sys, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

In [20]:
try:
    import pyproj
except:
    !pip install pyproj
    import pyproj

In [21]:
try:
    import shapely
except:
    !pip install shapely==1.6b2
    import shapely

In [22]:
year = 2011
jt = 'JT01'

In [23]:
def LonLatToPixelXY(lonlat, scale = 1.):
    (lon, lat) = lonlat
    x = (lon + 180.0) * 256.0 / 360.0
    y = 128.0 - math.log(math.tan((lat + 90.0) * math.pi / 360.0)) * 128.0 / math.pi
    return [x*scale, y*scale]

In [24]:
import psycopg2
from random import uniform

def randomPoint(poly):
    bbox = poly.bounds
    l,b,r,t = bbox
    while True:
        point = shapely.geometry.point.Point(uniform(l,r),uniform(t,b))
        if point is None:
            break
        if poly.contains(point):
            break
    return point.__geo_interface__['coordinates']


In [25]:
def split_list(alist, num_chunks=1):
    chunk_size = (len(alist) + num_chunks - 1) // num_chunks
    return [ alist[i*chunk_size : (i+1)*chunk_size] 
             for i in xrange(num_chunks) ]

In [26]:
def pack_color(color):
    return color['r'] + color['g'] * 256.0 + color['b'] * 256.0 * 256.0;

def unpack_color(f):
    b = floor(f / 256.0 / 256.0)
    g = floor((f - b * 256.0 * 256.0) / 256.0)
    r = floor(f - b * 256.0 * 256.0 - g * 256.0)
    return {'r':r,'g':g,'b':b}

se01_color = {'r':25, 'g':75, 'b':255}
se02_color = {'r':20, 'g':138, 'b':9}
se03_color = {'r':227, 'g':30, 'b':30}
nonemployed_color = {'r':0, 'g':0, 'b':0}

Compute dot home and work locations from LODES 3 income levels
--------------------------------------------------------------

In [27]:
max_gid_query = 'SELECT gid FROM od_{jt}_{year} ORDER BY gid DESC LIMIT 1'.format(jt=jt, year=year)
max_gid = query_psql(max_gid_query)[0][0]
print 'Maximum gid in od_{jt}_{year} is {max_gid}'.format(jt=jt, year=year, max_gid=max_gid)

Execution of SELECT gid FROM od_JT01_2011 ORDER BY gid DESC LIMIT 1
took 0.002249 seconds and returned 1 rows
Maximum gid in od_JT01_2011 is 106605364


In [28]:
max_psql_parallelism = 8
psql_semaphore = multiprocessing.Semaphore(max_psql_parallelism)

def query_psql_throttled(query, quiet=False):
    psql_semaphore.acquire()
    try:
        rows = query_psql(query, quiet=quiet)
    finally:
        psql_semaphore.release()
    return rows


In [29]:
dest = 'od-{jt}-{year}.bin'.format(jt=jt, year=year)

In [32]:
def process_employment_shard(gid_start, gid_end):
    shard_filename = dest + '-shard-employed%09d' % gid_start
    if os.path.exists(shard_filename):
        sys.stdout.write('%s already exists\n' % shard_filename)
        return
    shard_out = open(shard_filename + '.tmp', 'wb')
    query = (
            "SELECT w.geom, h.geom, od.se01, od.se02, od.se03 "
            "FROM od_{jt}_{year} od  "
            "JOIN tl_2010_tabblock10 w ON od.w_geocode = w.geoid10 "
            "JOIN tl_2010_tabblock10 h ON od.h_geocode = h.geoid10 "
            "WHERE od.gid BETWEEN {gid_start} AND {gid_end} "
            #"AND od.h_geocode LIKE '42%' "
            ).format(jt=jt, year=year, gid_start=gid_start, gid_end=gid_end)
    rows = query_psql_throttled(query, quiet=True)
    begin_time = time.time()
    if len(rows) == 0:
        sys.stdout.write("No rows, aborting shard %d\n" % gid_start)
        return []

    points = []
    wgs84_geod = pyproj.Geod(ellps='WGS84')
    for (workGeom, homeGeom, se01, se02, se03) in rows:
        workPolygon = shapely.wkb.loads(workGeom, hex=True)
        homePolygon = shapely.wkb.loads(homeGeom, hex=True)

        # Compute distance on the sphere
        dist = wgs84_geod.inv(workPolygon.centroid.x, workPolygon.centroid.y,
                              homePolygon.centroid.x, homePolygon.centroid.y)[2]
        
        for (count, color) in [(se01, se01_color), (se02, se02_color), (se03, se03_color)]:
            pcolor = pack_color(color)
            for i in range(count):
                wpoint = LonLatToPixelXY(randomPoint(workPolygon))
                hpoint = LonLatToPixelXY(randomPoint(homePolygon))
                points.append(struct.pack('<ffffff', wpoint[0], wpoint[1], hpoint[0], hpoint[1], dist, pcolor))
        
        if len(points) >= 10000:
            shard_out.write(''.join(points))
            points = []
    shard_out.write(''.join(points))
    shard_out.close()
    os.rename(shard_filename + '.tmp', shard_filename)
    sys.stdout.write("Finished %s with %d rows in %g seconds\n" % (shard_filename, len(rows), time.time() - begin_time))


In [14]:
!rm $dest-shard-employed*

In [34]:
shard_size = 100000
pool = multiprocessing.Pool()

results = []

print "Starting shards with maximum index %d" % max_gid

for gid_start in range(0, max_gid + 1, shard_size):
    results.append(pool.apply_async(process_employment_shard, (gid_start, gid_start + shard_size - 1)))

pool.close()
pool.join()

for res in results:
    res.get()

od-JT01-2011.bin-shard-employed000100000 already exists
od-JT01-2011.bin-shard-employed003500000 already exists
od-JT01-2011.bin-shard-employed000200000 already exists
od-JT01-2011.bin-shard-employed000400000 already exists
od-JT01-2011.bin-shard-employed000300000 already exists
od-JT01-2011.bin-shard-employed000000000 already exists
od-JT01-2011.bin-shard-employed000500000 already exists
od-JT01-2011.bin-shard-employed000600000 already exists
od-JT01-2011.bin-shard-employed002100000 already exists
od-JT01-2011.bin-shard-employed000700000 already exists
od-JT01-2011.bin-shard-employed002700000 already exists
od-JT01-2011.bin-shard-employed003300000 already exists
od-JT01-2011.bin-shard-employed003200000 already exists
od-JT01-2011.bin-shard-employed004000000 already exists
od-JT01-2011.bin-shard-employed003900000 already exists
od-JT01-2011.bin-shard-employed003600000 already exists
od-JT01-2011.bin-shard-employed003700000 already exists
od-JT01-2011.bin-shard-employed004100000 already

In [None]:
record_len = 6 * 4
print '%s has %.1f records (points)' % (dest, os.stat(dest).st_size / float(record_len))

In [13]:
record_len = 6 * 4
print '%s has %.1f records (points)' % (dest, os.stat(dest).st_size / float(record_len))

od-JT01-2011.bin has 223234100.0 records (points)


Compute dot home location for "non-employed" persons
----------------------------------------------------

Compute non-employed persons by subtracting LODES primary jobs from "working age population" (15-64) from 2010 Decennial census

In [37]:
!rm $dest-shard-nonemployed*

In [38]:
def get_geocode_at(idx):
    q = "SELECT geocode FROM working_age_blocks ORDER BY geocode OFFSET {idx} LIMIT 1 ".format(idx=idx)
    return query_psql(q, quiet=True)[0][0]

def process_nonemployment_shard(shard_start, shard_end):
    shard_filename = dest + '-shard-nonemployed%09d' % shard_start
    if os.path.exists(shard_filename):
        sys.stdout.write('%s already exists\n' % shard_filename)
        return
    shard_out = open(shard_filename + '.tmp', 'wb')
    query = (
            "SELECT MIN(h.geom), MIN(t.working_age_pop), "
            "SUM(od.se01 + od.se02 + od.se03)"
            "FROM working_age_blocks t "
            "JOIN tl_2010_tabblock10 h ON t.geocode = h.geoid10 "
            "LEFT JOIN od_{jt}_{year} od ON t.geocode = od.h_geocode "
            "WHERE t.geocode BETWEEN '{geocode_start}' AND '{geocode_end}' "
            #"AND t.geocode LIKE '42%' "
            "GROUP BY t.geocode "
            ).format(jt=jt, year=year, geocode_start=get_geocode_at(shard_start), geocode_end=get_geocode_at(shard_end))
    rows = query_psql_throttled(query, quiet=True)
    begin_time = time.time()
    if len(rows) == 0:
        sys.stdout.write("No rows, aborting nonemployment shard %d\n" % shard_start)
        return []

    points = []
    pcolor = pack_color(nonemployed_color)
    for (homeGeom, working_age_pop, employed_pop) in rows:
        if employed_pop == None:
            # LODES records no employment for this home block
            employed_pop = 0
        homePolygon = shapely.wkb.loads(homeGeom, hex=True)

        dist = 0
        count = max(working_age_pop - employed_pop, 0)
        # TODO:  look at negative values;  maybe show them as a different color until we better understand
        for i in range(count):
            hpoint = LonLatToPixelXY(randomPoint(homePolygon))
            points.append(struct.pack('<ffffff', hpoint[0], hpoint[1], hpoint[0], hpoint[1], dist, pcolor))
        
        if len(points) >= 10000:
            shard_out.write(''.join(points))
            points = []
    shard_out.write(''.join(points))
    shard_out.close()
    os.rename(shard_filename + '.tmp', shard_filename)
    sys.stdout.write("Finished %s with %d rows in %g seconds\n" % (shard_filename, len(rows), time.time() - begin_time))


In [42]:
max_shard_idx = query_psql("SELECT COUNT(*) FROM working_age_blocks", quiet=True)[0][0] - 1

shard_size = 100000

pool = multiprocessing.Pool()


print "Starting shards with maximum index %d" % max_shard_idx

results = []
for shard_start in range(0, max_shard_idx + 1, shard_size):
    results.append(pool.apply_async(process_nonemployment_shard, (shard_start, min(max_shard_idx, shard_start + shard_size - 1))))

pool.close()
pool.join()

for res in results:
    res.get()

od-JT01-2011.bin-shard-nonemployed000900000 already exists
od-JT01-2011.bin-shard-nonemployed000100000 already exists
od-JT01-2011.bin-shard-nonemployed000300000 already exists
od-JT01-2011.bin-shard-nonemployed003200000 already exists
od-JT01-2011.bin-shard-nonemployed003300000 already exists
od-JT01-2011.bin-shard-nonemployed003400000 already exists
od-JT01-2011.bin-shard-nonemployed001400000 already exists
od-JT01-2011.bin-shard-nonemployed003500000 already exists
od-JT01-2011.bin-shard-nonemployed001500000 already exists
od-JT01-2011.bin-shard-nonemployed000600000 already exists
od-JT01-2011.bin-shard-nonemployed001600000 already exists
od-JT01-2011.bin-shard-nonemployed003700000 already exists
od-JT01-2011.bin-shard-nonemployed003600000 already exists
od-JT01-2011.bin-shard-nonemployed003800000 already exists
od-JT01-2011.bin-shard-nonemployed003900000 already exists
od-JT01-2011.bin-shard-nonemployed004000000 already exists
od-JT01-2011.bin-shard-nonemployed004100000 already exis

Check shards for sanity
-----------------------

In [46]:
record_len = 4 * 6

def check_point_file(filename):
    n = 0
    with open(filename, 'rb') as file_in:
        print 'Checking %s' % filename
        while True:
            rec = file_in.read(record_len)
            if not rec:
                return
            (wx, wy, hx, hy, dist, color) = struct.unpack('<ffffff', rec)
            if wx < 0 or wx >= 256 or wy < 0 or wy >= 256 or hx < 0 or hx >= 256 or hy < 0 or hy >= 256:
                print 'Error at %s:%d' % (filename, n)
                print (wx, wy, hx, hy, dist, color)
            n += 1

for filename in sorted(glob.glob(dest + '-shard*0')):
    check_point_file(filename)

Checking od-JT01-2011.bin-shard-employed000000000
Checking od-JT01-2011.bin-shard-employed000100000
Checking od-JT01-2011.bin-shard-employed000200000
Checking od-JT01-2011.bin-shard-employed000300000
Checking od-JT01-2011.bin-shard-employed000400000
Checking od-JT01-2011.bin-shard-employed000500000
Checking od-JT01-2011.bin-shard-employed000600000
Checking od-JT01-2011.bin-shard-employed000700000
Checking od-JT01-2011.bin-shard-employed000800000
Checking od-JT01-2011.bin-shard-employed000900000
Checking od-JT01-2011.bin-shard-employed001000000
Checking od-JT01-2011.bin-shard-employed001100000
Checking od-JT01-2011.bin-shard-employed001200000
Checking od-JT01-2011.bin-shard-employed001300000
Checking od-JT01-2011.bin-shard-employed001400000
Checking od-JT01-2011.bin-shard-employed001500000
Checking od-JT01-2011.bin-shard-employed001600000
Checking od-JT01-2011.bin-shard-employed001700000
Checking od-JT01-2011.bin-shard-employed001800000
Checking od-JT01-2011.bin-shard-employed001900000


Randomize point order and write them to binary file
---------------------------------------------------

Binary file will later be converted to tiles, or, if small enough (i.e. a single state like PA or smaller) could be read directly by the web client page.

In [47]:
points = []


for check_only in [False, True]:
    for file in sorted(glob.glob(dest + '-shard-*0')):
        nbytes = os.stat(file).st_size
        if nbytes % record_len:
            raise 'File %s has unexpected length %d' % (file, nbytes)
        if check_only:
            continue
        npoints = nbytes / record_len
        print '%s: reading %d points' % (file, npoints)
        with open(file, 'rb') as file_in:
            points.extend([file_in.read(record_len) for i in range(npoints)])

print "Randomizing %d points..." % len(points)
random.shuffle(points)        

print "Writing randomized points to %s" % dest
points_file = open(dest, 'wb')
for b in points:
    points_file.write(b)
points_file.close()

print '%s has %.1f records (points)' % (dest, os.stat(dest).st_size / float(record_len))

check_point_file(dest)

od-JT01-2011.bin-shard-employed000000000: reading 121689 points
od-JT01-2011.bin-shard-employed000100000: reading 113442 points
od-JT01-2011.bin-shard-employed000200000: reading 117807 points
od-JT01-2011.bin-shard-employed000300000: reading 110820 points
od-JT01-2011.bin-shard-employed000400000: reading 113380 points
od-JT01-2011.bin-shard-employed000500000: reading 116177 points
od-JT01-2011.bin-shard-employed000600000: reading 113536 points
od-JT01-2011.bin-shard-employed000700000: reading 119083 points
od-JT01-2011.bin-shard-employed000800000: reading 115449 points
od-JT01-2011.bin-shard-employed000900000: reading 119191 points
od-JT01-2011.bin-shard-employed001000000: reading 111344 points
od-JT01-2011.bin-shard-employed001100000: reading 111384 points
od-JT01-2011.bin-shard-employed001200000: reading 115637 points
od-JT01-2011.bin-shard-employed001300000: reading 112045 points
od-JT01-2011.bin-shard-employed001400000: reading 112833 points
od-JT01-2011.bin-shard-employed001500000

In [22]:
!ls -l $dest

-rw-rw-r-- 1 rsargent rsargent 5357618400 Feb  1 14:49 od-JT01-2011.bin
