In [44]:
import array, csv, fcntl, glob, json, math, multiprocessing, os, random, re, shutil
import shapely, shapely.wkb, struct, subprocess, sys, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

set_default_psql_database('census2010')

set default_psql_database to census2010


In [2]:
try:
    import pyproj
except:
    !pip install pyproj
    import pyproj

In [3]:
try:
    import shapely
except:
    !pip install shapely==1.6b2
    import shapely

In [4]:
def LonLatToPixelXY(lonlat, scale = 1.):
    (lon, lat) = lonlat
    x = (lon + 180.0) * 256.0 / 360.0
    y = 128.0 - math.log(math.tan((lat + 90.0) * math.pi / 360.0)) * 128.0 / math.pi
    return [x*scale, y*scale]

In [5]:
def randomPoint(poly):
    bbox = poly.bounds
    l,b,r,t = bbox
    while True:
        point = shapely.geometry.point.Point(random.uniform(l,r), random.uniform(t,b))
        if point is None:
            break
        if poly.contains(point):
            break
    return point.__geo_interface__['coordinates']


Create census2010_block_idxs mapping each geoid2010 to a sequential integer
---------------------------------------------------------------------------

In [6]:
regenerate_census2010_block_idxs = False

if regenerate_census2010_block_idxs:
    psql("""
DROP TABLE census2010_block_idxs;
CREATE TABLE census2010_block_idxs (blockidx2010 SERIAL PRIMARY KEY,
                                   geoid2010 VARCHAR);

INSERT INTO census2010_block_idxs 
SELECT nextval('census2010_block_idxs_blockidx2010_seq'), geoid10
FROM tiger2010_census2010_blocks
ORDER BY geoid10;
CREATE UNIQUE INDEX ON census2010_block_idxs (geoid2010);
""")
    
psql("\d census2010_block_idxs")
psql("SELECT COUNT(*) FROM census2010_block_idxs")
psql("SELECT * FROM census2010_block_idxs ORDER BY geoid2010 LIMIT 5")

\d census2010_block_idxs
Finished execution in 0.904316 secs: Table "public.census2010_block_idxs"
    Column    |       Type        |                                  Modifiers                                   
--------------+-------------------+------------------------------------------------------------------------------
 blockidx2010 | integer           | not null default nextval('census2010_block_idxs_blockidx2010_seq'::regclass)
 geoid2010    | character varying | 
Indexes:
    "census2010_block_idxs_pkey" PRIMARY KEY, btree (blockidx2010)
    "census2010_block_idxs_geoid2010_idx" UNIQUE, btree (geoid2010)
SELECT COUNT(*) FROM census2010_block_idxs
Finished execution in 1.38568 secs: count   
----------
 11078297
(1 row)
SELECT * FROM census2010_block_idxs ORDER BY geoid2010 LIMIT 5
Finished execution in 0.108793 secs: blockidx2010 |    geoid2010    
--------------+-----------------
            1 | 010010201001000
            2 | 010010201001001
            3 | 010010201001002
 

In [61]:
#psql('SELECT SUM(p001001) FROM sf1_2000_int2010_p001')
#psql('SELECT SUM(p001001) FROM sf1_2010_block_p001')
psql("""
SELECT SUM(GREATEST(a.p001001, b.p001001))
FROM sf1_2010_block_p001 as a
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""")



SELECT SUM(GREATEST(a.p001001, b.p001001))
FROM sf1_2010_block_p001 as a
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
Finished execution in 14.3458 secs: sum        
------------------
 341321396.909159
(1 row)


In [59]:
#psql("SELECT COUNT(*) FROM census2010_block_idxs")
#psql("SELECT COUNT(*) FROM tiger2010_census2010_blocks")
#psql("SELECT COUNT(*) FROM sf1_2010_block_p001")
#psql("SELECT COUNT(*) FROM sf1_2010_block_p001 WHERE LEFT(geoid2010,2) != '72'")
#psql("SELECT COUNT(*) FROM sf1_2000_int2010_p001")

psql("""
SELECT COUNT(*) FROM census2010_block_idxs as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
LEFT JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""")

SELECT COUNT(*) FROM census2010_block_idxs as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
LEFT JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
Finished execution in 35.8532 secs: count   
----------
 11078297
(1 row)


In [62]:
psql("""
SELECT SUM(GREATEST(a.p001001, b.p001001))
FROM census2010_block_idxs as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""")

SELECT SUM(GREATEST(a.p001001, b.p001001))
FROM census2010_block_idxs as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
Finished execution in 38.3241 secs: sum        
------------------
 341321396.909154
(1 row)


Compute dots for max(2010, 2000 interpolated) block population, by geoid2010
----------------------------------------------------------------------------

In [68]:
max_psql_parallelism = 8
psql_semaphore = multiprocessing.Semaphore(max_psql_parallelism)

def query_psql_throttled(query, quiet=False):
    psql_semaphore.acquire()
    try:
        rows = query_psql(query, quiet=quiet)
    finally:
        psql_semaphore.release()
    return rows

In [85]:
record_format = '<ffii'
record_len = struct.calcsize(record_format)

def process_shard(start_idx, end_idx_exclusive):
    end_idx_inclusive = end_idx_exclusive - 1
    shard_filename = 'shards/protomaster-%08d' % start_idx
    if os.path.exists(shard_filename):
        sys.stdout.write('%s already exists\n' % shard_filename)
        return shard_filename
    shard_out = open(shard_filename + '.tmp', 'wb')
    query = """
SELECT i.blockidx2010, GREATEST(a.p001001, b.p001001), shapes.geom
FROM (SELECT * FROM census2010_block_idxs WHERE blockidx2010 BETWEEN {start_idx} AND {end_idx_inclusive}) as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""".format(**locals())
    rows = query_psql_throttled(query, quiet=True)
    begin_time = time.time()

    points = []
    population = 0
    dots = 0
    for (blockIdx, pop, geom) in rows:
        population += pop
        polygon = shapely.wkb.loads(geom, hex=True)

        for i in range(int(math.ceil(pop))):
            dots += 1
            point = LonLatToPixelXY(randomPoint(polygon))
            points.append(struct.pack(record_format, 
                                      point[0], point[1],
                                      blockIdx, i))
        if len(points) >= 10000:
            shard_out.write(''.join(points))
            points = []
    shard_out.write(''.join(points))
    shard_out.close()
    sys.stdout.write("Shard {shard_filename} has population {population} and {dots} dots\n".format(**locals()))
    os.rename(shard_filename + '.tmp', shard_filename)
    sys.stdout.write("Finished %s with %d rows in %g seconds\n" % (shard_filename, len(rows), time.time() - begin_time))
    return shard_filename

In [86]:
max_block_idx = query_psql("SELECT MAX(blockidx2010) FROM census2010_block_idxs")[0][0]

print 'Max block idx is {max_block_idx}'.format(max_block_idx=max_block_idx)

shard_size = 100000
pool = SimpleProcessPoolExecutor(32)

results = []

print "Starting shards with maximum index %d" % num_blocks

if not os.path.exists('shards'):
    os.mkdir("shards")

for start_idx in range(0, max_block_idx + 1, shard_size):
    pool.submit(process_shard, start_idx, start_idx + shard_size)

shardfiles = pool.shutdown()

Execution of SELECT MAX(blockidx2010) FROM census2010_block_idxs
took 0.00255704 seconds and returned 1 rows
Max block idx is 11078297
Starting shards with maximum index 11078297
Shard shards/protomaster-02600000 has population 913003.11122 and 922213 dots
Finished shards/protomaster-02600000 with 100000 rows in 492.343 seconds
Shard shards/protomaster-03000000 has population 1834620.37559 and 1843066 dots
Finished shards/protomaster-03000000 with 100000 rows in 854.824 seconds
Shard shards/protomaster-00600000 has population 1712061.41818 and 1720744 dots
Finished shards/protomaster-00600000 with 100000 rows in 936.057 seconds
Shard shards/protomaster-00500000 has population 1858008.88365 and 1868058 dots
Finished shards/protomaster-00500000 with 100000 rows in 1020.99 seconds
Shard shards/protomaster-00000000 has population 1829164.76369 and 1837863 dots
Finished shards/protomaster-00000000 with 99999 rows in 1070.45 seconds
Shard shards/protomaster-00300000 has population 2425152.36

In [87]:
def check_point_file(filename):
    n = 0
    with open(filename, 'rb') as file_in:
        print 'Checking %s' % filename
        while True:
            rec = file_in.read(record_len)
            if not rec:
                return
            (x, y, blockIdx, seqWithinBlock) = struct.unpack(record_format, rec)
            if x < 0 or x >= 256:
                print 'Error at %s:%d' % (filename, n)
                print (x, y, blockIdx, seqWithinBlock)
            n += 1

Randomize point order and write them to binary file
---------------------------------------------------

Binary file will later be converted to tiles, or, if small enough (i.e. a single state like PA or smaller) could be read directly by the web client page.

In [91]:
tiledir = 'prototiles'

master = tiledir + '/master.bin'

if not os.path.exists(tiledir):
    os.mkdir(tiledir)

In [92]:
if os.path.exists(master):
    print '%s already exists' % master
else:
    points = []
    for file in sorted(shardfiles):
        nbytes = os.stat(file).st_size
        if nbytes % record_len:
            raise 'File %s has unexpected length %d' % (file, nbytes)
        npoints = nbytes / record_len
        print '%s: reading %d points' % (file, npoints)
        with open(file, 'rb') as file_in:
            points.extend([file_in.read(record_len) for i in range(npoints)])

    print "Randomizing %d points..." % len(points)
    random.shuffle(points)        

    print "Writing randomized points to %s" % master
    points_file = open(master + '.tmp', 'wb')
    for b in points:
        points_file.write(b)
    points_file.close()
    os.rename(master + '.tmp', master)
    check_point_file(master)

print '%s has %.1f records (points)' % (master, os.stat(master).st_size / float(record_len))

shards/protomaster-00000000: reading 1837863 points
shards/protomaster-00100000: reading 2527342 points
shards/protomaster-00200000: reading 1887326 points
shards/protomaster-00300000: reading 2432416 points
shards/protomaster-00400000: reading 3916679 points
shards/protomaster-00500000: reading 1868058 points
shards/protomaster-00600000: reading 1720744 points
shards/protomaster-00700000: reading 4593286 points
shards/protomaster-00800000: reading 3287914 points
shards/protomaster-00900000: reading 9092784 points
shards/protomaster-01000000: reading 5245701 points
shards/protomaster-01100000: reading 5982774 points
shards/protomaster-01200000: reading 6359347 points
shards/protomaster-01300000: reading 4618877 points
shards/protomaster-01400000: reading 4218848 points
shards/protomaster-01500000: reading 2315523 points
shards/protomaster-01600000: reading 4282463 points
shards/protomaster-01700000: reading 5076071 points
shards/protomaster-01800000: reading 3898644 points
shards/proto

Generate prototiles
-------------------

In [None]:
def convert_prototile_to_numpy(old_path, new_path):
    prototile = open(old_path).read()
    npoints = len(prototile) / prototile_record_len
    points = [struct.unpack_from(prototile_record_format, prototile, i * prototile_record_len)
              for i in range(npoints)]
    write_numpy_prototile(points, new_path)


In [93]:
# merc_x and merc_y are in 0-256 web mercator space
def MercToTileCoords(merc_x, merc_y, z):
    n = 2 ** z
    xtile = int(merc_x * n / 256)
    ytile = int(merc_y * n / 256)
    return xtile, ytile

def make_prototile_zoom(z, subsample):
    tiles = [[[] for x in range(2 ** z)] for y in range(2 ** z)]

    num_records = os.stat(master).st_size / record_len
    
    print '%s has %d records (points)' % (master, num_records)
    if subsample < 1:
        num_records = int(round(num_records * subsample))
        print 'Subsampling %g%% to %d records (points)' % (subsample * 100, num_records)
    
    master_file = open(master, 'rb')

    for i in xrange(num_records):
        record = master_file.read(record_len)
        (x, y) = struct.unpack(record_format, record)[0:2]
        (xtile, ytile) = MercToTileCoords(x, y, z)
        tiles[xtile][ytile].append(record)
    
    for ytile in range(2 ** z):
        for xtile in range(2 ** z):
            tile_path = "%s/%d/%d/%d.bin" % (tiledir, z, xtile, ytile)
            if not os.path.exists(os.path.dirname(tile_path)):
                os.makedirs(os.path.dirname(tile_path))
            open(tile_path + '.tmp', 'wb').write(''.join(tiles[xtile][ytile]))
            os.rename(tile_path + '.tmp', tile_path)

    print 'Created %d prototiles with prefix %s/%d' % (4 ** z, tiledir, z)


In [95]:
make_prototile_zoom( 0, 0.001)
make_prototile_zoom( 1, 0.001)
make_prototile_zoom( 2, 0.001)
make_prototile_zoom( 3, 0.001)
make_prototile_zoom( 4, 0.001)
make_prototile_zoom( 5, 0.001)
make_prototile_zoom( 6, 0.004)
make_prototile_zoom( 7, 0.016)
make_prototile_zoom( 8, 0.064)
make_prototile_zoom( 9, 0.256)
make_prototile_zoom(10, 1.0)

prototiles/master.bin has 342298447 records (points)
Subsampling 0.1% to 342298 records (points)
Created 1 tiles with prefix prototiles/0
prototiles/master.bin has 342298447 records (points)
Subsampling 0.1% to 342298 records (points)
Created 4 tiles with prefix prototiles/1
prototiles/master.bin has 342298447 records (points)
Subsampling 0.1% to 342298 records (points)
Created 16 tiles with prefix prototiles/2
prototiles/master.bin has 342298447 records (points)
Subsampling 0.1% to 342298 records (points)
Created 64 tiles with prefix prototiles/3
prototiles/master.bin has 342298447 records (points)
Subsampling 0.1% to 342298 records (points)
Created 256 tiles with prefix prototiles/4
prototiles/master.bin has 342298447 records (points)
Subsampling 0.1% to 342298 records (points)
Created 1024 tiles with prefix prototiles/5
prototiles/master.bin has 342298447 records (points)
Subsampling 0.4% to 1369194 records (points)
Created 4096 tiles with prefix prototiles/6
prototiles/master.bin h

In [96]:
!du -sh prototiles

13G	prototiles


In [97]:
!df -h

Filesystem                        Size  Used Avail Use% Mounted on
udev                               32G     0   32G   0% /dev
tmpfs                             6.3G   67M  6.3G   2% /run
/dev/mapper/earthserve2--vg-root  212G  190G   11G  95% /
tmpfs                              32G   40K   32G   1% /dev/shm
tmpfs                             5.0M     0  5.0M   0% /run/lock
tmpfs                              32G     0   32G   0% /sys/fs/cgroup
/dev/sdb                          1.8T  1.5T  277G  85% /mnt/ssd
/dev/sda2                         473M  224M  226M  50% /boot
/dev/sda1                         511M  3.5M  508M   1% /boot/efi
tmpfs                             6.3G  104K  6.3G   1% /run/user/1007
tmpfs                             6.3G   24K  6.3G   1% /run/user/1009
