In [None]:
import array, csv, fcntl, glob, json, math, multiprocessing, numpy, os, random, re, shutil
import shapely, shapely.wkb, struct, subprocess, sys, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

set_default_psql_database('census2010')

In [None]:
try:
    import pyproj
except:
    !pip install pyproj
    import pyproj

In [None]:
try:
    import shapely
except:
    !pip install shapely==1.6b2
    import shapely

In [None]:
def LonLatToPixelXY(lonlat, scale = 1.):
    (lon, lat) = lonlat
    x = (lon + 180.0) * 256.0 / 360.0
    y = 128.0 - math.log(math.tan((lat + 90.0) * math.pi / 360.0)) * 128.0 / math.pi
    return [x*scale, y*scale]

In [None]:
def randomPoint(poly):
    bbox = poly.bounds
    l,b,r,t = bbox
    while True:
        point = shapely.geometry.point.Point(random.uniform(l,r), random.uniform(t,b))
        if point is None:
            break
        if poly.contains(point):
            break
    return point.__geo_interface__['coordinates']


Create census2010_block_idxs mapping each geoid2010 to a sequential integer
---------------------------------------------------------------------------

In [None]:
regenerate_census2010_block_idxs = False

if regenerate_census2010_block_idxs:
    psql("""
DROP TABLE census2010_block_idxs;
CREATE TABLE census2010_block_idxs (blockidx2010 SERIAL PRIMARY KEY,
                                   geoid2010 VARCHAR);

INSERT INTO census2010_block_idxs 
SELECT nextval('census2010_block_idxs_blockidx2010_seq'), geoid10
FROM tiger2010_census2010_blocks
ORDER BY geoid10;
CREATE UNIQUE INDEX ON census2010_block_idxs (geoid2010);
""")
    
psql("\d census2010_block_idxs")
psql("SELECT COUNT(*) FROM census2010_block_idxs")
psql("SELECT * FROM census2010_block_idxs ORDER BY geoid2010 LIMIT 5")

In [None]:
#psql('SELECT SUM(p001001) FROM sf1_2000_int2010_p001')
#psql('SELECT SUM(p001001) FROM sf1_2010_block_p001')
psql("""
SELECT SUM(GREATEST(a.p001001, b.p001001))
FROM sf1_2010_block_p001 as a
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""")



In [None]:
#psql("SELECT COUNT(*) FROM census2010_block_idxs")
#psql("SELECT COUNT(*) FROM tiger2010_census2010_blocks")
#psql("SELECT COUNT(*) FROM sf1_2010_block_p001")
#psql("SELECT COUNT(*) FROM sf1_2010_block_p001 WHERE LEFT(geoid2010,2) != '72'")
#psql("SELECT COUNT(*) FROM sf1_2000_int2010_p001")

psql("""
SELECT COUNT(*) FROM census2010_block_idxs as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
LEFT JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""")

In [None]:
psql("""
SELECT SUM(GREATEST(a.p001001, b.p001001))
FROM census2010_block_idxs as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
JOIN sf1_2010_block_p001 as a USING (geoid2010)
JOIN sf1_2000_int2010_p001 as b USING (geoid2010)
""")

Compute dots for max(2010, 2000 interpolated) block population, by geoid2010
----------------------------------------------------------------------------

In [None]:
max_psql_parallelism = 8
psql_semaphore = multiprocessing.Semaphore(max_psql_parallelism)

def query_psql_throttled(query, quiet=False):
    psql_semaphore.acquire()
    try:
        rows = query_psql(query, quiet=quiet)
    finally:
        psql_semaphore.release()
    return rows

In [None]:
prototiledir = 'prototiles002'
!mkdir -p $prototiledir

pops = [numpy.load('columncache/census%d_block2010/P0010001.numpy' % y) for y in [1990, 2000, 2010]]
caps = numpy.ceil(numpy.maximum(numpy.maximum(pops[0], pops[1]), pops[2]) * 2 + 60).astype(numpy.int32)

numpy_atomic_save(prototiledir + '/caps.numpy', caps)

In [None]:
record_format = '<ffii'
record_len = struct.calcsize(record_format)

def process_shard(start_idx, end_idx_exclusive):
    end_idx_inclusive = end_idx_exclusive - 1
    shard_filename = 'shards/protomaster-%08d' % start_idx
    if os.path.exists(shard_filename):
        sys.stdout.write('%s already exists\n' % shard_filename)
        return shard_filename
    shard_out = open(shard_filename + '.tmp', 'wb')
    query = """
SELECT i.blockidx2010, shapes.geom
FROM (SELECT * FROM census2010_block_idxs WHERE blockidx2010 BETWEEN {start_idx} AND {end_idx_inclusive}) as i
JOIN tiger2010_census2010_blocks as shapes ON (i.geoid2010=shapes.geoid10)
""".format(**locals())
    rows = query_psql_throttled(query, quiet=True)
    begin_time = time.time()

    points = []
    population = 0
    dots = 0
    for (blockIdx, geom) in rows:
        pop = caps[blockIdx]
        population += pop
        polygon = shapely.wkb.loads(geom, hex=True)

        for i in range(pop):
            dots += 1
            point = LonLatToPixelXY(randomPoint(polygon))
            points.append(struct.pack(record_format, 
                                      point[0], point[1],
                                      blockIdx, i))
        if len(points) >= 10000:
            shard_out.write(''.join(points))
            points = []
    shard_out.write(''.join(points))
    shard_out.close()
    sys.stdout.write("Shard {shard_filename} has population {population} and {dots} dots\n".format(**locals()))
    os.rename(shard_filename + '.tmp', shard_filename)
    sys.stdout.write("Finished %s with %d rows in %g seconds\n" % (shard_filename, len(rows), time.time() - begin_time))
    return shard_filename

In [None]:
max_block_idx = query_psql("SELECT MAX(blockidx2010) FROM census2010_block_idxs")[0][0]

shard_size = 100000
pool = SimpleProcessPoolExecutor(16)

results = []

print "Starting shards with maximum index %d" % max_block_idx

if not os.path.exists('shards'):
    os.mkdir("shards")

for start_idx in range(0, max_block_idx + 1, shard_size):
    pool.submit(process_shard, start_idx, start_idx + shard_size)

shardfiles = pool.shutdown()

Randomize point order and write them to binary file
---------------------------------------------------

Binary file will later be converted to tiles, or, if small enough (i.e. a single state like PA or smaller) could be read directly by the web client page.

In [None]:
master = prototiledir + '/master.bin'

reread_shard_filenames = True

if reread_shard_filenames:
    shardfiles = glob.glob('shards/protomaster-????????')
    print 'Located %d shard files' % len(shardfiles)

In [None]:
numpy_record_type = [('x','<f4'), ('y','<f4'), ('blockIdx', '<i4'), ('subIdx', '<i4')]

if os.path.exists(master):
    print '%s already exists' % master
else:
    npoints = 0
    for file in sorted(shardfiles):
        nbytes = os.stat(file).st_size
        if nbytes % record_len:
            raise 'File %s has unexpected length %d' % (file, nbytes)
        npoints += nbytes / record_len
    print 'From all shards: %d points (%.1fGB)' % (npoints, npoints * record_len / 1e9)

    for _ in stopwatch('Reading all shards'):
        master_points = numpy.zeros(npoints, dtype=numpy_record_type)
        offset = 0
        for file in sorted(shardfiles):
            shard_points = numpy.fromfile(file, dtype=numpy_record_type)
            master_points[offset:offset + len(shard_points)] = shard_points
            print 'Placed %d points from %s at offset %d' % (
                    len(shard_points), file, offset)
            offset += len(shard_points)
        assert offset == npoints

    for _ in stopwatch('Shuffling points'):
        numpy.random.shuffle(master_points)

    for _ in stopwatch('Writing %d points to %s' % (npoints, master)):
        master_points.tofile(master)
        nbytes = os.stat(master).st_size
        print 'Master is %.1fGB' % (nbytes / 1e9)
        assert npoints * record_len == nbytes
        
    master_points = None   # allow memory to be reclaimed

Generate prototiles
-------------------

master.bin from pre-2021 is located at hal15:uwsgi/dotmaptiles-data/data-visualization-tools/examples/lodes/prototiles002

tileserve.py:max_prototile_level is hardcoded to 10, and prototile_subsamples array is also hardcoded, but only used for PNG output (which itself is no longer used?)

If we subsample prototiles less, could we then add our own subsample later?  I think we do that currently for level < 5 in tileserve.py:
    if z < 5:
        # Further subsample the points
        subsample = 2.0 ** ((5.0 - z) / 2.0)  # z=4, subsample=2;  z=3, subsample=4 ...
        # We're further subsampling the prototile
        prototile_subsample /= subsample
        incount = int(incount / subsample)



In [None]:
# Too many points to easily fit in RAM;  instead, buffer and write every so often
class PrototileWriter:
    def __init__(self, z, x, y):
        self.filename = "%s/%d/%d/%d.bin" % (prototiledir, z, x, y)
        if not os.path.exists(os.path.dirname(self.filename)):
            os.makedirs(os.path.dirname(self.filename))
        open(self.filename + '.tmp', 'w') # truncate file
        buffer_len = 256
        self.buf = numpy.zeros(buffer_len, dtype=numpy_record_type)
        self.size = 0
    def write(self, point):
        if self.size == len(self.buf):
            self.flush()
        self.buf[self.size] = point
        self.size += 1
    def flush(self):
        if self.size:
            self.buf[0:self.size].tofile(open(self.filename + '.tmp', 'a'))
            self.size = 0
    def close(self):
        self.flush()
        os.rename(self.filename + '.tmp', self.filename)

In [None]:
def make_prototile_zoom(z, subsample):
    dim = 2 ** z
    with Stopwatch('Building %d prototiles in %s/%d with subsample %g%%' % 
                   (dim * dim, prototiledir, z, subsample * 100.0)):
        tiles = [[PrototileWriter(z, x, y) for y in range(dim)] for x in range(dim)]

        num_records = os.stat(master).st_size / record_len
    
        if subsample < 1:
            num_records = int(round(num_records * subsample))
                              
        with open(master, 'rb') as master_in:
            while num_records > 0:
                records_to_read = min(num_records, 1024)
                bytes = master_in.read(records_to_read * record_len)
                assert len(bytes) == records_to_read * record_len
                records = numpy.frombuffer(bytes, dtype=numpy_record_type)
                assert len(records) == records_to_read
                num_records -= records_to_read
                for record in records:
                    xtile = int(record[0] * dim / 256)
                    ytile = int(record[1] * dim / 256)
                    tiles[xtile][ytile].write(record)
    
        for ytile in range(dim):
            for xtile in range(dim):
                tiles[xtile][ytile].close()


In [None]:
# Zoom behavior.  bad(1) means brighness step change on level switch.
# bad(2) means low res, or brightnness change if we render more levels

#       0-5    5-10   10+
# PNG   bad(1) good   bad(2)
# BIN   good   bad(1) good

# PNG
#

make_prototile_zoom( 0, 0.001) # fully zoomed out
make_prototile_zoom( 1, 0.001)
make_prototile_zoom( 2, 0.001)
make_prototile_zoom( 3, 0.001)
make_prototile_zoom( 4, 0.001)
make_prototile_zoom( 5, 0.001)
make_prototile_zoom( 6, 0.004)
make_prototile_zoom( 7, 0.016)
make_prototile_zoom( 8, 0.064)
make_prototile_zoom( 9, 0.256)
make_prototile_zoom(10, 1.0)   # zoomed in to metropolitan area

In [None]:
!du -sh prototiles002

In [None]:
#def check_point_file(filename):
#    n = 0
#    with open(filename, 'rb') as file_in:
#        print 'Checking %s' % filename
#        while True:
#            rec = file_in.read(record_len)
#            if not rec:
#                return
#            (x, y, blockIdx, seqWithinBlock) = struct.unpack(record_format, rec)
#            if x < 0 or x >= 256:
#                print 'Error at %s:%d' % (filename, n)
#                print (x, y, blockIdx, seqWithinBlock)
#            n += 1