In [1]:
import array, csv, datetime, json, math, multiprocessing, numpy, os, random, re, shutil
import shapely, shapely.wkb, struct, subprocess, sys, tempfile, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

Compute total population at block level for checking
----------------------------------------------------

In [None]:
test_old_pop = False

if test_old_pop:
    psql("SELECT SUM(p001001) FROM sf1_2000_p1 "
         "JOIN geo2000 "
         "          ON geo2000.stusab = geo2000.stusab "
         "             AND geo2000.logrecno = geo2000.logrecno "
         "GROUP BY sumlev")

Create block table indexed by geoid
-----------------------------------

In [None]:
def create_block_indexed_table_from_view(view_name, year, force=False):
    tokens = view_name.split('_')
    addl = []
    # 2000 tables look like sf1_2000_p1;  rename to sf1_2000_block_p001
    # 2010 tables look like sf1_p1;       rename to sf1_2010_block_p001
    if year == 2010:
        addl.append('2010')
    addl.append('block')
    tokens = (tokens[0:-1] +
              addl +
              [canonicalize_census_table_name(tokens[-1])])
    table_name = '_'.join(tokens)
    if psql_table_exists(table_name) and not force:
        sys.stdout.write('{table_name} already exists\n')
        return
    tmp_table_name = table_name + "_tmp"
    data_columns = ', '.join(get_census_column_names_from_view(view_name))
    psql("DROP TABLE {table_name}".format(**locals()))
    psql("DROP TABLE {tmp_table_name}".format(**locals()))
    psql("""
CREATE TABLE {tmp_table_name} AS
SELECT blockidx{year}, geoid{year}, {data_columns}
FROM {view_name}
JOIN geo{year} USING (stusab, logrecno)
JOIN census{year}_block_idxs ON (geoid{year} = state || county || tract || block)
WHERE sumlev='101'
""".format(**locals()))
    psql(("CREATE UNIQUE INDEX {table_name}_idx "
          "ON {tmp_table_name} (geoid{year})".format(**locals())))
    psql(("ALTER TABLE {tmp_table_name} "
          "ADD CONSTRAINT {table_name}_pkey PRIMARY KEY "
          "  USING INDEX {table_name}_idx;").format(**locals()))
    psql("ALTER TABLE {table_name}_tmp RENAME TO {table_name}".format(**locals()))

create_block_indexed_table_from_view('sf1_p1', 2010, force=True)

In [None]:
pool = SimpleThreadPoolExecutor(8)  # seems good for 64GB RAM earthserve2

for view_name in get_view_names('sf1_%'):
    if '2000' in view_name:
        year = 2000
    else:
        year = 2010
    pool.submit(create_block_indexed_table_from_view, view_name, year)

pool.shutdown()

Sample block table
------------------

In [None]:
print psql('\d sf1_2000_block_p001')
print psql('select * from sf1_2000_block_p001 LIMIT 10')

Generate binary columns
-----------------------

In [2]:
def generate_binary_column(table, year, column, force=False):
    dataset = 'census{year}_block{year}'.format(**locals())
    
    cache_dir = 'columncache'
    
    dir_name = '{cache_dir}/{dataset}'.format(**locals())
    !mkdir -p $dir_name
    
    cache_filename = '{dir_name}/{column}.numpy'.format(**locals())
    if os.path.exists(cache_filename) and not force:
        sys.stdout.write('{cache_filename} already exists\n'.format(**locals()))
        return

    query = """
SELECT {column}
FROM {table}
WHERE LEFT(geoid{year}, 2) != '72'
ORDER BY geoid{year};
""".format(**locals())

    data = numpy.array([0] + [x[0] for x in query_psql(query)],
                       dtype=numpy.uint16)
    
    tmp_file = tempfile.NamedTemporaryFile(dir=cache_dir, delete=False)
    numpy.save(tmp_file, data)
    nrecs = len(data)
    tmp_file.close()
    os.rename(tmp_file.name, cache_filename)
    sys.stdout.write('Wrote {nrecs} records to {cache_filename}\n'.format(**locals()))

#generate_binary_column('sf1_2010_block_p001', 2010, 'p001001', force=True)

In [None]:
pool = SimpleThreadPoolExecutor(8)  # seems good for 64GB RAM earthserve2

#for table in ['sf1_2010_block_p012']:
for table in sorted(get_table_names('sf1_2010_block_%')):
    for column in get_census_column_names_from_view(table):
        pool.submit(generate_binary_column, table, 2010, column, force=True)

pool.shutdown()
None

Execution of SELECT table_name FROM INFORMATION_SCHEMA.tables WHERE table_schema='public' AND table_type='BASE TABLE' AND table_name LIKE 'sf1_2010_block_%'
took 0.022999 seconds and returned 331 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_2010_block_h001' AND column_name LIKE 'h001%'
took 0.0166321 seconds and returned 1 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_2010_block_h002' AND column_name LIKE 'h002%'
took 0.0142901 seconds and returned 6 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_2010_block_h003' AND column_name LIKE 'h003%'
took 0.0222962 seconds and returned 3 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_2010_block_h004' AND column_name LIKE 'h004%'
took 0.0100222 seconds and returned 4 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_2010_block_h0

In [None]:
psql("SELECT COUNT(*) FROM sf1_2010_block_p001")
psql("SELECT COUNT(*) FROM sf1_2010_block_p012")
psql("SELECT COUNT(*) FROM sf1_2010_block_p012 WHERE LEFT(geoid2010,2) != '72'")

In [11]:
def parse_hex(hex):
    return 

def pack_color(color):
    return color['r'] + color['g'] * 256.0 + color['b'] * 256.0 * 256.0;

def parse_color(color):
    if type(color) == str:
        if len(color) == 7:
            color = color[1:]
        if len(color) == 6:
            return pack_color({'r': int(color[0:2], 16),
                               'g': int(color[2:4], 16),
                               'b': int(color[4:6], 16)})
    raise Exception('cannot parse color %s' % color)

def parse_colors(colors):
    packed = [parse_color(color) for color in colors]
    return numpy.array(packed, dtype = numpy.float32)
    
parse_color('#1b9e77')

7839259.0

In [12]:


parse_colors(['#1b9e77','#d95f02','#7570b3'])

array([  7839259.,    155609.,  11759733.], dtype=float32)

In [8]:
int('aa',16)

170