In [5]:
import array, csv, datetime, json, math, multiprocessing, os, random, re, shutil
import shapely, shapely.wkb, struct, subprocess, sys, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

set_default_psql_database('census2010')

set default_psql_database to census2010


Compute total population at block level for checking
----------------------------------------------------

In [6]:
test_old_pop = False

if test_old_pop:
    psql("SELECT SUM(p001001) FROM sf1_2000_p1 "
         "JOIN geo2000 "
         "          ON geo2000.stusab = geo2000.stusab "
         "             AND geo2000.logrecno = geo2000.logrecno "
         "GROUP BY sumlev")

Utility Functions
-----------------
Consider moving some of these to timelapse-utilities.ipynb

In [7]:
def get_column_names(table_name, pattern='%'):
    q = query_psql("SELECT column_name "
                   "FROM information_schema.columns "
                   "WHERE table_name='{table_name}' "
                   "AND column_name LIKE '{pattern}'".format(**locals()))
    return [x[0] for x in q]

#get_column_names('sf1_2000_p1', pattern='p001%')

# convert e.g. p35e to p035e
def canonicalize_census_table_name(name):
    (prefix, num, suffix) = re.match(r'([a-z]+)(\d+)([a-z])?$', name).groups()
    if not suffix:
        suffix = ''
    return '%s%03d%s' % (prefix, int(num), suffix)

def get_census_column_names_from_view(table_name):
    census_table_name = canonicalize_census_table_name(
        table_name.split('_')[-1])
    return get_column_names(table_name, pattern=census_table_name + '%')
    
#print get_census_column_names_from_view('sf1_2000_p35e')
#print get_census_column_names_from_view('sf1_p1')


def get_view_names(pattern='%'):
    q = query_psql("SELECT table_name "
                   "FROM INFORMATION_SCHEMA.views "
                   "WHERE table_name LIKE '{pattern}'".format(**locals()))
    return [x[0] for x in q]

#view_names = get_view_names('sf1_2000_%')
    

Create block table indexed by geoid
-----------------------------------

In [22]:
def psql_table_exists(table_name):
    try:
        query_psql('SELECT * FROM {table_name} LIMIT 1'.format(**locals()))
        return True
    except:
        return False

def create_block_indexed_table_from_view(view_name, year, force=False):
    tokens = view_name.split('_')
    addl = []
    # 2000 tables look like sf1_2000_p1;  rename to sf1_2000_block_p001
    # 2010 tables look like sf1_p1;       rename to sf1_2010_block_p001
    if year == 2010:
        addl.append('2010')
    addl.append('block')
    tokens = (tokens[0:-1] +
              addl +
              [canonicalize_census_table_name(tokens[-1])])
    table_name = '_'.join(tokens)
    if psql_table_exists(table_name) and not force:
        sys.stdout.write('{table_name} already exists\n'.format(**locals()))
        return
    tmp_table_name = table_name + "_tmp"
    data_columns = ', '.join(get_census_column_names_from_view(view_name))
    psql("DROP TABLE {table_name}".format(**locals()))
    psql("DROP TABLE {tmp_table_name}".format(**locals()))
    psql(("CREATE TABLE {tmp_table_name} AS "
          "SELECT (state || county || tract || block) AS geoid{year}, "
          "       {data_columns} "
          "FROM {view_name} "
          "JOIN geo{year} USING (stusab, logrecno) "
          "WHERE sumlev='101' ").format(**locals()))
    psql(("CREATE UNIQUE INDEX {table_name}_idx "
          "ON {tmp_table_name} (geoid{year})".format(**locals())))
    psql(("ALTER TABLE {tmp_table_name} "
          "ADD CONSTRAINT {table_name}_pkey PRIMARY KEY "
          "  USING INDEX {table_name}_idx;").format(**locals()))
    psql("ALTER TABLE {table_name}_tmp RENAME TO {table_name}".format(**locals()))
    

In [None]:
pool = SimpleThreadPoolExecutor(8)  # seems good for 64GB RAM earthserve2

for view_name in get_view_names('sf1_%'):
    if '2000' in view_name:
        year = 2000
    else:
        year = 2010
    pool.submit(create_block_indexed_table_from_view, view_name, year)

pool.shutdown()

Execution of SELECT table_name FROM INFORMATION_SCHEMA.views WHERE table_name LIKE 'sf1_%'
took 0.00856805 seconds and returned 617 rows
Execution of SELECT * FROM sf1_2010_block_p001 LIMIT 1
took 0.00661778 seconds and returned 1 rows
sf1_2010_block_p001 already exists
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_p3' AND column_name LIKE 'p003%'
took 0.0152152 seconds and returned 8 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_p2' AND column_name LIKE 'p002%'
took 0.0138178 seconds and returned 6 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_p6' AND column_name LIKE 'p006%'
took 0.012661 seconds and returned 7 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf1_p4' AND column_name LIKE 'p004%'
took 0.01388 seconds and returned 3 rows
Execution of SELECT column_name FROM information_schema.columns WHERE table_name='sf

Test creating table
-------------------

In [14]:
# Test
#create_table_from_view('sf1_2000_p1', 2000)
print psql('\d sf1_2000_block_p001')
print psql('select * from sf1_2000_block_p001 LIMIT 10')
print psql('select sum(p001001) from sf1_2000_block_p001')

\d sf1_2000_block_p001
Finished execution in 0.117092 secs: Table "public.sf1_2000_block_p001"
  Column   |  Type   | Modifiers 
-----------+---------+-----------
 geoid2000 | text    | not null
 p001001   | integer | 
Indexes:
    "sf1_2000_block_p001_pkey" PRIMARY KEY, btree (geoid2000)
None
select * from sf1_2000_block_p001 LIMIT 10
Finished execution in 0.103305 secs: geoid2000    | p001001 
-----------------+---------
 020130001001052 |     713
 010010211002045 |     115
 050019808001184 |       0
 040019442001073 |      64
 060014271001000 |      27
 080010085121012 |       0
 090010000000992 |       0
 110010001001000 |     279
 100010422021006 |      39
 120010002001000 |      35
(10 rows)
None
select sum(p001001) from sf1_2000_block_p001
Finished execution in 1.33903 secs: sum    
-----------
 285230516
(1 row)
None
