Download, unpack, and import into database the 2010 Census Block Shapefiles
===========================================================================

In [1]:
import csv, json, os, re, shutil
import subprocess, sys, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

Census 2010 subdivisions, for U.S. only:
    
    ~50 states
    ~73K census tracts (approx 4000 people)
    ~218K block groups (approx 1500 people)
    ~11M blocks

How to set up postgres
======================
    
    sudo -u postgres createuser --superuser $USER
    sudo -u postgres createdb $USER
    touch ~/.psql_history

In [7]:
!psql -c 'CREATE DATABASE timelapse;'
psql('CREATE EXTENSION IF NOT EXISTS postgis;')

ERROR:  database "timelapse" already exists
CREATE EXTENSION IF NOT EXISTS postgis;
Finished execution: CREATE EXTENSION

NOTICE:  extension "postgis" already exists, skipping



In [8]:
capture_dir = "capture/tabblock_2010/"

Download 2010 Census Block Shapefiles
-------------------------------------

In [9]:
def download_shapes(state_id):
    # URL for FIPS and GNIS codes file -- https://www.census.gov/geo/reference/docs/state.txt'
    url = 'http://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/2010/tl_2010_%s_tabblock10.zip' % state_id
    filename = capture_dir + os.path.basename(url)
    download_file(url, filename)
    unzip_file(filename)
    
# Download all state shapefiles
threads = []

for state_id in state_ids:
    threads.append(threading.Thread(target=download_shapes, args=(state_id,)))
    threads[-1].start()

for t in threads:
    t.join()

capture/tabblock_2010/tl_2010_01_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_01_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_02_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_02_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_04_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_04_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_05_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_06_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_06_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_05_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_08_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_08_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_09_tabblock10.zip already downloaded
capture/tabblock_2010/tl_2010_09_tabblock10.zip already unzipped
capture/tabblock_2010/tl_2010_11_tabblock10.zip already downloaded
capture/t

Load 2010 census block shapefiles into postgres
-----------------------------------------------

In [26]:
# Warning:  only run this block once or you may end up with duplicate records in the database
# TODO: check if the database is already there and populated and if so, skip the work...

def load_shapes_into_db(state_id, prepare_only=False):
    command = 'shp2pgsql'
    if prepare_only:
        command += ' -p'   # prepare the tables only, don't load
        sys.stdout.write('Creating table tl_2010_tabblock10 schema using state %s\n' % state_id)
    else:
        command += ' -a'   # append to tables
        sys.stdout.write('Appending state %s to tl_2010_tabblock10\n' % state_id)
    # Transform lat, lon from NAD83 (EPSG:4269) to WGS84 (EPSG:4326)
    command += ' -s 4269:4326'
    command += ' capture/tabblock_2010/tl_2010_%s_tabblock10/tl_2010_%s_tabblock10.shp tl_2010_tabblock10' % (state_id, state_id)
    command += ' | psql -q -d timelapse'
    out = subprocess.check_output(command, shell=True)
    sys.stdout.write('psql output: %s\n' % out.strip())
    
# Drop and recreate tl_2010_tabblock10    
psql('DROP TABLE tl_2010_tabblock10;')
load_shapes_into_db(state_ids[0], prepare_only=True)
# Add indices to census block shapefile table
psql('CREATE INDEX ON tl_2010_tabblock10 (geoid10);')
psql('CREATE INDEX ON tl_2010_tabblock10 USING GIST (geom);')

# Loading all states in parallel worked OK on 32-core 64GB earthdev2.  psql was i/o bound, but no paging occurred
# On a lesser machine it might be important to do only a few in parallel at a time
threads = []

for state_id in state_ids:
    threads.append(threading.Thread(target=load_shapes_into_db, args=(state_id, False)))
    threads[-1].start()

for t in threads:
    t.join()

DROP TABLE tl_2010_tabblock10;
Finished execution: DROP TABLE


Creating table tl_2010_tabblock10 schema using state 01
psql output: addgeometrycolumn                          
--------------------------------------------------------------------
 public.tl_2010_tabblock10.geom SRID:4326 TYPE:MULTIPOLYGON DIMS:2 
(1 row)
CREATE INDEX ON tl_2010_tabblock10 (geoid10);
Finished execution: CREATE INDEX


CREATE INDEX ON tl_2010_tabblock10 USING GIST (geom);
Finished execution: CREATE INDEX


Appending state 01 to tl_2010_tabblock10
Appending state 04 to tl_2010_tabblock10
Appending state 02 to tl_2010_tabblock10
Appending state 05 to tl_2010_tabblock10
Appending state 06 to tl_2010_tabblock10
Appending state 08 to tl_2010_tabblock10
Appending state 09 to tl_2010_tabblock10
Appending state 10 to tl_2010_tabblock10
Appending state 11 to tl_2010_tabblock10
Appending state 12 to tl_2010_tabblock10
Appending state 13 to tl_2010_tabblock10
Appending state 15 to tl_2010_tabblock10
Appending state 

Calculate and store WGS84 centroids
-----------------------------------

This took around 30 mins on earthserve2

In [28]:
psql('ALTER TABLE tl_2010_tabblock10 ADD centroid geometry(Point,4326);')
psql('UPDATE tl_2010_tabblock10 SET centroid = St_Centroid(geom);')

ALTER TABLE tl_2010_tabblock10 ADD centroid geometry(Point,4326);
Finished execution: ALTER TABLE


UPDATE tl_2010_tabblock10 SET centroid = St_Centroid(geom);
Finished execution: UPDATE 11078297




tl_2010_tabblock10 table structure
----------------------------------

In [2]:
psql('\d tl_2010_tabblock10')

\d tl_2010_tabblock10
Finished execution in 2.57236 secs: Table "public.tl_2010_tabblock10"
   Column   |            Type             |                            Modifiers                             
------------+-----------------------------+------------------------------------------------------------------
 gid        | integer                     | not null default nextval('tl_2010_tabblock10_gid_seq'::regclass)
 statefp10  | character varying(2)        | 
 countyfp10 | character varying(3)        | 
 tractce10  | character varying(6)        | 
 blockce10  | character varying(4)        | 
 geoid10    | character varying(15)       | 
 name10     | character varying(10)       | 
 mtfcc10    | character varying(5)        | 
 ur10       | character varying(1)        | 
 uace10     | character varying(5)        | 
 uatyp10    | character varying(1)        | 
 funcstat10 | character varying(1)        | 
 aland10    | double precision            | 
 awater10   | double precision         

In [31]:
psql('SELECT COUNT(*) FROM tl_2010_tabblock10')

SELECT COUNT(*) FROM tl_2010_tabblock10
Finished execution:   count   
----------
 11078297
(1 row)





In [6]:
psql("SELECT statefp10, SUM(ST_AREA(geography(geom))) FROM tl_2010_tabblock10 GROUP BY statefp10 ORDER BY statefp10")

SELECT statefp10, SUM(ST_AREA(geography(geom))) FROM tl_2010_tabblock10 GROUP BY statefp10 ORDER BY statefp10
Finished execution in 1276.26 secs: statefp10 |       sum        
-----------+------------------
 01        | 135767386785.177
 02        | 1723336231602.72
 04        | 295233966389.403
 05        | 137731759748.111
 06        | 423966986967.012
 08        | 269602109668.028
 09        | 14357704640.4523
 10        | 6445768047.60594
 11        | 176999629.839292
 12        | 170310252747.451
 13        | 153910558920.819
 15        | 28313002857.9046
 16        | 216442088143.975
 17        | 149995358564.624
 18        | 94326244618.7675
 19        | 145745894630.339
 20        | 213099884356.535
 21        | 104655739133.661
 22        | 135658630228.489
 23        | 91634016908.5077
 24        | 32131155697.2027
 25        |  27335741662.889
 26        | 250486838403.137
 27        | 225161524593.809
 28        | 125437676538.904
 29        |   180540284484.9
 30        | 