Download, unpack, and import into database the 2000 and 2010 Census Block Shapefiles as defined by TIGER 2010
=============================================================================================================

Note:  There are two definitions of the 2000 decennial census block shapes.  The shapes originally created in 2000 by TIGER2K are quite inaccurate and should no longer be used.  In 2010, TIGER 2010 created block shapes for the 2010 Decennial Census, and also at the same time recreated the 2000 Decennial Census block shapes, a much more accurate version.

This script downloads the TIGER 2010 versions of both the 2000 and 2010 decennial census block shapes.

In [1]:
import csv, json, os, re, shutil
import subprocess, sys, threading, traceback, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

Census 2010 subdivisions, for U.S. only:
    
    ~50 states
    ~73K census tracts (approx 4000 people)
    ~218K block groups (approx 1500 people)
    ~11M blocks

How to set up postgres
======================
    
    sudo -u postgres createuser --superuser $USER
    sudo -u postgres createdb $USER
    touch ~/.psql_history

In [2]:
!psql -c 'CREATE DATABASE timelapse;'
psql('CREATE EXTENSION IF NOT EXISTS postgis;')

ERROR:  database "timelapse" already exists
CREATE EXTENSION IF NOT EXISTS postgis;
Finished execution in 0.0633948 secs: CREATE EXTENSION

NOTICE:  extension "postgis" already exists, skipping


In [2]:
capture_dir = 'capture'

Download Tiger2010 Census Block Shapefiles for 2000 and 2010 censuses
---------------------------------------------------------------------

In [3]:
def tiger2010_table_name(yyyy):
    return 'tiger2010_census{yyyy}_blocks'.format(**locals())

def tiger2010_file_name(state_id, yyyy):
    yy = '%02d' % (yyyy % 100)
    return 'tl_2010_{state_id}_tabblock{yy}'.format(**locals())

def tiger2010_url(state_id, yyyy):
    yy = '%02d' % (yyyy % 100)
    file_name = tiger2010_file_name(state_id, yyyy)
    return 'http://www2.census.gov/geo/tiger/TIGER2010/TABBLOCK/{yyyy}/{file_name}.zip'.format(**locals())

def tiger2010_dir(state_id, yyyy):
    yy = '%02d' % (yyyy % 100)
    table_name = tiger2010_table_name(yyyy)
    file_name = tiger2010_file_name(state_id, yyyy)
    return '{capture_dir}/{table_name}/{file_name}'.format(capture_dir=capture_dir, **locals())
    
def tiger2010_shapefile_path(state_id, yyyy):
    yy = '%02d' % (yyyy % 100)
    table_name = tiger2010_table_name(yyyy)
    file_name = tiger2010_file_name(state_id, yyyy)
    return '{capture_dir}/{table_name}/{file_name}/{file_name}.shp'.format(capture_dir=capture_dir, **locals())
    
def tiger2010_id_name(yyyy):
    return {2000:'blkidfp00', 2010:'geoid10'}[yyyy]
    
def download_tiger2010_state(state_id, yyyy):
    src = tiger2010_url(state_id, yyyy)
    table_name=tiger2010_table_name(yyyy)
    filename = filename=os.path.basename(src)
    dest = '{capture_dir}/{table_name}/{filename}'.format(capture_dir=capture_dir, **locals())
    download_file(src, dest)
    unzip_file(dest)

In [5]:
!df -h capture

Filesystem      Size  Used Avail Use% Mounted on
/dev/sdd1       3.6T  2.5T  990G  72% /workspace2


In [5]:
executor = SimpleThreadPoolExecutor(max_workers=4)

for yyyy in [2000, 2010]:
    for state_id in state_ids:
        executor.submit(download_tiger2010_state, state_id, yyyy)
        
executor.shutdown()

capture/tiger2010_census2000_blocks/tl_2010_01_tabblock00.zip already downloaded
capture/tiger2010_census2000_blocks/tl_2010_01_tabblock00.zip already unzipped
capture/tiger2010_census2000_blocks/tl_2010_02_tabblock00.zip already downloaded
capture/tiger2010_census2000_blocks/tl_2010_02_tabblock00.zip already unzipped
capture/tiger2010_census2000_blocks/tl_2010_04_tabblock00.zip already downloaded
capture/tiger2010_census2000_blocks/tl_2010_04_tabblock00.zip already unzipped
capture/tiger2010_census2000_blocks/tl_2010_05_tabblock00.zip already downloaded
capture/tiger2010_census2000_blocks/tl_2010_05_tabblock00.zip already unzipped
capture/tiger2010_census2000_blocks/tl_2010_08_tabblock00.zip already downloaded
capture/tiger2010_census2000_blocks/tl_2010_06_tabblock00.zip already downloaded
capture/tiger2010_census2000_blocks/tl_2010_08_tabblock00.zip already unzipped
capture/tiger2010_census2000_blocks/tl_2010_06_tabblock00.zip already unzipped
capture/tiger2010_census2000_blocks/tl_2

In [6]:
!shp2pgsql -p capture/tiger2010_census2000_blocks/tl_2010_01_tabblock00/tl_2010_01_tabblock00.shp
!shp2pgsql -p capture/tiger2010_census2010_blocks/tl_2010_01_tabblock10/tl_2010_01_tabblock10.shp

Shapefile type: Polygon
Postgis type: MULTIPOLYGON[2]
SET CLIENT_ENCODING TO UTF8;
SET STANDARD_CONFORMING_STRINGS TO ON;
BEGIN;
CREATE TABLE "tl_2010_01_tabblock00" (gid serial,
"statefp00" varchar(2),
"countyfp00" varchar(3),
"tractce00" varchar(6),
"blockce00" varchar(4),
"blkidfp00" varchar(15),
"name00" varchar(10),
"mtfcc00" varchar(5),
"ur00" varchar(1),
"uace00" varchar(5),
"funcstat00" varchar(1),
"aland00" float8,
"awater00" float8,
"intptlat00" varchar(11),
"intptlon00" varchar(12));
ALTER TABLE "tl_2010_01_tabblock00" ADD PRIMARY KEY (gid);
SELECT AddGeometryColumn('','tl_2010_01_tabblock00','geom','0','MULTIPOLYGON',2);
COMMIT;
ANALYZE "tl_2010_01_tabblock00";
Shapefile type: Polygon
Postgis type: MULTIPOLYGON[2]
SET CLIENT_ENCODING TO UTF8;
SET STANDARD_CONFORMING_STRINGS TO ON;
BEGIN;
CREATE TABLE "tl_2010_01_tabblock10" (gid serial,
"statefp10" varchar(2),
"countyfp10" varchar(3),
"tractce10" varchar(6),
"blockce10" varchar(4),
"geoid10" varchar(15),
"name10" varchar(10

Load 2000-2010 census block shapefiles into psql
------------------------------------------------

In [7]:
def load_shapes_into_db(state_id, yyyy, prepare_only=False):
    table_name = tiger2010_table_name(yyyy)
    yy = '%02d' % (yyyy % 100)
    command = 'shp2pgsql'
    if prepare_only:
        command += ' -p'   # prepare the tables only, don't load
        psql('DROP TABLE {table_name};'.format(**locals()))
        sys.stdout.write('Creating {table_name} schema using state {state_id}\n'.format(**locals()))
    else:
        command += ' -a'   # append to tables
        sys.stdout.write('Appending state {state_id} to {table_name}\n'.format(**locals()))
    # Transform lat, lon from NAD83 (EPSG:4269) to WGS84 (EPSG:4326)
    command += ' -s 4269:4326'
    shapefile_path = tiger2010_shapefile_path(state_id, yyyy)
    command += ' {shapefile_path} {table_name}'.format(**locals())
    command += ' | psql -q -d timelapse'
    out = subprocess.check_output(command, shell=True)
    sys.stdout.write('psql output: %s\n' % out.strip())
    if prepare_only:
        id_name = tiger2010_id_name(yyyy)
        psql('CREATE INDEX ON {table_name} ({id_name});'.format(**locals()))
        psql('CREATE INDEX ON {table_name} USING GIST (geom);'.format(**locals()))
        psql("ALTER TABLE {table_name} ADD COLUMN dots text DEFAULT '';".format(**locals()))

In [8]:
# Warning:  this regenerates tables from scratch and is slow;  don't rerun if your databases are already populated
# TODO: check if the database is already there and populated and if so, skip the work...

executor = SimpleThreadPoolExecutor(max_workers=10)

for yyyy in [2000, 2010]:
    # Delete old table and set it up synchronously
    load_shapes_into_db(state_ids[0], yyyy, prepare_only=True)
    for state_id in state_ids:
        # Load states asynchronously in parallel
        executor.submit(load_shapes_into_db, state_id, yyyy)
        
executor.shutdown()

DROP TABLE tiger2010_census2000_blocks;
Finished execution in 169.008 secs: DROP TABLE
Creating tiger2010_census2000_blocks schema using state 01
psql output: addgeometrycolumn                              
-----------------------------------------------------------------------------
 public.tiger2010_census2000_blocks.geom SRID:4326 TYPE:MULTIPOLYGON DIMS:2 
(1 row)
CREATE INDEX ON tiger2010_census2000_blocks (blkidfp00);
Finished execution in 0.114668 secs: CREATE INDEX
CREATE INDEX ON tiger2010_census2000_blocks USING GIST (geom);
Finished execution in 0.144983 secs: CREATE INDEX
ALTER TABLE tiger2010_census2000_blocks ADD COLUMN dots text DEFAULT '';
Finished execution in 0.163762 secs: ALTER TABLE
Appending state 01 to tiger2010_census2000_blocks
Appending state 02 to tiger2010_census2000_blocks
Appending state 04 to tiger2010_census2000_blocks
Appending state 05 to tiger2010_census2000_blocks
Appending state 06 to tiger2010_census2000_blocks
Appending state 08 to tiger2010_census

tiger2010_census2000_blocks and tiger2010_census2010_blocks
----------------------------------

In [10]:
psql('\d tiger2010_census2000_blocks')
psql('\d tiger2010_census2010_blocks')

\d tiger2010_census2000_blocks
Finished execution in 0.155889 secs: Table "public.tiger2010_census2000_blocks"
   Column   |            Type             |                                 Modifiers                                 
------------+-----------------------------+---------------------------------------------------------------------------
 gid        | integer                     | not null default nextval('tiger2010_census2000_blocks_gid_seq'::regclass)
 statefp00  | character varying(2)        | 
 countyfp00 | character varying(3)        | 
 tractce00  | character varying(6)        | 
 blockce00  | character varying(4)        | 
 blkidfp00  | character varying(15)       | 
 name00     | character varying(10)       | 
 mtfcc00    | character varying(5)        | 
 ur00       | character varying(1)        | 
 uace00     | character varying(5)        | 
 funcstat00 | character varying(1)        | 
 aland00    | double precision            | 
 awater00   | double precision        

In [None]:
psql('SELECT COUNT(*) FROM tiger2010_census2000_blocks')
psql('SELECT COUNT(*) FROM tiger2010_census2010_blocks')

In [1]:
!pg_dump -t tiger2010_census2010_blocks timelapse | psql census2010

SET
SET
SET
SET
SET
SET
SET
SET
SET
SET
CREATE TABLE
ALTER TABLE
CREATE SEQUENCE
ALTER TABLE
ALTER SEQUENCE
ALTER TABLE
COPY 11078297
  setval  
----------
 11078297
(1 row)

ALTER TABLE
CREATE INDEX
CREATE INDEX


In [14]:
psql("SELECT SUM(ST_AREA(geography(geom))) FROM tiger2010_census2000_blocks")
psql("SELECT SUM(ST_AREA(geography(geom))) FROM tiger2010_census2010_blocks")

SELECT SUM(ST_AREA(geography(geom))) FROM tiger2010_census2000_blocks
Finished execution in 1058.71 secs: sum       
-----------------
 9833517583445.2
(1 row)
SELECT SUM(ST_AREA(geography(geom))) FROM tiger2010_census2010_blocks
Finished execution in 1256.57 secs: sum        
------------------
 9833517583446.43
(1 row)
