TODO:
    
Decide on how to create 2010 geom table from 2000.
- We can create new geoid indices on 2000 tables to create a massive join
- We can ingest entire 01-37 tables individually into RAM in python and interpolate in ram in place and write to
     new 2000 tables
- For either of the above we can decide to write all results into a single massive table, including new geoid column


In [63]:
import array, csv, json, math, multiprocessing, os, random, re, shutil
import shapely, shapely.wkb, struct, subprocess, sys, threading, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

set_default_psql_database('census2010')

set default_psql_database to census2010


Crosswalk:  download and document
---------------------------------

In [2]:
download_file('http://users.pop.umn.edu/~jps/NHGIS_block2000_to_block2010.zip', 'capture/NHGIS_block2000_to_block2010.zip')

capture/NHGIS_block2000_to_block2010.zip already downloaded


In [None]:
unzip_file('capture/NHGIS_block2000_to_block2010.zip')

In [None]:
!cat capture/NHGIS_block2000_to_block2010/readme.txt

In [4]:
!head capture/NHGIS_block2000_to_block2010/crosswalk_block2000_block2010_v002.csv

F010010201001000,F010010201002000,0.03589746619998647,0.008988216111782212
F010010201001000,F010010201002001,0.2533302636105429,0.2637247584498352
F010010201001000,F010010201002002,0,0.000384823707001651
F010010201001000,F010010201002003,0.0762971112558778,0.055429504205606606
F010010201001000,F010010201002004,0.03244112598630059,0.007542916917166581
F010010201001000,F010010201002005,0.41140573696128735,0.18976089744424943
F010010201001000,F010010201002006,0.019909297520890202,0.0019957785723459264
F010010201001000,F010010201002007,0.09924123825815177,0.45777176371172934
F010010201001000,F010010201002008,0.07147776020696295,0.014401340880283067
F010010201001001,F010010201002004,0.13459272780487186,0.04845679605124167


In [3]:
psql('SELECT * FROM crosswalk_block2000_block2010 '
    "WHERE geoid2010='010010201002007'")

SELECT * FROM crosswalk_block2000_block2010 WHERE geoid2010='010010201002007'
Finished execution in 0.185084 secs: geoid2000    |    geoid2010    |  weight   |  parea   
-----------------+-----------------+-----------+----------
 010010201001000 | 010010201002007 | 0.0992412 | 0.457772
(1 row)


Crosswalk columns are:
block_id_2000, block_id_2010, weight, parea
We'll tentatively ignore the last.

For a data column X, we want to compute:
x2k(block_id_2010) = sum(x2k(block_id_2000)*weight(block_id_2000, block_id_2010)) for all block_id_2000 overlapping block_id_2010



Create crosswalk_block2000_block2010 database table
------------------------------------

In [None]:
psql('CREATE TABLE crosswalk_block2000_block2010 '
    '(geoid2000 varchar, '
    ' geoid2010 varchar, '
    ' weight real, '
    ' parea real)')

In [None]:
full_path = os.path.abspath('capture/NHGIS_block2000_to_block2010/crosswalk_block2000_block2010_v002.csv')
psql("COPY crosswalk_block2000_block2010 "
     "FROM '%s' DELIMITER ',' CSV;" % full_path,
     database='census2010')

In [None]:
psql("UPDATE crosswalk_block2000_block2010 SET "
     "geoid2000 = right(geoid2000, 15), "
     "geoid2010 = right(geoid2010, 15)",
     database='census2010')

In [None]:
psql("CREATE INDEX ON crosswalk_block2000_block2010 (geoid2010)",
     database='census2010')

In [19]:
psql("CREATE INDEX ON crosswalk_block2000_block2010 (geoid2000)",
     database='census2010')

CREATE INDEX ON crosswalk_block2000_block2010 (geoid2000)
Finished execution in 179.913 secs: CREATE INDEX


In [None]:
psql('\d crosswalk_block2000_block2010', database='census2010')

Create new interpolated table
------------------------------

In [67]:
psql('DROP TABLE sf1_2000_int2010_p001')
psql('CREATE TABLE sf1_2000_int2010_p001 (geoid2010, p001001) AS '
     'SELECT geoid2010, SUM(weight * p001001) '
     'FROM crosswalk_block2000_block2010 '
     'JOIN sf1_2000_block_p001 USING (geoid2000) '
     'GROUP BY geoid2010 ',
     database='census2010')

DROP TABLE sf1_2000_int2010_p001
Finished execution in 0.272692 secs: DROP TABLE
CREATE TABLE sf1_2000_int2010_p001 (geoid2010, p001001) AS SELECT geoid2010, SUM(weight * p001001) FROM crosswalk_block2000_block2010 JOIN sf1_2000_block_p001 USING (geoid2000) GROUP BY geoid2010
Finished execution in 128.3 secs: SELECT 11078297


In [68]:
psql('\d sf1_2000_int2010_p001')

\d sf1_2000_int2010_p001
Finished execution in 0.172286 secs: Table "public.sf1_2000_int2010_p001"
  Column   |       Type        | Modifiers 
-----------+-------------------+-----------
 geoid2010 | character varying | 
 p001001   | double precision  |


Sanity-checking first interpolated table
----------------------------------------
The 2000 census includes Puerto Rico (FIPS 72), but the crosswalk doesn't, so checking that the overall populations match requires filtering out Puerto Rico from the original census count.

In [69]:
psql('select SUM(p001001) from sf1_2000_int2010_p001')
psql('select SUM(p001001) from sf1_2000_block_p001')
psql("select SUM(p001001) from sf1_2000_block_p001 WHERE LEFT(geoid2000,2) != '72'")

select SUM(p001001) from sf1_2000_int2010_p001
Finished execution in 2.11048 secs: sum       
-----------------
 281421906.05482
(1 row)
select SUM(p001001) from sf1_2000_block_p001
Finished execution in 1.32348 secs: sum    
-----------
 285230516
(1 row)
select SUM(p001001) from sf1_2000_block_p001 WHERE LEFT(geoid2000,2) != '72'
Finished execution in 2.78161 secs: sum    
-----------
 281421906
(1 row)


Add geoid index to geo2000 table
--------------------------------

In [None]:
#in hindsight, should have called this geoid2000
psql("ALTER TABLE geo2000 ADD COLUMN geoid CHARACTER(15)",
     database='census2010')

In [None]:
psql("UPDATE geo2000 SET geoid ="
     " (state || county || tract || block)",
     database='census2010')

In [None]:
psql("CREATE INDEX ON geo2000 (geoid)",
     database='census2010')

In [None]:
psql("SELECT geoid FROM geo2000 WHERE sumlev='101' LIMIT 10", database='census2010')

In [None]:
# Show some 2000 'geoids'
psql("SELECT state || county || tract || block FROM geo2000 WHERE sumlev='101' LIMIT 10", database='census2010')

In [None]:
File identification (FILEID),state/US abbreviation (STUSAB),summary levels (SUMLEV),and the
geographic component codes (GEOCOMP) are critical elements in identifying the geographic level
for each record. The STUSAB field identifies the highest level of geography for the file. In the case
of state file,it identifies the individual state. For SF 1 files,the following FILEID and STUSAB codes
are used:
SF 1 state and state equivalent files ‘uSF1’ ‘AL-WY’
SF 1 advance national file ‘uSF1A’ ‘US’
SF 1 final national file ‘uSF1F’ ‘US’

In [None]:
psql('\d sf1_2000_01', database='census2010')

In [None]:
psql('SELECT fileid,stusab,chariter,cifsn,logrecno FROM sf1_2000_01 LIMIT 10', database='census2010')

In [None]:
psql('\d sf1_2000_02', database='census2010')

In [None]:
psql('\d sf1_2000_p2', database='census2010')

In [None]:
psql('SELECT CONCAT(stusab, logrecno) FROM sf1_2000_01 LIMIT 10', database='census2010')

In [1]:
psql('\d')

NameError: name 'psql' is not defined

In [15]:
psql('\d sf1_2000_h12')

\d sf1_2000_h12
Finished execution in 0.122462 secs: View "public.sf1_2000_h12"
  Column  |         Type         | Modifiers 
----------+----------------------+-----------
 fileid   | character varying(6) | 
 stusab   | character varying(2) | 
 chariter | character varying(3) | 
 cifsn    | character varying(2) | 
 logrecno | character varying(7) | 
 h012001  | double precision     | 
 h012002  | double precision     | 
 h012003  | double precision     |


In [16]:
psql('SELECT h012* FROM sf1_2000_h12 LIMIT 10')

SELECT h012* FROM sf1_2000_h12 LIMIT 10
Finished execution in 0.109905 secs: ERROR:  syntax error at or near "FROM"
LINE 1: SELECT h012* FROM sf1_2000_h12 LIMIT 10
                     ^


In [18]:
query_psql('\d sf1_2000_h12')

ProgrammingError: syntax error at or near "\"
LINE 1: \d sf1_2000_h12
        ^


Execution of SELECT table_name FROM INFORMATION_SCHEMA.views WHERE table_name LIKE 'sf1_2000_%'
took 0.00853682 seconds and returned 286 rows
\d sf1_2000_block_p001
Finished execution in 0.116253 secs: Table "public.sf1_2000_block_p001"
  Column   |  Type   | Modifiers 
-----------+---------+-----------
 geoid2000 | text    | 
 p001001   | integer |
None
select * from sf1_2000_block_p001 LIMIT 10
Finished execution in 0.108216 secs: geoid2000    | p001001 
-----------------+---------
 020130001001052 |     713
 010010211002045 |     115
 050019808001184 |       0
 040019442001073 |      64
 060014271001000 |      27
 080010085121012 |       0
 090010000000992 |       0
 110010001001000 |     279
 100010422021006 |      39
 120010002001000 |      35
(10 rows)
None
select sum(p001001) from sf1_2000_block_p001
Finished execution in 1.37132 secs: sum    
-----------
 285230516
(1 row)
None


In [59]:
print psql('\d sf1_2000_block_p001')
psql('select * from sf1_2000_block_p001')
psql('select * from sf1_2000_p1 LIMIT 10')

\d sf1_2000_block_p001
Finished execution in 0.124137 secs: Table "public.sf1_2000_block_p001"
  Column   |  Type   | Modifiers 
-----------+---------+-----------
 geoid2000 | text    | 
 p001001   | integer |
None
select * from sf1_2000_block_p001
Finished execution in 0.108624 secs: geoid2000    | p001001 
-----------------+---------
 020130001001052 |     713
 020130001001985 |       0
 020130001001004 |       0
 020130001001005 |       0
 020130001001006 |       1
 020130001001007 |       0
 020130001001008 |       0
 020130001001009 |       0
 020130001001010 |       0
 020130001001011 |       0
(10 rows)
select * from sf1_2000_p1 LIMIT 10
Finished execution in 0.109913 secs: fileid | stusab | chariter | cifsn | logrecno | p001001 
--------+--------+----------+-------+----------+---------
 uSF1   | MT     | 000      | 01    | 0002049  |       0
 uSF1   | MT     | 000      | 01    | 0002050  |       0
 uSF1   | MT     | 000      | 01    | 0002051  |       0
 uSF1   | MT     | 000  

In [26]:
query_psql("select table_name from INFORMATION_SCHEMA.views "
     "WHERE table_name LIKE 'sf1_2000_%'")

Execution of select table_name from INFORMATION_SCHEMA.views WHERE table_name LIKE 'sf1_2000_%'
took 0.00861883 seconds and returned 286 rows


[('sf1_2000_p1',),
 ('sf1_2000_p2',),
 ('sf1_2000_p3',),
 ('sf1_2000_p4',),
 ('sf1_2000_p5',),
 ('sf1_2000_p6',),
 ('sf1_2000_p7',),
 ('sf1_2000_p8',),
 ('sf1_2000_p9',),
 ('sf1_2000_p10',),
 ('sf1_2000_p11',),
 ('sf1_2000_p12',),
 ('sf1_2000_p13',),
 ('sf1_2000_p14',),
 ('sf1_2000_p15',),
 ('sf1_2000_p16',),
 ('sf1_2000_p17',),
 ('sf1_2000_p18',),
 ('sf1_2000_p19',),
 ('sf1_2000_p20',),
 ('sf1_2000_p21',),
 ('sf1_2000_p22',),
 ('sf1_2000_p23',),
 ('sf1_2000_p24',),
 ('sf1_2000_p25',),
 ('sf1_2000_p26',),
 ('sf1_2000_p27',),
 ('sf1_2000_p28',),
 ('sf1_2000_p29',),
 ('sf1_2000_p30',),
 ('sf1_2000_p31',),
 ('sf1_2000_p32',),
 ('sf1_2000_p33',),
 ('sf1_2000_p34',),
 ('sf1_2000_p35',),
 ('sf1_2000_p36',),
 ('sf1_2000_p37',),
 ('sf1_2000_p38',),
 ('sf1_2000_p39',),
 ('sf1_2000_p40',),
 ('sf1_2000_p41',),
 ('sf1_2000_p42',),
 ('sf1_2000_p43',),
 ('sf1_2000_p44',),
 ('sf1_2000_p45',),
 ('sf1_2000_p12a',),
 ('sf1_2000_p12b',),
 ('sf1_2000_p12c',),
 ('sf1_2000_p12d',),
 ('sf1_2000_p12e',),
 ('s