In [11]:
import glob, json, pprint, re
from collections import OrderedDict
pp = pprint.PrettyPrinter(indent=4)

    wget https://data2.nhgis.org/extracts/97092/13/nhgis0013_csv.zip
    wget https://data2.nhgis.org/extracts/97092/12/nhgis0012_csv.zip

    convert NHGIS code to Source code
    lowercase letters
    leading zeros for a 3-digit table number

    P2->p002
    P12->p012

    we want something like with approx 9K rows for 2010:

    [
    ...all the earlier tables...
     ["p012001", ["Sex by Age", "Total"]],
     ["p012002", ["Sex by Age", "Male"]],
     ["p012003", ["Sex by Age", "Male", "Under 5 years"]]
    ...all the later tables...
    ]


In [2]:
!wget https://data2.nhgis.org/extracts/97092/12/nhgis0012_csv.zip
!wget https://data2.nhgis.org/extracts/97092/13/nhgis0013_csv.zip
!wget https://data2.nhgis.org/extracts/97092/14/nhgis0014_csv.zip

--2017-04-17 16:09:48--  https://data2.nhgis.org/extracts/97092/14/nhgis0014_csv.zip
Resolving data2.nhgis.org... 128.101.163.216
Connecting to data2.nhgis.org|128.101.163.216|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1740499 (1.7M) [application/zip]
Saving to: 'nhgis0014_csv.zip'


2017-04-17 16:09:49 (3.93 MB/s) - 'nhgis0014_csv.zip' saved [1740499/1740499]



In [3]:
!unzip nhgis0012_csv.zip
!unzip nhgis0013_csv.zip
!unzip nhgis0014_csv.zip

Archive:  nhgis0014_csv.zip
  inflating: nhgis0014_csv/nhgis0014_ds120_1990_block_codebook.txt  
  inflating: nhgis0014_csv/nhgis0014_ds120_1990_block.csv  


In [185]:
#!cat nhgis0014_csv/nhgis0014_ds120_1990_block_codebook.txt

In [72]:
#!cat nhgis0012_csv/nhgis0012_ds147_2000_block_codebook.txt

In [50]:
#!cat nhgis0013_csv/nhgis0013_ds172_2010_block_codebook.txt

In [15]:
columnDescriptors = OrderedDict()
nhgisColId_to_cbColId_map = OrderedDict()
cbColId_to_nhgisColId_map = OrderedDict()
cbColId_to_descriptors_map = OrderedDict()

def extractColumnDescriptors(dataset, codebook):
    body = open(codebook).read()
    tablePattern = re.compile(r'''
 *Table (\d+): +(.*)
 *Universe: +(.*)
 *Source code: +(.*)
 *NHGIS code: +(.*)
(( *\w+: +.*\n)+)''', re.M)
    columnPattern = re.compile(r'^ *(\w+): +(.*)$', re.M)
    sourceCodePattern = re.compile(r'^.*?(.)(\d+)(.*?)$')

    oldNhgisCount = len(nhgisColId_to_cbColId_map)
    oldCbCount = len(cbColId_to_nhgisColId_map)
    for table in re.findall(tablePattern, body):
        (tableId, tableName, universeName, sourceCode, nhgis, columns) = [s.strip() for s in table[:6]]
        for (nhgisColId, colDesc) in re.findall(columnPattern, columns):
            descs = colDesc.strip().split(' >> ')
            column = descs[-1]
            descs = ['Grouping: %s' % g for g in descs[:-1]]
            descs.append(column)
            m = re.match(sourceCodePattern, sourceCode)
            descs = ['Dataset: %s' % dataset,
                     'Universe: %s' % universeName,
                     'Table %s: %s' % (tableId, tableName)
                     #'Table: %s' % tableName
                    ] + descs
            (a, b, c) = (m.group(1), m.group(2), m.group(3))
            b = '{:03d}'.format(int(b))
            if (c == ''):
                c = '0'
            d = '{:03d}'.format(int(nhgisColId[3:]))
            cbColId = ''.join((a, b, c, d)).upper();
            cbColId = dataset + '.' + cbColId
            nhgisColId = dataset + '.' + nhgisColId
            #if (len(descs) > 4): print descs
            columnDescriptors[dataset + '.' + cbColId] = descs
            nhgisColId_to_cbColId_map[nhgisColId] = cbColId
            cbColId_to_nhgisColId_map[cbColId] = nhgisColId
            cbColId_to_descriptors_map[cbColId] = descs

    nhgisCount = len(nhgisColId_to_cbColId_map) - oldNhgisCount
    cbCount = len(cbColId_to_nhgisColId_map) - oldCbCount
    print 'Found %4d (%4d) columns in dataset %s' %(nhgisCount, cbCount, dataset)

extractColumnDescriptors('census1990_block2010', 'nhgis0014_csv/nhgis0014_ds120_1990_block_codebook.txt')
extractColumnDescriptors('census2000_block2010', 'nhgis0012_csv/nhgis0012_ds147_2000_block_codebook.txt')
extractColumnDescriptors('census2010_block2010', 'nhgis0013_csv/nhgis0013_ds172_2010_block_codebook.txt')
open('capture/NHGIS_1990/columnMap.json','w').write(json.dumps(nhgisColId_to_cbColId_map))
print len(columnDescriptors)
#pp.pprint(columnDescriptors)

Found  982 ( 982) columns in dataset census1990_block2010
982


In [4]:
def columnDescriptorString(cbColId):
    return '%s: %s' % (cbColId, ' >> '.join(cbColId_to_descriptors_map[cbColId]))

with open('assets/allById.html','w') as html:
    html.write('<html><head></head><body>\n')
    for cbColId in cbColId_to_descriptors_map.iterkeys():
        html.write(columnDescriptorString(cbColId) + '<br>\n')
    html.write('</body></html>')

In [5]:
with open('assets/allColIds','w') as f:
    for id in cbColId_to_nhgisColId_map.iterkeys():
        f.write(id + '\n')

In [6]:
#for id in cbColId_to_nhgisColId_map.iterkeys(): print id

In [7]:
def insertColumnDescription(hdict, cbColId, descs):
    #print colId, len(descs), descs
    assert(len(descs) > 0)
    d = descs[0]
    if len(descs) == 1:
        if d in hdict:
            if isinstance(hdict[d], basestring):
                #prev_nhgisColId = hdict[d].split(' ')[0]
                prev_cbColId = hdict[d]
                print "\nWARNING -- Column has name collision with column:"
                print columnDescriptorString(cbColId)
                print columnDescriptorString(prev_cbColId)
            else:
                print "\nWARNING -- Column has name collision with grouping:"
                print columnDescriptorString(cbColId)
                print hdict[d]
            return
        #hdict[d] = '%s (%s)' % (nhgisColId, nhgisColId_to_cbColId_map[nhgisColId])
        hdict[d] = cbColId
        return
    else:
        if not d in hdict:
            hdict[d] = OrderedDict()
        if isinstance(hdict[d], basestring):
            #prev_cbColId = hdict[d].split(' ')[0]
            prev_cbColId = hdict[d]
            print "\nWARNING -- Grouping has name collision with column:"
            print columnDescriptorString(cbColId)
            print columnDescriptorString(prev_ncbColId)
            return
        insertColumnDescription(hdict[d], cbColId, descs[1:])
columnHierarchy = OrderedDict()
#pp.pprint(cbColId_to_descriptors_map.items()[:10])
for (cbColId, descs) in cbColId_to_descriptors_map.iteritems():
    insertColumnDescription(columnHierarchy, cbColId, descs)

pp.pprint(columnHierarchy.keys())
#pp.pprint(columnHierarchy['Dataset: census2000']['Universe: Persons'])
#pp.pprint(columnHierarchy)

[   'Dataset: census1990_block2010',
    'Dataset: census2000_block2010',
    'Dataset: census2010_block2010']


In [8]:
def ordereddict_to_array(x):
    if (type(x) is OrderedDict):
        return [(k, ordereddict_to_array(v)) for (k, v) in x.iteritems()]
    else:
        return x
#pp.pprint(ordereddict_to_array(columnHierarchy))

!mkdir -p assets
open('assets/all.json','w').write(json.dumps(ordereddict_to_array(columnHierarchy)))

In [9]:
for y in (1990, 2000, 2010):
    ds = 'census%d_block2010' % y
    open('assets/%s.json' % ds, 'w').write(json.dumps(ordereddict_to_array(columnHierarchy['Dataset: %s' % ds])))
    open('assets/%dById.json' % y, 'w').write(json.dumps(
            [(id, ' >> '.join(descs[1:]))
             for (id, descs) in cbColId_to_descriptors_map.iteritems()
             if id.startswith('census%d' % y)]))

In [1]:
!ls -l assets/*.json

-rw-rw-r-- 1 rsargent rsargent 152109 Apr 26 16:02 assets/1990ById.json
-rw-rw-r-- 1 rsargent rsargent 648462 Apr 26 16:02 assets/2000ById.json
-rw-rw-r-- 1 rsargent rsargent 754392 Apr 26 16:02 assets/2010ById.json
-rw-rw-r-- 1 rsargent rsargent 648432 Apr 26 16:02 assets/all.json
-rw-rw-r-- 1 rsargent rsargent  68993 Apr 26 16:02 assets/census1990_block2010.json
-rw-rw-r-- 1 rsargent rsargent 248631 Apr 26 16:02 assets/census2000_block2010.json
-rw-rw-r-- 1 rsargent rsargent 330697 Apr 26 16:02 assets/census2010_block2010.json


In [1]:
!pwd

/mnt/ssd/rsargent/projects/dotmaps/server/data-visualization-tools/examples/lodes


In [17]:
for year in [1990, 2000, 2010]:
    print len(json.load(open('assets/%dById.json' % year))), len(json.load(open('assets/census%d_block2010.json' % year))), len(glob.glob('columncache/census%d_block2010/[A-Z]*' % year))

982 23 677
2881 137 3044
3346 104 3346


In [None]:
json.load()