In [1]:
# Wide display
from IPython.core.display import display, HTML
display(HTML("<style>#notebook-container { margin-left:-14px; width:calc(100% + 27px) !important; }</style>"))

In [2]:
import csv, json, os, math, numbers, pandas, re, scipy, scipy.sparse, shutil
import subprocess, sys, threading, time, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

In [3]:
pandas.options.display.max_colwidth = 300

## Download File Templates for 5-year data

5-year data is a 5-year average, ending in the named year.
So the recently released ACS2016-5year actually is from 2012-2016

In [4]:
#src = 'https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/2015_1yr_Summary_FileTemplates.zip'
#dest = 'capture/ACS2015_1year/2015_1yr_Summary_FileTemplates.zip'
#download_file(src, dest)
#templates = unzip_file(dest)

def download_file_templates(year):
    src = 'https://www2.census.gov/programs-surveys/acs/summary_file/{year}/data/{year}_5yr_Summary_FileTemplates.zip'.format(**locals())

    # Special-case 2010
    src = src.replace('2010_5yr_Summary_File', '2010_5yr_SummaryFile')
    
    dest = 'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates.zip'.format(**locals())
    download_file(src, dest)
    templates = unzip_file(dest)
    
for year in range(2009, 2017):
    download_file_templates(year)

capture/ACS2009_5year/2009_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2009_5year/2009_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2010_5year/2010_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2010_5year/2010_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2011_5year/2011_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2011_5year/2011_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2012_5year/2012_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2012_5year/2012_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2013_5year/2013_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2013_5year/2013_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2014_5year/2014_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2014_5year/2014_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2015_5year/20

In [None]:
!ls -l capture/ACS2015_1year/2015_1yr_Summary_FileTemplates/Templates | head

## Download ACS2015 5-year data (tract and block group)

In [4]:
process_year=2009

In [5]:
def download_data(year):
    filename = 'Tracts_Block_Groups_Only'
    if year < 2011:
        filename += '.zip'
    else:
        filename += '.tar.gz'
    src = 'https://www2.census.gov/programs-surveys/acs/summary_file/{year}/data/5_year_entire_sf/{filename}'.format(**locals())
    dest = 'capture/ACS{year}_5year/{filename}'.format(**locals())

    if os.path.exists(dest):
        print '{dest} already exists, skipping'.format(**locals())
    else:
        try:
            os.unlink(filename)
        except OSError:
            pass
        cmd = '/usr/bin/curl'
        cmd += " -H 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'"
        cmd += ' {src}'.format(**locals())
        cmd += ' >{dest}'.format(**locals())
        try:
            os.makedirs(os.path.dirname(dest))
        except OSError:
            pass
        print cmd
        subprocess_check(cmd)
        print 'Downloaded to {dest}'.format(**locals())

for year in range(2009, 2017):
    download_data(year)

capture/ACS2009_5year/Tracts_Block_Groups_Only.zip already exists, skipping
capture/ACS2010_5year/Tracts_Block_Groups_Only.zip already exists, skipping
capture/ACS2011_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2012_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2013_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2014_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2015_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2016_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping


In [6]:
!ls -l capture/ACS*/Tracts*

-rw-rw-r-- 1 rsargent rsargent 2806502508 Oct  5 07:52 capture/ACS2009_5year/Tracts_Block_Groups_Only.zip
-rw-rw-r-- 1 rsargent rsargent 3369803296 Oct  5 07:59 capture/ACS2010_5year/Tracts_Block_Groups_Only.zip
-rw-rw-r-- 1 rsargent rsargent 3297054880 Oct  5 08:12 capture/ACS2011_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3651813394 Oct  5 07:33 capture/ACS2012_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3769295680 Oct  5 07:45 capture/ACS2013_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3757945352 Oct  5 07:59 capture/ACS2014_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3747109902 Dec  2  2016 capture/ACS2015_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3780352044 Feb 14 15:00 capture/ACS2016_5year/Tracts_Block_Groups_Only.tar.gz


In [None]:
# !mkdir -p capture/ACS2005_5year
# !mv  capture/ACS2005_5year
#
# !cd capture/ACS2005_5year; tar xvfz Tracts_Block_Groups_Only.tar.gz >/dev/null
#
# !wget --header="User-Agent: Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/5_year_entire_sf/2015_ACS_Geography_Files.zip
#
# !mv 2015_ACS_Geography_Files.zip capture/ACS2005_5year
# 
# unzip_file('capture/ACS2005_5year/2015_ACS_Geography_Files.zip')

In [5]:
def download_geography_data(year, force_regenerate=False):
    fname = ("{year}_ACS_Geography_Files.zip").format(**locals())
    cdir = ("capture/ACS{year}_5year").format(**locals())
    fpath = ("{cdir}/{fname}").format(**locals())
    
    if os.path.exists(fpath) and not force_regenerate:
        print '{fpath} already exists, skipping'.format(**locals())
        return
    
    url_template = "https://www2.census.gov/programs-surveys/acs/summary_file/{year}/data/5_year_entire_sf/{fname}"
    url = url_template.format(**locals())
    !wget --header="User-Agent: Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" $url
    
    !mv $fname $cdir
    unzip_file(fpath)
    print "Downloaded %s to %s" % (fname,fpath)

In [6]:
download_geography_data(process_year)

--2018-02-28 22:16:08--  https://www2.census.gov/programs-surveys/acs/summary_file/2009/data/5_year_entire_sf/2009_ACS_Geography_Files.zip
Resolving www2.census.gov (www2.census.gov)... 23.36.91.141, 2600:1408:7:291::208c, 2600:1408:7:2a5::208c
Connecting to www2.census.gov (www2.census.gov)|23.36.91.141|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2018-02-28 22:16:09 ERROR 404: Not Found.

mv: cannot stat '2009_ACS_Geography_Files.zip': No such file or directory
Unzipping capture/ACS2009_5year/2009_ACS_Geography_Files.zip into capture/ACS2009_5year/2009_ACS_Geography_Files.tmp


Exception: Call to subprocess_check failed with return code 9
Standard error:
unzip:  cannot find or open capture/ACS2009_5year/2009_ACS_Geography_Files.zip, capture/ACS2009_5year/2009_ACS_Geography_Files.zip.zip or capture/ACS2009_5year/2009_ACS_Geography_Files.zip.ZIP.
Standard out:


## Read CSV utility functions

In [38]:
def read_acs_5year_template(year, seqno):
    for template in ['capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/{year}_5yr_Templates/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/seq/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/templates/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/Seq%04d.xls'%(seqno)]:
        path = template.format(**locals())
        #print 'Checking for {path}'.format(**locals())
        if os.path.exists(path):
            return pandas.read_excel(path)
    #print 'yo could not find {year}:{seqno}'.format(**locals())
    return None

def find_acs_5year_data(year, state, seqno):
    fname = 'e%d5%s%04d000.txt' % (year, state, seqno)
    for template in ['capture/ACS{year}_5year/group2/{fname}',
                     'capture/ACS{year}_5year/data/tab4/sumfile/prod/2012thru2016/group2/{fname}',
                     'capture/ACS{year}_5year/tab4/sumfile/prod/2010thru2014/group2/{fname}',
                     'capture/ACS{year}_5year/tab4/sumfile/prod/2008thru2012/group2/{fname}',
                     'capture/ACS{year}_5year/tab4/sumfile/prod/2006thru2010/group2/{fname}']:
        path = template.format(**locals())
        #print 'Checking for {path}'.format(**locals())
        if os.path.exists(path):
            return path
    print 'Could not find {year}:{seqno} file {fname}'.format(**locals())
    return None

# Combine template header and data into pandas frame
def read_acs_5year_data(year, state, seqno):
    header = read_acs_5year_template(year, seqno)
    data_fname = find_acs_5year_data(year, state, seqno)
    if not data_fname:
        return None
    else:
        data = pandas.read_csv(data_fname,
                               index_col=False,
                               dtype={'FILEID':numpy.str,
                                      'FILETYPE':numpy.str,
                                      'STUSAB':numpy.str,
                                      'CHARITER':numpy.str,
                                      'SEQUENCE':numpy.str,
                                      'LOGRECNO':numpy.str},
                               header=None,
                               names=header.columns.values)
        return data

In [34]:
read_acs_5year_template(process_year, 1)

Unnamed: 0,FILEID,FILETYPE,STUSAB,CHARITER,SEQUENCE,LOGRECNO,B07401_001,B07401_002,B07401_003,B07401_004,...,B07409_021,B07409_022,B07409_023,B07409_024,B07409_025,B07409_026,B07409_027,B07409_028,B07409_029,B07409_030
0,FILEID,FILETYPE,STUSAB,CHARITER,SEQUENCE,LOGRECNO,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY AGE FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 1 year and over in the United States% Total living in area 1 year ago:,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY AGE FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 1 year and over in the United States% 1 to 4 years,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY AGE FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 1 year and over in the United States% 5 to 17 years,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY AGE FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 1 year and over in the United States% 18 and 19 years,...,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different county within same state:% High school graduate (includes equivalency),GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different county within same state:% Some college or associate's degree,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different county within same state:% Bachelor's degree,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different county within same state:% Graduate or professional degree,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different state:,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different state:% Less than high school graduate,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different state:% High school graduate (includes equivalency),GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different state:% Some college or associate's degree,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different state:% Bachelor's degree,GEOGRAPHICAL MOBILITY IN THE PAST YEAR BY EDUCATIONAL ATTAINMENT FOR RESIDENCE 1 YEAR AGO IN THE UNITED STATES for Population 25 years and over in the United States% Moved to different state:% Graduate or professional degree


In [37]:
read_acs_5year_data(process_year,'pa', 1)

Checking for capture/ACS2010_5year/group2/e20105pa0001000.txt
Checking for capture/ACS2010_5year/data/tab4/sumfile/prod/2012thru2016/group2/e20105pa0001000.txt
Checking for capture/ACS2010_5year/tab4/sumfile/prod/2010thru2014/group2/e20105pa0001000.txt
Checking for capture/ACS2010_5year/tab4/sumfile/prod/2008thru2012/group2/e20105pa0001000.txt
Checking for capture/ACS2010_5year/tab4/sumfile/prod/2006thru2010/group2/e20105pa0001000.txt


Unnamed: 0,FILEID,FILETYPE,STUSAB,CHARITER,SEQUENCE,LOGRECNO,B07401_001,B07401_002,B07401_003,B07401_004,...,B07409_021,B07409_022,B07409_023,B07409_024,B07409_025,B07409_026,B07409_027,B07409_028,B07409_029,B07409_030
0,ACSSF,2010e5,pa,000,0001,0011708,,,,,...,,,,,,,,,,
1,ACSSF,2010e5,pa,000,0001,0011709,,,,,...,,,,,,,,,,
2,ACSSF,2010e5,pa,000,0001,0011710,,,,,...,,,,,,,,,,
3,ACSSF,2010e5,pa,000,0001,0011711,,,,,...,,,,,,,,,,
4,ACSSF,2010e5,pa,000,0001,0011712,,,,,...,,,,,,,,,,
5,ACSSF,2010e5,pa,000,0001,0011713,,,,,...,,,,,,,,,,
6,ACSSF,2010e5,pa,000,0001,0011714,,,,,...,,,,,,,,,,
7,ACSSF,2010e5,pa,000,0001,0011715,,,,,...,,,,,,,,,,
8,ACSSF,2010e5,pa,000,0001,0011716,,,,,...,,,,,,,,,,
9,ACSSF,2010e5,pa,000,0001,0011717,,,,,...,,,,,,,,,,


## Write ACSYYYY 5-year description.html

In [20]:
# Check if dataset is already defined.  If not, define it as a map, otherwise, leave it alone
try:
  dataset
except NameError:
  dataset = {}

column_dir = 'columncache'

def write_acs_5year_description(year, force_regenerate=False):
    dataset[year] = 'acs{year}_5year_tract2010'.format(**locals())
    description_path = column_dir + '/' + dataset[year] + '/description.html'

    if os.path.exists(description_path) and not force_regenerate:
        print '{description_path} already exists, skipping'.format(**locals())
        return

    table_rows = []

    for seqno in range(1, 1000):
        template = read_acs_5year_template(year, seqno)
        if template is None:
            break
        for col in range(6, template.shape[1]):
            colname = template.columns.values[col]
            description = template.iloc[0,col]
            try:
                description = description.replace(':', '')
                description = re.sub(r'\s*%\s*', ' &mdash; ', description)
            except:
                print "%d:%d col %d description = '%s', using '%s' instead" % (year, seqno, col, description,colname)
                description = colname
            # format can't handle array reference, so put dataset[year] in a flat variable for the format to work
            dataset_var = dataset[year]
            table_rows.append(u'<tr><td>{dataset_var}.{colname}</td><td>{description}</td></tr>\n'.format(**locals()))

    html = '<table>' + ''.join(table_rows) + '</table>'

    try:
        os.makedirs(os.path.dirname(description_path))
    except:
        pass
    open(description_path, 'w').write(html.encode('utf8'))
    print 'Wrote %d column names and descriptions to %s' % (len(table_rows), description_path)
    print 'Check it out at http://dotmaptiles.createlab.org/data/acs{year}_5year_tract2010'.format(**locals())
    

In [39]:
write_acs_5year_description(process_year)

Wrote 21487 column names and descriptions to columncache/acs2010_5year_tract2010/description.html
Check it out at http://dotmaptiles.createlab.org/data/acs2010_5year_tract2010


## Create ACS2015 block-level population

### Read 2010 block geoids and 2010 block populations

In [22]:
block_populations = numpy.load('columncache/census2010_block2010/p001001.numpy')
print 'block_populations has', sum(block_populations), 'total people'

block_populations has 308745538 total people


In [23]:
# block_geoids_2010 = [row[0] for row in query_psql("SELECT geoid2010 FROM sf1_2010_block_p001 order by blockidx2010")]
block_geoids_2010 = json.load(open('block_geoids_2010.json'))
print 'There are', len(block_geoids_2010), 'blocks'

assert(len(block_geoids_2010) + 1 == len(block_populations))

There are 11078297 blocks


### Compute 2010 population by tract and block indices from tract


In [24]:
tract_populations = {}
tract_block_indexes = {}

for block_index_minus_one, block_geoid in enumerate(block_geoids_2010):
    block_index = block_index_minus_one + 1
    tract_name = block_geoid[0:11] # SSCCCTTTTTT
    if tract_name not in tract_populations:
        tract_populations[tract_name] = 0
        tract_block_indexes[tract_name] = []
    tract_populations[tract_name] += block_populations[block_index]
    tract_block_indexes[tract_name].append(block_index)

print 'There are', len(tract_populations), 'tracts'
print 'tract_populations has', sum(tract_populations.values()), 'people'

There are 73057 tracts
tract_populations has 308745538 people


### Map tract identifiers to LOGRECNO using geography file

In [27]:
tract_to_logrecno_year=None
tract_to_logrecno = {}

def compute_tract_to_logrecno(state, year):
    global tract_to_logrecno_year
    tract_to_logrecno_year=year
    for template in ["capture/ACS{year}_5year/{year}_ACS_Geography_Files/g{year}5{state}.csv",
                     "capture/ACS{year}_5year/{year}_ACS_Geography_Files/geo/g{year}5{state}.csv",
                     "capture/ACS{year}_5year/{year}_ACS_Geography_Files/tab4/sumfile/prod/2009thru2013/geo/g{year}5{state}.csv",
                     "capture/ACS{year}_5year/{year}_ACS_Geography_Files/geog/g{year}5{state}.csv"]:
        csv_path = template.format(**locals())
        if os.path.exists(csv_path):
            geography = pandas.read_csv(csv_path,
                                        dtype=numpy.str,
                                        index_col=False,
                                        header=None,
                                        keep_default_na=False,
                                        na_values=[])

            nrows = geography.shape[0]
            print 'State {state} has {nrows} geography rows'.format(**locals())
    
            ntracts = 0
            tract_to_logrecno[state] = {}
    
            for r in range(0, geography.shape[0]):
                aggregation_level = geography.iloc[r, 2]
                if aggregation_level == '140': # census tract
                    tract_identifier = geography.iloc[r, 48][7:]
                    logrecno = geography.iloc[r, 4]
                    tract_to_logrecno[state][tract_identifier] = logrecno
    
            print 'Found %d tracts for state %s in year %d' % (len(tract_to_logrecno[state]), state, year)
            return

    print '{csv_path} missing, call download_geography_data({year}), skipping {state},{year}'.format(**locals())    

In [40]:
for state in state_names:
    compute_tract_to_logrecno(state, process_year)

State ak has 4193 geography rows
Found 167 tracts for state ak in year 2010
State al has 11466 geography rows
Found 1181 tracts for state al in year 2010
State ar has 12182 geography rows
Found 686 tracts for state ar in year 2010
State az has 11173 geography rows
Found 1526 tracts for state az in year 2010
State ca has 52857 geography rows
Found 8057 tracts for state ca in year 2010
State co has 10108 geography rows
Found 1249 tracts for state co in year 2010
State ct has 6401 geography rows
Found 833 tracts for state ct in year 2010
State dc has 857 geography rows
Found 179 tracts for state dc in year 2010
State de has 1714 geography rows
Found 218 tracts for state de in year 2010
State fl has 28273 geography rows
Found 4245 tracts for state fl in year 2010
State ga has 16360 geography rows
Found 1969 tracts for state ga in year 2010
State hi has 3120 geography rows
Found 351 tracts for state hi in year 2010
State ia has 16074 geography rows
Found 825 tracts for state ia in year 2010

### Interpolate and write columns for data file

In [None]:
# AW 2/15/18: Randy believes this version is older than the one below.  I discovered this after putting in some work to generalize it to a 
# parameterized year.  The current version doesn't work.
# TODO: can we do this with a data frame then write out columns?

# def interpolate_acs_file(year, state, seq):
#     print 'Reading %s:%d for %d' % (state, seq, year)
#     data = read_acs_5year_data(year, state, seq)

#     print 'Mapping locrecno to row'
#     logrecnos = data['LOGRECNO']

#     logrecno_to_row = {}

#     for r, logrecno in enumerate(logrecnos):
#         logrecno_to_row[logrecno] = r
    
#     col_names = data.columns.values[6:]
#     print 'Iterating across %d columns' % len(col_names)
#     for col_name in col_names:
#         input_col = data[col_name]
#         output_col_path = column_dir + '/' + dataset + '/' + col_name + '.float32'
#         if os.path.exists(output_col_path):
#             print '%s already exists, skipping' % output_col_path
#             continue

#         output_col = numpy.zeros(block_populations.size, dtype=numpy.float32)

#         for tract in sorted(tract_to_logrecno[state].keys()):
#             input_pop = input_col[logrecno_to_row[tract_to_logrecno[state][tract]]]
#             if not isinstance(input_pop, numbers.Number):
#                 if input_pop == '.':
#                     input_pop = 0
#                 else:
#                     try:
#                         input_pop = float(input_pop)
#                     except:
#                         print 'That population is'
#                         print input_pop
#                         print type(input_pop)
#                         print '>%s<' % input_pop
#                         input_pop = 0
#             if not tract in tract_block_indexes:
#                 print 'missing tract {tract} from tract_block_indexes'.format(**locals())
#             else:
#                 for block_index in tract_block_indexes[tract]:
#                     if block_populations[block_index]:
#                         output_col[block_index] = input_pop * float(block_populations[block_index]) / tract_populations[tract]
            
#         output_col.tofile(output_col_path + '.tmp')
#         os.rename(output_col_path + '.tmp', output_col_path)
#         print 'Created %s' % output_col_path

# for seq in range(97, 2000):
#     interpolate_acs_file(year, 'pa', seq)

In [29]:
# TODO: can we do this with a data frame then write out columns?

def interpolate_acs_file(year, seq):
    global tract_to_logrecno_year
    sys.stdout.write("interpolating %d:%d\n" % (year, seq))
    
   # Make sure dataset[year] already exists.  If not, prompt to run write_acs_5year_description(year)
    try:
        dataset[year]
    except:
        print "dataset[%d] not defined.  Call write_acs_5year_description(%d) first." % (year, year)
        return None


    # Make sure tract_to_logrecno_year already exists and matches year.  If not, prompt to run compute_tract_to_logrecno(state, %d)
    try:
        tract_to_logrecno_year
    except:
        print "tract_to_logrecno_year not defined.  Call compute_tract_to_logrecno(state, %d) first." % (year)
        return None

    if tract_to_logrecno_year != year:
        print "tract_to_logrecno_year doesn't match.  Call compute_tract_to_logrecno(state, %d) first." % (year)
        return None
    
    output_cols = {}
    missing_tracts = {}
    num_nans=0
    for state in state_names:
        data = read_acs_5year_data(year, state, seq)
    
        logrecnos = data['LOGRECNO']

        logrecno_to_row = {}

        col_names = data.columns.values[6:]
        sys.stdout.write('%s:%d %d has %d columns\n' % (state, seq, year, len(col_names)))
        assert len(col_names) < 500   # sanity check to avoid demanding too much RAM on hal15

        if state == state_names[0]:
            missing = 0
            # First state.  Now that we know the col names, let's see if the output files all already exist
            for col_name in col_names:
                output_col_path = column_dir + '/' + dataset[year] + '/' + col_name + '.float32'
                if not os.path.exists(output_col_path):
                    missing += 1
            if missing == 0:
                sys.stdout.write("All %d columns for sequence %d already exist, skipping\n" % (len(col_names), seq))
                return
        
        for r, logrecno in enumerate(logrecnos):
            logrecno_to_row[logrecno] = r
    
        for col_name in col_names:
            input_col = data[col_name]
                
            if not col_name in output_cols:
                output_cols[col_name] = numpy.zeros(block_populations.size, dtype=numpy.float32)
            output_col = output_cols[col_name]

            for tract in sorted(tract_to_logrecno[state].keys()):
                input_pop = input_col[logrecno_to_row[tract_to_logrecno[state][tract]]]
                if not isinstance(input_pop, numbers.Number):
                    if input_pop == '.':
                        input_pop = 0
                    else:
                        try:
                            input_pop = float(input_pop)
                        except:
                            print 'That population is'
                            print input_pop
                            print type(input_pop)
                            print '>%s<' % input_pop
                            input_pop = 0
                            
                if math.isnan(input_pop):
                    #sys.stdout.write('Warning, %s:%d Tract %s is nan\n' % (state, seq, tract))
                    num_nans=num_nans+1

                if not tract in tract_block_indexes:
                    missing_tracts[tract] = True
                else:
                    for block_index in tract_block_indexes[tract]:
                        if block_populations[block_index]:
                            output_col[block_index] = input_pop * float(block_populations[block_index]) / tract_populations[tract]
            
    sys.stdout.write('Seq %d missing tracts: %s\n' % (seq, sorted(missing_tracts.keys())))
        
    if num_nans>0:
        sys.stdout.write('Seq %d contains %d nans' % (seq,num_nans))
        
    for col_name in sorted(output_cols.keys()):
        output_col_path = column_dir + '/' + dataset[year] + '/' + col_name + '.float32'
        output_cols[col_name].tofile(output_col_path + '.tmp')
        os.rename(output_col_path + '.tmp', output_col_path)
        sys.stdout.write('Created %s with sum %f\n' % (output_col_path, output_cols[col_name].sum()))
    
        

In [41]:
interpolate_acs_file(process_year, 1)

interpolating 2010:1
ak:1 2010 has 230 columns
al:1 2010 has 230 columns
ar:1 2010 has 230 columns
az:1 2010 has 230 columns
ca:1 2010 has 230 columns
co:1 2010 has 230 columns
ct:1 2010 has 230 columns
dc:1 2010 has 230 columns
de:1 2010 has 230 columns
fl:1 2010 has 230 columns
ga:1 2010 has 230 columns
hi:1 2010 has 230 columns
ia:1 2010 has 230 columns
id:1 2010 has 230 columns
il:1 2010 has 230 columns
in:1 2010 has 230 columns
ks:1 2010 has 230 columns
ky:1 2010 has 230 columns
la:1 2010 has 230 columns
ma:1 2010 has 230 columns
md:1 2010 has 230 columns
me:1 2010 has 230 columns
mi:1 2010 has 230 columns
mn:1 2010 has 230 columns
mo:1 2010 has 230 columns
ms:1 2010 has 230 columns
mt:1 2010 has 230 columns
nc:1 2010 has 230 columns
nd:1 2010 has 230 columns
ne:1 2010 has 230 columns
nh:1 2010 has 230 columns
nj:1 2010 has 230 columns
nm:1 2010 has 230 columns
nv:1 2010 has 230 columns
ny:1 2010 has 230 columns
oh:1 2010 has 230 columns
ok:1 2010 has 230 columns
or:1 2010 has 230

Created columncache/acs2010_5year_tract2010/B07403_009.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07403_010.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07403_011.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07403_012.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07403_013.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07403_014.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07403_015.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07404A_001.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07404A_002.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07404A_003.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07404A_004.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07404A_005.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07404B_001.float32 with sum nan
Create

Created columncache/acs2010_5year_tract2010/B07409_002.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_003.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_004.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_005.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_006.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_007.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_008.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_009.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_010.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_011.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_012.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_013.float32 with sum nan
Created columncache/acs2010_5year_tract2010/B07409_014.float32 with sum nan
Created colu

In [None]:
# 4 seems conservative on a 64GB machine
pool = SimpleProcessPoolExecutor(4)

for seq in range(1, 1000):
    pool.submit(interpolate_acs_file, process_year, seq)

pool.shutdown()
None

interpolating 2010:1
interpolating 2010:3
interpolating 2010:2
interpolating 2010:4
ak:1 2010 has 230 columns
All 230 columns for sequence 1 already exist, skipping
ak:3 2010 has 237 columns
interpolating 2010:5
ak:5 2010 has 175 columns
ak:2 2010 has 95 columns
ak:4 2010 has 217 columns
al:2 2010 has 95 columns
al:5 2010 has 175 columns
al:4 2010 has 217 columns
al:3 2010 has 237 columns
ar:2 2010 has 95 columns
az:2 2010 has 95 columns


In [133]:
year

2016

In [101]:
data = read_acs_5year_data(2015, 'ak', 1)

Checking for capture/ACS2015_5year/group2/e20155ak0001000.txt


In [94]:
logrecnos = data['LOGRECNO']

In [96]:
tract_to_logrecno['ak']['02198000300']

'0000617'

In [97]:
for i in range(0,len(logrecnos)):
    if(logrecnos[i]=='0000617'):
        print i

151


In [99]:
col_names = data.columns.values[6:]
col_names

array([u'B00001_001', u'B00002_001'], dtype=object)

In [None]:
len(tract_block_indexes.keys())

In [None]:
!ls -l columncache/acs2015_5year_tract2010/B08006_002.float32

In [91]:
x=numpy.memmap('columncache/acs2015_5year_tract2010/B00001_001.float32', dtype=numpy.float32, mode='r')

In [92]:
x.sum()

memmap(nan, dtype=float32)

In [56]:
x

memmap([ 0.        ,  7.81642246,  0.        , ...,  1.24807394,
        0.        ,  0.        ], dtype=float32)