In [1]:
# This version uses Allegheny_County_All_Properties.csv, 
# the Allegheny county subset of the full preservation database
# generated by all-properties-preservation-database.ipynb


In [2]:
# Boilerplate from Randy cheatsheet at https://docs.google.com/document/d/1utZuLHcKQEZNXTQLOysTNCxTHrqxczAUymmtplpn27Q/edit#
import pandas as pd
import geopandas as gpd
import numpy as np
from geopandas import GeoSeries, GeoDataFrame
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [3]:
# Wide display
from IPython.core.display import display, HTML
display(HTML("<style>#notebook-container { margin-left:-14px; width:calc(100% + 27px) !important; }</style>"))

In [4]:
import csv, json, os, math, numbers, pandas, re, scipy, scipy.sparse, shutil, arrow
import subprocess, sys, threading, time, urllib2

def exec_ipynb(filename_or_url):
    nb = (urllib2.urlopen(filename_or_url) if re.match(r'https?:', filename_or_url) else open(filename_or_url)).read()
    jsonNb = json.loads(nb)
    #check for the modified formatting of Jupyter Notebook v4
    if(jsonNb['nbformat'] == 4):
        exec '\n'.join([''.join(cell['source']) for cell in jsonNb['cells'] if cell['cell_type'] == 'code']) in globals()
    else:
        exec '\n'.join([''.join(cell['input']) for cell in jsonNb['worksheets'][0]['cells'] if cell['cell_type'] == 'code']) in globals()

exec_ipynb('timelapse-utilities.ipynb')

In [5]:
pandas.options.display.max_colwidth = 300
pandas.set_option('display.max_columns', 500)

In [6]:
earliest_year=2017
latest_year=2018

def process_cosima_csv(path):
    parsed_pd_data=[]
    pd_data = pandas.read_csv(path)
    
    for i in range(0,len(pd_data.index)):
        try:
            rec = {'row':i}
            rec['lat']= pd_data['Latitude'][i]
            rec['lon']= pd_data['Longitude'][i]
            # We don't know the sizes, so we just pull something from thin air
            rec['total_units'] = 20
            reac_years = []
            reac_vals = []
            for j in range(3,0,-1):
                reac_str = "ReacScore%d"%(j)
                # Check if this reac score is missing
                if(pandas.isnull(pd_data['%s'%(reac_str)][i]) or pandas.isnull(pd_data['%sDate'%(reac_str)][i])):
                    continue
                # Something non-null is there, hope it's valid!
                reac_years.append(pandas.to_datetime(pd_data['%sDate'%(reac_str)][i]).date().year)
                reac_vals.append(pd_data['%s'%(reac_str)][i])
            rec['reac_years'] = reac_years
            rec['reac_vals'] = reac_vals
            # There's no set of programs specified, just set year range to have two columns
            rec['start_year']=earliest_year
            rec['end_year']=latest_year
            parsed_pd_data.append(rec)
        except Exception as e:
            print "%s: Skipping row %d, zip %s, due to error %s" % (program_arr,i,zip,e)
    return parsed_pd_data

In [23]:
parsed_hud_data = process_cosima_csv("https://docs-proxy.cmucreatelab.org/spreadsheets/d/1SasXUMnzGxxnaktgB1GnsA0jjEGOagEQszqiB2nxIZM/export?format=csv&gid=1970308521")


In [25]:
len(parsed_hud_data)

113

In [12]:
def write_pd_csv(parsed_pd_data, start_year, end_year, out_path):
    date_range = range(start_year, end_year+1)
    out = open(out_path, 'w')
    # Write out header row.  First column doesn't have a column heading, next two are lat, lon, then each year
    out.write(",lat,lon,%s\n" % (",".join(map(str,date_range))))
    
    i=0
    for rec in parsed_pd_data:
        start_year = rec['start_year']
        end_year = rec['end_year']
        out_data=[i, rec['lat'], rec['lon']]
        i=i+1
        
        for year in date_range:
            out_val = 0
            # If csv type is:
            #  'all' or 'total', use total_units
            #  'current', use size for year >= start_year and year< end_year (green)
            #  'expiring', use size for year == end_year (yellow)
            #  'expired', use size for year > end_year (red)
            out_val = rec['total_units']
            out_data.append(out_val)
        out.write('%s\n' % (",".join(map(str,out_data))))
    out.close()

In [15]:
write_pd_csv(parsed_ahrco_data, 2017, 2018, "preservationdatabase/pgh_ahrco_2018.csv")

In [26]:
write_pd_csv(parsed_hud_data, 2017, 2018, "preservationdatabase/pgh_hud_2018.csv")

In [31]:
# This uses colormap https://tiles.earthtime.org/colormaps/grey-red-yellow-green.png
a_val=1
b_val=0.666
c_val=0.333
unknown_val=0
def write_reac_code_pd_csv(parsed_pd_data, start_year, end_year, out_path):
    date_range = range(start_year, end_year+1)
    out = open(out_path, 'w')
    # Write out header row.  First column doesn't have a column heading, next two are lat, lon, then each year
    out.write(",lat,lon,%s\n" % (",".join(map(str,date_range))))
    
    i=0
    for rec in parsed_pd_data:
        # If we have no reac scores, skip this one
        if(len(rec['reac_years'])==0):
            start_year = 2017
            end_year = 2018
        else:            
            # Set start_year to be earliest reac_year and end_year to be latest reac_year
            start_year = rec['reac_years'][0]
            end_year = rec['reac_years'][len(rec['reac_years'])-1]
            
        #Output two rows, both with the same label.  First row is bubble size.  Second row is 0 - 1 for status
        row_label = i
        i=i+1
        
        units=0
        if(not 'assisted_units' in rec.keys()):
            units = rec['total_units']
        else:
            units = rec['assisted_units']
        bubble_size = units
        
        # Bubble size row -- size is constant
        size_row=[row_label, rec['lat'], rec['lon']]

        for year in date_range:
            if(len(rec['reac_years'])>0):
                size_row.append(bubble_size)
            else:
                size_row.append(5)
            
        # Bubble color row -- value depends on status
        color_row=[row_label, rec['lat'], rec['lon']]

        # Earliest year will be 
        reac_i = 0
        # If we don't know a reac value, just set it arbitrarily to be 66.  If we do know a reac value this will be overridden by the right stuff later
        last_reac_val = "66b"
        last_reac_year = 2018
        if(len(rec['reac_years'])>0):
            last_reac_val = rec['reac_vals'][0]
            last_reac_year = rec['reac_years'][0]
            
        for year in date_range:
            color_val=0
            # Check if we need to increment reac_i to a later test date
            while(year>last_reac_year and reac_i<(len(rec['reac_years'])-1)):
                last_reac_val = rec['reac_vals'][reac_i]
                reac_i=reac_i+1
                last_reac_year = rec['reac_years'][reac_i]

            # If we've caught up to the last reac_year, set last_reac_val to the end one
            if(reac_i==(len(rec['reac_years'])-1) and year>=end_year):
                last_reac_val = rec['reac_vals'][reac_i]
                
            # At this point last_reac_val is valid for this year so long as year>=start_year.
            # If year < start_year, set color to unknown_val
            #if (year < start_year):
            #    color_val = unknown_val
            #else:
            
            # Randy doesn't want it to start as grey
            if('a' in last_reac_val):
                color_val = a_val
            elif('b' in last_reac_val):
                color_val = b_val
            elif('c' in last_reac_val):
                color_val = c_val
            else:
                color_val = unknown_val
            color_row.append(color_val)
            
        # Write both rows for this property out here
        out.write('%s\n' % (",".join(map(str,size_row))))
        out.write('%s\n' % (",".join(map(str,color_row))))
        
    out.close()

In [32]:
write_reac_code_pd_csv(parsed_ahrco_data, 2003, 2018, "preservationdatabase/pgh_reac_code_ahrco_2018.csv")

In [33]:
write_reac_code_pd_csv(parsed_hud_data, 2003, 2018, "preservationdatabase/pgh_reac_code_hud_2018.csv")

In [37]:
# This uses colormap https://tiles.earthtime.org/colormaps/grey-red-yellow-green.png
unknown_val=0
def write_reac_val_pd_csv(parsed_pd_data, start_year, end_year, out_path):
    date_range = range(start_year, end_year+1)
    out = open(out_path, 'w')
    
    # Setup regular expression for parsing reac score
    reac_re = re.compile('(\d+)([abc])(\*?)')

    # Write out header row.  First column doesn't have a column heading, next two are lat, lon, then each year
    out.write(",lat,lon,%s\n" % (",".join(map(str,date_range))))
    
    i=0
    for rec in parsed_pd_data:
        if(len(rec['reac_years'])==0):
            start_year = 2017
            end_year = 2018
        else:            
            # Set start_year to be earliest reac_year and end_year to be latest reac_year
            start_year = rec['reac_years'][0]
            end_year = rec['reac_years'][len(rec['reac_years'])-1]

        #Output two rows, both with the same label.  First row is bubble size.  Second row is 0 - 1 for status
        row_label = i
        i=i+1
        
        units=0
        if(not 'assisted_units' in rec.keys()):
            units = rec['total_units']
        else:
            units = rec['assisted_units']
        bubble_size = units
        
        # Bubble size row -- size is constant
        size_row=[row_label, rec['lat'], rec['lon']]

        for year in date_range:
            size_row.append(bubble_size)
            
        # Bubble color row -- value depends on status
        color_row=[row_label, rec['lat'], rec['lon']]

        # Earliest year will be 
        reac_i = 0
        # If we don't know a reac value, just set it arbitrarily to be 66.  If we do know a reac value this will be overridden by the right stuff later
        last_reac_val = "66b"
        last_reac_year = 2018
        if(len(rec['reac_years'])>0):
            last_reac_val = rec['reac_vals'][0]
            last_reac_year = rec['reac_years'][0]

        for year in date_range:
            color_val=0
            # Check if we need to increment reac_i to a later test date
            while(year>last_reac_year and reac_i<(len(rec['reac_years'])-1)):
                last_reac_val = rec['reac_vals'][reac_i]
                reac_i=reac_i+1
                last_reac_year = rec['reac_years'][reac_i]

            # If we've caught up to the last reac_year, set last_reac_val to the end one
            if(reac_i==(len(rec['reac_years'])-1) and year>=end_year):
                last_reac_val = rec['reac_vals'][reac_i]
                
            # At this point last_reac_val is valid for this year so long as year>=start_year.
            # If year < start_year, set color to unknown_val
            #if (year < start_year):
            #    color_val = unknown_val
            #else:
            # Randy doesn't want it to start as grey
            if(True):
                # Strip the number from the front of the reac string
                m = reac_re.match(last_reac_val)
                # The number will be in the first group
                color_val = int(m.group(1))
            color_row.append(color_val)
            
        # Write both rows for this property out here
        out.write('%s\n' % (",".join(map(str,size_row))))
        out.write('%s\n' % (",".join(map(str,color_row))))
        
    out.close()

In [38]:
write_reac_val_pd_csv(parsed_ahrco_data, 2003, 2018, "preservationdatabase/pgh_reac_val_ahrco_2018.csv")

In [39]:
write_reac_val_pd_csv(parsed_hud_data, 2003, 2018, "preservationdatabase/pgh_reac_val_hud_2018.csv")

In [46]:
# This was the old path that only included non-expired properties
#pgh_path = "preservationdatabase/Active and Inconclusive Properties Pgh.xlsx"
# This is the new path that includes all properties
pgh_path = "preservationdatabase/Allegheny_County_All_Properties.csv"
# Removed "S8_2" because S8_2_AssistedUnits1, S8_2_StartTime1, S8_2_EndTime1 don't follow the pattern
#for program in ["S8_1","S202_1","S202_2","S236_1","S236_2","FHA_1","FHA_2","LIHTC_1","LIHTC_2"]:
for program_info in [{'name':'S8_m','programs':["S8_1","S8_2"]},
                     {'name':'S202_m','programs':["S202_1","S202_2"]},
                     {'name':'S236_m','programs':["S236_1","S236_2"]},
                     {'name':'FHA_m','programs':["FHA_1","FHA_2"]},
                     {'name':'LIHTC_m','programs':["LIHTC_1","LIHTC_2"]}]:
    program = program_info['name']
    program_arr = program_info['programs']
    parsed_pgh_data = process_preservationdatabase_csv(pgh_path,program_arr)

    #write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_current_%s_2018.csv"%(program),'current')
    #write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_expiring_%s_2018.csv"%(program),'expiring')
    #write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_expired_%s_2018.csv"%(program),'expired')
    #write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_total_%s_2018.csv"%(program),'total')
    #write_combo_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_combo_%s_a_2018.csv"%(program))
    write_reac_code_pd_csv(parsed_pgh_data, 2003,2018,"preservationdatabase/pgh_reac_code_%s_a_2018.csv"%(program))
    write_reac_val_pd_csv(parsed_pgh_data, 2003,2018,"preservationdatabase/pgh_reac_val_%s_a_2018.csv"%(program))

In [35]:
parsed_pgh_data = process_preservationdatabase_csv(pgh_path,[])
#write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_all_2018.csv",'all')
#write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_elderly_2018.csv",'elderly')
#write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_family_2018.csv",'family')
#write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_disabled_2018.csv",'disabled')
#write_pd_csv(parsed_pgh_data, 2017,2051,"preservationdatabase/pgh_mixed_2018.csv",'mixed')
write_reac_code_pd_csv(parsed_pgh_data, 2003,2018,"preservationdatabase/pgh_reac_code_all_a_2018.csv")
write_reac_val_pd_csv(parsed_pgh_data, 2003,2018,"preservationdatabase/pgh_reac_val_all_a_2018.csv")

NameError: name 'process_preservationdatabase_csv' is not defined

In [82]:
len(parsed_pgh_data)

439

In [44]:
for program_info in [{'name':'S8_m','programs':["S8_1","S8_2"]},
                     {'name':'S202_m','programs':["S202_1","S202_2"]},
                     {'name':'S236_m','programs':["S236_1","S236_2"]},
                     {'name':'FHA_m','programs':["FHA_1","FHA_2"]},
                     {'name':'LIHTC_m','programs':["LIHTC_1","LIHTC_2"]}]:
    program = program_info['name']
    print "https://tiles.earthtime.org/preservationdatabase/pgh_current_%s_2018.csv"%(program)
    print "https://tiles.earthtime.org/preservationdatabase/pgh_expiring_%s_2018.csv"%(program)
    print "https://tiles.earthtime.org/preservationdatabase/pgh_expired_%s_2018.csv"%(program)
    print "https://tiles.earthtime.org/preservationdatabase/pgh_total_%s_2018.csv"%(program)    

https://tiles.earthtime.org/preservationdatabase/pgh_current_S8_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_expiring_S8_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_expired_S8_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_total_S8_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_current_S202_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_expiring_S202_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_expired_S202_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_total_S202_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_current_S236_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_expiring_S236_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_expired_S236_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_total_S236_m_2018.csv
https://tiles.earthtime.org/preservationdatabase/pgh_current_FHA_m_2018.csv
https://til

In [None]:
# Public housing history with Jala

In [59]:
phh_path = 'https://docs-proxy.cmucreatelab.org/spreadsheets/d/1SasXUMnzGxxnaktgB1GnsA0jjEGOagEQszqiB2nxIZM/export?format=csv&gid=1721321247'
phh_data = pandas.read_csv(phh_path, dtype={'Start':numpy.str, 'End':numpy.str})

# Sort first by LocID and then by start date.  Get rid of any where LocID is empty
phh_data_s = phh_data[~pd.isna(phh_data.LocID)].sort_values(['LocID', 'Start'], ascending=[True, True]).reset_index()
phh_data_s

Unnamed: 0,index,Residence Name,LocID,Own type,Mgmt type,Address,Size,Start,End,Latitude,Longitude,Notes,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,4,Addision Terrace,Addison,public,public,"2100 Elmore Street, Pittsburgh, PA",736,1940,2011.0,40.443704,-79.976888,"Put back 400 units, leaving over 300 families displaced; opened by Franklin Roosevelt; AKA ""Elmore Square"" and ""Bentley Drive""; moved out families between 2011 and 2013; demolition completed in 2012; rebuilt in 2013 (fewer units)",,,,,,,,,
1,13,Allegheny Dwellings Belleau,AlleghenyDwellingsB,public,public,"1710 Belleau Dr, Pittsburgh, PA 15212",174,1939,,40.460846,-80.006823,,,,,,,,,,
2,14,Allegheny Dwellings Sandusky,AlleghenyDwellingsS,public,public,"1710 Belleau Dr, Pittsburgh, PA 15212",97,1939,2018.0,40.459703,-80.006147,"They tore these down in 2018. Trek Development (private) are now rebuilding 65 town houses (was 97 before). They will be privately owned and managed. Housing Authority plans to move people down from Belleau when construction is done. There are stipulations: job for 12 consecutive months, in ...",https://triblive.com/local/allegheny/13576448-74/housing-authority-of-pittsburgh-converting-allegheny-dwellings-to-mixed-income-community,,,,,,,,
3,7,Allequippa Terrace,Allequippa,public,public,"280 Burrows St, Pittsburgh, PA 15213",1851,1938,1995.0,40.43969,-79.970037,18-26 units per block; first batch of townhomes started replacing it in 1995; don't know when demolition happened,https://newpittsburghcourieronline.com/2011/08/17/final-phase-of-allequippa-terrace-to-oak-hill-completed/,,,,,,,,
4,8,Oak Hill,Allequippa,private,private,,639,1995,2011.0,40.43969,-79.970037,"Was Allequippa Terrace, rebuilt as privately owned mixed income townhouses",https://newpittsburghcourieronline.com/2011/08/17/final-phase-of-allequippa-terrace-to-oak-hill-completed/,,,,,,,,
5,9,Oak Hill,Allequippa,private,private,,725,2011,,40.43969,-79.970037,"Was Allequippa Terrace, rebuilt as privately owned mixed income townhouses",https://newpittsburghcourieronline.com/2011/08/17/final-phase-of-allequippa-terrace-to-oak-hill-completed/,,,,,,,,
6,26,Arlington Heights,Arlington,public,public,,660,1942,1991.0,40.416563,-79.96176,"Arlington Heights, built in 1942, still retains 143 occupied housing units. But much of its 82 acres, too, is a ghost town now — the result of a ""downsizing plan"" implemented in the late 1990s and early 2000s.",https://www.pghcitypaper.com/pittsburgh/up-on-the-farm-could-large-urban-farms-be-the-future-of-two-hilltop-zombie-towns/Content?oid=1688865,"1400 people in 1990, dropped to 238 (units unknown) ~1999 (lost 84% of people)",,,,,,,
7,27,Arlington Heights,Arlington,public,public,,200,1991,1999.0,40.416563,-79.96176,"Arlington Heights, built in 1942, still retains 143 occupied housing units. But much of its 82 acres, too, is a ghost town now — the result of a ""downsizing plan"" implemented in the late 1990s and early 2000s.",https://www.pghcitypaper.com/pittsburgh/up-on-the-farm-could-large-urban-farms-be-the-future-of-two-hilltop-zombie-towns/Content?oid=1688865,,,,,,,,
8,28,Arlington Heights,Arlington,public,public,,143,1999,,40.416563,-79.96176,,,,,,,,,,
9,15,Auburn Towers,Auburn,public,public,"6290 Auburn Street, Pittsburgh, PA",286,1960,2008.0,40.464694,-79.917416,10 story high rise,http://www.post-gazette.com/local/neighborhoods/2008/07/26/Larimer-s-Auburn-Towers-will-fall-on-Monday/stories/200807260145,,,,,,,,


In [60]:
# Build a map of LocID to array of row numbers into phh_data_s
residence_map={}

for i in range(0,len(phh_data_s.index)):
    LocID=phh_data_s.iloc[i]['LocID']
    if LocID in residence_map:
        residence_map[LocID].append(i)
    else:
        residence_map[LocID]=[i]


In [61]:
residence_map

{'Addison': [0],
 'AlleghenyDwellingsB': [1],
 'AlleghenyDwellingsS': [2],
 'Allequippa': [3, 4, 5],
 'Arlington': [6, 7, 8],
 'Auburn': [9],
 'Broadhead': [10, 11, 12],
 'EastMall': [13],
 'Francis': [14],
 'LibPKHr': [15],
 'LibPkSc': [16],
 'Manchester': [17, 18, 19],
 'Northview': [20, 21, 22],
 'Northview HR': [23],
 'PennC': [24],
 'StClair': [25],
 'Westgate': [26, 27],
 'Whiteside': [28]}

In [62]:
next_year = arrow.now().year+2
next_year

2020

In [63]:
# Use color map https://tiles.earthtime.org/colormaps/grey-red-yellow-green-purple.png
current_val=3
private_val = 4
expiring_val=2
expired_val=1

def write_phh_csv(include_sizes, include_latlon, out_path):
    date_range = range(1960, next_year)
    out = open(out_path, 'w')
    # Write out header row.  First column doesn't have a column heading, next two are lat, lon, then each year
    latlon_str=''
    if(include_latlon):
        latlon_str='lat,lon,'
    out.write("id,%s%s\n" % (latlon_str,",".join(map(str,date_range))))
    
    for LocID in residence_map.keys():
        # Start with the earliest entry for this LocID
        i = residence_map[LocID][0]
        
        # Make sure we know lat/lon
        if(include_latlon and (pandas.isnull(phh_data_s['Latitude'][i]) or phh_data_s['Latitude'][i]=='' or
            pandas.isnull(phh_data_s['Longitude'][i]) or phh_data_s['Longitude'][i]=='')):
            continue
            
        #Output two rows, both with the same label.  First row is bubble size.  Second row is 0 - 1 for status
        
        # Bubble size row -- size is 20 if not set, or size if set
        size_row=[str(phh_data_s['Residence Name'][i])]
        if(include_latlon):
            size_row.append(phh_data_s['Latitude'][i])
            size_row.append(phh_data_s['Longitude'][i])

        # Calculate start_year and end_year for this property.  In case that there's more than one row for this 
        # LocID, set start_year to the earliest start_year and end_year to the latest
        # end year.  If the latest end year is NaN then it's still open today.
        end_i = (residence_map[LocID][-1])
        start_year = int(phh_data_s['Start'][i])
        # In case this property is still active, set end_year to be in the future
        end_year = next_year+1
        if(not pd.isna(phh_data_s['End'][end_i])):
            # We have a real end year
            end_year = int(phh_data_s['End'][end_i])
            
        # Use j to keep track of which row we're processing now
        j=i
        for year in date_range:
            if(year<end_year and not pd.isna(phh_data_s['End'][j]) and year >= int(phh_data_s['End'][j])):
                # Go to the next row
                j=j+1
                print "Processing %s for %d, moving to next row %d" % (LocID, year, j)
            #if(phh_data_s['Size'][i]!=''):
            if(year<start_year):
                # Not open yet, set size to zero
                size_row.append(0)
            elif(not pandas.isnull(phh_data_s['Size'][j]) and phh_data_s['Size'][j]!=''):
                size_row.append(int(phh_data_s['Size'][j]))
            else:
                size_row.append(20)
        
        # Bubble color row -- value depends on status
        color_row=[str(phh_data_s['Residence Name'][i])]
        if(include_latlon):
            color_row.append(phh_data_s['Latitude'][i])
            color_row.append(phh_data_s['Longitude'][i])

            
        # Use j to keep track of which row we're processing now
        j=i
        for year in date_range:
            color_val=0
            if(year<end_year):
                if (year >= start_year and year< end_year):
                    if(not pd.isna(phh_data_s['End'][j]) and year >= int(phh_data_s['End'][j])):
                        # Go to the next row
                        j=j+1
                        print "Processing %s for %d, moving to next row %d" % (LocID, year, j)
                    # If Own type or Mgmt type is private, set color to private_val, otherwise use current_val
                    color_val=current_val
                    if(phh_data_s['Own type'][j]=='private' or phh_data_s['Mgmt type'][j]=='private'):
                        color_val=private_val
                    # If Start for this row is > year (which can happen if a property is closed down for a while)
                    # use expired_val
                    if(int(phh_data_s['Start'][j])>year):
                        print "%s %d [%s]: In gap, not open, show as expired"%(LocID, year, j)
                        color_val = expired_val
                    
                elif (year==end_year):
                    color_val=expiring_val
                elif (year>end_year):
                    color_val=expired_val
                color_row.append(color_val)
            else:
                color_row.append(expired_val)
            
        # Write both rows for this property out here if include_sizes is true.  
        # Otherwise just do the color row
        if(include_sizes):
            out.write('%s\n' % (",".join(map(str,size_row))))
        out.write('%s\n' % (",".join(map(str,color_row))))

    out.close()

In [64]:
write_phh_csv(True, True, "allegheny_county/phh-v6.csv")

Processing Allequippa for 1995, moving to next row 4
Processing Allequippa for 2011, moving to next row 5
Processing Allequippa for 1995, moving to next row 4
Processing Allequippa for 2011, moving to next row 5
Processing Westgate for 2001, moving to next row 27
Processing Westgate for 2001, moving to next row 27
Westgate 2001 [27]: In gap, not open, show as expired
Westgate 2002 [27]: In gap, not open, show as expired
Westgate 2003 [27]: In gap, not open, show as expired
Westgate 2004 [27]: In gap, not open, show as expired
Westgate 2005 [27]: In gap, not open, show as expired
Processing Arlington for 1991, moving to next row 7
Processing Arlington for 1999, moving to next row 8
Processing Arlington for 1991, moving to next row 7
Processing Arlington for 1999, moving to next row 8
Processing Broadhead for 1996, moving to next row 11
Processing Broadhead for 1998, moving to next row 12
Processing Broadhead for 1996, moving to next row 11
Processing Broadhead for 1998, moving to next r

## Download File Templates for 5-year data

5-year data is a 5-year average, ending in the named year.
So the recently released ACS2016-5year actually is from 2012-2016

In [4]:
#src = 'https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/2015_1yr_Summary_FileTemplates.zip'
#dest = 'capture/ACS2015_1year/2015_1yr_Summary_FileTemplates.zip'
#download_file(src, dest)
#templates = unzip_file(dest)

def download_file_templates(year):
    src = 'https://www2.census.gov/programs-surveys/acs/summary_file/{year}/data/{year}_5yr_Summary_FileTemplates.zip'.format(**locals())

    # Special-case 2010
    src = src.replace('2010_5yr_Summary_File', '2010_5yr_SummaryFile')
    
    dest = 'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates.zip'.format(**locals())
    download_file(src, dest)
    templates = unzip_file(dest)
    
for year in range(2009, 2017):
    download_file_templates(year)

capture/ACS2009_5year/2009_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2009_5year/2009_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2010_5year/2010_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2010_5year/2010_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2011_5year/2011_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2011_5year/2011_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2012_5year/2012_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2012_5year/2012_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2013_5year/2013_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2013_5year/2013_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2014_5year/2014_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2014_5year/2014_5yr_Summary_FileTemplates.zip already unzipped
capture/ACS2015_5year/2015_5yr_Summary_FileTemplates.zip already downloaded
capture/ACS2015_5year/20

In [None]:
!ls -l capture/ACS2015_1year/2015_1yr_Summary_FileTemplates/Templates | head

## Download ACS2015 5-year data (tract and block group)

In [6]:
process_year=2009

In [5]:
def download_data(year):
    filename = 'Tracts_Block_Groups_Only'
    if year < 2011:
        filename += '.zip'
    else:
        filename += '.tar.gz'
    src = 'https://www2.census.gov/programs-surveys/acs/summary_file/{year}/data/5_year_entire_sf/{filename}'.format(**locals())
    dest = 'capture/ACS{year}_5year/{filename}'.format(**locals())

    if os.path.exists(dest):
        print '{dest} already exists, skipping'.format(**locals())
    else:
        try:
            os.unlink(filename)
        except OSError:
            pass
        cmd = '/usr/bin/curl'
        cmd += " -H 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'"
        cmd += ' {src}'.format(**locals())
        cmd += ' >{dest}'.format(**locals())
        try:
            os.makedirs(os.path.dirname(dest))
        except OSError:
            pass
        print cmd
        subprocess_check(cmd)
        print 'Downloaded to {dest}'.format(**locals())

for year in range(2009, 2017):
    download_data(year)

capture/ACS2009_5year/Tracts_Block_Groups_Only.zip already exists, skipping
capture/ACS2010_5year/Tracts_Block_Groups_Only.zip already exists, skipping
capture/ACS2011_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2012_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2013_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2014_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2015_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping
capture/ACS2016_5year/Tracts_Block_Groups_Only.tar.gz already exists, skipping


In [6]:
!ls -l capture/ACS*/Tracts*

-rw-rw-r-- 1 rsargent rsargent 2806502508 Oct  5 07:52 capture/ACS2009_5year/Tracts_Block_Groups_Only.zip
-rw-rw-r-- 1 rsargent rsargent 3369803296 Oct  5 07:59 capture/ACS2010_5year/Tracts_Block_Groups_Only.zip
-rw-rw-r-- 1 rsargent rsargent 3297054880 Oct  5 08:12 capture/ACS2011_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3651813394 Oct  5 07:33 capture/ACS2012_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3769295680 Oct  5 07:45 capture/ACS2013_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3757945352 Oct  5 07:59 capture/ACS2014_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3747109902 Dec  2  2016 capture/ACS2015_5year/Tracts_Block_Groups_Only.tar.gz
-rw-rw-r-- 1 rsargent rsargent 3780352044 Feb 14 15:00 capture/ACS2016_5year/Tracts_Block_Groups_Only.tar.gz


In [None]:
# !mkdir -p capture/ACS2005_5year
# !mv  capture/ACS2005_5year
#
# !cd capture/ACS2005_5year; tar xvfz Tracts_Block_Groups_Only.tar.gz >/dev/null
#
# !wget --header="User-Agent: Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" https://www2.census.gov/programs-surveys/acs/summary_file/2015/data/5_year_entire_sf/2015_ACS_Geography_Files.zip
#
# !mv 2015_ACS_Geography_Files.zip capture/ACS2005_5year
# 
# unzip_file('capture/ACS2005_5year/2015_ACS_Geography_Files.zip')

In [5]:
def download_geography_data(year, force_regenerate=False):
    fname = ("{year}_ACS_Geography_Files.zip").format(**locals())
    cdir = ("capture/ACS{year}_5year").format(**locals())
    fpath = ("{cdir}/{fname}").format(**locals())
    
    if os.path.exists(fpath) and not force_regenerate:
        print '{fpath} already exists, skipping'.format(**locals())
        return
    
    url_template = "https://www2.census.gov/programs-surveys/acs/summary_file/{year}/data/5_year_entire_sf/{fname}"
    url = url_template.format(**locals())
    !wget --header="User-Agent: Mozilla/5.0 (Windows NT 6.0) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11" $url
    
    !mv $fname $cdir
    unzip_file(fpath)
    print "Downloaded %s to %s" % (fname,fpath)

In [6]:
download_geography_data(process_year)

--2018-02-28 22:16:08--  https://www2.census.gov/programs-surveys/acs/summary_file/2009/data/5_year_entire_sf/2009_ACS_Geography_Files.zip
Resolving www2.census.gov (www2.census.gov)... 23.36.91.141, 2600:1408:7:291::208c, 2600:1408:7:2a5::208c
Connecting to www2.census.gov (www2.census.gov)|23.36.91.141|:443... connected.
HTTP request sent, awaiting response... 404 Not Found
2018-02-28 22:16:09 ERROR 404: Not Found.

mv: cannot stat '2009_ACS_Geography_Files.zip': No such file or directory
Unzipping capture/ACS2009_5year/2009_ACS_Geography_Files.zip into capture/ACS2009_5year/2009_ACS_Geography_Files.tmp


Exception: Call to subprocess_check failed with return code 9
Standard error:
unzip:  cannot find or open capture/ACS2009_5year/2009_ACS_Geography_Files.zip, capture/ACS2009_5year/2009_ACS_Geography_Files.zip.zip or capture/ACS2009_5year/2009_ACS_Geography_Files.zip.ZIP.
Standard out:


## Read CSV utility functions

In [7]:
def read_acs_5year_template(year, seqno):
    for template in ['capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/{year}_5yr_Templates/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/seq/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/templates/Seq{seqno}.xls',
                     'capture/ACS{year}_5year/{year}_5yr_Summary_FileTemplates/Seq%04d.xls'%(seqno)]:
        path = template.format(**locals())
        #print 'Checking for {path}'.format(**locals())
        if os.path.exists(path):
            return pandas.read_excel(path)
    #print 'yo could not find {year}:{seqno}'.format(**locals())
    return None

def find_acs_5year_data(year, state, seqno):
    fname = 'e%d5%s%04d000.txt' % (year, state, seqno)
    for template in ['capture/ACS{year}_5year/group2/{fname}',
                     'capture/ACS{year}_5year/data/tab4/sumfile/prod/2012thru2016/group2/{fname}',
                     'capture/ACS{year}_5year/tab4/sumfile/prod/2010thru2014/group2/{fname}',
                     'capture/ACS{year}_5year/tab4/sumfile/prod/2008thru2012/group2/{fname}',
                     'capture/ACS{year}_5year/tab4/sumfile/prod/2006thru2010/group2/{fname}']:
        path = template.format(**locals())
        #print 'Checking for {path}'.format(**locals())
        if os.path.exists(path):
            return path
    print 'Could not find {year}:{seqno} file {fname}'.format(**locals())
    return None

# Combine template header and data into pandas frame
def read_acs_5year_data(year, state, seqno):
    header = read_acs_5year_template(year, seqno)
    data_fname = find_acs_5year_data(year, state, seqno)
    if not data_fname:
        return None
    else:
        data = pandas.read_csv(data_fname,
                               index_col=False,
                               dtype={'FILEID':numpy.str,
                                      'FILETYPE':numpy.str,
                                      'STUSAB':numpy.str,
                                      'CHARITER':numpy.str,
                                      'SEQUENCE':numpy.str,
                                      'LOGRECNO':numpy.str},
                               header=None,
                               names=header.columns.values)
        return data

In [8]:
read_acs_5year_template(process_year, 1)

Unnamed: 0,FILEID,FILETYPE,STUSAB,CHARITER,SEQUENCE,LOGRECNO,B07401_001,B07401_002,B07401_003,B07401_004,...,B07409_021,B07409_022,B07409_023,B07409_024,B07409_025,B07409_026,B07409_027,B07409_028,B07409_029,B07409_030
0,FILEID,FILETYPE,STUSAB,CHARITER,SEQUENCE,LOGRECNO,Population 1 year and over in the United States,Population 1 year and over in the United States% 1 to 4 years,Population 1 year and over in the United States% 5 to 17 years,Population 1 year and over in the United States% 18 and 19 years,...,Population 25 years and over in the United States% Moved to different county within same state:% High school graduate (includes equivalency),Population 25 years and over in the United States% Moved to different county within same state:% Some college or associate's degree,Population 25 years and over in the United States% Moved to different county within same state:% Bachelor's degree,Population 25 years and over in the United States% Moved to different county within same state:% Graduate or professional degree,Population 25 years and over in the United States% Moved to different state:,Population 25 years and over in the United States% Moved to different state:% Less than high school graduate,Population 25 years and over in the United States% Moved to different state:% High school graduate (includes equivalency),Population 25 years and over in the United States% Moved to different state:% Some college or associate's degree,Population 25 years and over in the United States% Moved to different state:% Bachelor's degree,Population 25 years and over in the United States% Moved to different state:% Graduate or professional degree


In [9]:
read_acs_5year_data(process_year,'pa', 1)

Unnamed: 0,FILEID,FILETYPE,STUSAB,CHARITER,SEQUENCE,LOGRECNO,B07401_001,B07401_002,B07401_003,B07401_004,...,B07409_021,B07409_022,B07409_023,B07409_024,B07409_025,B07409_026,B07409_027,B07409_028,B07409_029,B07409_030


## Write ACSYYYY 5-year description.html

In [10]:
# Check if dataset is already defined.  If not, define it as a map, otherwise, leave it alone
try:
  dataset
except NameError:
  dataset = {}

column_dir = 'columncache'

def write_acs_5year_description(year, force_regenerate=False):
    dataset[year] = 'acs{year}_5year_tract2010'.format(**locals())
    description_path = column_dir + '/' + dataset[year] + '/description.html'

    if os.path.exists(description_path) and not force_regenerate:
        print '{description_path} already exists, skipping'.format(**locals())
        return

    table_rows = []

    for seqno in range(1, 1000):
        template = read_acs_5year_template(year, seqno)
        if template is None:
            break
        for col in range(6, template.shape[1]):
            colname = template.columns.values[col]
            description = template.iloc[0,col]
            try:
                description = description.replace(':', '')
                description = re.sub(r'\s*%\s*', ' &mdash; ', description)
            except:
                print "%d:%d col %d description = '%s', using '%s' instead" % (year, seqno, col, description,colname)
                description = colname
            # format can't handle array reference, so put dataset[year] in a flat variable for the format to work
            dataset_var = dataset[year]
            table_rows.append(u'<tr><td>{dataset_var}.{colname}</td><td>{description}</td></tr>\n'.format(**locals()))

    html = '<table>' + ''.join(table_rows) + '</table>'

    try:
        os.makedirs(os.path.dirname(description_path))
    except:
        pass
    open(description_path, 'w').write(html.encode('utf8'))
    print 'Wrote %d column names and descriptions to %s' % (len(table_rows), description_path)
    print 'Check it out at http://dotmaptiles.createlab.org/data/acs{year}_5year_tract2010'.format(**locals())
    

In [11]:
write_acs_5year_description(process_year)

2009:57 col 128 description = 'nan', using 'B19080_005' instead
2009:57 col 134 description = 'nan', using 'B19081_006' instead
2009:57 col 140 description = 'nan', using 'B19082_006' instead
2009:96 col 49 description = 'nan', using 'B25005_002' instead
2009:105 col 6 description = 'nan', using 'B98001_001' instead
2009:105 col 7 description = 'nan', using 'B98001_002' instead
2009:105 col 8 description = 'nan', using 'B98002_001' instead
2009:105 col 9 description = 'nan', using 'B98002_002' instead
Wrote 21207 column names and descriptions to columncache/acs2009_5year_tract2010/description.html
Check it out at http://dotmaptiles.createlab.org/data/acs2009_5year_tract2010


## Create ACS2015 block-level population

### Read 2010 block geoids and 2010 block populations

In [12]:
block_populations = numpy.load('columncache/census2010_block2010/p001001.numpy')
print 'block_populations has', sum(block_populations), 'total people'

block_populations has 308745538 total people


In [13]:
# block_geoids_2010 = [row[0] for row in query_psql("SELECT geoid2010 FROM sf1_2010_block_p001 order by blockidx2010")]
block_geoids_2010 = json.load(open('block_geoids_2010.json'))
print 'There are', len(block_geoids_2010), 'blocks'

assert(len(block_geoids_2010) + 1 == len(block_populations))

There are 11078297 blocks


### Compute 2010 population by tract and block indices from tract


In [14]:
tract_populations = {}
tract_block_indexes = {}

for block_index_minus_one, block_geoid in enumerate(block_geoids_2010):
    block_index = block_index_minus_one + 1
    tract_name = block_geoid[0:11] # SSCCCTTTTTT
    if tract_name not in tract_populations:
        tract_populations[tract_name] = 0
        tract_block_indexes[tract_name] = []
    tract_populations[tract_name] += block_populations[block_index]
    tract_block_indexes[tract_name].append(block_index)

print 'There are', len(tract_populations), 'tracts'
print 'tract_populations has', sum(tract_populations.values()), 'people'

There are 73057 tracts
tract_populations has 308745538 people


### Map tract identifiers to LOGRECNO using geography file

In [17]:
tract_to_logrecno_year=None
tract_to_logrecno = {}

def compute_tract_to_logrecno(state, year):
    global tract_to_logrecno_year
    tract_to_logrecno_year=year
    
    # In the case of 2009, use the 2010 geography files
    geo_file_year = year
    if(geo_file_year == 2009):
        geo_file_year = 2010
        
    for template in ["capture/ACS{geo_file_year}_5year/{geo_file_year}_ACS_Geography_Files/g{geo_file_year}5{state}.csv",
                     "capture/ACS{geo_file_year}_5year/{geo_file_year}_ACS_Geography_Files/geo/g{geo_file_year}5{state}.csv",
                     "capture/ACS{geo_file_year}_5year/{geo_file_year}_ACS_Geography_Files/tab4/sumfile/prod/2009thru2013/geo/g{geo_file_year}5{state}.csv",
                     "capture/ACS{geo_file_year}_5year/{geo_file_year}_ACS_Geography_Files/geog/g{geo_file_year}5{state}.csv"]:
        csv_path = template.format(**locals())
        if os.path.exists(csv_path):
            geography = pandas.read_csv(csv_path,
                                        dtype=numpy.str,
                                        index_col=False,
                                        header=None,
                                        keep_default_na=False,
                                        na_values=[])

            nrows = geography.shape[0]
            print 'State {state} has {nrows} geography rows'.format(**locals())
    
            ntracts = 0
            tract_to_logrecno[state] = {}
    
            for r in range(0, geography.shape[0]):
                aggregation_level = geography.iloc[r, 2]
                if aggregation_level == '140': # census tract
                    tract_identifier = geography.iloc[r, 48][7:]
                    logrecno = geography.iloc[r, 4]
                    tract_to_logrecno[state][tract_identifier] = logrecno
    
            print 'Found %d tracts for state %s in year %d' % (len(tract_to_logrecno[state]), state, year)
            return

    print '{csv_path} missing, call download_geography_data({geo_file_year}), skipping {state},{geo_file_year}'.format(**locals())    

In [18]:
for state in state_names:
    compute_tract_to_logrecno(state, process_year)

State ak has 4193 geography rows
Found 167 tracts for state ak in year 2009
State al has 11466 geography rows
Found 1181 tracts for state al in year 2009
State ar has 12182 geography rows
Found 686 tracts for state ar in year 2009
State az has 11173 geography rows
Found 1526 tracts for state az in year 2009
State ca has 52857 geography rows
Found 8057 tracts for state ca in year 2009
State co has 10108 geography rows
Found 1249 tracts for state co in year 2009
State ct has 6401 geography rows
Found 833 tracts for state ct in year 2009
State dc has 857 geography rows
Found 179 tracts for state dc in year 2009
State de has 1714 geography rows
Found 218 tracts for state de in year 2009
State fl has 28273 geography rows
Found 4245 tracts for state fl in year 2009
State ga has 16360 geography rows
Found 1969 tracts for state ga in year 2009
State hi has 3120 geography rows
Found 351 tracts for state hi in year 2009
State ia has 16074 geography rows
Found 825 tracts for state ia in year 2009

### Interpolate and write columns for data file

In [None]:
# AW 2/15/18: Randy believes this version is older than the one below.  I discovered this after putting in some work to generalize it to a 
# parameterized year.  The current version doesn't work.
# TODO: can we do this with a data frame then write out columns?

# def interpolate_acs_file(year, state, seq):
#     print 'Reading %s:%d for %d' % (state, seq, year)
#     data = read_acs_5year_data(year, state, seq)

#     print 'Mapping locrecno to row'
#     logrecnos = data['LOGRECNO']

#     logrecno_to_row = {}

#     for r, logrecno in enumerate(logrecnos):
#         logrecno_to_row[logrecno] = r
    
#     col_names = data.columns.values[6:]
#     print 'Iterating across %d columns' % len(col_names)
#     for col_name in col_names:
#         input_col = data[col_name]
#         output_col_path = column_dir + '/' + dataset + '/' + col_name + '.float32'
#         if os.path.exists(output_col_path):
#             print '%s already exists, skipping' % output_col_path
#             continue

#         output_col = numpy.zeros(block_populations.size, dtype=numpy.float32)

#         for tract in sorted(tract_to_logrecno[state].keys()):
#             input_pop = input_col[logrecno_to_row[tract_to_logrecno[state][tract]]]
#             if not isinstance(input_pop, numbers.Number):
#                 if input_pop == '.':
#                     input_pop = 0
#                 else:
#                     try:
#                         input_pop = float(input_pop)
#                     except:
#                         print 'That population is'
#                         print input_pop
#                         print type(input_pop)
#                         print '>%s<' % input_pop
#                         input_pop = 0
#             if not tract in tract_block_indexes:
#                 print 'missing tract {tract} from tract_block_indexes'.format(**locals())
#             else:
#                 for block_index in tract_block_indexes[tract]:
#                     if block_populations[block_index]:
#                         output_col[block_index] = input_pop * float(block_populations[block_index]) / tract_populations[tract]
            
#         output_col.tofile(output_col_path + '.tmp')
#         os.rename(output_col_path + '.tmp', output_col_path)
#         print 'Created %s' % output_col_path

# for seq in range(97, 2000):
#     interpolate_acs_file(year, 'pa', seq)

In [19]:
# TODO: can we do this with a data frame then write out columns?

def interpolate_acs_file(year, seq):
    global tract_to_logrecno_year
    sys.stdout.write("interpolating %d:%d\n" % (year, seq))
    
   # Make sure dataset[year] already exists.  If not, prompt to run write_acs_5year_description(year)
    try:
        dataset[year]
    except:
        print "dataset[%d] not defined.  Call write_acs_5year_description(%d) first." % (year, year)
        return None


    # Make sure tract_to_logrecno_year already exists and matches year.  If not, prompt to run compute_tract_to_logrecno(state, %d)
    try:
        tract_to_logrecno_year
    except:
        print "tract_to_logrecno_year not defined.  Call compute_tract_to_logrecno(state, %d) first." % (year)
        return None

    if tract_to_logrecno_year != year:
        print "tract_to_logrecno_year doesn't match.  Call compute_tract_to_logrecno(state, %d) first." % (year)
        return None
    
    output_cols = {}
    missing_tracts = {}
    num_nans=0
    for state in state_names:
        data = read_acs_5year_data(year, state, seq)
    
        logrecnos = data['LOGRECNO']

        logrecno_to_row = {}

        col_names = data.columns.values[6:]
        sys.stdout.write('%s:%d %d has %d columns\n' % (state, seq, year, len(col_names)))
        assert len(col_names) < 500   # sanity check to avoid demanding too much RAM on hal15

        if state == state_names[0]:
            missing = 0
            # First state.  Now that we know the col names, let's see if the output files all already exist
            for col_name in col_names:
                output_col_path = column_dir + '/' + dataset[year] + '/' + col_name + '.float32'
                if not os.path.exists(output_col_path):
                    missing += 1
            if missing == 0:
                sys.stdout.write("All %d columns for sequence %d already exist, skipping\n" % (len(col_names), seq))
                return
        
        for r, logrecno in enumerate(logrecnos):
            logrecno_to_row[logrecno] = r
    
        for col_name in col_names:
            input_col = data[col_name]
                
            if not col_name in output_cols:
                output_cols[col_name] = numpy.zeros(block_populations.size, dtype=numpy.float32)
            output_col = output_cols[col_name]

            for tract in sorted(tract_to_logrecno[state].keys()):
                input_pop = input_col[logrecno_to_row[tract_to_logrecno[state][tract]]]
                if not isinstance(input_pop, numbers.Number):
                    if input_pop == '.':
                        input_pop = 0
                    else:
                        try:
                            input_pop = float(input_pop)
                        except:
                            print 'That population is'
                            print input_pop
                            print type(input_pop)
                            print '>%s<' % input_pop
                            input_pop = 0
                            
                if math.isnan(input_pop):
                    #sys.stdout.write('Warning, %s:%d Tract %s is nan\n' % (state, seq, tract))
                    num_nans=num_nans+1

                if not tract in tract_block_indexes:
                    missing_tracts[tract] = True
                else:
                    for block_index in tract_block_indexes[tract]:
                        if block_populations[block_index]:
                            output_col[block_index] = input_pop * float(block_populations[block_index]) / tract_populations[tract]
            
    sys.stdout.write('Seq %d missing tracts: %s\n' % (seq, sorted(missing_tracts.keys())))
        
    if num_nans>0:
        sys.stdout.write('Seq %d contains %d nans' % (seq,num_nans))
        
    for col_name in sorted(output_cols.keys()):
        output_col_path = column_dir + '/' + dataset[year] + '/' + col_name + '.float32'
        output_cols[col_name].tofile(output_col_path + '.tmp')
        os.rename(output_col_path + '.tmp', output_col_path)
        sys.stdout.write('Created %s with sum %f\n' % (output_col_path, output_cols[col_name].sum()))
    
        

In [21]:
data = read_acs_5year_data(2009, 'ak', 1)

In [24]:
logrecnos = data['LOGRECNO']
col_names = data.columns.values[6:]
col_names

array([u'B07401_001', u'B07401_002', u'B07401_003', u'B07401_004',
       u'B07401_005', u'B07401_006', u'B07401_007', u'B07401_008',
       u'B07401_009', u'B07401_010', u'B07401_011', u'B07401_012',
       u'B07401_013', u'B07401_014', u'B07401_015', u'B07401_016',
       u'B07401_017', u'B07401_018', u'B07401_019', u'B07401_020',
       u'B07401_021', u'B07401_022', u'B07401_023', u'B07401_024',
       u'B07401_025', u'B07401_026', u'B07401_027', u'B07401_028',
       u'B07401_029', u'B07401_030', u'B07401_031', u'B07401_032',
       u'B07401_033', u'B07401_034', u'B07401_035', u'B07401_036',
       u'B07401_037', u'B07401_038', u'B07401_039', u'B07401_040',
       u'B07401_041', u'B07401_042', u'B07401_043', u'B07401_044',
       u'B07401_045', u'B07401_046', u'B07401_047', u'B07401_048',
       u'B07401_049', u'B07401_050', u'B07401_051', u'B07401_052',
       u'B07401_053', u'B07401_054', u'B07401_055', u'B07401_056',
       u'B07401_057', u'B07401_058', u'B07401_059', u'B07401_0

In [27]:
logrecnos

Series([], Name: LOGRECNO, dtype: object)

In [26]:
for tract in sorted(tract_to_logrecno['ak'].keys()):
    print "%s %s" % (tract_to_logrecno['ak'][tract], logrecno_to_row[tract_to_logrecno['ak'][tract]])

NameError: name 'logrecno_to_row' is not defined

In [20]:
interpolate_acs_file(process_year, 1)

interpolating 2009:1
ak:1 2009 has 230 columns


KeyError: '0001037'

In [None]:
# 4 seems conservative on a 64GB machine
pool = SimpleProcessPoolExecutor(4)

for seq in range(1, 1000):
    pool.submit(interpolate_acs_file, process_year, seq)

pool.shutdown()
None

interpolating 2010:1
interpolating 2010:3
interpolating 2010:2
interpolating 2010:4
ak:1 2010 has 230 columns
All 230 columns for sequence 1 already exist, skipping
ak:3 2010 has 237 columns
interpolating 2010:5
ak:5 2010 has 175 columns
ak:2 2010 has 95 columns
ak:4 2010 has 217 columns
al:2 2010 has 95 columns
al:5 2010 has 175 columns
al:4 2010 has 217 columns
al:3 2010 has 237 columns
ar:2 2010 has 95 columns
az:2 2010 has 95 columns


In [133]:
year

2016

In [101]:
data = read_acs_5year_data(2015, 'ak', 1)

Checking for capture/ACS2015_5year/group2/e20155ak0001000.txt


In [94]:
logrecnos = data['LOGRECNO']

In [96]:
tract_to_logrecno['ak']['02198000300']

'0000617'

In [97]:
for i in range(0,len(logrecnos)):
    if(logrecnos[i]=='0000617'):
        print i

151


In [99]:
col_names = data.columns.values[6:]
col_names

array([u'B00001_001', u'B00002_001'], dtype=object)

In [None]:
len(tract_block_indexes.keys())

In [None]:
!ls -l columncache/acs2015_5year_tract2010/B08006_002.float32

In [91]:
x=numpy.memmap('columncache/acs2015_5year_tract2010/B00001_001.float32', dtype=numpy.float32, mode='r')

In [92]:
x.sum()

memmap(nan, dtype=float32)

In [56]:
x

memmap([ 0.        ,  7.81642246,  0.        , ...,  1.24807394,
        0.        ,  0.        ], dtype=float32)