### prepare_seed_json

This notebook prepares a formatted .json file with for one state with census ids etc filled in.
The JSON is formatted in a way that later allows an easy import into our database to display it values on a map.

In [1]:
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python

In [2]:
from subprocess import check_output
print(check_output(["ls", "./export/census_tracts_lookup"]).decode("utf8"))

Alabama.csv
Alaska.csv
Arizona.csv
Arkansas.csv
California.csv
Colorado.csv
Connecticut.csv
Delaware.csv
District of Columbia.csv
Florida.csv
Georgia.csv
Hawaii.csv
Idaho.csv
Illinois.csv
Indiana.csv
Iowa.csv
Kansas.csv
Kentucky.csv
Louisiana.csv
Maine.csv
Maryland.csv
Massachusetts.csv
Michigan.csv
Minnesota.csv
Mississippi.csv
Missouri.csv
Montana.csv
Nebraska.csv
Nevada.csv
New Hampshire.csv
New Jersey.csv
New Mexico.csv
New York.csv
North Carolina.csv
North Dakota.csv
Ohio.csv
Oklahoma.csv
Oregon.csv
Pennsylvania.csv
Rhode Island.csv
South Carolina.csv
South Dakota.csv
Tennessee.csv
Texas.csv
Utah.csv
Vermont.csv
Virginia.csv
Washington.csv
West Virginia.csv
Wisconsin.csv
Wyoming.csv
all_states.csv
faulty_census_tracts.csv



In [3]:
census_lookup = pd.read_csv("./export/census_tracts_lookup/California.csv", delimiter=",")

In [4]:
census_lookup.head()

Unnamed: 0,state_code,state,county_code,county,census_tracts,census_tract_number
0,6,California,1,Alameda County,3076,4301.01
1,6,California,1,Alameda County,230,4229.0
2,6,California,1,Alameda County,341,4041.02
3,6,California,1,Alameda County,1107,4371.02
4,6,California,1,Alameda County,2607,4273.0


In [5]:
import os
import simplejson

def make_sure_directory_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)
        
def export_to_json(path, filename, dic):
    make_sure_directory_exists(path)
    filepath = os.path.join(path, filename)
    with open(filepath, "w") as f:
        f.write(simplejson.dumps(dic, indent=4, sort_keys=False))


In [6]:
import json
import os

def parse_state_geojson():
    ''' Returns a dict where each key corresponds to one state_code 
        and its value to the coordinates from the geojson.'''
    
    filepath = './input_data/geojson/us_states.geojson'
    
    states_geojson = open(filepath).read()
    states_geojson = str(states_geojson).strip('\n')
    states_geojson = str(states_geojson).strip(';')
    
    json_data = json.loads(states_geojson)
    
    return_dic = {}
    for feature in json_data['features']:
        return_dic[feature['id']] = {
            'type': feature['geometry']['type'],
            'coordinates': feature['geometry']['coordinates']
        }
    
    return return_dic
    
def parse_county_geojson():
    ''' Returns a dict where each key corresponds to one a string
    of '{:02d}-{:03d}'.format(state_code, county_code) and its value 
    to the coordinates from the geojson for the county'''
    
    filepath = './input_data/geojson/us_counties.geojson'
    
    counties_geojson = open(filepath).read()
    counties_geojson = str(counties_geojson).strip('\n')
    counties_geojson = str(counties_geojson).strip(';')
    
    json_data = json.loads(counties_geojson)
    
    return_dic = {}
    for feature in json_data['features']:
        key = feature['properties']['STATEFP'] + '-' + feature['properties']['COUNTYFP']
        return_dic[key] = {
            'type': feature['geometry']['type'],
            'coordinates': feature['geometry']['coordinates']
        }
    
    return return_dic

def parse_census_geojson():
    ''' Returns a dict where each key corresponds to one a string
    of '{:02d}-{:03d}-{}'.format(state_code, county_code, census_tract_number)
    and its value to the coordinates from the geojson for the county'''
    
    path = './input_data/geojson/census_tracts/'
    files = check_output(['ls', path]).decode('utf8').split('\n')
    
    files_to_parse = []
    for filename in files:
        filepath = os.path.join(path, filename)
        if not os.path.isdir(filepath):
            files_to_parse.append(filepath)
            
    return_dic = {}
    for filepath in files_to_parse:
        census_geojson = open(filepath).read()
        census_geojson = str(census_geojson).strip('\n')
        census_geojson = str(census_geojson).strip(';')
    
        json_data = json.loads(census_geojson)
        
        for feature in json_data['features']:

            key = '-'.join([
                '{:02d}'.format(int(feature['properties']['STATEFP'])),
                '{:03d}'.format(int(feature['properties']['COUNTYFP'])),
                feature['properties']['TRACTCE'][:4] + '.' + feature['properties']['TRACTCE'][4:]
            ])
            return_dic[key] = {
                'type': feature['geometry']['type'],
                'coordinates': feature['geometry']['coordinates']
            }

    return return_dic

In [8]:
census_average = pd.read_csv("./input_data/hmda_data_average_california", delimiter=",")

census_average.head()



FileNotFoundError: File b'./input_data/hmda_data_average_california' does not exist

In [57]:
'''Generates a JSON File with the required map data to display data interactively on the map.'''

state_geojson_lookup = parse_state_geojson()
county_geojson_lookup = parse_county_geojson()
census_geojson_lookup = parse_census_geojson()

census_keys_found = 0
census_keys_not_found = 0

def get_geojson_dict(state_id=None, county_id=None, census_tract_number=None, kind=None):
    
    global census_keys_not_found
    global census_keys_found
    
    coordinates = None #[ ['//TODO' ] ]
    geometry_type = 'Polygon'
    
    if kind is not None:
        if kind == 'state'and state_id is not None:
            key = '{:02d}'.format(state_id)
            if state_geojson_lookup.get(key) is not None:
                coordinates = state_geojson_lookup.get(key)['coordinates']
                geometry_type = state_geojson_lookup.get(key)['type']
        if kind == 'county'and state_id is not None and county_id is not None:
            key = '{:02d}-{:03d}'.format(state_id, county_id)
            if county_geojson_lookup.get(key) is not None:
                coordinates = county_geojson_lookup.get(key)['coordinates']
                geometry_type = county_geojson_lookup.get(key)['type']
        if kind == 'census' and state_id is not None and county_id is not None and census_tract_number is not None:
            # TODO: Make sure the ids are formatted properly
            key = '-'.join([
                '{:02d}'.format(state_id),
                '{:03d}'.format(county_id),
                str('{:04.2f}'.format(float(census_tract_number))).zfill(7)
            ])
            if census_geojson_lookup.get(key) is not None:
                census_keys_found += 1
                coordinates = census_geojson_lookup.get(key)['coordinates']
                geometry_type = census_geojson_lookup.get(key)['type']
            else:
                census_keys_not_found += 1

    return {
        'type': 'Feature',
        'geometry': {
            'type': geometry_type,
             'coordinates': coordinates
        }
    }

def get_census_dict(census_tract, census_tract_number, county_code, state_code):
    return {
        'census_tract': census_tract,
        'census_tract_number': census_tract_number,
        'geojson': get_geojson_dict(state_id=state_code, 
                                    county_id=county_code, 
                                    census_tract_number=census_tract_number, 
                                    kind='census'),
    }

def get_county_dict(df, county, county_code, state_code):
    filtered_df = df.loc[df['county_code'] == county_code]
    # TODO: figure out how to deal with faulty census tracts
    filtered_df = filtered_df.dropna(axis=0, how='any', subset=['census_tract_number'])
    census_tracts = filtered_df[['census_tracts', 'census_tract_number']].drop_duplicates()
    return {
        'county': county,
        'county_code': county_code,
        'geojson': get_geojson_dict(state_id=state_code, county_id=county_code, kind='county'),
        'census_tracts': [get_census_dict(ct, ctn, county_code, state_code) for (ct, ctn) in census_tracts.values ]
    }

def get_state_dict(df, state, state_code):
    filtered_df = df.loc[df['state_code'] == state_code]
    counties = filtered_df[['county', 'county_code']].drop_duplicates()
    return {
        'state': state,
        'state_code': state_code,
        'geojson': get_geojson_dict(state_id=state_code, kind='state'),
        'counties': [get_county_dict(filtered_df, c, cc, state_code) for (c, cc) in counties.values ]
    }

   
    
verbose = True
path = './export/json'

states = census_lookup[['state', 'state_code']].drop_duplicates()
count = 0
for (state, state_code) in states.values:
    count += 1
    export_dict = get_state_dict(census_lookup, state, state_code)

    filename = '2016_{}_HDMA.json'.format(state)
    filepath = os.path.join(path, filename)
    verbose and print("Saving :: {:02d} of {} to '{}'".format(count, len(states.values), filepath))
    export_to_json(path, filename, export_dict)        

    
print('Finished')
print('census_keys_found', census_keys_found)
print('census_keys_not_found', census_keys_not_found)


         

Saving :: 01 of 1 to './export/json/2016_California_HDMA.json'
Finished
census_keys_found 8805
census_keys_not_found 1138
