### prepare_seed_json

This notebook prepares a formatted .json file with for one state with census ids etc filled in.
The JSON is formatted in a way that later allows an easy import into our database to display it values on a map.

In [7]:
import pandas as pd # package for high-performance, easy-to-use data structures and data analysis
import numpy as np # fundamental package for scientific computing with Python

In [8]:
from subprocess import check_output
print(check_output(["ls", "./export/census_tracts_lookup"]).decode("utf8"))

Alabama.csv
Alaska.csv
Arizona.csv
Arkansas.csv
California.csv
Colorado.csv
Connecticut.csv
Delaware.csv
District of Columbia.csv
Florida.csv
Georgia.csv
Hawaii.csv
Idaho.csv
Illinois.csv
Indiana.csv
Iowa.csv
Kansas.csv
Kentucky.csv
Louisiana.csv
Maine.csv
Maryland.csv
Massachusetts.csv
Michigan.csv
Minnesota.csv
Mississippi.csv
Missouri.csv
Montana.csv
Nebraska.csv
Nevada.csv
New Hampshire.csv
New Jersey.csv
New Mexico.csv
New York.csv
North Carolina.csv
North Dakota.csv
Ohio.csv
Oklahoma.csv
Oregon.csv
Pennsylvania.csv
Rhode Island.csv
South Carolina.csv
South Dakota.csv
Tennessee.csv
Texas.csv
Utah.csv
Vermont.csv
Virginia.csv
Washington.csv
West Virginia.csv
Wisconsin.csv
Wyoming.csv
all_states.csv
faulty_census_tracts.csv



In [9]:
census_lookup = pd.read_csv("./export/census_tracts_lookup/California.csv", delimiter=",")

In [10]:
census_lookup.head()

Unnamed: 0,state_code,state,county_code,county,census_tracts,census_tract_number
0,6,California,1,Alameda County,3076,4301.01
1,6,California,1,Alameda County,230,4229.0
2,6,California,1,Alameda County,341,4041.02
3,6,California,1,Alameda County,1107,4371.02
4,6,California,1,Alameda County,2607,4273.0


In [11]:
from subprocess import check_output
print(check_output(["ls", "./input_data"]).decode("utf8"))

fip_codes_2016.csv
geojson
hmda_data_average_california.csv



In [12]:
census_average = pd.read_csv("./input_data/hmda_data_average_california.csv", delimiter=",")

In [13]:
census_average.head()

Unnamed: 0,state_code,state_name,county_name,census_tract_number,avg_loan_state,avg_loan_county,avg_loan_census_num
0,6,California,Fresno County,53.05,425.336311,229.69459,185.479592
1,6,California,Placer County,206.01,425.336311,357.687245,475.932854
2,6,California,Sacramento County,71.05,425.336311,276.394165,249.423729
3,6,California,Los Angeles County,3020.02,425.336311,506.124766,497.522388
4,6,California,Los Angeles County,9200.45,425.336311,506.124766,366.025806


In [72]:
import os

def make_sure_directory_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)
        
def export(path, filename, df):
    make_sure_directory_exists(path)
    filepath = os.path.join(path, filename)
    df.to_csv(filepath ,index=False, sep=',')
        

In [71]:
export_df = census_lookup
export_df['avg_loan_state'] = np.nan
export_df['avg_loan_county'] = np.nan
export_df['avg_loan_census'] = np.nan


states = census_average[['state_code']].drop_duplicates()
for (state_code,) in states.values:
    # set state_average data
    filtered_df = census_average.loc[census_average['state_code'] == state_code]
    avg_loan_state = filtered_df[['avg_loan_state']].drop_duplicates().values[0][0]
    indices = export_df.index[export_df['state_code'] == state_code].tolist()
    export_df['avg_loan_state'].iloc[indices] = avg_loan_state

    counties = filtered_df[['county_name']].drop_duplicates().dropna()
    for (county_name,) in counties.values:
        filtered_county_df = filtered_df.loc[filtered_df['county_name'] == county_name]
        avg_loan_county = filtered_county_df[['avg_loan_county']].drop_duplicates().values[0][0]

        county_indices = export_df.index[(export_df['state_code'] == state_code) & (export_df['county'] == county_name)].tolist()
        if len(county_indices)==0:
            print(county_name)
        else:
            export_df['avg_loan_county'].iloc[county_indices] = avg_loan_county
            
            
        census_tracts = filtered_county_df[['census_tract_number']].drop_duplicates().dropna()
        for (census_tract_number,) in census_tracts.values:
            filtered_census_df = filtered_county_df.loc[filtered_county_df['census_tract_number'] == census_tract_number]
            avg_loan_census = filtered_census_df[['avg_loan_census_num']].drop_duplicates().values[0][0]

            census_indices = export_df.index[(export_df['state_code'] == state_code) & 
                                             (export_df['county'] == county_name) & 
                                             (export_df['census_tract_number'] == census_tract_number)].tolist()
            if len(census_indices)==0:
                print(census_tract_number)
            else:
                export_df['avg_loan_census'].iloc[census_indices] = census_indices
    
export_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


Unnamed: 0,state_code,state,county_code,county,census_tracts,census_tract_number,avg_loan_state,avg_loan_county,avg_loan_census
0,6,California,1,Alameda County,3076,4301.01,425.336311,503.512741,0.0
1,6,California,1,Alameda County,230,4229.0,425.336311,503.512741,1.0
2,6,California,1,Alameda County,341,4041.02,425.336311,503.512741,2.0
3,6,California,1,Alameda County,1107,4371.02,425.336311,503.512741,3.0
4,6,California,1,Alameda County,2607,4273.0,425.336311,503.512741,4.0


In [73]:
export('./export/census_tracts_lookup', 'census_lookup_with_avg_california.csv', export_df)

In [59]:
df = census_lookup.loc[(census_lookup['state_code'] == 6)]
counties_1 = df[['county']].drop_duplicates().values

df_avg = census_average.loc[(census_average['state_code'] == 6)]
counties_2 = df_avg[['county_name']].drop_duplicates().dropna().values

print(len(counties_1), len(counties_2))

for county in counties_1:
    
    if county not in counties_2:
        print(county)

58 58


In [68]:
df = census_lookup.loc[(census_lookup['state_code'] == 6)]
census_1 = df[['census_tract_number']].drop_duplicates().dropna().values

df_avg = census_average.loc[(census_average['state_code'] == 6)]
census_2 = df_avg[['census_tract_number']].drop_duplicates().dropna().values

print(len(census_1), len(census_2))

missing = []
for (census,) in census_1:
    
    if census not in census_2:
        missing.append(census)
        
print('missing', len(missing))

7198 6478
missing 720
