In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json, os, sys, copy
import statistics as stat

In [None]:
path = Path.cwd().parent.parent
# LoadAndCleanACS = main_path /'src' /'data'/ 'LoadAndCleanACS.py'
# %run $LoadAndCleanACS

In [5]:
def get_average_house_value_for_block(house_value_dict, block_info):
    """ get the average house value based on the percent of houses for each range
    return it rounded to the 2nd decimal because its units of $
    """
    ave_value = 0.0
    for x in house_value_dict.keys():
        # percent_of_block = block_info[x]   
        # value_of_subset = house_value_dict[x]
        ave_value = ave_value + float(block_info[x] * house_value_dict[x])
    return round(ave_value,2)
def make_inverted_data_dict(data_path):
    
    #Set up value conversion
    # ordered list of house value ranges
    ordered_key_vals = ['house_val_less_10K', 'house_val_10K_15K', 'house_val_15K_20K', 'house_val_20K_25K', 'house_val_25K_30K', 'house_val_30K_35K', 'house_val_35K_40K', 'house_val_40K_50K', 'house_val_50K_60K', 'house_val_60K_70K', 'house_val_70K_80K', 'house_val_80K_90K', 'house_val_90K_100K', 'house_val_100K_125K', 'house_val_125K_150K', 'house_val_150K_175K', 'house_val_175K_200K', 'house_val_200K_250K', 'house_val_250K_300K', 'house_val_300K_400K', 'house_val_400K_500K', 'house_val_500K_750K', 'house_val_750K_1M', 'house_val_1M_1.5M', 'house_val_1.5M_2M', 'house_val_more_2M']

    # convert the house value ranges to averages (int)
    # the ordered average of each house value range
    # Assumed the ends (<10k & >2M where 10k and 2M, respectively)
    ave_house_valrange = [1000,12500,17500,22500,27500,32500,37500,45000,55000,65000,75000,85000,95000,112500,137500,162500,187500,225000,275000 ,350000 ,450000 ,625000 ,875000 ,1250000,1750000,2000000]

    # convert to searchable dict
    house_value_dict = {}
    for i in range(0, len(ordered_key_vals)):
        house_value_dict[ordered_key_vals[i]] = ave_house_valrange[i]
        
    data_invert = pd.read_csv(data_path)
    data_invert = dict(data_invert.T)
    for x in range(0, len(data_invert)):
        ave_value = get_average_house_value_for_block(house_value_dict, data_invert[x])
        data_invert[x]["ave_home_val"] = ave_value
    return data_invert
##############################
def get_county_dict(data_path, data_invert):
    """This get a custom dictionary of county data, that will be saved in a JSON file for future loading"""
    
    # Load saved versions of county data if run previously
    if os.path.exists(data_path + "county_dict_simple.json"):
        with open(data_path + "county_dict_simple.json",'r') as json_file:
            county_dict = json.load(json_file)
        return county_dict
    
    ##############ELSE NEED TO GENERATE DATA WHICH IS SLOW
    print("Gathering average home data. This will take a few minutes the first time...")


    # determine the average cost of a home within a county. 
    # This will be used to normalize each block groups housing info
    county_dict = {}
    for x in range(0, len(data_invert)):
        county_ID = data_invert[x]["county_name"] +  data_invert[x]["state_name"]
        if county_ID in county_dict:
            temp_vals = county_dict[county_ID][0]
            temp_geos = county_dict[county_ID][1]
            state_name = county_dict[county_ID][2]
            tot_population = county_dict[county_ID][3]

            temp_vals.append(data_invert[x]["ave_home_val"])
            temp_geos.append(data_invert[x]["GEOID"])
            state_name.append(data_invert[x]["state_name"])
            tot_population.append(data_invert[x]["tot_population"])

        else:
            temp_vals = [data_invert[x]["ave_home_val"]]
            temp_geos = [data_invert[x]["GEOID"]]
            state_name = [data_invert[x]["state_name"]]
            tot_population = [data_invert[x]["tot_population"]]


        county_dict[county_ID] = [temp_vals, temp_geos, state_name, tot_population]

    # SAVE TO JSON FILE FOR EASY LOAD IF RERUN
    with open(data_path + "county_dict_simple.json", 'w') as outfile:
        json.dump(county_dict, outfile)
    return county_dict
##############################
def countywide_COLA_per_person(info_list):
    """
    Some blocks have more residents and likely more homes or multifamily complexes.
    
    Thus it makes sense to multiple each average house value by the respective block's population
    before taking the average home value per county. This may have biases for denser/city areas.
    
    We can then divide the sum of these adjust vals by the total county size.
    
    This provides an estimate of the average cost of living accomidations (COLA) per person in the county
    
    ****THIS METHOD IS JUST TO WEIGHT THE HOUSE VALS BY POPULATION. IT IS NOT A TRUE COLA ***
    ****THIS ALSO ASSUMES 1 HOUSE PER PERSON SO NOT SO REALISTIC ****
    
    Returns countywide_COLA_adjust_ave_home_val 
    """
    average_list = info_list[0]
    pop_list = info_list[3]
    adjusted_list = []
    pop_counter = 0
    for i in range(0, len(average_list)):
        if type(average_list[i])!=float: continue
        adjusted_h_val = average_list[i] * pop_list[i]
        adjusted_list.append(adjusted_h_val)
        pop_counter = pop_counter + pop_list[i]

    countywide_COLA_adjust_ave_home_val = sum(adjusted_list) / pop_counter
    return countywide_COLA_adjust_ave_home_val
####################################


In [6]:
# PATH TO DATA FOLDER
path = Path.cwd().parent.parent
main_path = Path.cwd().parent.parent
top_data_path = main_path / 'Data' 
top_data_path =str(top_data_path) + os.sep

data_path = str(path) +os.sep+ str(os.sep).join(["Data","Master Project Data","ACS 5YR Block Group Data.csv"])
print("Loading data. May take a few min")
data_invert = make_inverted_data_dict(data_path)
print("Done. Data loaded")

Loading data. May take a few min
Done. Data loaded


In [14]:
# Load saved versions of county data if run previously
if os.path.exists(top_data_path + "countywide_home_averages.json")==True:

    with open(top_data_path + "countywide_home_averages.json",'r') as json_file:
        master_county_averages_dict = json.load(json_file)
    county_dict = get_county_dict(top_data_path, data_invert)
else:

    county_dict = get_county_dict(top_data_path, data_invert)

    #Run a precaution check. If good should print nothing
    for x in county_dict.keys():
        test = list(set(list(county_dict[x][2])))
    if len(test) != 1:
        print("FAIL", test,x)

    # Make smaller dict of county wide average home costs.
    # there will be 2 averages, 1 with no weighting (simple average 1 for 1 per block) i.e.,
    #    ave_val = mean([average_home_val_per_block])
    # and the other wieghted by population i.e.,
    #    weighted_by_pop_ave_val = sum([average_home_val_per_block * pop_per_block for blocks in county]) / total_pop_of_county
    master_county_averages_dict = {}
    for county_ID in county_dict.keys():
        info_list = county_dict[county_ID]
        raw_ave = stat.mean([x for x in info_list[0] if type(x)==float])
        countywide_COLA_adjust_ave_home_val = countywide_COLA_per_person(info_list)
        master_county_averages_dict[county_ID] = [raw_ave, countywide_COLA_adjust_ave_home_val]
    master_county_averages_dict

    # SAVE TO JSON FILE FOR EASY LOAD IF RERUN
    with open(top_data_path + "countywide_home_averages.json", 'w') as outfile:
        json.dump(master_county_averages_dict, outfile)

print("\nGet average values, now lets normalize each block's ave_home_val by its respective county average home val")



Get average values, now lets normalize each block's ave_home_val by its respective county average home val


In [24]:
for x in range(0, len(data_invert)):
    ave_value = data_invert[x]["ave_home_val"]
    county_ID = data_invert[x]["county_name"] +  data_invert[x]["state_name"]
#     master_county_averages_dict[ave_value]
    normalized_ave_home_val_by_county_average = float(ave_value / master_county_averages_dict[county_ID][0])
    normalized_ave_home_val_by_county_average_PopWeighted = float(ave_value / master_county_averages_dict[county_ID][1])
    data_invert[x]["normalized_ave_home_val_by_county_average"] = normalized_ave_home_val_by_county_average
    data_invert[x]["normalized_ave_home_val_by_county_average_PopWeighted"] = normalized_ave_home_val_by_county_average_PopWeighted

    
data = pd.DataFrame.from_dict(data_invert)
data= data.T

outfile = top_data_path + "ACS_5YR_Block_Group_Data_w_Normalized_Ave_Home_Vals.csv"
data.to_csv(outfile)
print("DONE WITH NORMALIZING DATA")
print("Data with the Normalized_Ave_Home_Vals is written to:{}".format(outfile))

DONE
