In [None]:
# Import libraries

import sys
import re
import requests
import asyncio
import nest_asyncio
import aiohttp
import time
import warnings
from datetime import datetime

# Suppress unnecessary Shapely warning
warnings.filterwarnings('ignore',
                        '.*Shapely GEOS version.*')

from aiohttp import ClientSession
from requests import request, Session
from itertools import product, repeat
import os
from dotenv import load_dotenv
from os import getenv
from threading import Thread
import time
import inspect
import pandas as pd
import geopandas as gp
import shapely
import pygeos
from functools import reduce
from pandas.plotting import lag_plot
import pickle
import numpy as np
import seaborn as sns
import datetime as dt
import copy
import math
from decimal import Decimal
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator
import matplotlib.dates as mdates
import matplotlib.gridspec as gridspec
from matplotlib.gridspec import GridSpec
from matplotlib.offsetbox import AnchoredText
import matplotlib as mpl
import plotly.express as px
from sklearn.metrics import mean_absolute_error
from multiprocess import Process, Pool

# All helper functions are in this module:
from helper_functions.census_functions import *

# Set up Pandas defaults
pd.options.display.float_format = '{:.4f}'.format
pd.set_option("display.max_columns", None)

## Get Census Key

In [None]:
# Test getting census key
print("Census key:", census_key)


### Create the directories for file-saving

In [None]:
create_folder("datasets/cleaned_census_api_files/")
create_folder("datasets/cleaned_census_api_files/graphable/")
create_folder("datasets/cleaned_census_api_files/raw/")
create_folder("datasets/cleaned_census_api_files/standardized/")

## Following Tutorial Below
Referencing [this](https://www.youtube.com/watch?v=LW-M_UC0VTE) tutorial.

Here is the [Census API](https://www.census.gov/data/developers/data-sets.html)

## Get Population at the Tract Level

### Get Population by block group for 2013-2020

In [None]:
%%time
# Start list
df_list = []
nest_asyncio.apply()

# Start session
session = Session()

# Define our API variable
# It's within a dictionary because some variables
# can change names from year to year (but not all)
census_code_dict = {
    2013: 'B01003_001E',
    2014: 'B01003_001E',
    2015: 'B01003_001E', # No change in code number this year
    2016: 'B01003_001E',
    2017: 'B01003_001E',
    2018: 'B01003_001E',
    2019: 'B01003_001E',
    2020: 'B01003_001E',
    2021: 'B01003_001E'
}

census_code_meaning='population_blocks'

# Run the API call
asyncio.run(url_to_dataframe_async_owners(2013, 2021, 
                                          fifty_states_list=fifty_states_list,
                                          census_code_dict=census_code_dict,
                                          df_list=df_list,
                                          census_code_meaning=census_code_meaning,
                                         get_blocks=True))

# Get merged dataframe
pop_by_blocks_raw = final_data_prep(df_list, census_code_meaning, blocks=True)
pop_pre_st = merge_with_crosswalk(pop_by_blocks_raw)


In [None]:
# Read in the dataframe
pop_by_blocks_raw = pd.read_csv(
    "datasets/cleaned_census_api_files/raw/population_blocks_raw.csv",
    encoding='utf-8',
    dtype={'geo_id':str, 'state':str, 'county':str, 
          'tract':str, 'block group':str, 'block':str})
pop_pre_st = merge_with_crosswalk(pop_by_blocks_raw)


#### Standardize

In [None]:
%%time
# This can and should be assigned to cloud parrallelization.
# The good thing is this function only needs to run once.
year_end = 2021
pop_dictionary = {}
array2 = pop_pre_st['BG20'].unique()

# Run the standardization function
[block_standardize(
    x, pop_dict=pop_dictionary, og_df=pop_pre_st, year_end=year_end) for x in array2]

saved_pop_dictionary = pop_dictionary.copy()

pop_standardized_df_1 = (pd.DataFrame.from_dict(saved_pop_dictionary, 
                       orient='index', 
                       columns=[str(i) for i in range(2013, year_end + 1)])
                       .reset_index()
                       .rename(columns={'index':'geoid_block'})
                      )

pop_standardized_df_1.to_csv('datasets/cleaned_census_api_files/raw/population_standardized_raw.csv',
                          encoding='utf-8',
                          index=False)

### Final dataframe clean up

In [None]:
pop_standardized_df_1 = pd.read_csv('datasets/cleaned_census_api_files/raw/population_standardized_raw.csv',
                                  encoding='utf-8',
                                 dtype={'geoid_block':str})

# Run statistics function to get ratio and z-score
pop_standardized_df_1 = get_statistics(
    pop_standardized_df_1, begin_year=2013, end_year=2021)

# Add columns for state,county, and tract, and make sums for census tracts
pop_standardized_df_2 = specify_geographies(pop_standardized_df_1, 2013, 2021)
display(pop_standardized_df_2.head(1))

pop_standardized_df_2.to_csv('datasets/cleaned_census_api_files/standardized/population_standardized.csv',
                          encoding='utf-8',
                          index=False)

# Plot zscores to check for anomalies
plot_zscores(pop_standardized_df_2['z_score'], 'Population')

# Create the shapefiles for block groups and tracts
pop_tract_geo = create_and_save_geo_files(dataframe=pop_standardized_df_2, 
                                          name='population',
                                          begin_year=2013, 
                                          end_year=2021)



### Group Tracts By City

In [None]:
##### Use the below function and incorporate it into the main
##### cencus helper functions. 

def group_tracts_by_city(tract_gdf, city_name, state_name):
    """
    Group tracts by a given city and save them as
    an individual dataframes.
    
    Parameters:
        tract_gdf (GeoDataFrame): The geodataframe
            you want to split by city.
        city_name (str): Name of city.
        state_name (str): Name of state.
    
    Returns:
        None.
    """
    
    # Read in the city shapefile
    city_boundaries = gp.read_file(
        "datasets/census_original_files/cities_2020/all_cities_2020/all_tracts_2020.shp")
    
    # Filter by city name and state
    
    # Make copy
    df = tract_gdf.copy()
    
    ### Below is an algorithm to only keep tracts
    ### that are within 50% or more of a city's boundary
    
    # Step 1: Calculate area of each tract
    df['tract_area'] = df.area
    
    # Step 2: Overlay the city boundaries over the tracts
    df_2 = df.overlay(city_boundaries, how="intersect")
    
    display(df_2)
    
    # Step 3: Calculate new areas
    df_2['area_']
    
group_tracts_by_city(pop_tract_geo)

In [None]:
pop_tract_geo

In [None]:
### Temporary code - can be deleted once file is created

# Goal: Create csv that matches states to FIPS codes



End of standardizing and merging population

---

## Median Gross Rent

Estimated Median of bin values = 𝑙 + (𝑛/2 − 𝐹)/𝑓 ⋅ 𝑤

#### Below is their "Median Value." Once we standardize, compare the values to this table below!

In [None]:
# %%time
# # Start list
# df_list = []
# nest_asyncio.apply()

# # Start session
# session = Session()

# # Define our API variable
# # It's within a dictionary because some variables
# # can change names from year to year (but not all)
# census_code_dict = {
#     2013: 'B25064_001E',
#     2014: 'B25064_001E',
#     2015: 'B25064_001E', # No change in code number this year
#     2016: 'B25064_001E',
#     2017: 'B25064_001E',
#     2018: 'B25064_001E',
#     2019: 'B25064_001E',
#     2020: 'B25064_001E',
#     2021: 'B25064_001E'
# }

# census_code_meaning='median_rent_2013_2021_blocks'

# # Run the API call
# asyncio.run(url_to_dataframe_async_owners(2013, 2021, 
#                                           fifty_states_list,
#                                           census_code_dict,
#                                           df_list=df_list,
#                                           census_code_meaning=census_code_meaning,
#                                          get_blocks=True,
#                                          ))

# # Get merged dataframe
# median_rent_raw = final_data_prep(df_list, census_code_meaning, blocks=True)



In [None]:
median_rent_raw = pd.read_csv('datasets/cleaned_census_api_files/raw/median_rent_2013_2021_blocks_raw.csv',
                                   encoding='utf-8',
                                   dtype={'geo_id':str, 'state':str, 'county':str, 
                                          'tract':str, 'block group':str, 'block':str})
median_rent_raw['block'] = median_rent_raw['geo_id'].apply(lambda x: str(x)[-12:])

spot_check(median_rent_raw, 2013, 2021)

median_rent_raw.head(1)


#### Get Gross Rent for each category, then combine them! Try to get all codes for all years

In [None]:
# Get the Gross Rent Code for each year
rent_code_list = []
for i in range(3, 27):
    if i < 10:
        i = '0' + str(i)
    rent_code_list.append(f'B25063_0{i}E')
    
rent_code_list

In [None]:
# %%time
# # Start list
# df_list = []
# nest_asyncio.apply()

# # Start session
# session = Session()

# # Define our API variable
# # It's within a dictionary because some variables
# # can change names from year to year (but not all)
# rent_code_list = []
# for i in range(3, 27):
#     if i < 10:
#         i = '0' + str(i)
#     rent_code_list.append(f'B25063_0{i}E')
    
# census_code_dict = {
#     2013: rent_code_list[:-3],
#     2014: rent_code_list[:-3],
#     2015: rent_code_list, 
#     2016: rent_code_list,
#     2017: rent_code_list,
#     2018: rent_code_list,
#     2019: rent_code_list,
#     2020: rent_code_list,
#     2021: rent_code_list
# }

# code_name_dict = {
#     'B25063_003E': 'rent_less_than_100',
#     'B25063_004E': 'rent_100_to_149',
#     'B25063_005E': 'rent_150_to_199',
#     'B25063_006E': 'rent_200_to_249',
#     'B25063_007E': 'rent_250_to_299',
#     'B25063_008E': 'rent_300_to_349',
#     'B25063_009E': 'rent_350_to_399',
#     'B25063_010E': 'rent_400_to_449',
#     'B25063_011E': 'rent_450_to_499',
#     'B25063_012E': 'rent_500_to_549',
#     'B25063_013E': 'rent_550_to_599',
#     'B25063_014E': 'rent_600_to_649',
#     'B25063_015E': 'rent_650_to_699',
#     'B25063_016E': 'rent_700_to_749',
#     'B25063_017E': 'rent_750_to_799',
#     'B25063_018E': 'rent_800_to_899',
#     'B25063_019E': 'rent_900_to_999',
#     'B25063_020E': 'rent_1000_to_1249',
#     'B25063_021E': 'rent_1250_to_1449',
#     'B25063_022E': 'rent_1500_to_1999',
#     'B25063_023E': 'rent_2000_to_2499',
#     'B25063_024E': 'rent_2500_to_2999',
#     'B25063_025E': 'rent_3000_to_3499',
#     'B25063_026E': 'rent_3500_or_more',
# }

# census_code_meaning='rent_distribution_blocks'

# # Run the API call
# asyncio.run(url_to_dataframe_async_owners(2013, 2021, 
#                                           fifty_states_list,
#                                           census_code_dict,
#                                           df_list=df_list,
#                                           census_code_meaning=census_code_meaning,
#                                          get_blocks=True,
#                                          multi_code=True,
#                                          code_name_dict=code_name_dict
#                                          ))

# # Get merged dataframe
# rent_distribution = final_data_prep(df_list, census_code_meaning, blocks=True)
# rent_dist_pre_st = merge_with_crosswalk(rent_dist)


In [None]:
# # Custom spot checking
# print("How many states:", len(rent_dist['state'].value_counts()), "\n")

# for code in code_name_dict:
    
#     name = code_name_dict[code]
    
#     print(f"Values for {name}")
    
#     if (name != 'rent_2500_to_2999') and (name != 'rent_3000_to_3499') and (name != 'rent_3500_or_more'):
        
#         # Check for null values
#         for i in range(2013, 2021):
#             print(f"{name} Null values in {i}:", rent_dist[rent_dist[f'{i}_{name}'].isnull()].shape[0])
#         print("\n")

#         # Check for null values in multiple years
#         for i in range(2013, 2020):
#             print(f"2020 and {i} {name} null values:", rent_dist[(rent_dist[f'2020_{name}'].isnull()) & (rent_dist[f'{i}_{name}'].isnull())].shape[0])
#         print("\n")

#         # Check stats
#         for i in range(2013, 2021):
#             print(f"{name} Stats for year {i}:\n", rent_dist[f'{i}_{name}'].describe(), "\n")
            
#     else:
        
#         # Check for null values
#         for i in range(2015, 2021):
#             print(f"{name} Null values in {i}:", rent_dist[rent_dist[f'{i}_{name}'].isnull()].shape[0])
#         print("\n")

#         # Check for null values in multiple years
#         for i in range(2015, 2020):
#             print(f"2020 and {i} {name} null values:", rent_dist[(rent_dist[f'2020_{name}'].isnull()) & (rent_dist[f'{i}_{name}'].isnull())].shape[0])
#         print("\n")

#         # Check stats
#         for i in range(2015, 2021):
#             print(f"{name} Stats for year {i}:\n", rent_dist[f'{i}_{name}'].describe(), "\n")
        

In [None]:
rent_dist = pd.read_csv('datasets/cleaned_census_api_files/raw/rent_distribution_blocks_raw.csv',
                        encoding='utf-8',
                       dtype={'geo_id':str, 'state':str, 'county':str, 
                                          'tract':str, 'block group':str, 'block':str})
rent_dist_pre_st = merge_with_crosswalk(rent_dist)
rent_dist_pre_st.head(1)

In [None]:
def block_standardize_medians_3(bg20_df,
                            og_df,
                            year_start,
                            year_end,
                            weight,
                            code_name_dict_2013_2014=False,
                            code_name_dict_2015_2021=False,
                            code_name_dict_all=False):

    """
    WARNING: This function alone takes a few seconds to complete
    per block group, but to standard all 242,333 block groups
    can take many, many hours to run.
    It would be wise to run this function on any type of
    parrallel processing, such as using Dask, or a GPU,
    or parrallelized cloud computing, as there is no
    serialization (the block groups can be standardized
    in no particular order).
    
    This function standardizes all block group rows. It 
    should be called in a loop or vectorized if possible,
    such as the example below. (Note, the example below
    may not be the most efficient way to loop through
    or vectorize the block groups.)
    
    ```
    # Loop through all population block groups
    # and standardize them
    pop_dictionary = {}
    array2 = pop_pre_st['BG20'].unique()
    [block_standardize(
            x, 
            pop_dict=pop_dictionary, 
            og_df=pop_pre_st) 
        for x in array2]
    ```
    
    Parameters:
        tuple (tuple): A tuple containing the below.
            block (str): The block_group to group by.
            og_df (DataFrame): The dataframe we are 
                standardizing from.
            year_start (int): Which year to start from.
            year_end (int): Which year to end from.
            weight (str): Which weight to use (such as 
                'wt_pop' pr 'wt_hh').
    
    Returns:
        None. However, it appends the standardized values
            per block group to a pre-defined dictionary.
    """

    block = bg20_df['BG20'].iloc[0]
        
    # Step 1: Get a dataframe grouped by BG20
    bg20_df = bg20_df.drop_duplicates()
    bg20_df = bg20_df.fillna(0)
    filtered = og_df[og_df['block']==block].copy().drop_duplicates()
    
    # make sure the final dict is in the form of {block : dots}
    return_array = np.array([])
    
    if code_name_dict_all == False:
    
        # Step 2: Loop through the code names that 2013 and 2014 are guaranteed to have
        for code in code_name_dict_2013_2014:

            rent_category = code_name_dict_2013_2014[code]
            years_13_19 = [f"{i}_{rent_category}" for i in range(2013, 2020)]

            # Step 3: Get dot product of 2013-2019 values with the target weight values values
            array_13_19 = bg20_df[years_13_19].to_numpy().T
            wt_array = bg20_df[weight].to_numpy()
            dots = array_13_19.dot(wt_array)

            # Step 4: Append standardized 2013-2019 and 
            # the block's 2020+ values to new dictionary
            val_20s = filtered[[
                f"{i}_{rent_category}" for i in 
                range(2020, year_end + 1)]].iloc[0].to_numpy()
            
            # If the 2020 value is 0, then all years
            # before then should be 0 also
            if val_20s[0] == 0:
                dots = dots * 0

            # val_20 = filtered[f'2020_{rent_category}'].iloc[0]
            
            # Finalize the append
            dots = np.append(dots, val_20s)

            # Save a copy of the array
            set_dots = dots.copy()

            # Update return_dictionary
            return_array = np.append(return_array, set_dots)

        # Step 2: Loop through the code names that 2013 and 2014 won't have
        for code in code_name_dict_2015_2021:

            rent_category = code_name_dict_2015_2021[code]
            years_15_19 = [f"{i}_{rent_category}" for i in range(2015, 2020)]

            # Step 3: Get dot product of 2015-2019 values with the target weight values values
            array_15_19 = bg20_df[years_15_19].to_numpy().T
            wt_array = bg20_df[weight].to_numpy()
            dots = array_15_19.dot(wt_array)
            
            # Step 4: Append standardized 2013-2019 and 
            # the block's 2020+ values to new dictionary
            val_20s = filtered[[
                f"{i}_{rent_category}" for i in 
                range(2020, year_end + 1)]].iloc[0].to_numpy()
            
            # If the 2020 value is 0, then all years
            # before then should be 0 also
            if val_20s[0] == 0:
                dots = dots * 0
            
            # Finalize the append
            dots = np.append(dots, val_20s)

            # Save a copy of the array
            set_dots = dots.copy()

            # Update return_dictionary
            return_array = np.append(return_array, set_dots)
            
    else: # if code_name_dict_all exists
        for code in code_name_dict_all:

            rent_category = code_name_dict_all[code]
            years_13_19 = [f"{i}_{rent_category}" for i in range(2013, 2020)]

            # Step 3: Get dot product of 2010-2019 values with the target weight values values
            array_13_19 = bg20_df[years_13_19].to_numpy().T
            wt_array = bg20_df[weight].to_numpy()
            dots = array_13_19.dot(wt_array)
            
            # Step 4: Append standardized 2013-2019 and 
            # the block's 2020+ values to new dictionary
            val_20s = filtered[[
                f"{i}_{rent_category}" for i in 
                range(2020, year_end + 1)]].iloc[0].to_numpy()
            
            # If the 2020 value is 0, then all years
            # before then should be 0 also
            if val_20s[0] == 0:
                dots = dots * 0

            # Finalize the append
            dots = np.append(dots, val_20s)

            # Save a copy of the array
            set_dots = dots.copy()

            # Update return_dictionary
            return_array = np.append(return_array, set_dots)
        
    return {block : return_array}

In [None]:
%%time
# Using list comprehension

begin_year = 2013
end_year = 2021

my_groups = rent_dist_pre_st.groupby('BG20')
keys = list(my_groups.groups.keys())

code_name_dict_2013_2014 = {
    'B25063_003E': 'rent_less_than_100',
    'B25063_004E': 'rent_100_to_149',
    'B25063_005E': 'rent_150_to_199',
    'B25063_006E': 'rent_200_to_249',
    'B25063_007E': 'rent_250_to_299',
    'B25063_008E': 'rent_300_to_349',
    'B25063_009E': 'rent_350_to_399',
    'B25063_010E': 'rent_400_to_449',
    'B25063_011E': 'rent_450_to_499',
    'B25063_012E': 'rent_500_to_549',
    'B25063_013E': 'rent_550_to_599',
    'B25063_014E': 'rent_600_to_649',
    'B25063_015E': 'rent_650_to_699',
    'B25063_016E': 'rent_700_to_749',
    'B25063_017E': 'rent_750_to_799',
    'B25063_018E': 'rent_800_to_899',
    'B25063_019E': 'rent_900_to_999',
    'B25063_020E': 'rent_1000_to_1249',
    'B25063_021E': 'rent_1250_to_1449',
    'B25063_022E': 'rent_1500_to_1999',
    'B25063_023E': 'rent_2000_to_2499'
}

code_name_dict_2015_2021 = {
    'B25063_024E': 'rent_2500_to_2999',
    'B25063_025E': 'rent_3000_to_3499',
    'B25063_026E': 'rent_3500_or_more'
}

dict2 = {}
dict3 = [block_standardize_medians_3(my_groups.get_group(keys[i]), 
                                rent_dist_pre_st, 
                                begin_year, 
                                end_year, 
                                'wt_hh', 
                                code_name_dict_2013_2014=code_name_dict_2013_2014, 
                                code_name_dict_2015_2021=code_name_dict_2015_2021)
        for i in range(len(keys))
       ]


for d in dict3:
    dict2.update(d)

column_list = []
for code in code_name_dict_2013_2014:
    for i in range(begin_year, end_year + 1):
        column_list.append(f'{i}_{code_name_dict_2013_2014[code]}')
for code in code_name_dict_2015_2021:
    for i in range(begin_year, end_year + 1):
        column_list.append(f'{i}_{code_name_dict_2015_2021[code]}')

gross_rent_df = (pd.DataFrame.from_dict(dict2, 
                   orient='index', columns=column_list)
                   .reset_index()
                   .rename(columns={'index':'geoid_block'}))

gross_rent_df.to_csv('datasets/cleaned_census_api_files/raw/gross_rent_standardized_raw.csv',
                          encoding='utf-8',
                          index=False)

gross_rent_df

In [None]:
gross_rent_df.to_csv('datasets/cleaned_census_api_files/raw/gross_rent_standardized_raw.csv',
                          encoding='utf-8',
                          index=False)


In [None]:
gross_rent_df = pd.read_csv('datasets/cleaned_census_api_files/raw/gross_rent_standardized_raw.csv',
                          encoding='utf-8',
                          dtype={'geoid_block':str})

# Make geoid_block the index
gross_rent_df.set_index('geoid_block', inplace=True)
gross_rent_df.head(2)


### Once I have the gross rents standardized, calculate the median value for each year using frequency tables

Estimated Median of bin values = 𝑙 + (𝑛/2 − 𝐹)/𝑓 ⋅ 𝑤

where 𝑙 is the lower border of the median group, 𝐹 is the cumulative frequency up to the median group, 𝑓 is the frequency of the median group, 𝑤 is the width of the median group.


In [None]:
### STEPS TO ESTIMATE MEDIAN VALUE
def estimate_median(series, year, keyword):
    """
    Estimate the median for a given year.
    """
    s = series.copy()
    year = str(year)
#     print(series.name)
    
    # Gather all variables for the equation
    non_zero_series = s[s != 0]
    n = non_zero_series.sum()
    
    if len(non_zero_series) == 0:
        return 0
    
    # Find median interval
    median_n = n/2

    series_dict = non_zero_series.to_dict()
    key_list = list(series_dict.keys())
    count = 0
    interval = ''
    interval_backup = None
    i = 0
    while i < len(key_list):
        key = key_list[i]
        if count < median_n:
            interval = key
            count += series_dict[key]
            if count > median_n:
                F = count - series_dict[key]
        elif count == median_n:
            F = count
            interval_backup = key_list[i]
            i = len(key_list)
        else:
            pass
        i += 1
        
    f = series_dict[interval]
        
    # Get l, the lower bound
    interval_bound = interval.replace(f'{year}_{keyword}_','')
    l = int(re.sub(r'_to_\d+|less_than_|_or_more','',interval_bound))
    
    # Get higher bound
    h = int(re.sub(r'\d+_to_|less_than_|_or_more','',interval_bound))

    # Get width
    w = h - l
    
    # Calculate almost_median
    almost_median = (((n/2) - F)/f)*w
    
    # Check if there is no "median interval"
    if (almost_median == 0) & (interval_backup is not None):
        
        # Get l, the lower bound
        interval_higher_bound = interval_backup.replace(f'{year}_{keyword}_','')
        l_higher = int(re.sub(r'_to_\d+|less_than_|_or_more','',
                              interval_higher_bound))

        # Get higher bound
        h_higher = int(re.sub(r'\d+_to_|less_than_|_or_more','',interval_higher_bound))

        # Calculate average of lower-lower bound and higher-higher bound
        split_intervals_median = (l + h_higher)/2
        
        # This is our median
        return split_intervals_median
    
    else:
    
        # estimate median
        estimated_median = l + almost_median

        return estimated_median



In [None]:
gross_rent_medians = gross_rent_df.copy()

In [None]:
%%time
# Calculate gross rent for every year
for i in range(2013, 2021):
    columns_year = gross_rent_df.columns[gross_rent_df.columns.str.contains(str(i))]
    gross_rent_medians[f'{i}_median'] = gross_rent_df[columns_year].apply(
        lambda x: estimate_median(x, str(i), 'rent'),
        axis=1
    )
gross_rent_medians

In [None]:
gross_rent_medians.to_csv('datasets/cleaned_census_api_files/raw/gross_rent_medians_raw.csv',
                          encoding='utf-8')

In [None]:
gross_rent_medians = pd.read_csv('datasets/cleaned_census_api_files/raw/gross_rent_medians_raw.csv',
                          encoding='utf-8',
                          dtype={'geoid_block':str}
                                )
gross_rent_medians.head(2)

In [None]:
gross_rent_medians = gross_rent_medians[['geoid_block','2013_median','2014_median','2015_median',
                                        '2016_median','2017_median','2018_median',
                                        '2019_median','2020_median']]
gross_rent_medians.head(2)

In [None]:
# Create loop that goes through each year backyards, 
# and linearly interpolate values between 0 values 
# (to show one consistent line on a grpah between two points spearated by "0")
def linearly_interpolate(dataframe, 
                         grouping='block'):
    """Linearly interpolate NaN values."""
    df = copy.deepcopy(dataframe)
    df = df.replace(0, np.nan)
    df = df.set_index(f'geoid_{grouping}')
    df = df.interpolate(axis=1).reset_index()
    
    return df


In [None]:
median_rent_interpolated = linearly_interpolate(gross_rent_medians)
median_rent_interpolated

In [None]:
%%time
### Now get the median tract values
def get_tract_median_values(dataframe):
    "Sum blocks for tracts then calculate each year's median."
    
    df = copy.deepcopy(dataframe)
    if 'geoid_block' not in df.columns:
        df = df.reset_index()

    df['geoid_tract'] = df['geoid_block'].apply(lambda x: str(x)[:-1])
    df['state'] = df['geoid_block'].apply(lambda x: str(x)[0:2])
    df['county'] = df['geoid_block'].apply(lambda x: str(x)[2:5])
    
    # Calculate gross rent for every year
    for i in range(2013, 2021):
        str_year = str(i)
        columns_year = df.columns[df.columns.str.contains(str_year)]
        for col in columns_year:
            main_name = col.replace(f'{i}_rent_','')
            df[f'{i}_tract_rent_{main_name}'] = df.groupby('geoid_tract')[f'{col}'].transform('sum')
        tracts_year = df.columns[df.columns.str.contains(f'{str_year}_tract')]
        df[f'{i}_tract_median'] = df[tracts_year].apply(
            lambda x: estimate_median(x, str(i), 'tract_rent'),
            axis=1
        )
    
    return df

gross_rent_tract = get_tract_median_values(gross_rent_df)
gross_rent_tract

In [None]:
gross_rent_tract_2 = gross_rent_tract[['geoid_tract',
                                       '2013_tract_median','2014_tract_median','2015_tract_median',
                                        '2016_tract_median','2017_tract_median','2018_tract_median',
                                        '2019_tract_median','2020_tract_median']].drop_duplicates()
gross_rent_tract_2

In [None]:
median_tract_rent_interpolated = linearly_interpolate(gross_rent_tract_2, grouping='tract')
median_tract_rent_interpolated

In [None]:
median_rent_interpolated['geoid_tract'] = median_rent_interpolated['geoid_block'].apply(
    lambda x: str(x)[:-1])

median_rent_interpolated

In [None]:
median_rent_all = pd.merge(median_rent_interpolated,
                           median_tract_rent_interpolated,
                           left_on="geoid_tract",
                           right_on="geoid_tract",
                           how='inner')

median_rent_all

In [None]:
for year in range(2013, 2021):
    median_rent_all.rename(columns={f'{year}_median': f'{year}_block_median'}, inplace=True)
    
median_rent_all


In [None]:
median_rent_all['state'] = median_rent_all['geoid_block'].apply(lambda x: str(x)[0:2])
median_rent_all['county'] = median_rent_all['geoid_block'].apply(lambda x: str(x)[2:5])
median_rent_all['tract'] = median_rent_all['geoid_block'].apply(lambda x: str(x)[5:11])

median_rent_all

In [None]:
median_rent_all.to_csv('datasets/cleaned_census_api_files/standardized/median_rent_standardized.csv',
                       encoding='utf-8',
                       index=False)

In [None]:
median_rent_all = pd.read_csv('datasets/cleaned_census_api_files/standardized/median_rent_standardized.csv',
                               encoding='utf-8',
                             dtype={'geoid_block':str, 'geoid_tract':str})

median_rent_all


In [None]:
### Once I have the interpolated median tract values, merge with block level,
# Save, then run create_and_save_geo_files() function on it

# Create the shapefiles for block groups and tracts
median_rent_block_geo, median_rent_tract_geo = create_and_save_geo_files(dataframe=median_rent_all, 
                                                          name='median_rent',
                                                          keyword='median',
                                                          begin_year=2013, 
                                                          end_year=2020)

In [None]:
median_rent_block_geo
median_rent_tract_geo

### End of getting data at the Tract level
---

# Get data at the MSA level

We will get the following:
1. Median Income
2. Median Unit Price
3. Median Rent

using the ACS 1-Year survey.

In [None]:
# Set end year for all MSAs
end_year=2021

### Get Median Income at the MSA level

In [None]:
# Run the API download function
median_income_msa = download_and_format_msa_census_data(
    census_code="B19013_001E",
    census_code_meaning="median_income_msa",
    end_year=end_year
)
median_income_msa

### Get Median Unit Value at the MSA Level

In [None]:
# Run the API download function
median_price_msa = download_and_format_msa_census_data(
    census_code="B25077_001E",
    census_code_meaning="median_price_msa",
    end_year=end_year
)
median_price_msa

### Get Median Rent at the MSA level

In [None]:
# Run the API download function
median_rent_msa = download_and_format_msa_census_data(
    census_code="B25058_001E",
    census_code_meaning="median_rent_msa",
    end_year=end_year
)
median_rent_msa

### Get Total Units at the MSA level

In [None]:
# Run the API download function
total_units_msa = download_and_format_msa_census_data(
    census_code="B25001_001E",
    census_code_meaning="total_units_msa",
    end_year=end_year
)
total_units_msa

### Create Rent-to-Price Ratio dataset

In [None]:
# Rename columns
for i in range(2010, end_year + 1):
    median_rent_msa.rename(columns={f"{i}":f"{i}_rent"}, inplace=True)

# Rename columns
for i in range(2010, end_year + 1):
    median_price_msa.rename(columns={f"{i}":f"{i}_price"}, inplace=True)

# Merge price data
rent_to_price = median_rent_msa.merge(
    median_price_msa, how='inner', 
    on=['msa_code','msa_name'])

# Loop through columns and divide rent by price per year
for i in range(2010, end_year + 1):
    rent_to_price[f'{i}'] = rent_to_price[f"{i}_rent"]/rent_to_price[f"{i}_price"]
    
    # Drop rent and price columns
    rent_to_price.drop(columns=[f'{i}_rent',f'{i}_price'], inplace=True)

# Save dataset
rent_to_price.to_csv(
    "datasets/cleaned_census_api_files/msa_data/rent_price_ratio_msa.csv", 
    index=False)

rent_to_price

### Create Jobs per Unit dataset

In [None]:
# Read in jobs
jobs = pd.read_csv('datasets/bls/raw/most_recent_bls_data.csv',
                   dtype={'msa_code':str, 'state_code':str})

# Make sure the date column is in datetime format
jobs['date'] = pd.to_datetime(jobs['date'])

# Replace NECTA Division
jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA Division",""))
jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA",""))

# Keep only december months
new_jobs = jobs[jobs['month']=='December'].reset_index(drop=True)

# Get earliest year
earliest_year = new_jobs['year'].min()

# Get latest year
latest_year = new_jobs['year'].max()

# Only keep certain columns
new_jobs = new_jobs[['msa_name','year','value']]

# Rename column
new_jobs.rename(columns={'value':f'jobs'}, inplace=True)

# Stack and unstack
new_jobs = new_jobs.set_index(['msa_name','year'])
new_jobs = new_jobs.unstack('year')

# Reset index
new_jobs = new_jobs.reset_index()

# Rename jobs columns
new_jobs.columns = ['msa_name'] + [
    f'{i}_jobs' for i in range(earliest_year, latest_year + 1)]

# Read in total units and rename columns
total_units = pd.read_csv(
    "datasets/cleaned_census_api_files/msa_data/total_units_msa.csv")
for i in range(earliest_year, latest_year + 1):
    total_units.rename(columns={f"{i}":f"{i}_units"}, inplace=True)
    
# Merge data
jobs_per_unit = new_jobs.merge(
    total_units, how='inner', 
    on=['msa_name'])

# Loop through columns and divide rent by price per year
for i in range(earliest_year, latest_year + 1):
    jobs_per_unit[f'{i}'] = jobs_per_unit[f"{i}_jobs"]/jobs_per_unit[f"{i}_units"]
        
# Only keep main columns
jobs_per_unit = jobs_per_unit[['msa_name','msa_code'] +
    [f'{i}' for i in range(earliest_year, latest_year + 1)]]

# Save dataset
jobs_per_unit.to_csv(
    "datasets/cleaned_census_api_files/msa_data/jobs_per_unit_msa.csv", 
    index=False)

jobs_per_unit



## Get data at the City Level (at the ACS 5-Year level)

1. Population (B01003_001E)
2. Median Income
3. Median Unit Price
4. Median Rent
5. Total Units
6. Percent Renter Occupied
7. Total Employed (B23025_004E)

Create Manually:
1. Rent-to-Price Ratio
2. People-per-Units

In [None]:
# Define end year
begin_year = 2011
end_year = 2021

### Get Population (City)

In [None]:
# Run the API download function
population_city = download_and_format_msa_census_data(
    census_code="B01001_001E",
    census_code_meaning="population_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
population_city

### Get Median Income (city)

In [None]:
# Run the API download function
median_income_city = download_and_format_msa_census_data(
    census_code="B19013_001E",
    census_code_meaning="median_income_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
median_income_city

### Get Median Price (city)

In [None]:
# Run the API download function
median_price_city = download_and_format_msa_census_data(
    census_code="B25077_001E",
    census_code_meaning="median_price_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
median_price_city

### Get Median Rent (city)

In [None]:
# Run the API download function
median_rent_city = download_and_format_msa_census_data(
    census_code="B25058_001E",
    census_code_meaning="median_rent_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
median_rent_city

### Get Total Units (city)

In [None]:
# Run the API download function
total_units_city = download_and_format_msa_census_data(
    census_code="B25001_001E",
    census_code_meaning="total_units_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_units_city

## Get Percent Renter Occupied (city)

1. First we must get Total Occupied Units (B25002_002E)
2. Then we must get Renter Occupied Units (B25003_003E)

3. Then we must manually divide Renters by Occupied units to get Percent Renter Occupied

In [None]:
# Run the API download function
total_occupied_city = download_and_format_msa_census_data(
    census_code="B25002_002E",
    census_code_meaning="total_occupied_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_occupied_city

total_renter_occupied_city = download_and_format_msa_census_data(
    census_code="B25003_003E",
    census_code_meaning="total_renter_occupied_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_renter_occupied_city

In [None]:
### Manually get Percent Renter Occupied

# Rename columns
for i in range(begin_year, end_year + 1):
    total_occupied_city.rename(columns={f"{i}":f"{i}_total_occupied"}, inplace=True)

# Rename columns
for i in range(begin_year, end_year + 1):
    total_renter_occupied_city.rename(columns={f"{i}":f"{i}_renter_occupied"}, inplace=True)

# Merge price data
percent_renter_city = total_occupied_city.merge(
    total_renter_occupied_city, how='inner', 
    on=['name','geo_id'])

# Loop through columns and divide rent by price per year
for i in range(begin_year, end_year + 1):
    percent_renter_city[f'{i}'] = percent_renter_city[f"{i}_renter_occupied"]/percent_renter_city[f"{i}_total_occupied"]
    
    # Drop rent and price columns
    percent_renter_city.drop(columns=[
        f'{i}_renter_occupied',f'{i}_total_occupied'], inplace=True)

# Save dataset
percent_renter_city.to_csv(
    "datasets/cleaned_census_api_files/city_data/percent_renter_occupied_city.csv", 
    index=False)

percent_renter_city


### Get Total Employed (city)

In [None]:
# Run the API download function
total_employed_city = download_and_format_msa_census_data(
    census_code="B23025_004E",
    census_code_meaning="total_employed_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_employed_city

### Manually Create Rent-to-Price Ratio (city)


In [None]:
### Manually get Rent to Price Ratio

# Rename columns
for i in range(begin_year, end_year + 1):
    median_rent_city.rename(columns={f"{i}":f"{i}_rent"}, inplace=True)

# Rename columns
for i in range(begin_year, end_year + 1):
    median_price_city.rename(columns={f"{i}":f"{i}_price"}, inplace=True)

# Merge price data
rent_price_ratio_city = median_rent_city.merge(
    median_price_city, how='inner', 
    on=['name','geo_id'])

# Loop through columns and divide rent by price per year
for i in range(begin_year, end_year + 1):
    rent_price_ratio_city[f'{i}'] = rent_price_ratio_city[f"{i}_rent"]/rent_price_ratio_city[f"{i}_price"]
    
    # Drop rent and price columns
    rent_price_ratio_city.drop(columns=[
        f'{i}_rent',f'{i}_price'], inplace=True)

# Save dataset
rent_price_ratio_city.to_csv(
    "datasets/cleaned_census_api_files/city_data/rent_price_ratio_city.csv", 
    index=False)

rent_price_ratio_city


### Manually Create People-per-Units (city)


In [None]:
### Manually get People per Units

# Rename columns
for i in range(begin_year, end_year + 1):
    population_city.rename(columns={f"{i}":f"{i}_population"}, inplace=True)

# Rename columns
for i in range(begin_year, end_year + 1):
    total_units_city.rename(columns={f"{i}":f"{i}_units"}, inplace=True)

# Merge price data
people_per_unit_city = population_city.merge(
    total_units_city, how='inner', 
    on=['name','geo_id'])

# Loop through columns and divide rent by price per year
for i in range(begin_year, end_year + 1):
    people_per_unit_city[f'{i}'] = people_per_unit_city[f"{i}_population"]/people_per_unit_city[f"{i}_units"]
    
    # Drop rent and price columns
    people_per_unit_city.drop(columns=[
        f'{i}_population',f'{i}_units'], inplace=True)

# Save dataset
people_per_unit_city.to_csv(
    "datasets/cleaned_census_api_files/city_data/people_per_unit_city.csv", 
    index=False)

people_per_unit_city
