In [1]:
# Import libraries

import sys
import re
import requests
import asyncio
import nest_asyncio
import aiohttp
import time
import warnings
from datetime import datetime

# Suppress unnecessary Shapely warning
warnings.filterwarnings('ignore',
                        '.*Shapely GEOS version.*')

from aiohttp import ClientSession
from requests import request, Session
from itertools import product, repeat
import os
from dotenv import load_dotenv
from os import getenv
from threading import Thread
import time
import inspect
import pandas as pd
import geopandas as gp
import shapely
import pygeos
from functools import reduce
from pandas.plotting import lag_plot
import pickle
import numpy as np
import seaborn as sns
import datetime as dt
import copy
import math
from decimal import Decimal
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator
import matplotlib.dates as mdates
import matplotlib.gridspec as gridspec
from matplotlib.gridspec import GridSpec
from matplotlib.offsetbox import AnchoredText
import matplotlib as mpl
import plotly.express as px
from sklearn.metrics import mean_absolute_error
from multiprocess import Process, Pool

# Spark library
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.types import *
from pyspark.sql.functions import col as spark_col

# Using findspark ensures the spark path is
# found and set, allowing us to use the
# appropriate JAR files to connect to AWS
import findspark
location = findspark.find()
findspark.init(location, edit_rc=True)

# AWS library
import boto3
import awswrangler as wr


# All helper functions are in this module:
from helper_functions.census_functions import *

# Set up Pandas defaults
pd.options.display.float_format = '{:.4f}'.format
pd.set_option("display.max_columns", None)


# Connect to S3
s3client = boto3.client("s3")
os.environ.setdefault("AWS_PROFILE", "default")
os.environ.setdefault("AWS_DEFAULT_REGION", "us-west-1")

# Set environment variables for LOCAL development ONLY!
os.environ.setdefault("ENVIRON", "LOCAL")

'LOCAL'

## Get Census Key

In [2]:
# Test getting census key
print("Census key:", census_key)


Census key: f25144e3809a49b45c5cc54e7d2bce532fe4ce99


### Create the directories for file-saving

In [3]:
create_folder("datasets/cleaned_census_api_files/")
create_folder("datasets/cleaned_census_api_files/graphable/")
create_folder("datasets/cleaned_census_api_files/raw/")
create_folder("datasets/cleaned_census_api_files/standardized/")

## Following Tutorial Below
Referencing [this](https://www.youtube.com/watch?v=LW-M_UC0VTE) tutorial.

Here is the [Census API](https://www.census.gov/data/developers/data-sets.html)

## Get Block Group Level Datasets

In [4]:
# Set global variables
start_year = 2013
end_year = 2021

### Get Population by block group for 2013-2020

In [32]:
# Define census code meaning
demographic = 'population'
census_code_meaning = f'{demographic}_blocks'

In [33]:
# Define our API variable
# It's within a dictionary because some variables
# can change names from year to year (but not all)
pop_census_code_dict = {
    2013: 'B01003_001E',
    2014: 'B01003_001E',
    2015: 'B01003_001E', # No change in code number this year
    2016: 'B01003_001E',
    2017: 'B01003_001E',
    2018: 'B01003_001E',
    2019: 'B01003_001E',
    2020: 'B01003_001E',
    2021: 'B01003_001E'
}

In [35]:
def download_and_format_blocks(
    census_code_meaning,
    census_code_dict,
    start_year=start_year,
    end_year=end_year,
    fifty_states_list=fifty_states_list
):
    
    # Start list
    df_list = []
    nest_asyncio.apply()

    # Start session
    session = Session()

    # Run the API call
    asyncio.run(url_to_dataframe_async_owners(start_year, end_year, 
                                              fifty_states_list=fifty_states_list,
                                              census_code_dict=census_code_dict,
                                              df_list=df_list,
                                              census_code_meaning=census_code_meaning,
                                             get_blocks=True))

    # Get merged dataframe
    by_blocks_raw = final_data_prep(df_list, census_code_meaning, blocks=True)
    pre_st = merge_with_crosswalk(by_blocks_raw)

    # Drop name and block group
    pre_st.drop(columns=['name','block group'], inplace=True)

    # Save crosswalked file
    pre_st.to_csv(f"datasets/cleaned_census_api_files/raw/merged_with_crosswalk/{census_code_meaning}_raw_crosswalked.csv",
                     index=False)

    ### Save pop_pre_st to S3

    # Make block-group s3 path
    block_group_s3_path = "s3://real-estate-wolff/census-data/block-groups/raw"
    filename = f"{census_code_meaning}_raw.csv"
    s3_path = f"{block_group_s3_path}/{filename}"

    # Save to s3
    wr.s3.to_csv(pre_st, s3_path, index=False)


In [36]:
download_and_format_blocks(
    census_code_meaning=census_code_meaning,
    census_code_dict=pop_census_code_dict
)

Working on year: 2013 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2013/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2013 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2013/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2013,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",637.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1171.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",1383.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",972.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2366.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",1276.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",3437.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",1286.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",2332.0000,72,153,750602,1,721537506021


Unnamed: 0_level_0,name,2013,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",637.0,1,1,20100,1,10010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1171.0,1,1,20100,2,10010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",1383.0,1,1,20200,1,10010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",972.0,1,1,20200,2,10010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2366.0,1,1,20300,1,10010203001


Working on year: 2014 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2014/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2014 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2014/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2014,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",676.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1224.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",1289.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",1053.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2376.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",822.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",2940.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",1612.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",2351.0000,72,153,750602,1,721537506021


geo_id
1500000US010010201001    676.0000
1500000US010010201002   1224.0000
1500000US010010202001   1289.0000
1500000US010010202002   1053.0000
1500000US010010203001   2376.0000
Name: 2014, dtype: float64

Working on year: 2015 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2015/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2015 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2015/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2015,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US060170303011,"Block Group 1, Census Tract 303.01, El Dorado ...",606.0000,06,017,030301,1,060170303011
1500000US060170303012,"Block Group 2, Census Tract 303.01, El Dorado ...",621.0000,06,017,030301,2,060170303012
1500000US060170303013,"Block Group 3, Census Tract 303.01, El Dorado ...",957.0000,06,017,030301,3,060170303013
1500000US060170303021,"Block Group 1, Census Tract 303.02, El Dorado ...",888.0000,06,017,030302,1,060170303021
1500000US060170303022,"Block Group 2, Census Tract 303.02, El Dorado ...",960.0000,06,017,030302,2,060170303022
...,...,...,...,...,...,...,...
1500000US530330263002,"Block Group 2, Census Tract 263, King County, ...",1016.0000,53,033,026300,2,530330263002
1500000US530330264001,"Block Group 1, Census Tract 264, King County, ...",699.0000,53,033,026400,1,530330264001
1500000US530330264002,"Block Group 2, Census Tract 264, King County, ...",1548.0000,53,033,026400,2,530330264002
1500000US530330264003,"Block Group 3, Census Tract 264, King County, ...",1304.0000,53,033,026400,3,530330264003


geo_id
1500000US060170303011   606.0000
1500000US060170303012   621.0000
1500000US060170303013   957.0000
1500000US060170303021   888.0000
1500000US060170303022   960.0000
Name: 2015, dtype: float64

Working on year: 2016 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2016/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2016 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2016/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2016,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010479561001,"Block Group 1, Census Tract 9561, Dallas Count...",190.0000,01,047,956100,1,010479561001
1500000US010479561002,"Block Group 2, Census Tract 9561, Dallas Count...",1815.0000,01,047,956100,2,010479561002
1500000US010479561003,"Block Group 3, Census Tract 9561, Dallas Count...",1604.0000,01,047,956100,3,010479561003
1500000US010479561004,"Block Group 4, Census Tract 9561, Dallas Count...",2120.0000,01,047,956100,4,010479561004
1500000US010479561005,"Block Group 5, Census Tract 9561, Dallas Count...",592.0000,01,047,956100,5,010479561005
...,...,...,...,...,...,...,...
1500000US721130730023,"Block Group 3, Census Tract 730.02, Ponce Muni...",2478.0000,72,113,073002,3,721130730023
1500000US721130730031,"Block Group 1, Census Tract 730.03, Ponce Muni...",2073.0000,72,113,073003,1,721130730031
1500000US721130730041,"Block Group 1, Census Tract 730.04, Ponce Muni...",912.0000,72,113,073004,1,721130730041
1500000US721130730042,"Block Group 2, Census Tract 730.04, Ponce Muni...",752.0000,72,113,073004,2,721130730042


geo_id
1500000US010479561001    190.0000
1500000US010479561002   1815.0000
1500000US010479561003   1604.0000
1500000US010479561004   2120.0000
1500000US010479561005    592.0000
Name: 2016, dtype: float64

Working on year: 2017 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2017/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2017 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2017/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2017,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",692.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1153.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",1020.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",1152.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2555.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",740.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",2348.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",1292.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",2055.0000,72,153,750602,1,721537506021


geo_id
1500000US010010201001    692.0000
1500000US010010201002   1153.0000
1500000US010010202001   1020.0000
1500000US010010202002   1152.0000
1500000US010010203001   2555.0000
Name: 2017, dtype: float64

Working on year: 2018 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2018/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2018 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2018/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2018,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",636.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1287.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",810.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",1218.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2641.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",699.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",2432.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",976.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",1707.0000,72,153,750602,1,721537506021


geo_id
1500000US010010201001    636.0000
1500000US010010201002   1287.0000
1500000US010010202001    810.0000
1500000US010010202002   1218.0000
1500000US010010203001   2641.0000
Name: 2018, dtype: float64

Working on year: 2019 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2019/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2019 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2019/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2019,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",730.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1263.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",835.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",1124.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2774.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",883.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",2523.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",991.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",1577.0000,72,153,750602,1,721537506021


geo_id
1500000US010010201001    730.0000
1500000US010010201002   1263.0000
1500000US010010202001    835.0000
1500000US010010202002   1124.0000
1500000US010010203001   2774.0000
Name: 2019, dtype: float64

Working on year: 2020 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2020/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2020 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2020/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2020,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",674.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1267.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",706.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",1051.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2912.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",1825.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",1504.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",1276.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",1410.0000,72,153,750602,1,721537506021


geo_id
1500000US010010201001    674.0000
1500000US010010201002   1267.0000
1500000US010010202001    706.0000
1500000US010010202002   1051.0000
1500000US010010203001   2912.0000
Name: 2020, dtype: float64

Working on year: 2021 and states:
 01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26
URL: https://api.census.gov/data/2021/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:01,02,03,04,05,06,07,08,09,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
Working on year: 2021 and states:
 27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72
URL: https://api.census.gov/data/2021/acs/acs5?get=NAME,GEO_ID,B01003_001E&for=block%20group:*&in=state:27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,53,54,55,56,72&in=county:*&in=tract:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99


Unnamed: 0_level_0,name,2021,state,county,tract,block group,block
geo_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1500000US010010201001,"Block Group 1, Census Tract 201, Autauga Count...",693.0000,01,001,020100,1,010010201001
1500000US010010201002,"Block Group 2, Census Tract 201, Autauga Count...",1098.0000,01,001,020100,2,010010201002
1500000US010010202001,"Block Group 1, Census Tract 202, Autauga Count...",844.0000,01,001,020200,1,010010202001
1500000US010010202002,"Block Group 2, Census Tract 202, Autauga Count...",1166.0000,01,001,020200,2,010010202002
1500000US010010203001,"Block Group 1, Census Tract 203, Autauga Count...",2685.0000,01,001,020300,1,010010203001
...,...,...,...,...,...,...,...
1500000US721537506011,"Block Group 1, Census Tract 7506.01, Yauco Mun...",1894.0000,72,153,750601,1,721537506011
1500000US721537506012,"Block Group 2, Census Tract 7506.01, Yauco Mun...",1820.0000,72,153,750601,2,721537506012
1500000US721537506013,"Block Group 3, Census Tract 7506.01, Yauco Mun...",1213.0000,72,153,750601,3,721537506013
1500000US721537506021,"Block Group 1, Census Tract 7506.02, Yauco Mun...",1295.0000,72,153,750602,1,721537506021


geo_id
1500000US010010201001    693.0000
1500000US010010201002   1098.0000
1500000US010010202001    844.0000
1500000US010010202002   1166.0000
1500000US010010203001   2685.0000
Name: 2021, dtype: float64

Completed data prep.


In [6]:
# Read in the dataframe
pop_pre_st = pd.read_csv(
    "datasets/cleaned_census_api_files/raw/merged_with_crosswalk/population_blocks_raw_crosswalked.csv",
    encoding='utf-8',
    dtype={'geo_id':str, 'state':str, 'county':str, 
          'tract':str, 'block':str, 'BG10':str, 'BG20':str,
          'TRACT20':str, 'TRACT10':str})
pop_pre_st


Unnamed: 0,geo_id,2013,state,county,tract,block,2014,2015,2016,2017,2018,2019,2020,2021,BG10,BG20,wt_pop,wt_hu,wt_adult,wt_fam,wt_hh,parea,TRACT20,TRACT10
0,1500000US010010201001,637.0000,01,001,01001020100,010010201001,676.0000,649.0000,745.0000,692.0000,636.0000,730.0000,674.0000,693.0000,010010201001,010010201001,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,01001020100,01001020100
1,1500000US010010201001,637.0000,01,001,01001020100,010010201001,676.0000,649.0000,745.0000,692.0000,636.0000,730.0000,674.0000,693.0000,010010201001,010010201001,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,01001020100,01001020100
2,1500000US010010201002,1171.0000,01,001,01001020100,010010201002,1224.0000,1299.0000,1265.0000,1153.0000,1287.0000,1263.0000,1267.0000,1098.0000,010010201002,010010201002,1.0000,1.0000,1.0000,1.0000,1.0000,0.9988,01001020100,01001020100
3,1500000US010010201002,1171.0000,01,001,01001020100,010010201002,1224.0000,1299.0000,1265.0000,1153.0000,1287.0000,1263.0000,1267.0000,1098.0000,010010201002,010010208031,0.0000,0.0000,0.0000,0.0000,0.0000,0.0012,01001020803,01001020100
4,1500000US010010202001,1383.0000,01,001,01001020200,010010202001,1289.0000,1074.0000,960.0000,1020.0000,810.0000,835.0000,706.0000,844.0000,010010202001,010010202001,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,01001020200,01001020200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561867,1500000US721537506012,3437.0000,72,153,72153750601,721537506012,2940.0000,2921.0000,2703.0000,2348.0000,2432.0000,2523.0000,1504.0000,1820.0000,721537506012,721537506013,0.6173,0.6167,0.6037,0.6202,0.6077,0.8305,72153750601,72153750601
561868,1500000US721537506013,1286.0000,72,153,72153750601,721537506013,1612.0000,1367.0000,1195.0000,1292.0000,976.0000,991.0000,1276.0000,1213.0000,721537506013,721537506011,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,72153750601,72153750601
561869,1500000US721537506021,2332.0000,72,153,72153750602,721537506021,2351.0000,1994.0000,2005.0000,2055.0000,1707.0000,1577.0000,1410.0000,1295.0000,721537506021,721537506021,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,72153750602,72153750602
561870,1500000US721537506022,856.0000,72,153,72153750602,721537506022,916.0000,747.0000,736.0000,946.0000,804.0000,648.0000,801.0000,894.0000,721537506022,721537506022,1.0000,1.0000,1.0000,1.0000,1.0000,1.0000,72153750602,72153750602


### After EMR Standardization, Final dataframe clean up

In [27]:
# Call in standardized dataframe so we can add tracts
s3_path = f"s3://real-estate-wolff/census-data/block-groups/standardized/{census_code_meaning}_standardized.csv/"
pop_std_1 = wr.s3.read_csv(
    s3_path, dtype={'geoid_block':str})

# Add columns for state,county, and tract, and make sums for census tracts
pop_std_2 = specify_geographies(pop_std_1, start_year, end_year)

pop_std_2.to_csv(f'datasets/cleaned_census_api_files/standardized/{demographic}_standardized.csv',
                          encoding='utf-8',
                          index=False)

# Make block-group s3 path
block_group_s3_path = "s3://real-estate-wolff/census-data/block-groups/standardized_with_tracts"
filename = f"{demographic}_standardized.csv"
s3_path = f"{block_group_s3_path}/{filename}"

# Save to s3
wr.s3.to_csv(pop_std_2, s3_path, index=False)

pop_std_2


Unnamed: 0,geoid_block,2013_block,2014_block,2015_block,2016_block,2017_block,2018_block,2019_block,2020_block,2021_block,geoid_tract,state,county,tract,2013_tract,2014_tract,2015_tract,2016_tract,2017_tract,2018_tract,2019_tract,2020_tract,2021_tract
0,010010204002,1895.0000,1884.0000,2228.0000,2303.0000,2217.0000,1903.0000,1756.0000,1713.0000,1655.0000,01001020400,01,001,020400,4403.0000,4272.0000,4423.0000,4563.0000,4267.0000,3831.0000,3878.0000,3539.0000,3802.0000
1,010030101001,738.5336,648.1151,576.7021,533.9831,638.6208,708.7839,409.3600,418.0000,573.0000,01003010100,01,003,010100,3424.0000,3427.0000,3829.0000,4013.0001,3988.0000,4255.0000,3432.9999,3266.0000,3244.0000
2,010030101003,1518.0000,1477.0000,1346.0000,1490.0000,1554.0000,1482.0000,1295.0000,1315.0000,1179.0000,01003010100,01,003,010100,3424.0000,3427.0000,3829.0000,4013.0001,3988.0000,4255.0000,3432.9999,3266.0000,3244.0000
3,010030109031,1515.0000,1693.0000,1543.0000,1467.0000,1648.0000,1325.0000,1284.0000,1719.0000,2033.0000,01003010903,01,003,010903,5628.0000,5820.0000,5810.0000,5953.0000,5688.0000,5789.0000,6235.0000,6667.0000,7620.0000
4,010030109063,1196.0000,1251.0000,1192.0000,1362.0000,1323.0000,1214.0000,1038.0000,901.0000,842.0000,01003010906,01,003,010906,4590.0000,5047.0000,5029.0000,5272.0000,5473.0000,5251.0000,5188.0000,5337.0000,4738.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242330,721270061011,1426.0000,1081.0000,1166.0000,1363.0000,1260.0000,1284.0000,1253.0000,1089.0000,752.0000,72127006101,72,127,006101,4000.0000,3606.0000,3614.0000,3843.0000,3729.0000,3677.0000,3471.0000,3377.0000,3092.0000
242331,721270080023,1937.3621,2086.4407,2024.4344,1809.3918,1761.8978,1661.6326,1536.3010,1175.0000,1311.0000,72127008002,72,127,008002,5057.9999,5334.0000,5087.9999,4654.9999,4544.9999,4034.9999,3594.9999,3364.0000,3682.0000
242332,721279801041,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,72127980104,72,127,980104,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
242333,721292203002,531.0000,557.0000,550.0000,652.0000,732.0000,877.0000,842.0000,811.0000,864.0000,72129220300,72,129,220300,1599.0752,1572.0724,1618.0779,1766.0756,1768.0782,1773.0602,1653.0474,1530.0000,1629.0000


### Group Blocks or Tracts By City

In [None]:
##### Use the below function and incorporate it into the main
##### cencus helper functions. 

def group_tracts_by_city(tract_gdf, city_name, state_name):
    """
    Group tracts by a given city and save them as
    an individual dataframes.
    
    Parameters:
        tract_gdf (GeoDataFrame): The geodataframe
            you want to split by city.
        city_name (str): Name of city.
        state_name (str): Name of state.
    
    Returns:
        None.
    """
    
    # Read in the city shapefile
    city_boundaries = gp.read_file(
        "datasets/census_original_files/cities_2020/all_cities_2020/all_tracts_2020.shp")
    
    # Filter by city name and state
    
    # Make copy
    df = tract_gdf.copy()
    
    ### Below is an algorithm to only keep tracts
    ### that are within 50% or more of a city's boundary
    
    # Step 1: Calculate area of each tract
    df['tract_area'] = df.area
    
    # Step 2: Overlay the city boundaries over the tracts
    df_2 = df.overlay(city_boundaries, how="intersect")
    
    display(df_2)
    
    # Step 3: Calculate new areas
    df_2['area_']
    
group_tracts_by_city(pop_tract_geo)

In [None]:
pop_tract_geo

End of standardizing and merging population

---

## Median Gross Rent

Estimated Median of bin values = 𝑙 + (𝑛/2 − 𝐹)/𝑓 ⋅ 𝑤

#### Below is their "Median Value." Once we standardize, compare the values to this table below!

In [None]:
# %%time
# # Start list
# df_list = []
# nest_asyncio.apply()

# # Start session
# session = Session()

# # Define our API variable
# # It's within a dictionary because some variables
# # can change names from year to year (but not all)
# census_code_dict = {
#     2013: 'B25064_001E',
#     2014: 'B25064_001E',
#     2015: 'B25064_001E', # No change in code number this year
#     2016: 'B25064_001E',
#     2017: 'B25064_001E',
#     2018: 'B25064_001E',
#     2019: 'B25064_001E',
#     2020: 'B25064_001E',
#     2021: 'B25064_001E'
# }

# census_code_meaning='median_rent_2013_2021_blocks'

# # Run the API call
# asyncio.run(url_to_dataframe_async_owners(2013, 2021, 
#                                           fifty_states_list,
#                                           census_code_dict,
#                                           df_list=df_list,
#                                           census_code_meaning=census_code_meaning,
#                                          get_blocks=True,
#                                          ))

# # Get merged dataframe
# median_rent_raw = final_data_prep(df_list, census_code_meaning, blocks=True)



In [None]:
median_rent_raw = pd.read_csv('datasets/cleaned_census_api_files/raw/median_rent_2013_2021_blocks_raw.csv',
                                   encoding='utf-8',
                                   dtype={'geo_id':str, 'state':str, 'county':str, 
                                          'tract':str, 'block group':str, 'block':str})
median_rent_raw['block'] = median_rent_raw['geo_id'].apply(lambda x: str(x)[-12:])

spot_check(median_rent_raw, 2013, 2021)

median_rent_raw.head(1)


#### Get Gross Rent for each category, then combine them! Try to get all codes for all years

In [None]:
# Get the Gross Rent Code for each year
rent_code_list = []
for i in range(3, 27):
    if i < 10:
        i = '0' + str(i)
    rent_code_list.append(f'B25063_0{i}E')
    
rent_code_list

In [None]:
# %%time
# # Start list
# df_list = []
# nest_asyncio.apply()

# # Start session
# session = Session()

# # Define our API variable
# # It's within a dictionary because some variables
# # can change names from year to year (but not all)
# rent_code_list = []
# for i in range(3, 27):
#     if i < 10:
#         i = '0' + str(i)
#     rent_code_list.append(f'B25063_0{i}E')
    
# census_code_dict = {
#     2013: rent_code_list[:-3],
#     2014: rent_code_list[:-3],
#     2015: rent_code_list, 
#     2016: rent_code_list,
#     2017: rent_code_list,
#     2018: rent_code_list,
#     2019: rent_code_list,
#     2020: rent_code_list,
#     2021: rent_code_list
# }

# code_name_dict = {
#     'B25063_003E': 'rent_less_than_100',
#     'B25063_004E': 'rent_100_to_149',
#     'B25063_005E': 'rent_150_to_199',
#     'B25063_006E': 'rent_200_to_249',
#     'B25063_007E': 'rent_250_to_299',
#     'B25063_008E': 'rent_300_to_349',
#     'B25063_009E': 'rent_350_to_399',
#     'B25063_010E': 'rent_400_to_449',
#     'B25063_011E': 'rent_450_to_499',
#     'B25063_012E': 'rent_500_to_549',
#     'B25063_013E': 'rent_550_to_599',
#     'B25063_014E': 'rent_600_to_649',
#     'B25063_015E': 'rent_650_to_699',
#     'B25063_016E': 'rent_700_to_749',
#     'B25063_017E': 'rent_750_to_799',
#     'B25063_018E': 'rent_800_to_899',
#     'B25063_019E': 'rent_900_to_999',
#     'B25063_020E': 'rent_1000_to_1249',
#     'B25063_021E': 'rent_1250_to_1449',
#     'B25063_022E': 'rent_1500_to_1999',
#     'B25063_023E': 'rent_2000_to_2499',
#     'B25063_024E': 'rent_2500_to_2999',
#     'B25063_025E': 'rent_3000_to_3499',
#     'B25063_026E': 'rent_3500_or_more',
# }

# census_code_meaning='rent_distribution_blocks'

# # Run the API call
# asyncio.run(url_to_dataframe_async_owners(2013, 2021, 
#                                           fifty_states_list,
#                                           census_code_dict,
#                                           df_list=df_list,
#                                           census_code_meaning=census_code_meaning,
#                                          get_blocks=True,
#                                          multi_code=True,
#                                          code_name_dict=code_name_dict
#                                          ))

# # Get merged dataframe
# rent_distribution = final_data_prep(df_list, census_code_meaning, blocks=True)
# rent_dist_pre_st = merge_with_crosswalk(rent_dist)


In [None]:
# # Custom spot checking
# print("How many states:", len(rent_dist['state'].value_counts()), "\n")

# for code in code_name_dict:
    
#     name = code_name_dict[code]
    
#     print(f"Values for {name}")
    
#     if (name != 'rent_2500_to_2999') and (name != 'rent_3000_to_3499') and (name != 'rent_3500_or_more'):
        
#         # Check for null values
#         for i in range(2013, 2021):
#             print(f"{name} Null values in {i}:", rent_dist[rent_dist[f'{i}_{name}'].isnull()].shape[0])
#         print("\n")

#         # Check for null values in multiple years
#         for i in range(2013, 2020):
#             print(f"2020 and {i} {name} null values:", rent_dist[(rent_dist[f'2020_{name}'].isnull()) & (rent_dist[f'{i}_{name}'].isnull())].shape[0])
#         print("\n")

#         # Check stats
#         for i in range(2013, 2021):
#             print(f"{name} Stats for year {i}:\n", rent_dist[f'{i}_{name}'].describe(), "\n")
            
#     else:
        
#         # Check for null values
#         for i in range(2015, 2021):
#             print(f"{name} Null values in {i}:", rent_dist[rent_dist[f'{i}_{name}'].isnull()].shape[0])
#         print("\n")

#         # Check for null values in multiple years
#         for i in range(2015, 2020):
#             print(f"2020 and {i} {name} null values:", rent_dist[(rent_dist[f'2020_{name}'].isnull()) & (rent_dist[f'{i}_{name}'].isnull())].shape[0])
#         print("\n")

#         # Check stats
#         for i in range(2015, 2021):
#             print(f"{name} Stats for year {i}:\n", rent_dist[f'{i}_{name}'].describe(), "\n")
        

In [None]:
rent_dist = pd.read_csv('datasets/cleaned_census_api_files/raw/rent_distribution_blocks_raw.csv',
                        encoding='utf-8',
                       dtype={'geo_id':str, 'state':str, 'county':str, 
                                          'tract':str, 'block group':str, 'block':str})
rent_dist_pre_st = merge_with_crosswalk(rent_dist)
rent_dist_pre_st.head(1)

In [None]:
def block_standardize_medians_3(bg20_df,
                            og_df,
                            year_start,
                            year_end,
                            weight,
                            code_name_dict_2013_2014=False,
                            code_name_dict_2015_2021=False,
                            code_name_dict_all=False):

    """
    WARNING: This function alone takes a few seconds to complete
    per block group, but to standard all 242,333 block groups
    can take many, many hours to run.
    It would be wise to run this function on any type of
    parrallel processing, such as using Dask, or a GPU,
    or parrallelized cloud computing, as there is no
    serialization (the block groups can be standardized
    in no particular order).
    
    This function standardizes all block group rows. It 
    should be called in a loop or vectorized if possible,
    such as the example below. (Note, the example below
    may not be the most efficient way to loop through
    or vectorize the block groups.)
    
    ```
    # Loop through all population block groups
    # and standardize them
    pop_dictionary = {}
    array2 = pop_pre_st['BG20'].unique()
    [block_standardize(
            x, 
            pop_dict=pop_dictionary, 
            og_df=pop_pre_st) 
        for x in array2]
    ```
    
    Parameters:
        tuple (tuple): A tuple containing the below.
            block (str): The block_group to group by.
            og_df (DataFrame): The dataframe we are 
                standardizing from.
            year_start (int): Which year to start from.
            year_end (int): Which year to end from.
            weight (str): Which weight to use (such as 
                'wt_pop' pr 'wt_hh').
    
    Returns:
        None. However, it appends the standardized values
            per block group to a pre-defined dictionary.
    """

    block = bg20_df['BG20'].iloc[0]
        
    # Step 1: Get a dataframe grouped by BG20
    bg20_df = bg20_df.drop_duplicates()
    bg20_df = bg20_df.fillna(0)
    filtered = og_df[og_df['block']==block].copy().drop_duplicates()
    
    # make sure the final dict is in the form of {block : dots}
    return_array = np.array([])
    
    if code_name_dict_all == False:
    
        # Step 2: Loop through the code names that 2013 and 2014 are guaranteed to have
        for code in code_name_dict_2013_2014:

            rent_category = code_name_dict_2013_2014[code]
            years_13_19 = [f"{i}_{rent_category}" for i in range(2013, 2020)]

            # Step 3: Get dot product of 2013-2019 values with the target weight values values
            array_13_19 = bg20_df[years_13_19].to_numpy().T
            wt_array = bg20_df[weight].to_numpy()
            dots = array_13_19.dot(wt_array)

            # Step 4: Append standardized 2013-2019 and 
            # the block's 2020+ values to new dictionary
            val_20s = filtered[[
                f"{i}_{rent_category}" for i in 
                range(2020, year_end + 1)]].iloc[0].to_numpy()
            
            # If the 2020 value is 0, then all years
            # before then should be 0 also
            if val_20s[0] == 0:
                dots = dots * 0

            # val_20 = filtered[f'2020_{rent_category}'].iloc[0]
            
            # Finalize the append
            dots = np.append(dots, val_20s)

            # Save a copy of the array
            set_dots = dots.copy()

            # Update return_dictionary
            return_array = np.append(return_array, set_dots)

        # Step 2: Loop through the code names that 2013 and 2014 won't have
        for code in code_name_dict_2015_2021:

            rent_category = code_name_dict_2015_2021[code]
            years_15_19 = [f"{i}_{rent_category}" for i in range(2015, 2020)]

            # Step 3: Get dot product of 2015-2019 values with the target weight values values
            array_15_19 = bg20_df[years_15_19].to_numpy().T
            wt_array = bg20_df[weight].to_numpy()
            dots = array_15_19.dot(wt_array)
            
            # Step 4: Append standardized 2013-2019 and 
            # the block's 2020+ values to new dictionary
            val_20s = filtered[[
                f"{i}_{rent_category}" for i in 
                range(2020, year_end + 1)]].iloc[0].to_numpy()
            
            # If the 2020 value is 0, then all years
            # before then should be 0 also
            if val_20s[0] == 0:
                dots = dots * 0
            
            # Finalize the append
            dots = np.append(dots, val_20s)

            # Save a copy of the array
            set_dots = dots.copy()

            # Update return_dictionary
            return_array = np.append(return_array, set_dots)
            
    else: # if code_name_dict_all exists
        for code in code_name_dict_all:

            rent_category = code_name_dict_all[code]
            years_13_19 = [f"{i}_{rent_category}" for i in range(2013, 2020)]

            # Step 3: Get dot product of 2010-2019 values with the target weight values values
            array_13_19 = bg20_df[years_13_19].to_numpy().T
            wt_array = bg20_df[weight].to_numpy()
            dots = array_13_19.dot(wt_array)
            
            # Step 4: Append standardized 2013-2019 and 
            # the block's 2020+ values to new dictionary
            val_20s = filtered[[
                f"{i}_{rent_category}" for i in 
                range(2020, year_end + 1)]].iloc[0].to_numpy()
            
            # If the 2020 value is 0, then all years
            # before then should be 0 also
            if val_20s[0] == 0:
                dots = dots * 0

            # Finalize the append
            dots = np.append(dots, val_20s)

            # Save a copy of the array
            set_dots = dots.copy()

            # Update return_dictionary
            return_array = np.append(return_array, set_dots)
        
    return {block : return_array}

In [None]:
%%time
# Using list comprehension

begin_year = 2013
end_year = 2021

my_groups = rent_dist_pre_st.groupby('BG20')
keys = list(my_groups.groups.keys())

code_name_dict_2013_2014 = {
    'B25063_003E': 'rent_less_than_100',
    'B25063_004E': 'rent_100_to_149',
    'B25063_005E': 'rent_150_to_199',
    'B25063_006E': 'rent_200_to_249',
    'B25063_007E': 'rent_250_to_299',
    'B25063_008E': 'rent_300_to_349',
    'B25063_009E': 'rent_350_to_399',
    'B25063_010E': 'rent_400_to_449',
    'B25063_011E': 'rent_450_to_499',
    'B25063_012E': 'rent_500_to_549',
    'B25063_013E': 'rent_550_to_599',
    'B25063_014E': 'rent_600_to_649',
    'B25063_015E': 'rent_650_to_699',
    'B25063_016E': 'rent_700_to_749',
    'B25063_017E': 'rent_750_to_799',
    'B25063_018E': 'rent_800_to_899',
    'B25063_019E': 'rent_900_to_999',
    'B25063_020E': 'rent_1000_to_1249',
    'B25063_021E': 'rent_1250_to_1449',
    'B25063_022E': 'rent_1500_to_1999',
    'B25063_023E': 'rent_2000_to_2499'
}

code_name_dict_2015_2021 = {
    'B25063_024E': 'rent_2500_to_2999',
    'B25063_025E': 'rent_3000_to_3499',
    'B25063_026E': 'rent_3500_or_more'
}

dict2 = {}
dict3 = [block_standardize_medians_3(my_groups.get_group(keys[i]), 
                                rent_dist_pre_st, 
                                begin_year, 
                                end_year, 
                                'wt_hh', 
                                code_name_dict_2013_2014=code_name_dict_2013_2014, 
                                code_name_dict_2015_2021=code_name_dict_2015_2021)
        for i in range(len(keys))
       ]


for d in dict3:
    dict2.update(d)

column_list = []
for code in code_name_dict_2013_2014:
    for i in range(begin_year, end_year + 1):
        column_list.append(f'{i}_{code_name_dict_2013_2014[code]}')
for code in code_name_dict_2015_2021:
    for i in range(begin_year, end_year + 1):
        column_list.append(f'{i}_{code_name_dict_2015_2021[code]}')

gross_rent_df = (pd.DataFrame.from_dict(dict2, 
                   orient='index', columns=column_list)
                   .reset_index()
                   .rename(columns={'index':'geoid_block'}))

gross_rent_df.to_csv('datasets/cleaned_census_api_files/raw/gross_rent_standardized_raw.csv',
                          encoding='utf-8',
                          index=False)

gross_rent_df

In [None]:
gross_rent_df.to_csv('datasets/cleaned_census_api_files/raw/gross_rent_standardized_raw.csv',
                          encoding='utf-8',
                          index=False)


In [None]:
gross_rent_df = pd.read_csv('datasets/cleaned_census_api_files/raw/gross_rent_standardized_raw.csv',
                          encoding='utf-8',
                          dtype={'geoid_block':str})

# Make geoid_block the index
gross_rent_df.set_index('geoid_block', inplace=True)
gross_rent_df.head(2)


### Once I have the gross rents standardized, calculate the median value for each year using frequency tables

Estimated Median of bin values = 𝑙 + (𝑛/2 − 𝐹)/𝑓 ⋅ 𝑤

where 𝑙 is the lower border of the median group, 𝐹 is the cumulative frequency up to the median group, 𝑓 is the frequency of the median group, 𝑤 is the width of the median group.


In [None]:
### STEPS TO ESTIMATE MEDIAN VALUE
def estimate_median(series, year, keyword):
    """
    Estimate the median for a given year.
    """
    s = series.copy()
    year = str(year)
#     print(series.name)
    
    # Gather all variables for the equation
    non_zero_series = s[s != 0]
    n = non_zero_series.sum()
    
    if len(non_zero_series) == 0:
        return 0
    
    # Find median interval
    median_n = n/2

    series_dict = non_zero_series.to_dict()
    key_list = list(series_dict.keys())
    count = 0
    interval = ''
    interval_backup = None
    i = 0
    while i < len(key_list):
        key = key_list[i]
        if count < median_n:
            interval = key
            count += series_dict[key]
            if count > median_n:
                F = count - series_dict[key]
        elif count == median_n:
            F = count
            interval_backup = key_list[i]
            i = len(key_list)
        else:
            pass
        i += 1
        
    f = series_dict[interval]
        
    # Get l, the lower bound
    interval_bound = interval.replace(f'{year}_{keyword}_','')
    l = int(re.sub(r'_to_\d+|less_than_|_or_more','',interval_bound))
    
    # Get higher bound
    h = int(re.sub(r'\d+_to_|less_than_|_or_more','',interval_bound))

    # Get width
    w = h - l
    
    # Calculate almost_median
    almost_median = (((n/2) - F)/f)*w
    
    # Check if there is no "median interval"
    if (almost_median == 0) & (interval_backup is not None):
        
        # Get l, the lower bound
        interval_higher_bound = interval_backup.replace(f'{year}_{keyword}_','')
        l_higher = int(re.sub(r'_to_\d+|less_than_|_or_more','',
                              interval_higher_bound))

        # Get higher bound
        h_higher = int(re.sub(r'\d+_to_|less_than_|_or_more','',interval_higher_bound))

        # Calculate average of lower-lower bound and higher-higher bound
        split_intervals_median = (l + h_higher)/2
        
        # This is our median
        return split_intervals_median
    
    else:
    
        # estimate median
        estimated_median = l + almost_median

        return estimated_median



In [None]:
gross_rent_medians = gross_rent_df.copy()

In [None]:
%%time
# Calculate gross rent for every year
for i in range(2013, 2021):
    columns_year = gross_rent_df.columns[gross_rent_df.columns.str.contains(str(i))]
    gross_rent_medians[f'{i}_median'] = gross_rent_df[columns_year].apply(
        lambda x: estimate_median(x, str(i), 'rent'),
        axis=1
    )
gross_rent_medians

In [None]:
gross_rent_medians.to_csv('datasets/cleaned_census_api_files/raw/gross_rent_medians_raw.csv',
                          encoding='utf-8')

In [None]:
gross_rent_medians = pd.read_csv('datasets/cleaned_census_api_files/raw/gross_rent_medians_raw.csv',
                          encoding='utf-8',
                          dtype={'geoid_block':str}
                                )
gross_rent_medians.head(2)

In [None]:
gross_rent_medians = gross_rent_medians[['geoid_block','2013_median','2014_median','2015_median',
                                        '2016_median','2017_median','2018_median',
                                        '2019_median','2020_median']]
gross_rent_medians.head(2)

In [None]:
# Create loop that goes through each year backyards, 
# and linearly interpolate values between 0 values 
# (to show one consistent line on a grpah between two points spearated by "0")
def linearly_interpolate(dataframe, 
                         grouping='block'):
    """Linearly interpolate NaN values."""
    df = copy.deepcopy(dataframe)
    df = df.replace(0, np.nan)
    df = df.set_index(f'geoid_{grouping}')
    df = df.interpolate(axis=1).reset_index()
    
    return df


In [None]:
median_rent_interpolated = linearly_interpolate(gross_rent_medians)
median_rent_interpolated

In [None]:
%%time
### Now get the median tract values
def get_tract_median_values(dataframe):
    "Sum blocks for tracts then calculate each year's median."
    
    df = copy.deepcopy(dataframe)
    if 'geoid_block' not in df.columns:
        df = df.reset_index()

    df['geoid_tract'] = df['geoid_block'].apply(lambda x: str(x)[:-1])
    df['state'] = df['geoid_block'].apply(lambda x: str(x)[0:2])
    df['county'] = df['geoid_block'].apply(lambda x: str(x)[2:5])
    
    # Calculate gross rent for every year
    for i in range(2013, 2021):
        str_year = str(i)
        columns_year = df.columns[df.columns.str.contains(str_year)]
        for col in columns_year:
            main_name = col.replace(f'{i}_rent_','')
            df[f'{i}_tract_rent_{main_name}'] = df.groupby('geoid_tract')[f'{col}'].transform('sum')
        tracts_year = df.columns[df.columns.str.contains(f'{str_year}_tract')]
        df[f'{i}_tract_median'] = df[tracts_year].apply(
            lambda x: estimate_median(x, str(i), 'tract_rent'),
            axis=1
        )
    
    return df

gross_rent_tract = get_tract_median_values(gross_rent_df)
gross_rent_tract

In [None]:
gross_rent_tract_2 = gross_rent_tract[['geoid_tract',
                                       '2013_tract_median','2014_tract_median','2015_tract_median',
                                        '2016_tract_median','2017_tract_median','2018_tract_median',
                                        '2019_tract_median','2020_tract_median']].drop_duplicates()
gross_rent_tract_2

In [None]:
median_tract_rent_interpolated = linearly_interpolate(gross_rent_tract_2, grouping='tract')
median_tract_rent_interpolated

In [None]:
median_rent_interpolated['geoid_tract'] = median_rent_interpolated['geoid_block'].apply(
    lambda x: str(x)[:-1])

median_rent_interpolated

In [None]:
median_rent_all = pd.merge(median_rent_interpolated,
                           median_tract_rent_interpolated,
                           left_on="geoid_tract",
                           right_on="geoid_tract",
                           how='inner')

median_rent_all

In [None]:
for year in range(2013, 2021):
    median_rent_all.rename(columns={f'{year}_median': f'{year}_block_median'}, inplace=True)
    
median_rent_all


In [None]:
median_rent_all['state'] = median_rent_all['geoid_block'].apply(lambda x: str(x)[0:2])
median_rent_all['county'] = median_rent_all['geoid_block'].apply(lambda x: str(x)[2:5])
median_rent_all['tract'] = median_rent_all['geoid_block'].apply(lambda x: str(x)[5:11])

median_rent_all

In [None]:
median_rent_all.to_csv('datasets/cleaned_census_api_files/standardized/median_rent_standardized.csv',
                       encoding='utf-8',
                       index=False)

In [None]:
median_rent_all = pd.read_csv('datasets/cleaned_census_api_files/standardized/median_rent_standardized.csv',
                               encoding='utf-8',
                             dtype={'geoid_block':str, 'geoid_tract':str})

median_rent_all


In [None]:
### Once I have the interpolated median tract values, merge with block level,
# Save, then run create_and_save_geo_files() function on it

# Create the shapefiles for block groups and tracts
median_rent_block_geo, median_rent_tract_geo = create_and_save_geo_files(dataframe=median_rent_all, 
                                                          name='median_rent',
                                                          keyword='median',
                                                          begin_year=2013, 
                                                          end_year=2020)

In [None]:
median_rent_block_geo
median_rent_tract_geo

### Get Median Rent at the Block Level

### Get Median Income at the Block Level

### Get Median Price at the Block Level

### Get Percent Renter Occupied at the Block Level

### Get Total Employed at the Block Level

### Get Rent-Price Ratio at the Block Level

### End of getting data at the Tract level
---

# Get data at the MSA level

We will get the following:
1. Median Income
2. Median Unit Price
3. Median Rent

using the ACS 1-Year survey.

In [None]:
# Set end year for all MSAs
end_year=2021

### Get Median Income at the MSA level

In [None]:
# Run the API download function
median_income_msa = download_and_format_msa_census_data(
    census_code="B19013_001E",
    census_code_meaning="median_income_msa",
    end_year=end_year
)
median_income_msa

### Get Median Unit Value at the MSA Level

In [None]:
# Run the API download function
median_price_msa = download_and_format_msa_census_data(
    census_code="B25077_001E",
    census_code_meaning="median_price_msa",
    end_year=end_year
)
median_price_msa

### Get Median Rent at the MSA level

In [None]:
# Run the API download function
median_rent_msa = download_and_format_msa_census_data(
    census_code="B25058_001E",
    census_code_meaning="median_rent_msa",
    end_year=end_year
)
median_rent_msa

### Get Total Units at the MSA level

In [None]:
# Run the API download function
total_units_msa = download_and_format_msa_census_data(
    census_code="B25001_001E",
    census_code_meaning="total_units_msa",
    end_year=end_year
)
total_units_msa

### Create Rent-to-Price Ratio dataset

In [None]:
# Rename columns
for i in range(2010, end_year + 1):
    median_rent_msa.rename(columns={f"{i}":f"{i}_rent"}, inplace=True)

# Rename columns
for i in range(2010, end_year + 1):
    median_price_msa.rename(columns={f"{i}":f"{i}_price"}, inplace=True)

# Merge price data
rent_to_price = median_rent_msa.merge(
    median_price_msa, how='inner', 
    on=['msa_code','msa_name'])

# Loop through columns and divide rent by price per year
for i in range(2010, end_year + 1):
    rent_to_price[f'{i}'] = rent_to_price[f"{i}_rent"]/rent_to_price[f"{i}_price"]
    
    # Drop rent and price columns
    rent_to_price.drop(columns=[f'{i}_rent',f'{i}_price'], inplace=True)

# Save dataset
rent_to_price.to_csv(
    "datasets/cleaned_census_api_files/msa_data/rent_price_ratio_msa.csv", 
    index=False)

rent_to_price

### Create Jobs per Unit dataset

In [None]:
# Read in jobs
jobs = pd.read_csv('datasets/bls/raw/most_recent_bls_data.csv',
                   dtype={'msa_code':str, 'state_code':str})

# Make sure the date column is in datetime format
jobs['date'] = pd.to_datetime(jobs['date'])

# Replace NECTA Division
jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA Division",""))
jobs['msa_name'] = jobs['msa_name'].apply(lambda x: x.replace(" NECTA",""))

# Keep only december months
new_jobs = jobs[jobs['month']=='December'].reset_index(drop=True)

# Get earliest year
earliest_year = new_jobs['year'].min()

# Get latest year
latest_year = new_jobs['year'].max()

# Only keep certain columns
new_jobs = new_jobs[['msa_name','year','value']]

# Rename column
new_jobs.rename(columns={'value':f'jobs'}, inplace=True)

# Stack and unstack
new_jobs = new_jobs.set_index(['msa_name','year'])
new_jobs = new_jobs.unstack('year')

# Reset index
new_jobs = new_jobs.reset_index()

# Rename jobs columns
new_jobs.columns = ['msa_name'] + [
    f'{i}_jobs' for i in range(earliest_year, latest_year + 1)]

# Read in total units and rename columns
total_units = pd.read_csv(
    "datasets/cleaned_census_api_files/msa_data/total_units_msa.csv")
for i in range(earliest_year, latest_year + 1):
    total_units.rename(columns={f"{i}":f"{i}_units"}, inplace=True)
    
# Merge data
jobs_per_unit = new_jobs.merge(
    total_units, how='inner', 
    on=['msa_name'])

# Loop through columns and divide rent by price per year
for i in range(earliest_year, latest_year + 1):
    jobs_per_unit[f'{i}'] = jobs_per_unit[f"{i}_jobs"]/jobs_per_unit[f"{i}_units"]
        
# Only keep main columns
jobs_per_unit = jobs_per_unit[['msa_name','msa_code'] +
    [f'{i}' for i in range(earliest_year, latest_year + 1)]]

# Save dataset
jobs_per_unit.to_csv(
    "datasets/cleaned_census_api_files/msa_data/jobs_per_unit_msa.csv", 
    index=False)

jobs_per_unit



## Get data at the City Level (at the ACS 5-Year level)

1. Population (B01003_001E)
2. Median Income
3. Median Unit Price
4. Median Rent
5. Total Units
6. Percent Renter Occupied
7. Total Employed (B23025_004E)

Create Manually:
1. Rent-to-Price Ratio
2. People-per-Units

In [29]:
# Define end year
begin_year = 2011
end_year = 2021

### Get Population (City)

In [30]:
# Run the API download function
population_city = download_and_format_msa_census_data(
    census_code="B01001_001E",
    census_code_meaning="population_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
population_city

URL: https://api.census.gov/data/2021/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2020/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2019/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2018/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2017/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2016/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2015/acs/acs5?get=NAME,B01001_001E&for=place:*&in=state:*&key=f25144e3809a49b45c5cc54e7d2bce532fe4ce99
URL: https://api.census.gov/data/2014/acs/acs5?g

Unnamed: 0,name,geo_id,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,"Abanda, Alabama",0100100,35.0000,42.0000,23.0000,43.0000,52.0000,151.0000,174.0000,218.0000,176.0000,192.0000,231.0000
1,"Abbeville city, Alabama",0100124,2721.0000,2699.0000,2684.0000,2654.0000,2646.0000,2627.0000,2594.0000,2582.0000,2573.0000,2572.0000,2231.0000
2,"Adamsville city, Alabama",0100460,4571.0000,4537.0000,4496.0000,4472.0000,4454.0000,4422.0000,4404.0000,4374.0000,4340.0000,4266.0000,4381.0000
3,"Addison town, Alabama",0100484,855.0000,826.0000,1036.0000,913.0000,682.0000,757.0000,725.0000,641.0000,728.0000,723.0000,697.0000
4,"Akron town, Alabama",0100676,404.0000,369.0000,384.0000,302.0000,293.0000,252.0000,318.0000,295.0000,315.0000,269.0000,385.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31903,"Voladoras comunidad, Puerto Rico",7287638,,,,,,,,,,638.0000,696.0000
31904,"Yabucoa zona urbana, Puerto Rico",7287863,5929.0000,6256.0000,6125.0000,6383.0000,6604.0000,6720.0000,6878.0000,6829.0000,7070.0000,6854.0000,6466.0000
31905,"Yauco zona urbana, Puerto Rico",7288035,17806.0000,17857.0000,17985.0000,17951.0000,17815.0000,17627.0000,17129.0000,16802.0000,16585.0000,15485.0000,15404.0000
31906,"Yaurel comunidad, Puerto Rico",7288121,1195.0000,1334.0000,1440.0000,1270.0000,1063.0000,1255.0000,1444.0000,1302.0000,1463.0000,1226.0000,927.0000


### Get Median Income (city)

In [None]:
# Run the API download function
median_income_city = download_and_format_msa_census_data(
    census_code="B19013_001E",
    census_code_meaning="median_income_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
median_income_city

### Get Median Price (city)

In [None]:
# Run the API download function
median_price_city = download_and_format_msa_census_data(
    census_code="B25077_001E",
    census_code_meaning="median_price_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
median_price_city

### Get Median Rent (city)

In [None]:
# Run the API download function
median_rent_city = download_and_format_msa_census_data(
    census_code="B25058_001E",
    census_code_meaning="median_rent_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
median_rent_city

### Get Total Units (city)

In [None]:
# Run the API download function
total_units_city = download_and_format_msa_census_data(
    census_code="B25001_001E",
    census_code_meaning="total_units_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_units_city

## Get Percent Renter Occupied (city)

1. First we must get Total Occupied Units (B25002_002E)
2. Then we must get Renter Occupied Units (B25003_003E)

3. Then we must manually divide Renters by Occupied units to get Percent Renter Occupied

In [None]:
# Run the API download function
total_occupied_city = download_and_format_msa_census_data(
    census_code="B25002_002E",
    census_code_meaning="total_occupied_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_occupied_city

total_renter_occupied_city = download_and_format_msa_census_data(
    census_code="B25003_003E",
    census_code_meaning="total_renter_occupied_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_renter_occupied_city

In [None]:
### Manually get Percent Renter Occupied

# Rename columns
for i in range(begin_year, end_year + 1):
    total_occupied_city.rename(columns={f"{i}":f"{i}_total_occupied"}, inplace=True)

# Rename columns
for i in range(begin_year, end_year + 1):
    total_renter_occupied_city.rename(columns={f"{i}":f"{i}_renter_occupied"}, inplace=True)

# Merge price data
percent_renter_city = total_occupied_city.merge(
    total_renter_occupied_city, how='inner', 
    on=['name','geo_id'])

# Loop through columns and divide rent by price per year
for i in range(begin_year, end_year + 1):
    percent_renter_city[f'{i}'] = percent_renter_city[f"{i}_renter_occupied"]/percent_renter_city[f"{i}_total_occupied"]
    
    # Drop rent and price columns
    percent_renter_city.drop(columns=[
        f'{i}_renter_occupied',f'{i}_total_occupied'], inplace=True)

# Save dataset
percent_renter_city.to_csv(
    "datasets/cleaned_census_api_files/city_data/percent_renter_occupied_city.csv", 
    index=False)

percent_renter_city


### Get Total Employed (city)

In [None]:
# Run the API download function
total_employed_city = download_and_format_msa_census_data(
    census_code="B23025_004E",
    census_code_meaning="total_employed_city",
    begin_year=begin_year,
    end_year=end_year,
    format_msa=False,
    format_city=True
)
total_employed_city

### Manually Create Rent-to-Price Ratio (city)


In [None]:
### Manually get Rent to Price Ratio

# Rename columns
for i in range(begin_year, end_year + 1):
    median_rent_city.rename(columns={f"{i}":f"{i}_rent"}, inplace=True)

# Rename columns
for i in range(begin_year, end_year + 1):
    median_price_city.rename(columns={f"{i}":f"{i}_price"}, inplace=True)

# Merge price data
rent_price_ratio_city = median_rent_city.merge(
    median_price_city, how='inner', 
    on=['name','geo_id'])

# Loop through columns and divide rent by price per year
for i in range(begin_year, end_year + 1):
    rent_price_ratio_city[f'{i}'] = rent_price_ratio_city[f"{i}_rent"]/rent_price_ratio_city[f"{i}_price"]
    
    # Drop rent and price columns
    rent_price_ratio_city.drop(columns=[
        f'{i}_rent',f'{i}_price'], inplace=True)

# Save dataset
rent_price_ratio_city.to_csv(
    "datasets/cleaned_census_api_files/city_data/rent_price_ratio_city.csv", 
    index=False)

rent_price_ratio_city


### Manually Create People-per-Units (city)


In [None]:
### Manually get People per Units

# Rename columns
for i in range(begin_year, end_year + 1):
    population_city.rename(columns={f"{i}":f"{i}_population"}, inplace=True)

# Rename columns
for i in range(begin_year, end_year + 1):
    total_units_city.rename(columns={f"{i}":f"{i}_units"}, inplace=True)

# Merge price data
people_per_unit_city = population_city.merge(
    total_units_city, how='inner', 
    on=['name','geo_id'])

# Loop through columns and divide rent by price per year
for i in range(begin_year, end_year + 1):
    people_per_unit_city[f'{i}'] = people_per_unit_city[f"{i}_population"]/people_per_unit_city[f"{i}_units"]
    
    # Drop rent and price columns
    people_per_unit_city.drop(columns=[
        f'{i}_population',f'{i}_units'], inplace=True)

# Save dataset
people_per_unit_city.to_csv(
    "datasets/cleaned_census_api_files/city_data/people_per_unit_city.csv", 
    index=False)

people_per_unit_city
