In [1]:
import os
import sys
import pandas as pd
import numpy as np
import getpass

user = getpass.getuser()

DVUTILS_LOCAL_CLONE_PATH = f"/Users/{user}/Documents/GitHub/dvutils"
sys.path.insert(0, DVUTILS_LOCAL_CLONE_PATH)
from utils_io import *

Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json


In [2]:
# get census api key
api_key = os.environ.get("CENSUS_API_KEY")

In [3]:
def pull_acs_5_year_est_data(
    census_api_key,
    acs_year=2019,
    tbl_prof_type="Detailed",
    table_id=None,
    select_table_vars=None,
    drop_anno_cols=True,
    drop_margin_cols=True,
):
    """
    Pull American Community Survey (ACS) 5 year estimate data. Data can be pulled for an entire
    table or for select table variables.

    !Must include a table_id or list to select_table_vars parameters!

    Parameters
    -------------------
    census_api_key (String):
    Your secret census api key.

    acs_year (Integer):
    Year for acs estimates, default is 2019 which is latest year 5 year data is available.

    tbl_prof_type (String):
    Table or profile type. These include the following types: Detailed, Subject, Data, or Comparison.

    table_id (String):
    ACS table id. Example 'B01001'

    select_table_vars (List):
    provide a list of ACS table variables as strings. Example: ['B01001_001E','B01001_002E']

    drop_anno_cols (Boolean):
    Used if table_id provided. Drops annotation of margin of error and annotation of estimate
    columns.

    drop_margin_cols (Boolean):
    Used if table_id provided. Drops margin of error columns.

    Returns
    -------------------
    Geodataframe object

    Author: Joshua Croff
    Variable Reference: https://www.census.gov/data/developers/data-sets/acs-5year.html
    """
    import requests
    import pandas as pd

    if table_id:
        var = f"group({table_id})"
    else:
        var = ",".join(select_table_vars)

    counties = "001,013,041,055,075,081,085,095,097"
    state = "06"
    # set base url
    if tbl_prof_type not in ["Detailed", "Subject", "Data", "Comparison"]:
        return "Please provide the following table types: Detailed, Subject, Data, or Comparison"
    elif tbl_prof_type == "Detailed":
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5?"
    elif tbl_prof_type == "Subject":
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5/subject?"
        # https://api.census.gov/data/2020/acs/acs5/subject?get=NAME,S0101_C01_001E&for=county:037&in=state:06&key=YOUR_KEY_GOES_HERE
    elif tbl_prof_type == "Data":
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5/profile?"
    else:
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5/cprofile?"

    # set query params
    query_params = {
        "get": var,
        "for": "tract:*",
        "in": [
            f"county:{counties}",
            f"state:{state}",
        ],
        "key": census_api_key,
    }
    rq = requests.get(base_url, params=query_params)
    data = rq.json()
    acs_df = pd.DataFrame(data[1:], columns=data[0])
    # Cast numeric columns to numeric types
    cols = acs_df.columns.to_list()

    if table_id:
        str_cols = ["GEO_ID", "NAME", "state", "county", "tract"]
    else:
        str_cols = ["state", "county", "tract"]
    num_cols = list(set(cols) - set(str_cols))
    acs_df[num_cols] = acs_df[num_cols].apply(pd.to_numeric)

    # Drop annotation columns
    if drop_anno_cols:
        acs_df = acs_df.loc[
            :, ~((acs_df.columns.str.endswith("EA")) | (acs_df.columns.str.endswith("MA")))
        ].copy()

    if drop_margin_cols:
        acs_df = acs_df.loc[:, ~acs_df.columns.str.endswith("M").copy()]

    # add tract id column
    acs_df["tract_geoid"] = acs_df["state"] + acs_df["county"] + acs_df["tract"]

    # rename columns
    acs_df = acs_df.rename(columns={"county": "fipco"})

    # drop redundent columns
    if table_id:
        acs_df = acs_df.drop(columns=["GEO_ID", "NAME", "state", "tract"])
    else:
        acs_df = acs_df.drop(columns=["state", "tract"])

    return acs_df

## Pull data

In [4]:
acs_2018 = pull_acs_5_year_est_data(census_api_key=api_key, acs_year=2018, table_id="B03002")
acs_2022 = pull_acs_5_year_est_data(census_api_key=api_key, acs_year=2022, table_id="B03002")

## Rename columns

In [5]:
rename_dict = {
    "B03002_001E": "total_population",
    "B03002_002E": "not_hispanic_or_latino",
    "B03002_003E": "white",
    "B03002_004E": "black_or_african_american",
    "B03002_005E": "american_indian_and_alaska_native",
    "B03002_006E": "asian",
    "B03002_007E": "native_hawaiian_and_other_pacific_islander",
    "B03002_008E": "other_race",
    "B03002_009E": "two_or_more_races",
    "B03002_012E": "hispanic_or_latino_any_race",
}

In [6]:
# rename columns
acs_2018.rename(columns=rename_dict, inplace=True)
acs_2022.rename(columns=rename_dict, inplace=True)

In [7]:
# create other race column and asian and pacific islander column
other_races = [
    "american_indian_and_alaska_native",
    "other_race",
    "two_or_more_races",
]

asian_and_pacific_islander = [
    "asian",
    "native_hawaiian_and_other_pacific_islander",
]

# sum columns and create other race column
acs_2018["other"] = acs_2018[other_races].sum(axis=1)
acs_2022["other"] = acs_2022[other_races].sum(axis=1)

# sum columns and create asian and pacific islander column
acs_2018["asian_and_pacific_islander"] = acs_2018[asian_and_pacific_islander].sum(axis=1)
acs_2022["asian_and_pacific_islander"] = acs_2022[asian_and_pacific_islander].sum(axis=1)

In [8]:
# check that adding up race and origin columns equals total population
check_cols = [
    "white",
    "asian_and_pacific_islander",
    "hispanic_or_latino_any_race",
    "black_or_african_american",
    "other",
]

# 2017
acs_2018["pop_check"] = acs_2018[check_cols].sum(axis=1)

# 2022
acs_2022["pop_check"] = acs_2022[check_cols].sum(axis=1)

In [10]:
rev_cols = [
    "total_population",
    "pop_check",
    "white",
    "asian_and_pacific_islander",
    "hispanic_or_latino_any_race",
    "black_or_african_american",
    "other",
]
acs_2018[rev_cols].head()

Unnamed: 0,total_population,pop_check,white,asian_and_pacific_islander,hispanic_or_latino_any_race,black_or_african_american,other
0,2707,2707,1332,395,368,321,291
1,6106,6106,1835,109,2667,1294,201
2,6300,6300,533,4683,757,57,270
3,4529,4529,722,2232,905,448,222
4,3888,3888,2931,311,183,177,286


In [11]:
# drop all columns except rev_cols
analysis_cols = [
    "total_population",
    "white",
    "asian_and_pacific_islander",
    "hispanic_or_latino_any_race",
    "black_or_african_american",
    "other",
]

acs_2018 = acs_2018[analysis_cols].copy()
acs_2022 = acs_2022[analysis_cols].copy()

In [26]:
## summarize data to regional level

acs_2018_sum = (
    acs_2018.aggregate("sum").round(-3)
    .reset_index()
    .rename(columns={"index": "race_and_ethnicity", 0: "population_2018"})
)

acs_2022_sum = (
    acs_2022.aggregate("sum").round(-3)
    .reset_index()
    .rename(columns={"index": "race_and_ethnicity", 0: "population_2022"})
)

In [27]:
# merge dataframes

acs_2018_2022 = pd.merge(acs_2018_sum, acs_2022_sum, on="race_and_ethnicity")

In [28]:
acs_2018_2022

Unnamed: 0,race_and_ethnicity,population_2018,population_2022
0,total_population,7676000,7686000
1,white,3046000,2830000
2,asian_and_pacific_islander,2013000,2165000
3,hispanic_or_latino_any_race,1811000,1819000
4,black_or_african_american,447000,438000
5,other,359000,434000


In [29]:
# calculate change in population
acs_2018_2022["change"] = round((acs_2018_2022["population_2022"] - acs_2018_2022["population_2018"]),-3)

In [30]:
acs_2018_2022

Unnamed: 0,race_and_ethnicity,population_2018,population_2022,change
0,total_population,7676000,7686000,10000
1,white,3046000,2830000,-216000
2,asian_and_pacific_islander,2013000,2165000,152000
3,hispanic_or_latino_any_race,1811000,1819000,8000
4,black_or_african_american,447000,438000,-9000
5,other,359000,434000,75000


In [31]:
# calculate percent of total population
acs_2018_2022["percent_of_total_2018"] = np.round(
    (acs_2018_2022["population_2018"] / acs_2018_2022["population_2018"].drop(index=0).sum()) * 100,
)
acs_2018_2022["percent_of_total_2022"] = np.round(
    (acs_2018_2022["population_2022"] / acs_2018_2022["population_2022"].drop(index=0).sum()) * 100,
)

In [32]:
# calculate the difference in percent of total population
acs_2018_2022["percent_change"] = round((acs_2018_2022["percent_of_total_2022"] - acs_2018_2022["percent_of_total_2018"]))

In [33]:
reorder_cols = [
    "race_and_ethnicity",
    "population_2018",
    "percent_of_total_2018",
    "population_2022",
    "percent_of_total_2022",
    "change",
    "percent_change",
]
acs_2018_2022 = acs_2018_2022[reorder_cols].copy()

In [35]:
acs_2018_2022

Unnamed: 0,race_and_ethnicity,population_2018,percent_of_total_2018,population_2022,percent_of_total_2022,change,percent_change
0,total_population,7676000,100.0,7686000,100.0,10000,0.0
1,white,3046000,40.0,2830000,37.0,-216000,-3.0
2,asian_and_pacific_islander,2013000,26.0,2165000,28.0,152000,2.0
3,hispanic_or_latino_any_race,1811000,24.0,1819000,24.0,8000,0.0
4,black_or_african_american,447000,6.0,438000,6.0,-9000,0.0
5,other,359000,5.0,434000,6.0,75000,1.0


In [37]:
acs_2018_2022.reindex([1,2,3,4,5,0]).to_csv("Data/race_origin_acs2018_2022.csv", index=False)