In [53]:
import os
import numpy as np
import requests
import pathlib
import getpass
import pandas as pd

from dotenv import load_dotenv

user = getpass.getuser()
load_dotenv()

True

In [54]:
API_KEY = os.environ.get("CENSUS_API_KEY")

In [55]:
work_dir = pathlib.Path(f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Bay_Area_Census_Website/census_decennial_download")
out_file = work_dir / "census_2020_race_sex_age_jc.csv"

In [56]:
def fetch_census_data(url):
    # Make the API request
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Convert the JSON response to a list of lists
        data = response.json()

        # The first element contains the column headers
        columns = data[0]

        # The rest of the elements contain the actual data
        rows = data[1:]

        # Create a pandas DataFrame from the data
        df = pd.DataFrame(rows, columns=columns)

        # convert all columns to numeric except GEOID, NAME
        str_cols = ["GEOID", "GEO_ID", "NAME", "state", "place"]
        num_cols = [col for col in df.columns if col not in str_cols]
        df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce")

        return df
    else:
        print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return None

In [57]:
def create_share_columns(df, universe_column, share_column_dict):
    """Calculate share columns based on a single population or universe column.

    Author: Joshua Croff

    Args:
        df (pd.DataFrame): The Dataframe to calculate share columns on.
        universe_column (str): The name of the column that represents the total population or universe.
        share_column_dict (dictionary): Key value pairs dictionary. Key should be population column 
            name and value should be expected share column output name.
    Returns:
        pd.DataFrame: The original DataFrame with the share columns added.
    """
    df = df.copy()
    
    if universe_column not in df.columns:
        raise ValueError(f"Universe column '{universe_column}' not found in DataFrame")
    
    for key, value in share_column_dict.items():
        if key not in df.columns:
            raise ValueError(f"Population column '{key}' not found in DataFrame")
        df[value] = np.where(df[universe_column] == 0, 0, round((df[key] / df[universe_column]), 3))
    return df

In [58]:
# Define the API endpoint
ca_places_url = "https://www2.census.gov/geo/docs/reference/codes2020/place/st06_ca_place2020.txt"

race_url_cty = (
    "https://api.census.gov/data/2020/dec/pl?get=group(P2)&"
    "ucgid=0500000US06001,0500000US06013,0500000US06041,"
    "0500000US06055,0500000US06075,0500000US06081,0500000US06085,"
    "0500000US06095,0500000US06097"
)

race_url_pl = (
    f"https://api.census.gov/data/2020/dec/pl?get=group(P2)&"
    f"for=place:*&in=state:06&key={API_KEY}"
)

age_url_cty = (
    "https://api.census.gov/data/2020/dec/dhc?get=group(P12)&"
    "ucgid=0500000US06001,0500000US06013,0500000US06041,"
    "0500000US06055,0500000US06075,0500000US06081,0500000US06085,"
    "0500000US06095,0500000US06097"
)

age_url_pl = (
    f"https://api.census.gov/data/2020/dec/dhc?get=group(P12)&"
    f"for=place:*&in=state:06&key={API_KEY}"
)

In [59]:
# Get california places data 
county_list = [
    "San Francisco County",
    "Alameda County",
    "Contra Costa County",
    "Marin County",
    "Napa County",
    "San Mateo County",
    "Santa Clara County",
    "Solano County",
    "Sonoma County",
]
place_df = pd.read_csv(ca_places_url, sep="|", header=0, dtype=str)
bay_area_places = place_df.query("COUNTIES in @county_list & TYPE =='INCORPORATED PLACE'")
place_geoid_list = bay_area_places["PLACEFP"].tolist()

# Pull and pre-process race data

In [60]:
# get race place data for california
race_pl_df = fetch_census_data(race_url_pl)
# Filter to only bay area places
race_pl_df = race_pl_df.query("place in @place_geoid_list")
# get race county data for california
race_cty_df = fetch_census_data(race_url_cty)

In [61]:
# concatenate place and county dataframes 
race_df = pd.concat([race_pl_df, race_cty_df], axis=0, ignore_index=True)

In [62]:
# drop and rename columns
race_rename_dict = {
    "P2_001N": "Race Total Population",
    "P2_002N": "Hispanic or Latino",
    "P2_003N": "Not Hispanic or Latino",
    "P2_004N": "Population of One Race",
    "P2_005N": "White",
    "P2_006N": "Black or African American",
    "P2_007N": "American Indian and Alaska Native",
    "P2_008N": "Asian",
    "P2_009N": "Native Hawaiian and Other Pacific Islander",
    "P2_010N": "Some Other Race",
    "P2_011N": "Two or More Races",
}
# drop columns except required columns
req_cols = list(race_rename_dict.keys())
req_cols.insert(0, "GEO_ID")
req_cols.insert(1, "NAME")

race_df = race_df[req_cols]
race_df = race_df.rename(columns=race_rename_dict)

In [63]:
# create shares dictionary to calculate share columns
# dict should look like {population_column: share_column}
rm_cols = ["GEO_ID", "NAME", "Race Total Population"]
pop_cols = race_df.columns.to_list()
pop_cols = [col for col in pop_cols if col not in rm_cols]
share_dict = {col: f"Share {col}" for col in pop_cols}
universe_col = "Race Total Population"

In [64]:
# calculate share columns
race_df = create_share_columns(df=race_df, universe_column=universe_col, share_column_dict=share_dict)

# Pull and pre-process age data

In [65]:
# get age place data for california
age_pl_df = fetch_census_data(age_url_pl)
# Filter to only bay area places
age_pl_df = age_pl_df.query("place in @place_geoid_list")
# get age county data for california
age_cty_df = fetch_census_data(age_url_cty)

In [66]:
# concatenate place and county dataframes
age_df = pd.concat([age_pl_df, age_cty_df], axis=0, ignore_index=True)

In [67]:
# drop and rename columns
age_rename_dict = {
    "P12_001N": "Age Total Population",
    "P12_002N": "Total Male",
    "P12_003N": "Male Under 5",
    "P12_004N": "Male 5 to 9",
    "P12_005N": "Male 10 to 14",
    "P12_006N": "Male 15 to 17",
    "P12_007N": "Male 18 to 19",
    "P12_008N": "Male 20",
    "P12_009N": "Male 21",
    "P12_010N": "Male 22 to 24",
    "P12_011N": "Male 25 to 29",
    "P12_012N": "Male 30 to 34",
    "P12_013N": "Male 35 to 39",
    "P12_014N": "Male 40 to 44",
    "P12_015N": "Male 45 to 49",
    "P12_016N": "Male 50 to 54",
    "P12_017N": "Male 55 to 59",
    "P12_018N": "Male 60 to 61",
    "P12_019N": "Male 62 to 64",
    "P12_020N": "Male 65 to 66",
    "P12_021N": "Male 67 to 69",
    "P12_022N": "Male 70 to 74",
    "P12_023N": "Male 75 to 79",
    "P12_024N": "Male 80 to 84",
    "P12_025N": "Male 85 and over",
    "P12_026N": "Total Female",
    "P12_027N": "Female Under 5",
    "P12_028N": "Female 5 to 9",
    "P12_029N": "Female 10 to 14",
    "P12_030N": "Female 15 to 17",
    "P12_031N": "Female 18 to 19",
    "P12_032N": "Female 20",
    "P12_033N": "Female 21",
    "P12_034N": "Female 22 to 24",
    "P12_035N": "Female 25 to 29",
    "P12_036N": "Female 30 to 34",
    "P12_037N": "Female 35 to 39",
    "P12_038N": "Female 40 to 44",
    "P12_039N": "Female 45 to 49",
    "P12_040N": "Female 50 to 54",
    "P12_041N": "Female 55 to 59",
    "P12_042N": "Female 60 to 61",
    "P12_043N": "Female 62 to 64",
    "P12_044N": "Female 65 to 66",
    "P12_045N": "Female 67 to 69",
    "P12_046N": "Female 70 to 74",
    "P12_047N": "Female 75 to 79",
    "P12_048N": "Female 80 to 84",
    "P12_049N": "Female 85 and over",
}
# drop columns except required columns
req_cols = list(age_rename_dict.keys())
req_cols.insert(0, "GEO_ID")
req_cols.insert(1, "NAME")

age_df = age_df[req_cols]
age_df = age_df.rename(columns=age_rename_dict)

In [68]:
# add age group total columns
age_groups = {
    "Under 5": ["Male Under 5", "Female Under 5"],
    "5 to 9": ["Male 5 to 9", "Female 5 to 9"],
    "10 to 14": ["Male 10 to 14", "Female 10 to 14"],
    "15 to 17": ["Male 15 to 17", "Female 15 to 17"],
    "18 to 19": ["Male 18 to 19", "Female 18 to 19"],
    "20": ["Male 20", "Female 20"],
    "21": ["Male 21", "Female 21"],
    "22 to 24": ["Male 22 to 24", "Female 22 to 24"],
    "25 to 29": ["Male 25 to 29", "Female 25 to 29"],
    "30 to 34": ["Male 30 to 34", "Female 30 to 34"],
    "35 to 39": ["Male 35 to 39", "Female 35 to 39"],
    "40 to 44": ["Male 40 to 44", "Female 40 to 44"],
    "45 to 49": ["Male 45 to 49", "Female 45 to 49"],
    "50 to 54": ["Male 50 to 54", "Female 50 to 54"],
    "55 to 59": ["Male 55 to 59", "Female 55 to 59"],
    "60 to 61": ["Male 60 to 61", "Female 60 to 61"],
    "62 to 64": ["Male 62 to 64", "Female 62 to 64"],
    "65 to 66": ["Male 65 to 66", "Female 65 to 66"],
    "67 to 69": ["Male 67 to 69", "Female 67 to 69"],
    "70 to 74": ["Male 70 to 74", "Female 70 to 74"],
    "75 to 79": ["Male 75 to 79", "Female 75 to 79"],
    "80 to 84": ["Male 80 to 84", "Female 80 to 84"],
    "85 and over": ["Male 85 and over", "Female 85 and over"],
}

for group, cols in age_groups.items():
    male_col, female_col = cols
    total_col = f"Total {group}"
    age_df[total_col] = age_df[male_col] + age_df[female_col]

In [69]:
# create shares dictionary to calculate share columns
# dict should look like {population_column: share_column}
rm_cols = ["GEO_ID", "NAME", "Age Total Population"]
pop_cols = age_df.columns.to_list()
pop_cols = [col for col in pop_cols if col not in rm_cols]
share_dict = {col: f"Share {col}" for col in pop_cols}
universe_col = "Age Total Population"

In [70]:
# calculate share columns
age_df = create_share_columns(df=age_df, universe_column=universe_col, share_column_dict=share_dict)

# Join the two dataframes

In [71]:
# join the two dataframes on GEO_ID and NAME
final_df = pd.merge(race_df, age_df, on=["GEO_ID", "NAME"])

# Final data cleaning

In [72]:
# drop San Francisco County from final_df
final_df = final_df.query("NAME != 'San Francisco County, California'")

# remove everything after the comma in the NAME column
final_df["NAME"] = final_df["NAME"].str.split(",").str[0]

# remove city or town from the NAME column
final_df["NAME"] = final_df["NAME"].str.replace(r" city| town", "", regex=True)

# update the NAME for St. Helena to 'St Helena'
final_df.loc[final_df["NAME"] == "St. Helena", "NAME"] = "St Helena"

In [73]:
final_df.to_csv(out_file, index=False)