In [50]:
import os
import requests
import pathlib
import getpass
import pandas as pd

user = getpass.getuser()

In [51]:
work_dir = pathlib.Path(f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Bay_Area_Census_Website/census_decennial_download")
out_file = work_dir / "census_2020_race_sex_age_jc.csv"

In [52]:
def fetch_census_data(url):
    # Make the API request
    response = requests.get(url)
    
    # Check if the request was successful
    if response.status_code == 200:
        # Convert the JSON response to a list of lists
        data = response.json()
        
        # The first element contains the column headers
        columns = data[0]
        
        # The rest of the elements contain the actual data
        rows = data[1:]
        
        # Create a pandas DataFrame from the data
        df = pd.DataFrame(rows, columns=columns)
        
        return df
    else:
        print(f"Failed to retrieve data. HTTP Status code: {response.status_code}")
        return None

In [53]:
# Define the API endpoint
race_url = (
    "https://api.census.gov/data/2020/dec/pl?get=group(P2)&"
    "ucgid=0500000US06001,0500000US06013,0500000US06041,"
    "0500000US06055,0500000US06075,0500000US06081,0500000US06085,"
    "0500000US06095,0500000US06097"
)

age_url = (
    "https://api.census.gov/data/2020/dec/dhc?get=group(P12)&"
    "ucgid=0500000US06001,0500000US06013,0500000US06041,"
    "0500000US06055,0500000US06075,0500000US06081,0500000US06085,"
    "0500000US06095,0500000US06097"
)


In [54]:
race_df = fetch_census_data(race_url)
age_df = fetch_census_data(age_url)

In [55]:
race_rename_dict = {
    'P2_001N': 'Race Total Population',
    'P2_002N': 'Hispanic or Latino',
    'P2_003N': 'Not Hispanic or Latino',
    'P2_004N': 'Population of One Race',
    'P2_005N': 'White',
    'P2_006N': 'Black or African American',
    'P2_007N': 'American Indian and Alaska Native',
    'P2_008N': 'Asian',
    'P2_009N': 'Native Hawaiian and Other Pacific Islander',
    'P2_010N': 'Some Other Race',
    'P2_011N': 'Two or More Races'
}
race_df.rename(columns=race_rename_dict,inplace=True)

In [56]:
age_rename_dict = {
    "P12_001N": "Age Total Population",
    "P12_002N": "Total Male",
    "P12_003N": "Male Under 5",
    "P12_004N": "Male 5 to 9",
    "P12_005N": "Male 10 to 14",
    "P12_006N": "Male 15 to 17",
    "P12_007N": "Male 18 to 19",
    "P12_008N": "Male 20",
    "P12_009N": "Male 21",
    "P12_010N": "Male 22 to 24",
    "P12_011N": "Male 25 to 29",
    "P12_012N": "Male 30 to 34",
    "P12_013N": "Male 35 to 39",
    "P12_014N": "Male 40 to 44",
    "P12_015N": "Male 45 to 49",
    "P12_016N": "Male 50 to 54",
    "P12_017N": "Male 55 to 59",
    "P12_018N": "Male 60 to 61",
    "P12_019N": "Male 62 to 64",
    "P12_020N": "Male 65 to 66",
    "P12_021N": "Male 67 to 69",
    "P12_022N": "Male 70 to 74",
    "P12_023N": "Male 75 to 79",
    "P12_024N": "Male 80 to 84",
    "P12_025N": "Male 85 and over",
    "P12_026N": "Total Female",
    "P12_027N": "Female Under 5",
    "P12_028N": "Female 5 to 9",
    "P12_029N": "Female 10 to 14",
    "P12_030N": "Female 15 to 17",
    "P12_031N": "Female 18 to 19",
    "P12_032N": "Female 20",
    "P12_033N": "Female 21",
    "P12_034N": "Female 22 to 24",
    "P12_035N": "Female 25 to 29",
    "P12_036N": "Female 30 to 34",
    "P12_037N": "Female 35 to 39",
    "P12_038N": "Female 40 to 44",
    "P12_039N": "Female 45 to 49",
    "P12_040N": "Female 50 to 54",
    "P12_041N": "Female 55 to 59",
    "P12_042N": "Female 60 to 61",
    "P12_043N": "Female 62 to 64",
    "P12_044N": "Female 65 to 66",
    "P12_045N": "Female 67 to 69",
    "P12_046N": "Female 70 to 74",
    "P12_047N": "Female 75 to 79",
    "P12_048N": "Female 80 to 84",
    "P12_049N": "Female 85 and over",
}
age_df.rename(columns=age_rename_dict,inplace=True)

In [57]:
race_out = list(race_rename_dict.values())
# add the GEO_ID and NAME columns
race_out.insert(0, "GEO_ID")  
race_out.insert(1, "NAME")

# race_df[race_out]

In [58]:
# calculate shares of the race total population
share_cols = race_out.copy()
share_cols.remove("GEO_ID")
share_cols.remove("NAME")
share_cols.remove("Race Total Population")

for group in share_cols:
    total_col = "Race Total Population"

    race_df[f"Share {group}"] = round(race_df[f"{group}"].astype(int) / race_df[total_col].astype(int), 4)

In [59]:
# remove column names that are not needed starting with "P2"
final_race_out = race_df.columns.to_list()

# Remove columns that start with "P2"
final_race_out = [col for col in final_race_out if not col.startswith("P2")]
final_race_out.remove("ucgid")

In [60]:
age_out = list(age_rename_dict.values())
# add the GEO_ID and NAME columns
age_out.insert(0, "GEO_ID")
# age_out.insert(1, "NAME")

# age_df[age_out]

In [61]:
# Define the age groups
age_groups = [
    "Under 5", "5 to 9", "10 to 14", "15 to 17", "18 to 19", "20", "21", 
    "22 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", 
    "50 to 54", "55 to 59", "60 to 61", "62 to 64", "65 to 66", "67 to 69", 
    "70 to 74", "75 to 79", "80 to 84", "85 and over"
]

# Calculate totals for each age group
total_cols = []
for group in age_groups:
    male_col = f"Male {group}"
    female_col = f"Female {group}"
    total_col = f"Total {group}"
    total_cols.append(total_col)
    
    # Ensure the columns exist in the DataFrame
    if male_col in age_df.columns and female_col in age_df.columns:
        age_df[total_col] = age_df[male_col].astype(int) + age_df[female_col].astype(int)

for col in total_cols:
    age_out.append(col)

In [62]:
# calculate shares of the age total population for each sex and age group
share_cols = age_out.copy()
share_cols.remove("GEO_ID")
share_cols.remove("Age Total Population")

for group in share_cols:
    total_col = "Age Total Population"

    age_df[f"Share {group}"] = round(age_df[f"{group}"].astype(int) / age_df[total_col].astype(int), 4)

In [63]:
# remove column names that are not needed starting with "P12"
final_age_out = age_df.columns.to_list()

final_age_out = [col for col in final_age_out if not col.startswith("P12")]
final_age_out.remove("NAME")
final_age_out.remove("ucgid")

In [64]:
# join the two dataframes

final_df = pd.merge(race_df[final_race_out], age_df[final_age_out], on="GEO_ID")

In [65]:
final_df.to_csv(out_file, index=False)