In [1]:
import pandas as pd
import os
import numpy as np
import requests

In [2]:
# Variable explanation:
# https://api.census.gov/data/2019/acs/acs5/variables.html

In [3]:
######################################################################
# DEMOGRAPHIC DATA FOR 2012 TO 2019
######################################################################

import requests
import pandas as pd

years = range(2012, 2020) # from 2012 to 2019
all_demographic_data = []

for year in years:
    api_url = f"https://api.census.gov/data/{year}/acs/acs5"
    params = {
        "get": ",".join([
            "NAME",
            "B19013_001E",   # Median Household Income
            "B01003_001E",   # Total Population
            "B15003_001E",   # Total Population age 25+
            "B25077_001E",   # Price
            "B23025_004E",   #"Employed",
            "B23025_003E",   # Labor Force
            "B25010_001E",   # Avg Household Size
            "B17001_001E",   # Poverty universe total
            "B17001_002E",   # Below poverty
            "B19083_001E",   # Gini Index: Measures income disparity (0 = equal, 1 = very unequal)
            # Education levels — full B15003 block from 002E to 025E
            "B15003_019E",
            "B15003_020E",
            "B15003_021E",
            "B15003_022E",
            # Race Levels
            "B02001_002E", # White people
            "B02001_003E", # Black or African American
            "B02001_004E", # American Indian and Alaska Native
            "B02001_005E", # Asian

        ]),
        "for": "tract:*", # census track 
        "in": "state:06 county:*" # census track for california

    }

    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data[1:], columns=data[0])
        
        df.rename(columns={
            "B19013_001E": "Median_Household_Income",
            "B01003_001E": "Total_Population",
            "B15003_001E": "Total_Population_Older_than_25",
            "B25077_001E": "PRICE",
            "B23025_004E": "Employed",
            "B23025_003E": "Labor_Force",
            "B25010_001E": "Avg_Household_Size",
            "B17001_001E": "Poverty_Universe",
            "B17001_002E": "Below_Poverty",
            "B19083_001E": "Gini_Index",
            "B02001_002E": "White",
            "B02001_003E": "Black_or_African_American",
            "B02001_004E": "American_Indian_and_Alaska_Native", 
            "B02001_005E": "Asian",
            "zip code tabulation area": "ZIP_CODE",
            "B15003_019E": "Bachelor_Degree",
            "B15003_020E": "Master_Degree",
            "B15003_021E": "Professional_Degree",
            "B15003_022E": "Doctorate_Degree"
        }, inplace=True)

        df["YEAR"] = year
        all_demographic_data.append(df)
    else:
        print(f"Failed for {year}: {response.status_code}")

# Combine all years
demographic_data_2012_2019 = pd.concat(all_demographic_data, ignore_index=True)

# Keep only useful columns
demographic_data_2012_2019 = demographic_data_2012_2019.drop(columns =["NAME", "state"])

#Employed: B23025_004E
#Labor Force: B23025_003E

demographic_data_2012_2019.head()

Unnamed: 0,Median_Household_Income,Total_Population,Total_Population_Older_than_25,PRICE,Employed,Labor_Force,Avg_Household_Size,Poverty_Universe,Below_Poverty,Gini_Index,...,Master_Degree,Professional_Degree,Doctorate_Degree,White,Black_or_African_American,American_Indian_and_Alaska_Native,Asian,county,tract,YEAR
0,74451,4122,2848,380600,1843,2120,2.66,4122,263,0.4228,...,589,237,647,3649,4,17,158,73,19809,2012
1,68542,4508,3329,394000,1697,1864,2.55,4503,164,0.3917,...,387,420,775,3590,1,24,273,73,20026,2012
2,80538,13407,8890,548100,5238,5700,2.69,13407,1483,0.4883,...,1415,1481,3005,11101,243,38,840,73,20027,2012
3,95855,4076,2695,468500,2286,2410,2.4,3944,299,0.3372,...,383,204,1016,2734,326,7,678,73,17056,2012
4,41073,6274,4308,119000,2551,2860,2.47,6230,720,0.4001,...,1066,393,550,3774,321,0,302,73,18614,2012


In [4]:

######################################################################
# DEMOGRAPHIC DATA FOR 2020 TO 2023
######################################################################

years = range(2020, 2024)  # From 2020 to 2023
all_demographic_data_2020s = []

for year in years:
    print(f"\n===== Processing Year {year} =====")
    
    api_url = f"https://api.census.gov/data/{year}/acs/acs5"
    print(f"API URL: {api_url}")
    
    # Your requested variables
    variables = [
        "B19013_001E",  # Median Household Income
        "B01003_001E",  # Total Population
        "B15003_001E",  # Total Population age 25+
        "B25077_001E",  # Median House Value (Price)
        "B23025_004E",  # Employed
        "B23025_003E",  # Labor Force
        "B25010_001E",  # Avg Household Size
        "B17001_001E",  # Poverty Universe
        "B17001_002E",  # Below Poverty
        "B19083_001E",  # Gini Index
        "B02001_002E",  # White
        "B02001_003E",  # Black or African American
        "B02001_004E",  # American Indian and Alaska Native
        "B02001_005E",  # Asian
        "B15003_022E",  # Bachelor's Degree
        "B15003_023E",  # Master's Degree
        "B15003_024E",  # Professional Degree
        "B15003_025E",  # Doctorate Degree
    ]

    params = {
        "get": ",".join(variables),
        "for": "tract:*",            # <=== THIS IS THE KEY CHANGE
        "in": "state:06 county:*",    # California (state 06), all counties
    }
    
    response = requests.get(api_url, params=params)

    if response.status_code == 200:
        print(f"✅ Success for {year}")
        data = response.json()
        df = pd.DataFrame(data[1:], columns=data[0])
        
        # Rename columns to human-readable names
        df.rename(columns={
            "B19013_001E": "Median_Household_Income",
            "B01003_001E": "Total_Population",
            "B15003_001E": "Total_Population_Older_than_25",
            "B25077_001E": "PRICE",
            "B23025_004E": "Employed",
            "B23025_003E": "Labor_Force",
            "B25010_001E": "Avg_Household_Size",
            "B17001_001E": "Poverty_Universe",
            "B17001_002E": "Below_Poverty",
            "B19083_001E": "Gini_Index",
            "B02001_002E": "White",
            "B02001_003E": "Black_or_African_American",
            "B02001_004E": "American_Indian_and_Alaska_Native",
            "B02001_005E": "Asian",
            "B15003_022E": "Bachelor_Degree",
            "B15003_023E": "Master_Degree",
            "B15003_024E": "Professional_Degree",
            "B15003_025E": "Doctorate_Degree",
        }, inplace=True)
        
        df["YEAR"] = year
        
        all_demographic_data_2020s.append(df)
        
    else:
        print(f"❌ Failed for year {year}: Status {response.status_code}")
        print(f"Response Text: {response.text}")


# Combine all years together
demographic_data_2020_2023 = pd.concat(all_demographic_data_2020s, ignore_index=True)

# Preview
demographic_data_2020_2023.head()



===== Processing Year 2020 =====
API URL: https://api.census.gov/data/2020/acs/acs5
✅ Success for 2020

===== Processing Year 2021 =====
API URL: https://api.census.gov/data/2021/acs/acs5
✅ Success for 2021

===== Processing Year 2022 =====
API URL: https://api.census.gov/data/2022/acs/acs5
✅ Success for 2022

===== Processing Year 2023 =====
API URL: https://api.census.gov/data/2023/acs/acs5
✅ Success for 2023


Unnamed: 0,Median_Household_Income,Total_Population,Total_Population_Older_than_25,PRICE,Employed,Labor_Force,Avg_Household_Size,Poverty_Universe,Below_Poverty,Gini_Index,...,American_Indian_and_Alaska_Native,Asian,Bachelor_Degree,Master_Degree,Professional_Degree,Doctorate_Degree,state,county,tract,YEAR
0,78856,4367,3034,391800,2118,2337,2.84,4263,340,0.3733,...,67,936,398,134,24,0,6,13,313102,2020
1,48316,2740,1962,365900,1365,1469,2.88,2740,703,0.5751,...,59,571,117,79,18,0,6,13,313104,2020
2,62422,4701,3087,346900,2175,2447,2.75,4681,521,0.3947,...,11,533,355,151,21,0,6,13,313105,2020
3,130091,6689,4479,523000,3441,3678,3.38,6643,627,0.3231,...,30,1147,899,207,134,38,6,13,313106,2020
4,122727,2656,1782,459600,1286,1420,3.69,2651,349,0.3591,...,0,352,326,124,0,0,6,13,313107,2020


In [5]:
#######################################################################
# CHECK IF TRACK CENSUS MATCH IN BOTH DATASETS
######################################################################

# Track census changes every year: they are merged or even divided over the years so some old track census might no longer exists while new ones are created

# Step 1. Get the list of unique GEO IDs for each time range
geo_2012_2019 = set(demographic_data_2012_2019["county"] + demographic_data_2012_2019["tract"])
geo_2020_2023 = set(demographic_data_2020_2023["county"] + demographic_data_2020_2023["tract"])

# Step 2. Compare
print(f"2012-2019 unique tracts: {len(geo_2012_2019)}")
print(f"2020-2023 unique tracts: {len(geo_2020_2023)}")

# Step 3. Find missing tracts (if any)
missing_in_2020 = geo_2012_2019 - geo_2020_2023
missing_in_2012 = geo_2020_2023 - geo_2012_2019

print(f"Tracts missing in 2020-2023 but present in 2012-2019: {len(missing_in_2020)}")
print(f"Tracts missing in 2012-2019 but present in 2020-2023: {len(missing_in_2012)}")


2012-2019 unique tracts: 8057
2020-2023 unique tracts: 9129
Tracts missing in 2020-2023 but present in 2012-2019: 1176
Tracts missing in 2012-2019 but present in 2020-2023: 2248


In [6]:
#######################################################################
# FILTER BOTH DATASETS TO KEEP COMMON TRACK CENSUS
######################################################################

# Find tracts that are common to both time periods
common_geo_ids = geo_2012_2019.intersection(geo_2020_2023)

print(len(common_geo_ids))

# Filter your data
demographic_data_2012_2019 = demographic_data_2012_2019[
    (demographic_data_2012_2019["county"] + demographic_data_2012_2019["tract"]).isin(common_geo_ids)
]

demographic_data_2020_2023 = demographic_data_2020_2023[
    (demographic_data_2020_2023["county"] + demographic_data_2020_2023["tract"]).isin(common_geo_ids)
]


6881


In [7]:
######################################################################
# MERGE DEMOGRAPHIC DATA 2012-2019 AND 2020-2023
######################################################################

demographic_data = pd.concat([demographic_data_2012_2019, demographic_data_2020_2023], ignore_index=True)
demographic_data.shape

(82572, 22)

In [8]:
######################################################################
# CREATE A UNIQUE IDENTIFIER FOR EACH CENSUS TRACK: COUNTY + ZIPCODE 
######################################################################

demographic_data["GEO_UNIQUE_ID"] = demographic_data["county"] + demographic_data["tract"]
demographic_data.head()

Unnamed: 0,Median_Household_Income,Total_Population,Total_Population_Older_than_25,PRICE,Employed,Labor_Force,Avg_Household_Size,Poverty_Universe,Below_Poverty,Gini_Index,...,Doctorate_Degree,White,Black_or_African_American,American_Indian_and_Alaska_Native,Asian,county,tract,YEAR,state,GEO_UNIQUE_ID
0,74451,4122,2848,380600,1843,2120,2.66,4122,263,0.4228,...,647,3649,4,17,158,73,19809,2012,,73019809
1,68542,4508,3329,394000,1697,1864,2.55,4503,164,0.3917,...,775,3590,1,24,273,73,20026,2012,,73020026
2,95855,4076,2695,468500,2286,2410,2.4,3944,299,0.3372,...,1016,2734,326,7,678,73,17056,2012,,73017056
3,35822,3733,2065,137500,1600,1672,3.84,3705,1370,0.2908,...,149,3164,37,0,146,73,20028,2012,,73020028
4,31106,4911,3095,115800,1908,2023,2.58,4859,851,0.3604,...,289,3764,163,33,231,73,20029,2012,,73020029


In [9]:
######################################################################
# CREATE EMPLOYEMENT RATE
######################################################################

demographic_data["Employment_Rate"] = (
    demographic_data["Employed"].astype(float) /
    demographic_data["Labor_Force"].astype(float)
) * 100

In [10]:
######################################################################
# CREATE POVERTY LINE
######################################################################

# Convert values to numeric for the calculation
demographic_data["Poverty_Universe"] = pd.to_numeric(demographic_data["Poverty_Universe"], errors="coerce")
demographic_data["Below_Poverty"] = pd.to_numeric(demographic_data["Below_Poverty"], errors="coerce")

# Calculate below poverty line
demographic_data["Below_Poverty_Rate"] = (
    demographic_data["Below_Poverty"] / demographic_data["Poverty_Universe"]
) * 100

In [11]:
######################################################################
# CREATE RATE OF COLLEGES OR HIGHER OF THE 25+ POP AGE 
######################################################################

college_or_higher_cols = [
    "Bachelor_Degree", "Master_Degree", "Professional_Degree", "Doctorate_Degree"
]

demographic_data["Rate_College_or_Higher"] = (
    demographic_data[college_or_higher_cols].astype(float).sum(axis=1)
    / demographic_data["Total_Population_Older_than_25"].astype(float)
) * 100

In [12]:
######################################################################
# CREATE RACE RATE
######################################################################

# Portion black
demographic_data["Black_Portion"] = (demographic_data["Black_or_African_American"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

# Portion white
demographic_data["White_Portion"] = (demographic_data["White"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

# Portion American Indian
demographic_data["American_Indian_and_Alaska_Native_Portion"] = (demographic_data["American_Indian_and_Alaska_Native"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

# Portion Asian
demographic_data["Asian_Portion"] = (demographic_data["Asian"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

In [13]:
######################################################################
# CLEAN SENTINEL VALUES AND DROP NA'S IN PRICE
######################################################################

# Define sentinel value
sentinel_value = "-666666666"

# Remove rows with invalid sentinel prices
demographic_data = demographic_data.loc[demographic_data["PRICE"] != sentinel_value]

# drops any row where PRICE is NaN
demographic_data = demographic_data.dropna(subset=['PRICE'])



In [14]:
demographic_data["PRICE"].value_counts()

PRICE
2000001    1282
1000001     858
400000       55
350000       54
450000       51
           ... 
1151400       1
1022700       1
1052700       1
1358800       1
947600        1
Name: count, Length: 13510, dtype: int64

In [15]:
######################################################################
# KEEP ONLY USEFUL CLASSES
######################################################################
demographic_data = demographic_data.drop(columns =[
    "Poverty_Universe", 
    "Below_Poverty", 
    "Employed", 
    "Labor_Force", 
    "Bachelor_Degree", "Master_Degree", "Professional_Degree", "Doctorate_Degree",
    "Total_Population_Older_than_25",
    "Asian", "White", "Black_or_African_American", "American_Indian_and_Alaska_Native",
    "state"
    #"Median_Home_Value"

])

# Preview the raw data
demographic_data.head(10)

Unnamed: 0,Median_Household_Income,Total_Population,PRICE,Avg_Household_Size,Gini_Index,county,tract,YEAR,GEO_UNIQUE_ID,Employment_Rate,Below_Poverty_Rate,Rate_College_or_Higher,Black_Portion,White_Portion,American_Indian_and_Alaska_Native_Portion,Asian_Portion
0,74451,4122,380600,2.66,0.4228,73,19809,2012,73019809,86.933962,6.380398,61.16573,0.09704,88.524988,0.412421,3.833091
1,68542,4508,394000,2.55,0.3917,73,20026,2012,73020026,91.040773,3.642016,52.83869,0.022183,79.636202,0.532387,6.055901
2,95855,4076,468500,2.4,0.3372,73,17056,2012,73017056,94.854772,7.581136,64.675325,7.998037,67.075564,0.171737,16.633955
3,35822,3733,137500,3.84,0.2908,73,20028,2012,73020028,95.69378,36.977058,20.726392,0.99116,84.757568,0.0,3.911063
4,31106,4911,115800,2.58,0.3604,73,20029,2012,73020029,94.315373,17.513892,33.505654,3.31908,76.644268,0.671961,4.703726
5,83105,5543,448400,2.36,0.41,59,52425,2012,59052425,93.067966,5.412232,68.073032,1.17265,74.147574,0.252571,17.770161
6,102500,2661,615300,3.02,0.3437,59,52506,2012,59052506,95.599188,6.313416,60.401955,5.862458,53.100338,0.375799,35.700864
7,76958,8571,593300,2.97,0.4495,59,52524,2012,59052524,90.431148,18.842609,57.06316,1.86676,57.554544,0.0,27.06802
8,29590,16855,422400,2.63,0.6495,59,62614,2012,59062614,93.423943,44.728123,36.003066,1.459508,49.255414,0.931474,37.899733
9,46045,4467,215200,2.34,0.448,59,62625,2012,59062625,90.340644,12.558764,52.999699,1.119319,62.032684,0.0,9.133647


In [16]:
demographic_data.isna().sum()

# should i remove na's

Median_Household_Income                      6
Total_Population                             0
PRICE                                        0
Avg_Household_Size                           0
Gini_Index                                   0
county                                       0
tract                                        0
YEAR                                         0
GEO_UNIQUE_ID                                0
Employment_Rate                              0
Below_Poverty_Rate                           0
Rate_College_or_Higher                       0
Black_Portion                                0
White_Portion                                0
American_Indian_and_Alaska_Native_Portion    0
Asian_Portion                                0
dtype: int64

In [17]:
# Save demographic data
demographic_data["YEAR"] = demographic_data["YEAR"].astype(int)
demographic_data["GEO_UNIQUE_ID"] = demographic_data["GEO_UNIQUE_ID"].astype(str)
demographic_data.to_csv("demographic_data.csv", index=False)

In [18]:
demographic_data.shape

(80176, 16)