In [34]:
import numpy as np
import pandas as pd
import requests

In [35]:
# Variable explanation:
# https://api.census.gov/data/2019/acs/acs5/variables.html

In [36]:
##################################### DEMOGRAPHIC DATA ##################################### 

import requests
import pandas as pd

years = range(2012, 2020)
all_demographic_data = []

for year in years:
    api_url = f"https://api.census.gov/data/{year}/acs/acs5"
    params = {
        "get": ",".join([
            "NAME",
            "B19013_001E",   # Median Household Income
            "B01003_001E",   # Total Population
            "B15003_001E",   # Total Population age 25+
            "B25077_001E",   # Price
            "B23025_004E",   #"Employed",
            "B23025_003E",   # Labor Force
            "B25010_001E",   # Avg Household Size
            "B17001_001E",   # Poverty universe total
            "B17001_002E",   # Below poverty
            "B19083_001E",   # Gini Index: Measures income disparity (0 = equal, 1 = very unequal)
            # Education levels — full B15003 block from 002E to 025E
            *[f"B15003_{str(i).zfill(3)}E" for i in range(2, 26)],
            # Race Levels
            "B02001_002E", # White people
            "B02001_003E", # Black or African American
            "B02001_004E", # American Indian and Alaska Native
            "B02001_005E", # Asian

        ]),
        "for": "zip code tabulation area:*",
        "in": "state:06"  # California
    }

    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        data = response.json()
        df = pd.DataFrame(data[1:], columns=data[0])
        
        df.rename(columns={
            "B19013_001E": "Median_Household_Income",
            "B01003_001E": "Total_Population",
            "B15003_001E": "Total_Population_Older_than_25",
            "B25077_001E": "PRICE",
            "B23025_004E": "Employed",
            #"B23025_005E": "Employed",
            "B23025_003E": "Labor_Force",
            "B25010_001E": "Avg_Household_Size",
            "B17001_001E": "Poverty_Universe",
            "B17001_002E": "Below_Poverty",
            "B19083_001E": "Gini_Index",
            "B02001_002E": "White",
            "B02001_003E": "Black_or_African_American",
            "B02001_004E": "American_Indian_and_Alaska_Native", 
            "B02001_005E": "Asian",
            "zip code tabulation area": "ZIP_CODE",
            "B15003_002E": "No_Schooling",
            "B15003_003E": "Nursery_4th_Grade",
            "B15003_004E": "5th_6th_Grade",
            "B15003_005E": "7th_8th_Grade",
            "B15003_006E": "9th_Grade",
            "B15003_007E": "10th_Grade",
            "B15003_008E": "11th_Grade",
            "B15003_009E": "12th_No_Diploma",
            "B15003_010E": "High_School_Diploma",
            "B15003_011E": "GED",
            "B15003_012E": "Some_College_Less_1yr",
            "B15003_013E": "Some_College_More_1yr",
            "B15003_014E": "Associate_Academic",
            "B15003_015E": "Associate_Occupational",
            "B15003_016E": "High_School_Graduate",
            "B15003_017E": "Some_College_No_Degree",
            "B15003_018E": "Associate_Degree",
            "B15003_019E": "Bachelor_Degree",
            "B15003_020E": "Master_Degree",
            "B15003_021E": "Professional_Degree",
            "B15003_022E": "Doctorate_Degree"
        }, inplace=True)

        df["YEAR"] = year
        all_demographic_data.append(df)
    else:
        print(f"Failed for {year}: {response.status_code}")

# Combine all years
demographic_data = pd.concat(all_demographic_data, ignore_index=True)

# Keep only useful columns
demographic_data = demographic_data.drop(columns =["NAME", "state"])

#Employed: B23025_004E
#Labor Force: B23025_003E

demographic_data.head()

Unnamed: 0,Median_Household_Income,Total_Population,Total_Population_Older_than_25,PRICE,Employed,Labor_Force,Avg_Household_Size,Poverty_Universe,Below_Poverty,Gini_Index,...,Doctorate_Degree,B15003_023E,B15003_024E,B15003_025E,White,Black_or_African_American,American_Indian_and_Alaska_Native,Asian,ZIP_CODE,YEAR
0,94317,48208,32192,520400,23871,25476,3.04,47746,2194,0.4157,...,9177,3296,1099,728,37899,756,279,4868,92064,2012
1,50471,45362,28549,374900,18960,20589,3.22,45200,7183,0.4364,...,4472,1442,367,188,34240,1226,168,3853,92069,2012
2,40093,1191,998,195600,261,350,2.16,1055,200,0.4495,...,118,30,35,15,916,41,51,59,92086,2012
3,80590,76797,50728,386100,38290,41826,3.19,73330,5582,0.3471,...,13590,4234,971,948,31389,2657,290,34299,92126,2012
4,125573,47599,31041,838600,22962,24295,2.75,47517,2457,0.4339,...,10552,6802,3275,3083,32713,289,137,11938,92130,2012


In [37]:
####################### Create employment rate #######################

demographic_data["Employment_Rate"] = (
    demographic_data["Employed"].astype(float) /
    demographic_data["Labor_Force"].astype(float)
) * 100


In [38]:
####################### Create below poverty line #######################

# Convert values to numeric for the calculation
demographic_data["Poverty_Universe"] = pd.to_numeric(demographic_data["Poverty_Universe"], errors="coerce")
demographic_data["Below_Poverty"] = pd.to_numeric(demographic_data["Below_Poverty"], errors="coerce")

# Calculate below poverty line
demographic_data["Below_Poverty_Rate"] = (
    demographic_data["Below_Poverty"] / demographic_data["Poverty_Universe"]
) * 100

In [39]:
################################### Rate of college or higher of the 25+ pop age ###########################
college_or_higher_cols = [
    "Bachelor_Degree", "Master_Degree", "Professional_Degree", "Doctorate_Degree"
]

demographic_data["Rate_College_or_Higher"] = (
    demographic_data[college_or_higher_cols].astype(float).sum(axis=1)
    / demographic_data["Total_Population_Older_than_25"].astype(float)
) * 100

In [40]:
########################### RACE PORTION #################

# Portion black
demographic_data["Black_Portion"] = (demographic_data["Black_or_African_American"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

# Portion white
demographic_data["White_Portion"] = (demographic_data["White"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

# Portion American Indian
demographic_data["American_Indian_and_Alaska_Native_Portion"] = (demographic_data["American_Indian_and_Alaska_Native"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100

# Portion Asian
demographic_data["Asian_Portion"] = (demographic_data["Asian"].astype(float) / demographic_data["Total_Population"].astype(float)) * 100


In [41]:
############################ CLEAN SENTINEL VALUES ###################

# Define sentinel value
sentinel_value = "-666666666"

# Remove rows with invalid sentinel prices
demographic_data = demographic_data.loc[demographic_data["PRICE"] != sentinel_value]


In [42]:
demographic_data["PRICE"].value_counts()

PRICE
1000001    199
2000001    107
225000      22
350000      15
275000      15
          ... 
919200       1
866300       1
703000       1
721600       1
44600        1
Name: count, Length: 6469, dtype: int64

In [43]:
# Keep only useful columns
demographic_data = demographic_data.drop(columns =[
    "Poverty_Universe", 
    "Below_Poverty", 
    "Employed", 
    "Labor_Force", 
    "No_Schooling", "Nursery_4th_Grade", "5th_6th_Grade", "7th_8th_Grade",
    "9th_Grade", "10th_Grade", "11th_Grade", "12th_No_Diploma",
    "High_School_Diploma", "GED",
    "Bachelor_Degree", "Master_Degree", "Professional_Degree", "Doctorate_Degree",
    "Some_College_Less_1yr", "Some_College_More_1yr",
    "Associate_Academic", "Associate_Occupational", "Associate_Degree",
    "B15003_023E", "B15003_025E", "B15003_024E", "B15003_023E", "High_School_Graduate",
    "Total_Population_Older_than_25", "Some_College_No_Degree",
    "Asian", "White", "Black_or_African_American", "American_Indian_and_Alaska_Native",
    #"Median_Home_Value"

])

# Preview the raw data
demographic_data.head(10)

Unnamed: 0,Median_Household_Income,Total_Population,PRICE,Avg_Household_Size,Gini_Index,ZIP_CODE,YEAR,Employment_Rate,Below_Poverty_Rate,Rate_College_or_Higher,Black_Portion,White_Portion,American_Indian_and_Alaska_Native_Portion,Asian_Portion
0,94317,48208,520400,3.04,0.4157,92064,2012,93.699953,4.595149,59.977634,1.568204,78.615582,0.578742,10.097909
1,50471,45362,374900,3.22,0.4364,92069,2012,92.088008,15.891593,48.348454,2.702703,75.481681,0.370354,8.493894
2,40093,1191,195600,2.16,0.4495,92086,2012,74.571429,18.957346,50.801603,3.442485,76.91016,4.282116,4.95382
3,80590,76797,386100,3.19,0.3471,92126,2012,91.545928,7.612164,59.140908,3.459771,40.87269,0.377619,44.661901
4,125573,47599,838600,2.75,0.4339,92130,2012,94.513274,5.170781,49.12535,0.607156,68.726234,0.287821,25.080359
5,41920,40881,151400,4.5,0.3848,92236,2012,79.229338,27.806856,20.170857,1.472567,53.330398,0.491671,0.149214
6,27275,3736,155600,2.14,0.5421,92256,2012,75.941873,24.912845,47.236942,0.0,86.697002,3.506424,0.026767
7,26322,2807,58900,2.97,0.4747,92283,2012,76.62989,39.046253,31.847891,0.213751,36.052725,47.488422,1.068757
8,44327,263,26100,1.22,0.2707,92328,2012,98.755187,4.942966,53.138075,0.0,93.91635,4.18251,1.901141
9,29754,5156,119100,2.36,0.5033,92363,2012,88.098434,28.389748,44.009022,3.200155,75.484872,14.429791,1.124903


In [44]:
demographic_data.isna().sum()

Median_Household_Income                      35
Total_Population                              0
PRICE                                        43
Avg_Household_Size                            0
Gini_Index                                    0
ZIP_CODE                                      0
YEAR                                          0
Employment_Rate                              16
Below_Poverty_Rate                            0
Rate_College_or_Higher                        0
Black_Portion                                 0
White_Portion                                 0
American_Indian_and_Alaska_Native_Portion     0
Asian_Portion                                 0
dtype: int64

In [45]:
demographic_data.shape

(13067, 14)

In [46]:
demographic_data["YEAR"].value_counts()

YEAR
2012    1672
2015    1653
2014    1651
2013    1650
2018    1615
2016    1613
2019    1609
2017    1604
Name: count, dtype: int64

In [47]:
demographic_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13067 entries, 0 to 14111
Data columns (total 14 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Median_Household_Income                    13032 non-null  object 
 1   Total_Population                           13067 non-null  object 
 2   PRICE                                      13024 non-null  object 
 3   Avg_Household_Size                         13067 non-null  object 
 4   Gini_Index                                 13067 non-null  object 
 5   ZIP_CODE                                   13067 non-null  object 
 6   YEAR                                       13067 non-null  int64  
 7   Employment_Rate                            13051 non-null  float64
 8   Below_Poverty_Rate                         13067 non-null  float64
 9   Rate_College_or_Higher                     13067 non-null  float64
 10  Black_Portion              

In [None]:
# Make sure the columns are numeric
#col_to_num = ["Median_Household_Income", "Total_Population", "PRICE", "Avg_Household_Size", "Gini_Index"]

#demographic_data[col_to_num] = demographic_data[col_to_num].apply(pd.to_numeric, errors='coerce')


In [51]:
# Save demographic data
demographic_data.to_csv("demographic_data.csv", index=False)