In [13]:
import pandas as pd
pd.set_option("mode.copy_on_write", True)
import requests


In [14]:
api_link = "https://api.census.gov/data/2019/acs/acs5" # see census bureau website

col_needed = {
    "get": "NAME,B19013_001E",  # B19013_001E = Median Household Income,  Ask for ZIP code name and median income
    "for": "zip code tabulation area:*", # Get all ZIP code tabulation areas
    "in": "state:06"                 # ...but only in California (state code 06)
}

# Go fecth the data in the database
response = requests.get(api_link, params=col_needed)

# Take the response from the Census API and converts it into a Python object
data = response.json()

# Convert to DataFrame
columns = data[0]
rows = data[1:]
income_data = pd.DataFrame(rows, columns=columns)

# Clean column names
income_data.rename(columns={
    "B19013_001E": "Median_Household_Income",
    "zip code tabulation area": "ZIP_CODE"
}, inplace=True)

# Convert income to numeric
income_data["Median_Household_Income"] = pd.to_numeric(income_data["Median_Household_Income"])

income_data.head(5)


Unnamed: 0,NAME,Median_Household_Income,state,ZIP_CODE
0,ZCTA5 93252,36726,6,93252
1,ZCTA5 93270,34023,6,93270
2,ZCTA5 93291,61599,6,93291
3,ZCTA5 93304,40710,6,93304
4,ZCTA5 93306,55133,6,93306


In [15]:
income_data.shape

(1764, 4)

In [16]:
# Define thresholds
low_thresh = income_data["Median_Household_Income"].quantile(0.25)
high_thresh = income_data["Median_Household_Income"].quantile(0.75)

# Label ZIPs as Low or High income
income_data["Income_Group"] = income_data["Median_Household_Income"].apply(
    lambda x: "Low Income" if x <= low_thresh else (
        "High Income" if x >= high_thresh else "Mid"
    )
)

# Filter out the "Mid" group if we're doing only Low vs High
low_vs_high_income = income_data[income_data["Income_Group"] != "Mid"]

In [17]:
# Remove sentinel values -666666666
low_vs_high_income = low_vs_high_income.loc[low_vs_high_income["Median_Household_Income"] != -666666666]
low_vs_high_income.head()

Unnamed: 0,NAME,Median_Household_Income,state,ZIP_CODE,Income_Group
0,ZCTA5 93252,36726,6,93252,Low Income
1,ZCTA5 93270,34023,6,93270,Low Income
3,ZCTA5 93304,40710,6,93304,Low Income
5,ZCTA5 93307,38415,6,93307,Low Income
12,ZCTA5 94929,119706,6,94929,High Income


In [18]:
from sklearn.metrics import mean_absolute_error

# Assuming you have y_true and y_pred already
for group in ["Low Income", "High Income"]:
    group_data = final_data[final_data["Income_Group"] == group]
    y_true = group_data["Percent_Change"]
    y_pred = group_data["Model_Prediction"]
    
    mae = mean_absolute_error(y_true, y_pred)
    residual_bias = (y_pred - y_true).mean()

    print(f"{group} - MAE: {mae:.2f}, Residual Bias: {residual_bias:.2f}")


NameError: name 'final_data' is not defined