In [1]:
import os
import sys
import pandas as pd, geopandas as gp, numpy as np
import getpass
from arcgis import GIS

user = getpass.getuser()

DVUTILS_LOCAL_CLONE_PATH = f"/Users/{user}/Documents/GitHub/dvutils"
sys.path.insert(0, DVUTILS_LOCAL_CLONE_PATH)
from utils_io import *

In [2]:
# read in the data
epc_2050 = pd.read_csv("Data/epc_acs2018.csv", dtype={"geoid": str})
epc_2050p_2022 = pd.read_csv("Data/epc_acs2022.csv", dtype={"tract_geoid": str})

In [3]:
county_fips_dict = {
    1: "Alameda",
    13: "Contra Costa",
    41: "Marin",
    55: "Napa",
    75: "San Francisco",
    81: "San Mateo",
    85: "Santa Clara",
    95: "Solano",
    97: "Sonoma",
}

epc_2050["county"] = epc_2050["county_fip"].map(county_fips_dict)
epc_2050p_2022["county"] = epc_2050p_2022["county_fip"].map(county_fips_dict)

## Summarize epc determination

In [4]:
def list_check(list1, list2):
    """Checks if items from list 1 are in list 2. If items from list 1 in list two, return true.

    Args:
        list1 (list): List of items to check.
        list2 (list): List of items to check against.

    Returns:
        boolean: True if items from list 1 are in list 2. False if not.
    """
    return all(item in list2 for item in list1)

In [5]:
def set_temp_determination(x, epc_flag_col):
    if x[epc_flag_col] == 1:
        return x[x == 1].index.drop(epc_flag_col).tolist()
    elif x[x == 1].index.tolist() != []:
        return x[x == 1].index.tolist()
    else:
        return None

In [6]:
def set_final_determination(x):
    low_income_poc = ["Low-Income", "People of Color"]

    if x != None and list_check(low_income_poc, x):
        return low_income_poc
    else:
        return x

In [10]:
## Create a epc determination classification function that returns a class for summary stats

def set_determination_class(x):
    low_income_poc = ["Low-Income", "People of Color"]
    if x != None:
        if list_check(low_income_poc, x):
            return "1: Low-Income and PoC"
        elif "Low-Income" in x and len(x) >=4:
            return "2: Low-Income and 3 or more other factors"
        else:
            return "3: Not an EPC"
    else:
        return "3: Not an EPC"

In [11]:
## create a function that indicates the column variable(s) that determined whether a tract was an epc

def epc_determination(df, epc_flag_col):
    """Returns a epc dataframe with a column indicating the variable(s) that determined whether a tract was an epc.

    Args:
        df (dataframe): Equity Priority Communities dataframe.
        epc_flag_col (string): Name of column with epc flag.
        out_col (string): Name of output column.
    """
    new_df = df.copy()

    ## rename flag columns to be more descriptive
    new_df.rename(
        {
            "poc_1_2": "People of Color",
            "below2_1_2": "Low-Income",
            "over75_1_2": "Seniors 75 Years and Over",
            "spfam_1_2": "Single Parent Family",
            "lep_1_2": "Limited English Proficiency",
            "disab_1_2": "Disability",
            "hus_re_1_2": "Rent Burdened",
            "zvhh_1_2": "Zero-Vehicle Households",
        },
        axis=1,
        inplace=True,
    )

    ## create a new temporary column that indicates all the variables that determined whether a tract was an epc
    cols = [
        "People of Color",
        "Low-Income",
        "Seniors 75 Years and Over",
        "Single Parent Family",
        "Limited English Proficiency",
        "Disability",
        "Rent Burdened",
        "Zero-Vehicle Households",
    ]

    ## add epc flag column to cols list
    cols.append(epc_flag_col)

    # new_df["temp_determination"] = new_df[cols].apply(
    #     lambda row: row[row == 1].index.drop(epc_flag_col).tolist() if row[epc_flag_col] == 1 else row[row == 1].index.tolist(),
    #     axis=1,
    # )

    new_df["temp_determination"] = new_df[cols].apply(lambda x: set_temp_determination(x, epc_flag_col), axis=1)

    ## create a new column that indicates the variable(s) that determined whether a tract was an epc
    # create list that will be used to check if temp_determination contains values
    low_income_poc = ["Low-Income", "People of Color"]
    new_df["epc_determination_list"] = new_df["temp_determination"].apply(set_final_determination)

    ## create human-readable column that indicates the variable(s) that determined whether a tract was an epc
    new_df["EPC Determination"] = new_df["epc_determination_list"].apply(
        lambda x: ", ".join(x) if x != None else None
    )


    ## create a column that indicates the classification of the epc determination
    new_df["EPC Determination Class"] = new_df["epc_determination_list"].apply(set_determination_class)
    
    ## drop temp_determination column
    new_df.drop(columns=["temp_determination"], inplace=True)

    return new_df

In [12]:
# set epc determination for 2018 data
epc_2018_det = epc_determination(epc_2050, epc_flag_col="epc_2050")

# set epc determination for 2022 data
epc_2022_det = epc_determination(epc_2050p_2022, epc_flag_col="epc_2050p")

In [40]:
## aggregate epc determination
epc_2018_det_agg = (
    epc_2018_det.groupby(["EPC Determination Class"])
    .size()
    .reset_index(name="Summary Count")
    .sort_values(by=["EPC Determination Class"])
)

## aggregate epc determination
epc_2022_det_agg = (
    epc_2022_det.groupby(["EPC Determination Class"])
    .size()
    .reset_index(name="Summary Count")
    .sort_values(by=["EPC Determination Class"])
)

epc_det_agg_merge = pd.merge(
    epc_2018_det_agg,
    epc_2022_det_agg,
    on="EPC Determination Class",
    suffixes=(" 2018", " 2022"),
)

In [41]:
epc_det_agg_merge["Summary Count 2018"].sum()

1588

In [42]:
epc_det_agg_merge["Summary Count 2022"].sum()

1772

In [59]:
# create a flag column for low-income and poc
epc_2050["li_poc"] = epc_2050.apply(lambda x: 1 if x["below2_1_2"] == 1 and x["poc_1_2"] else 0, axis=1)

epc_2050p_2022["li_poc"] = epc_2050p_2022.apply(lambda x: 1 if x["below2_1_2"] == 1 and x["poc_1_2"] else 0, axis=1)

In [60]:
# create a flag column for low-income and 3 or more other factors
epc_2050["li_3ormore"] = epc_2050.apply(
    lambda x: 1
    if x["below2_1_2"] == 1
    and x[["over75_1_2", "spfam_1_2", "lep_1_2", "disab_1_2", "hus_re_1_2", "zvhh_1_2"]].sum() >= 3
    else 0,
    axis=1,
)

epc_2050p_2022["li_3ormore"] = epc_2050p_2022.apply(
    lambda x: 1
    if x["below2_1_2"] == 1
    and x[["over75_1_2", "spfam_1_2", "lep_1_2", "disab_1_2", "hus_re_1_2", "zvhh_1_2"]].sum() >= 3
    else 0,
    axis=1,
)

In [65]:
# create a flag columns for low-income and poc only
epc_2050["li_poc_only"] = epc_2050.apply(
    lambda x: 1
    if x["li_poc"] == 1
    and x["li_3ormore"] == 0
    else 0,
    axis=1,
)

epc_2050p_2022["li_poc_only"] = epc_2050p_2022.apply(
    lambda x: 1
    if x["li_poc"] == 1
    and x["li_3ormore"] == 0
    else 0,
    axis=1,
)

In [70]:
# create a flag columns for low-income and 3 or more other factors only
epc_2050["li_3ormore_only"] = epc_2050.apply(
    lambda x: 1
    if x["li_poc"] == 0
    and x["li_3ormore"] == 1
    else 0,
    axis=1,
)

epc_2050p_2022["li_3ormore_only"] = epc_2050p_2022.apply(
    lambda x: 1
    if x["li_poc"] == 0
    and x["li_3ormore"] == 1
    else 0,
    axis=1,
)

In [71]:
# create a flag column where both li_poc and li_3ormore are 1
epc_2050["li_poc_and_li_3ormore"] = epc_2050.apply(
    lambda x: 1 if x["li_poc"] == 1 and x["li_3ormore"] == 1 else 0, axis=1
)

epc_2050p_2022["li_poc_and_li_3ormore"] = epc_2050p_2022.apply(
    lambda x: 1 if x["li_poc"] == 1 and x["li_3ormore"] == 1 else 0, axis=1
)

In [82]:
# get sum of tracts flagged as low-income
epc_2050_agg = (
    epc_2050.groupby("county").agg(
        {
            "poc_1_2": "sum",
            "below2_1_2": "sum",
            "li_poc": "sum",
            "li_poc_only": "sum",
            "li_3ormore": "sum",
            "li_3ormore_only": "sum",
            "li_poc_and_li_3ormore": "sum",
        }
    )
    .reset_index()
    .rename(columns={"index": "variable", 0: "count"})
)


In [76]:
# get sum of tracts flagged as low-income
epc_2050_agg = (
    epc_2050.agg(
        {
            "poc_1_2": "sum",
            "below2_1_2": "sum",
            "li_poc": "sum",
            "li_poc_only": "sum",
            "li_3ormore": "sum",
            "li_3ormore_only": "sum",
            "li_poc_and_li_3ormore": "sum",
        }
    )
    .reset_index()
    .rename(columns={"index": "variable", 0: "count"})
)

epc_2050p_2022_agg = (
    epc_2050p_2022.agg(
        {
            "poc_1_2": "sum",
            "below2_1_2": "sum",
            "li_poc": "sum",
            "li_poc_only": "sum",
            "li_3ormore": "sum",
            "li_3ormore_only": "sum",
            "li_poc_and_li_3ormore": "sum",
        }
    )
    .reset_index()
    .rename(columns={"index": "variable", 0: "count"})
)

# # calculate percent of total
# epc_2050_agg["percent"] = np.round((epc_2050_agg["count"] / epc_2050.shape[0]) * 100, 2)

# epc_2050p_2022_agg["percent"] = np.round(
#     (epc_2050p_2022_agg["count"] / epc_2050p_2022.shape[0]) * 100, 2
# )

In [80]:
epc_2050p_2022.shape[0]

1772

In [77]:
# merge 2018 and 2022 data
epc_agg_merge = pd.merge(
    epc_2050_agg,
    epc_2050p_2022_agg,
    on="variable",
    suffixes=(" 2018", " 2022"),
)

In [78]:
epc_agg_merge

Unnamed: 0,variable,count 2018,percent 2018,count 2022,percent 2022
0,poc_1_2,577,36.34,655,36.96
1,below2_1_2,421,26.51,467,26.35
2,li_poc,273,17.19,281,15.86
3,li_poc_only,97,6.11,107,6.04
4,li_3ormore,242,15.24,246,13.88
5,li_3ormore_only,66,4.16,72,4.06
6,li_poc_and_li_3ormore,176,11.08,174,9.82


In [79]:
epc_det_agg_merge

Unnamed: 0,EPC Determination Class,Summary Count 2018,Summary Count 2022
0,1: Low-Income and PoC,273,281
1,2: Low-Income and 3 or more other factors,66,72
2,3: Not an EPC,1249,1419


## Summarize epc change

In [93]:
epc_2050_summary = epc_2050.groupby("county").agg(total_tracts=("geoid", "count"), epc=("epc_2050", "sum")).reset_index()
epc_2050p_summary = epc_2050p_2022.groupby("county").agg(total_tracts=("tract_geoid", "count"), epc=("epc_2050p", "sum")).reset_index()

In [94]:
epc_2050_summary

Unnamed: 0,county,total_tracts,epc
0,Alameda,361,101
1,Contra Costa,208,50
2,Marin,56,4
3,Napa,40,5
4,San Francisco,197,51
5,San Mateo,158,22
6,Santa Clara,372,63
7,Solano,96,28
8,Sonoma,100,15


In [95]:
epc_2050p_summary

Unnamed: 0,county,total_tracts,epc
0,Alameda,379,101
1,Contra Costa,242,50
2,Marin,63,7
3,Napa,40,5
4,San Francisco,244,60
5,San Mateo,174,19
6,Santa Clara,408,69
7,Solano,100,27
8,Sonoma,122,15


In [106]:
epc_summary_merge = pd.merge(
    epc_2050_summary, epc_2050p_summary, on="county", suffixes=("_2018", "_2022")
)

In [116]:
# calculate county shares of epc tracts
epc_summary_merge["epc_percent_2018"] = np.round(
    (epc_summary_merge["epc_2018"] / epc_summary_merge["total_tracts_2018"]), 2
)

epc_summary_merge["epc_percent_2022"] = np.round(
    (epc_summary_merge["epc_2022"] / epc_summary_merge["total_tracts_2022"]), 2
)

In [120]:
# calculate change in epc tracts
epc_summary_merge["pct_change"] = (epc_summary_merge["epc_percent_2022"] - epc_summary_merge["epc_percent_2018"])
epc_summary_merge["num_change"] = (epc_summary_merge["epc_2022"] - epc_summary_merge["epc_2018"])

In [118]:
reorder_cols = [
    "county",
    "total_tracts_2018",
    "total_tracts_2022",
    "epc_2018",
    "epc_2022",
    "epc_percent_2018",
    "epc_percent_2022",
    "num_change",
    "pct_change",
]
epc_summary_merge[reorder_cols].to_clipboard(index=False)

In [121]:
epc_summary_merge

Unnamed: 0,county,total_tracts_2018,epc_2018,total_tracts_2022,epc_2022,epc_percent_2018,epc_percent_2022,change,pct_change,num_change
0,Alameda,361,101,379,101,0.28,0.27,-0.01,-0.01,0
1,Contra Costa,208,50,242,50,0.24,0.21,-0.03,-0.03,0
2,Marin,56,4,63,7,0.07,0.11,0.04,0.04,3
3,Napa,40,5,40,5,0.12,0.12,0.0,0.0,0
4,San Francisco,197,51,244,60,0.26,0.25,-0.01,-0.01,9
5,San Mateo,158,22,174,19,0.14,0.11,-0.03,-0.03,-3
6,Santa Clara,372,63,408,69,0.17,0.17,0.0,0.0,6
7,Solano,96,28,100,27,0.29,0.27,-0.02,-0.02,-1
8,Sonoma,100,15,122,15,0.15,0.12,-0.03,-0.03,0
