In [1]:
import os
import sys
import pandas as pd, geopandas as gp, numpy as np
import getpass
from arcgis import GIS

user = getpass.getuser()

DVUTILS_LOCAL_CLONE_PATH = f"/Users/{user}/Documents/GitHub/dvutils"
sys.path.insert(0, DVUTILS_LOCAL_CLONE_PATH)
from utils_io import *

Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json


In [2]:
# get census api key
api_key = os.environ.get("CENSUS_API_KEY")
agol_password = os.environ.get("AGOL_CONTENT_PASSWORD")

In [3]:
# authenticate to agol
gis = GIS(url="https://mtc.maps.arcgis.com/home", username="content_MTC", password=agol_password)

In [4]:
# pull 2050 equity priority communities data
# epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/communities_of_concern_2020_acs2018/FeatureServer/0"
epc_2050 = pd.read_csv("Data/epc_acs2018.csv", dtype={"geoid": str})

In [5]:
# pull draft acs 2021 equity priority communities data 
# draft_epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/DRAFT_Equity_Priority_Communities_Plan_Bay_Area_2050_/FeatureServer/0"
epc_2050p_2021 = pd.read_csv("Data/epc_acs2021.csv", dtype={"tract_geoid": str}) 

In [6]:
# pull draft acs 2022 equity priority communities data
# draft_epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/DRAFT_Equity_Priority_Communities_Plan_Bay_Area_2050_Plus_ACS_2022_/FeatureServer/0"
epc_2050p_2022 = pd.read_csv("Data/epc_acs2022.csv", dtype={"tract_geoid": str})

## Concat 2022 and 2021 data

In [7]:
county_fips_dict = {
    1: "Alameda",
    13: "Contra Costa",
    41: "Marin",
    55: "Napa",
    75: "San Francisco",
    81: "San Mateo",
    85: "Santa Clara",
    95: "Solano",
    97: "Sonoma",
}
epc_2050["county"] = epc_2050["county_fip"].map(county_fips_dict)
epc_2050p_2021["county"] = epc_2050p_2021["county_fip"].map(county_fips_dict)
# epc_2050p_2021.rename(columns={"epc_2050p":"epc_2021"}, inplace=True)
epc_2050p_2022["county"] = epc_2050p_2022["county_fip"].map(county_fips_dict)
# epc_2050p_2022.rename(columns={"epc_2050p":"epc_2022"}, inplace=True)

In [8]:
epc_2050.rename(columns={"geoid":"tract_geoid"}, inplace=True)
epc_2050["tract_geoid"] = "0" + epc_2050["tract_geoid"]
epc_2050.drop(columns=["tract"], inplace=True)
epc_2050.rename(columns={"tot_pop":"tot_pop_poc"}, inplace=True)
epc_2050.rename(columns={"epc_2050":"epc"}, inplace=True)
epc_2050p_2022.rename(columns={"epc_2050p":"epc"}, inplace=True)

In [9]:
# sum factor flag columns other than low income and poc
cols = ["over75_1_2", "spfam_1_2", "disab_1_2", "lep_1_2", "zvhh_1_2", "hus_re_1_2"]
epc_2050["other_than_li_poc"] = epc_2050[cols].sum(axis=1)
epc_2050p_2022["other_than_li_poc"] = epc_2050p_2022[cols].sum(axis=1)

In [10]:
# flag tracts where low income and 3 or more other factors are present
epc_2050["other_3plus"] = np.where((epc_2050["other_than_li_poc"] >= 3), 1, 0)
epc_2050p_2022["other_3plus"] = np.where((epc_2050p_2022["other_than_li_poc"] >= 3), 1, 0)

In [11]:
# add vintage cols
epc_2050["vintage"] = 2018
# epc_2050p_2021["vintage"] = 2021
epc_2050p_2022["vintage"] = 2022

# concat epc data
epc_concat = pd.concat([epc_2050, epc_2050p_2022])

In [26]:
epc_concat.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3360 entries, 0 to 1771
Data columns (total 43 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   tract_geoid        3360 non-null   object 
 1   state              1588 non-null   float64
 2   county_fip         3360 non-null   int64  
 3   tot_pop_poc        3360 non-null   int64  
 4   tot_pop_po         3360 non-null   int64  
 5   tot_pop_ci         3360 non-null   int64  
 6   tot_hh             3360 non-null   int64  
 7   pop_zvhhs          3360 non-null   int64  
 8   tot_fam            3360 non-null   int64  
 9   tot_pop_ov         3360 non-null   int64  
 10  pop_hus_re         3360 non-null   int64  
 11  pop_poc            3360 non-null   int64  
 12  pop_over75         3360 non-null   int64  
 13  pop_spfam          3360 non-null   int64  
 14  pop_lep            3360 non-null   int64  
 15  pop_below2         3360 non-null   int64  
 16  pop_disabi         3360 non-n

In [None]:
# acs_df["pct_poc"] = np.where(
#     acs_df["tot_pop_poc"] == 0, 0, (acs_df["pop_poc"] / acs_df["tot_pop_poc"])
# )
# acs_df["pct_over75"] = np.where(
#     acs_df["tot_pop_se"] == 0, 0, (acs_df["pop_over75"] / acs_df["tot_pop_se"])
# )
# acs_df["pct_spfam"] = np.where(acs_df["tot_fam"] == 0, 0, (acs_df["pop_spfam"] / acs_df["tot_fam"]))
# acs_df["pct_lep"] = np.where(
#     acs_df["tot_pop_ov"] == 0, 0, (acs_df["pop_lep"] / acs_df["tot_pop_ov"])
# )
# acs_df["pct_below2"] = np.where(
#     acs_df["tot_pop_po"] == 0, 0, (acs_df["pop_below2"] / acs_df["tot_pop_po"])
# )
# acs_df["pct_disab"] = np.where(
#     acs_df["tot_pop_ci"] == 0, 0, (acs_df["pop_disabi"] / acs_df["tot_pop_ci"])
# )
# acs_df["pct_zvhhs"] = np.where(acs_df["tot_hh"] == 0, 0, (acs_df["pop_zvhhs"] / acs_df["tot_hh"]))
# acs_df["pct_hus_re"] = np.where(acs_df["tot_hh"] == 0, 0, (acs_df["pop_hus_re"] / acs_df["tot_hh"]))

In [40]:
# summarize factor shares by vintage and factor
epc_factor_summary = epc_concat.groupby(["vintage", "epc"]).agg(
    tot_pop_po=("tot_pop_po", "sum"),
    pop_below2=("pop_below2", "sum"),
    tot_pop_poc=("tot_pop_poc", "sum"),
    pop_poc=("pop_poc", "sum"),
    tot_pop_se=("tot_pop_se", "sum"),
    pop_over75=("pop_over75", "sum"),
    tot_fam=("tot_fam", "sum"),
    pop_spfam=("pop_spfam", "sum"),
    tot_pop_ov=("tot_pop_ov", "sum"),
    pop_lep=("pop_lep", "sum"),
    tot_pop_ci=("tot_pop_ci", "sum"),
    pop_disabi=("pop_disabi", "sum"),
    tot_hh=("tot_hh", "sum"),
    pop_zvhhs=("pop_zvhhs", "sum"),
    pop_hus_re=("pop_hus_re", "sum"),
)

# calculate shares
epc_factor_summary["pct_poc"] = epc_factor_summary["pop_poc"] / epc_factor_summary["tot_pop_poc"]
epc_factor_summary["pct_below2"] = epc_factor_summary["pop_below2"] / epc_factor_summary["tot_pop_po"]
epc_factor_summary["pct_over75"] = epc_factor_summary["pop_over75"] / epc_factor_summary["tot_pop_poc"]
epc_factor_summary["pct_spfam"] = epc_factor_summary["pop_spfam"] / epc_factor_summary["tot_fam"]
epc_factor_summary["pct_lep"] = epc_factor_summary["pop_lep"] / epc_factor_summary["tot_pop_ov"]
epc_factor_summary["pct_disab"] = epc_factor_summary["pop_disabi"] / epc_factor_summary["tot_pop_ci"]
epc_factor_summary["pct_zvhhs"] = epc_factor_summary["pop_zvhhs"] / epc_factor_summary["tot_hh"]
epc_factor_summary["pct_hus_re"] = epc_factor_summary["pop_hus_re"] / epc_factor_summary["tot_hh"]


In [46]:
epc_factor_summary[
    [
        "pct_poc",
        "pct_below2",
        "pct_over75",
        "pct_spfam",
        "pct_lep",
        "pct_disab",
        "pct_zvhhs",
        "pct_hus_re",
    ]
].T

vintage,2018,2018,2022,2022
epc,0,1,0,1
pct_poc,0.546118,0.813707,0.585001,0.822331
pct_below2,0.157935,0.419498,0.137849,0.36162
pct_over75,0.065729,0.047428,0.071232,0.054487
pct_spfam,0.096584,0.235519,0.095502,0.214509
pct_lep,0.057227,0.166476,0.055779,0.149475
pct_disab,0.090827,0.119171,0.093143,0.126832
pct_zvhhs,0.071642,0.18872,0.074097,0.180277
pct_hus_re,0.078586,0.188044,0.077639,0.182243


In [50]:
# summarize factor shares by vintage and factor
epc_factor_summary_region = epc_concat.groupby(["vintage"]).agg(
    tot_pop_po=("tot_pop_po", "sum"),
    pop_below2=("pop_below2", "sum"),
    tot_pop_poc=("tot_pop_poc", "sum"),
    pop_poc=("pop_poc", "sum"),
    tot_pop_se=("tot_pop_se", "sum"),
    pop_over75=("pop_over75", "sum"),
    tot_fam=("tot_fam", "sum"),
    pop_spfam=("pop_spfam", "sum"),
    tot_pop_ov=("tot_pop_ov", "sum"),
    pop_lep=("pop_lep", "sum"),
    tot_pop_ci=("tot_pop_ci", "sum"),
    pop_disabi=("pop_disabi", "sum"),
    tot_hh=("tot_hh", "sum"),
    pop_zvhhs=("pop_zvhhs", "sum"),
    pop_hus_re=("pop_hus_re", "sum"),
)

# calculate shares
epc_factor_summary_region["pct_poc"] = (
    epc_factor_summary_region["pop_poc"] / epc_factor_summary_region["tot_pop_poc"]
)
epc_factor_summary_region["pct_below2"] = (
    epc_factor_summary_region["pop_below2"] / epc_factor_summary_region["tot_pop_po"]
)
epc_factor_summary_region["pct_over75"] = (
    epc_factor_summary_region["pop_over75"] / epc_factor_summary_region["tot_pop_poc"]
)
epc_factor_summary_region["pct_spfam"] = (
    epc_factor_summary_region["pop_spfam"] / epc_factor_summary_region["tot_fam"]
)
epc_factor_summary_region["pct_lep"] = (
    epc_factor_summary_region["pop_lep"] / epc_factor_summary_region["tot_pop_ov"]
)
epc_factor_summary_region["pct_disab"] = (
    epc_factor_summary_region["pop_disabi"] / epc_factor_summary_region["tot_pop_ci"]
)
epc_factor_summary_region["pct_zvhhs"] = (
    epc_factor_summary_region["pop_zvhhs"] / epc_factor_summary_region["tot_hh"]
)
epc_factor_summary_region["pct_hus_re"] = (
    epc_factor_summary_region["pop_hus_re"] / epc_factor_summary_region["tot_hh"]
)

In [53]:
epc_factor_summary_region[
    [
        "pct_poc",
        "pct_below2",
        "pct_over75",
        "pct_spfam",
        "pct_lep",
        "pct_disab",
        "pct_zvhhs",
        "pct_hus_re",
    ]
].T

vintage,2018,2022
pct_poc,0.603162,0.631729
pct_below2,0.213441,0.181623
pct_over75,0.061828,0.067935
pct_spfam,0.122196,0.115845
pct_lep,0.080334,0.074154
pct_disab,0.096878,0.099786
pct_zvhhs,0.094752,0.093523
pct_hus_re,0.100192,0.096776


In [13]:
epc_summary = (
    epc_concat[
        [
            "tract_geoid",
            "county",
            "vintage",
            "tot_pop_po",
            "tot_pop_poc",
            "pop_below2",
            "pop_poc",
            "below2_1_2",
            "poc_1_2",
            "other_3plus",
        ]
    ]
    .groupby(["county", "vintage"], dropna=False)
    .agg(
        total_pop_poverty=("tot_pop_po", "sum"),
        pop_low_income=("pop_below2", "sum"),
        total_pop_poc=("tot_pop_poc", "sum"),
        pop_poc=("pop_poc", "sum"),
        tract_count=("tract_geoid", "count"),
        below2_1_2_sum=("below2_1_2", "sum"),
        poc_1_2_sum=("poc_1_2", "sum"),
        other_3plus_sum=("other_3plus", "sum"),
    )
)

In [24]:
epc_2050.groupby(["county"]).agg(
    total_pop_poverty=("tot_pop_po", "sum"),
    pop_low_income=("pop_below2", "sum"),
)

Unnamed: 0_level_0,total_pop_poverty,pop_low_income
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Alameda,1617225,369977
Contra Costa,1123857,246448
Marin,254105,44229
Napa,137164,32363
San Francisco,856426,192988
San Mateo,759786,134732
Santa Clara,1891976,358741
Solano,428057,110345
Sonoma,494493,124452


In [25]:
epc_2050p_2022.groupby(["county"]).agg(
    total_pop_poverty=("tot_pop_po", "sum"),
    pop_low_income=("pop_below2", "sum"),
)

Unnamed: 0_level_0,total_pop_poverty,pop_low_income
county,Unnamed: 1_level_1,Unnamed: 2_level_1
Alameda,1637215,317966
Contra Costa,1153216,216292
Marin,255793,40333
Napa,134649,27493
San Francisco,837888,173759
San Mateo,748410,110289
Santa Clara,1886562,295115
Solano,441378,96131
Sonoma,481716,98749


In [15]:
# share of low income population
epc_summary["pct_low_income"] = round((epc_summary["pop_low_income"] / epc_summary["total_pop_poverty"]), 3)
# share of poc population
epc_summary["pct_poc"] = round((epc_summary["pop_poc"] / epc_summary["total_pop_poc"]), 3)
# percent of tracts flagged as low income
epc_summary["pct_tracts_low_income"] = round(
    (epc_summary["below2_1_2_sum"] / epc_summary["tract_count"]), 3
)
# percent of tracts flagged as poc
epc_summary["pct_tracts_poc"] = round((epc_summary["poc_1_2_sum"] / epc_summary["tract_count"]), 3)

# percent of tracts flagged as low income and 3 or more other factors
epc_summary["pct_tracts_3plus_other"] = round(
    (epc_summary["other_3plus_sum"] / epc_summary["tract_count"]), 3
)

In [16]:
# # test options for rows with only a single hotspot_id
# pda_melt = hs_pda_rt_summary.melt(id_vars=["hotspot_id", "pda_flag"], value_vars=["total_routes", "pct_total_routes"])
# pda_melt["new_variable"] = pda_melt["variable"] + "_" + pda_melt["pda_flag"].astype(str)
# hs_pda_rt_pivot = pda_melt.pivot(index="hotspot_id", columns="new_variable", values="value").reset_index()

In [17]:
epc_summary.reset_index(inplace=True)

In [19]:
epc_summary.columns.to_list()

['county',
 'vintage',
 'total_pop_poverty',
 'pop_low_income',
 'total_pop_poc',
 'pop_poc',
 'tract_count',
 'below2_1_2_sum',
 'poc_1_2_sum',
 'other_3plus_sum',
 'pct_low_income',
 'pct_poc',
 'pct_tracts_low_income',
 'pct_tracts_poc',
 'pct_tracts_3plus_other']

In [20]:
epc_melt = epc_summary.melt(
    id_vars=["county", "vintage"],
    value_vars=[
        "total_pop_poverty",
        "pop_low_income",
        "total_pop_poc",
        "pop_poc",
        "pct_low_income",
        "pct_poc",
        "pct_tracts_low_income",
        "pct_tracts_poc",
        "pct_tracts_3plus_other",
    ],
)

epc_melt["new_variable"] = epc_melt["variable"] + "_" + epc_melt["vintage"].astype("str")

epc_pivot = epc_melt.pivot_table(
    index="county", columns="new_variable", values="value", aggfunc="max"
).reset_index()

In [21]:
epc_pivot

new_variable,county,pct_low_income_2018,pct_low_income_2022,pct_poc_2018,pct_poc_2022,pct_tracts_3plus_other_2018,pct_tracts_3plus_other_2022,pct_tracts_low_income_2018,pct_tracts_low_income_2022,pct_tracts_poc_2018,pct_tracts_poc_2022,pop_low_income_2018,pop_low_income_2022,pop_poc_2018,pop_poc_2022,total_pop_poc_2018,total_pop_poc_2022,total_pop_poverty_2018,total_pop_poverty_2022
0,Alameda,0.229,0.194,0.682,0.707,0.266,0.227,0.335,0.317,0.504,0.509,369977.0,317966.0,1120309.0,1176371.0,1643700.0,1663823.0,1617225.0,1637215.0
1,Contra Costa,0.219,0.188,0.556,0.594,0.188,0.161,0.298,0.285,0.303,0.318,246448.0,216292.0,630296.0,690897.0,1133247.0,1162648.0,1123857.0,1153216.0
2,Marin,0.174,0.158,0.286,0.308,0.089,0.111,0.125,0.206,0.071,0.063,44229.0,40333.0,74486.0,80145.0,260295.0,260485.0,254105.0,255793.0
3,Napa,0.236,0.204,0.472,0.496,0.175,0.15,0.25,0.375,0.125,0.125,32363.0,27493.0,66371.0,68140.0,140530.0,137384.0,137164.0,134649.0
4,San Francisco,0.225,0.207,0.594,0.617,0.31,0.332,0.279,0.287,0.284,0.262,192988.0,173759.0,516374.0,525136.0,870044.0,851036.0,856426.0,837888.0
5,San Mateo,0.177,0.147,0.604,0.63,0.108,0.115,0.152,0.144,0.323,0.362,134732.0,110289.0,462888.0,475035.0,765935.0,754250.0,759786.0,748410.0
6,Santa Clara,0.19,0.156,0.68,0.708,0.11,0.137,0.202,0.191,0.468,0.502,358741.0,295115.0,1306288.0,1357720.0,1922200.0,1916831.0,1891976.0,1886562.0
7,Solano,0.258,0.218,0.615,0.645,0.198,0.15,0.417,0.36,0.406,0.4,110345.0,96131.0,269660.0,290961.0,438530.0,450995.0,428057.0,441378.0
8,Sonoma,0.252,0.205,0.365,0.391,0.15,0.164,0.27,0.336,0.03,0.033,124452.0,98749.0,183080.0,190995.0,501317.0,488436.0,494493.0,481716.0


In [22]:
epc_pivot.to_csv("Data/epc_factor_tract_summary.csv", index=False)

In [None]:
# epc_concat.to_csv("Data/epc_comparisons_2018_2021_2022.csv", index=False)

In [None]:
epc_2050.query("county == 'Napa' & below2_1_2 == 1")[[ "tract_geoid", "below2_1_2", "poc_1_2", "li_3plus", "other_than_li_poc"]]

## Summarize EPC Determination

In [None]:
def list_check(list1, list2):
    """Checks if items from list 1 are in list 2. If items from list 1 in list two, return true.

    Args:
        list1 (list): List of items to check.
        list2 (list): List of items to check against.

    Returns:
        boolean: True if items from list 1 are in list 2. False if not.
    """
    return all(item in list2 for item in list1)

In [None]:
def set_temp_determination(x, epc_flag_col):
    if x[epc_flag_col] == 1:
        return x[x == 1].index.drop(epc_flag_col).tolist()
    elif x[x == 1].index.tolist() != []:
        return x[x == 1].index.tolist()
    else:
        return None

In [None]:
def set_final_determination(x):
    low_income_poc = ["Low-Income", "People of Color"]

    if x != None and list_check(low_income_poc, x):
        return low_income_poc
    else:
        return x

In [None]:
## Create a epc determination classification function that returns a class for summary stats

def set_determination_class(x):
    low_income_poc = ["Low-Income", "People of Color"]
    if x != None:
        if list_check(low_income_poc, x):
            return "Low-Income and PoC"
        elif "Low-Income" in x and len(x) >=4:
            return "Low-Income and 3 or more other factors"
        else:
            return "Not an EPC"
    else:
        return "Not an EPC"

In [None]:
## create a function that indicates the column variable(s) that determined whether a tract was an epc

def epc_determination(df, epc_flag_col):
    """Returns a epc dataframe with a column indicating the variable(s) that determined whether a tract was an epc.

    Args:
        df (dataframe): Equity Priority Communities dataframe.
        epc_flag_col (string): Name of column with epc flag.
        out_col (string): Name of output column.
    """
    new_df = df.copy()

    ## rename flag columns to be more descriptive
    new_df.rename(
        {
            "poc_1_2": "People of Color",
            "below2_1_2": "Low-Income",
            "over75_1_2": "Seniors 75 Years and Over",
            "spfam_1_2": "Single Parent Family",
            "lep_1_2": "Limited English Proficiency",
            "disab_1_2": "Disability",
            "hus_re_1_2": "Rent Burdened",
            "zvhh_1_2": "Zero-Vehicle Households",
        },
        axis=1,
        inplace=True,
    )

    ## create a new temporary column that indicates all the variables that determined whether a tract was an epc
    cols = [
        "People of Color",
        "Low-Income",
        "Seniors 75 Years and Over",
        "Single Parent Family",
        "Limited English Proficiency",
        "Disability",
        "Rent Burdened",
        "Zero-Vehicle Households",
    ]

    ## add epc flag column to cols list
    cols.append(epc_flag_col)

    # new_df["temp_determination"] = new_df[cols].apply(
    #     lambda row: row[row == 1].index.drop(epc_flag_col).tolist() if row[epc_flag_col] == 1 else row[row == 1].index.tolist(),
    #     axis=1,
    # )

    new_df["temp_determination"] = new_df[cols].apply(lambda x: set_temp_determination(x, epc_flag_col), axis=1)

    ## create a new column that indicates the variable(s) that determined whether a tract was an epc
    # create list that will be used to check if temp_determination contains values
    low_income_poc = ["Low-Income", "People of Color"]
    new_df["epc_determination_list"] = new_df["temp_determination"].apply(set_final_determination)

    ## create human-readable column that indicates the variable(s) that determined whether a tract was an epc
    new_df["EPC Determination"] = new_df["epc_determination_list"].apply(
        lambda x: ", ".join(x) if x != None else None
    )


    ## create a column that indicates the classification of the epc determination
    new_df["EPC Determination Class"] = new_df["epc_determination_list"].apply(set_determination_class)
    
    ## drop temp_determination column
    new_df.drop(columns=["temp_determination"], inplace=True)

    return new_df

In [None]:
# set epc determination for 2018 data
epc_2018_det = epc_determination(epc_2050, epc_flag_col="epc_2050")

# set epc determination for 2021 data
epc_2021_det = epc_determination(epc_2050p_2021, epc_flag_col="epc_2050p")

# set epc determination for 2022 data
epc_2022_det = epc_determination(epc_2050p_2022, epc_flag_col="epc_2050p")

In [None]:
# cols = [
#     "People of Color",
#     "Low-Income",
#     "Seniors 75 Years and Over",
#     "Single Parent Family",
#     "Limited English Proficiency",
#     "Disability",
#     "Rent Burdened",
#     "Zero-Vehicle Households",
#     "EPC Determination",
#     "EPC Determination Class",
# ]
# epc_2021_det.query("epc_2050p == 0")[cols]

In [None]:
## aggregate epc determination 
epc_2021_agg = epc_2021_det.groupby(["EPC Determination Class"]).size().reset_index(name="Summary Count").sort_values(by=["EPC Determination Class"])

## aggregate epc determination
epc_2022_agg = epc_2022_det.groupby(["EPC Determination Class"]).size().reset_index(name="Summary Count").sort_values(by=["EPC Determination Class"])

epc_agg_merge = pd.merge(
    epc_2021_agg,
    epc_2022_agg,
    on="EPC Determination Class",
    suffixes=(" 2021", " 2022"),
)

In [None]:
## aggregate epc determination by county
epc_2021_agg_cnty = (
    epc_2021_det.groupby(["county", "EPC Determination Class"])
    .size()
    .reset_index(name="Summary Count")
    .sort_values(by=["county", "EPC Determination Class"])
)

## aggregate epc determination by county
epc_2022_agg_cnty = (
    epc_2022_det.groupby(["county", "EPC Determination Class"])
    .size()
    .reset_index(name="Summary Count")
    .sort_values(by=["county", "EPC Determination Class"])
)

epc_agg_merge_cnty = pd.merge(
    epc_2021_agg_cnty,
    epc_2022_agg_cnty,
    on=["county","EPC Determination Class"],
    suffixes=(" 2021", " 2022"),
)

In [None]:
epc_agg_merge

In [None]:
epc_agg_merge_cnty

## Evaluate the how EPCs changed between vintages & indicate what changed

In [None]:
def define_epc_change(previous_epc_flag, current_epc_flag):
    if current_epc_flag == 1 and previous_epc_flag == 1:
        return "EPCs in both"
    elif current_epc_flag == 1 and previous_epc_flag == 0:
        return "New EPC"
    elif current_epc_flag == 0 and previous_epc_flag == 1:
        return "Former EPC"
    else:
        return "EPCs in neither"

In [None]:
# merge 2018 and 2021 epc data
epc_2018_2021 = pd.merge(
    epc_2018_det[["tract_geoid", "epc_2050", "epc_determination_list"]],
    epc_2021_det[["tract_geoid", "epc_2050p", "epc_determination_list"]],
    on="tract_geoid",
    how="outer",
    suffixes=("_2018", "_2021"),
)

In [None]:
# merge 2022 data
epc_2018_2021_2022 = pd.merge(
    epc_2018_2021,
    epc_2022_det[["tract_geoid", "epc_2050p", "epc_determination_list"]],
    on="tract_geoid",
    how="outer",
    suffixes=("_2021", "_2022"),
)

epc_2018_2021_2022.rename(columns={"epc_determination_list":"epc_determination_list_2022"}, inplace=True)

In [None]:
# calculate change in epc from 2018 to 2022
# calculate change in epc from 2021 to 2022

epc_2018_2021_2022["epc_change_2018_2022"] = epc_2018_2021_2022.apply(
    lambda x: define_epc_change(x["epc_2050"], x["epc_2050p_2022"]), axis=1
)
epc_2018_2021_2022["epc_change_2021_2022"] = epc_2018_2021_2022.apply(
    lambda x: define_epc_change(x["epc_2050p_2021"], x["epc_2050p_2022"]), axis=1
)

In [None]:
## compare two lists and return the items that are different between the two lists
def list_compare(list1, list2):
    """Compares two lists and returns the items that are different between the two lists.

    Args:
        list1 (list): List of items to check.
        list2 (list): List of items to check against.

    Returns:
        list: List of items that are different between the two lists.
    """
    return ", ".join([item for item in list1 if item not in list2])

In [None]:
def epc_determination_change(previous_epc_determination, current_epc_determination, epc_change):

    if previous_epc_determination != None and current_epc_determination != None:
        if epc_change == "Former EPC":
            return "Following factors no longer exceed thresholds: " + list_compare(previous_epc_determination, current_epc_determination)
        elif epc_change == "New EPC":
            return ("Following factors now exceed thresholds: " 
                    + list_compare(current_epc_determination, previous_epc_determination) + ". " 
                    + "Factors that exceed thresholds include " + ", ".join(current_epc_determination))
        elif epc_change == "EPCs in both":
            return "Both sets of factors exceed thresholds"
        else:
            return "Neither set of factors exceed thresholds"

In [None]:
# calculate change in epc determination from 2018 to 2022
epc_2018_2021_2022["epc_determination_change_2018_2022"] = epc_2018_2021_2022.apply(
    lambda x: epc_determination_change(
        x["epc_determination_list_2018"],
        x["epc_determination_list_2022"],
        x["epc_change_2018_2022"],
    ),
    axis=1,
)

# calculate change in epc determination from 2021 to 2022
epc_2018_2021_2022["epc_determination_change_2021_2022"] = epc_2018_2021_2022.apply(
    lambda x: epc_determination_change(
        x["epc_determination_list_2021"],
        x["epc_determination_list_2022"],
        x["epc_change_2021_2022"],
    ),
    axis=1,
)

In [None]:
analysis_tracs = [
    "06075980600",
    "06075061000",
    "06075033203",
    "06075017700",
    "06081602300",
    "06081611700",
    "06085513000",
    "06085505202",
    "06001422000",
    "06013303207", 
    "06013372000",
    "06095252702",
    "06095253402",
    "06055201005",
    "06055202000",
    "06041104102",
    "06041129000",
]

# add research area column set to non if no dict match
research_area_dict = {
    "06075980600": "Hunters Point",
    "06075061000": "Candlestick Point",
    "06075033203": "Parkmerced",
    "06075017700": "East Mission",
    "06081602300": "South SF",
    "06081611700": "Menlo Park",
    "06085513000": "Stanford",
    "06085505202": "Santa Clara",
    "06001422000": "West Berkeley",
    "06013303207": "Antioch/Brentwood",
    "06013372000": "North Richmond",
    "06095252702": "Suisun",
    "06095253402": "Dixon",
    "06055201005": "American Canyon",
    "06055202000": "Calistoga",
    "06041104102": "Novato",
    "06041129000": "Marin City",
}

epc_2018_2021_2022["research_area"] = epc_2018_2021_2022["tract_geoid"].map(research_area_dict)

In [None]:
out_cols = [
    "tract_geoid",
    "research_area",
    "epc_2050",
    "epc_2050p_2021",
    "epc_2050p_2022",
    "epc_change_2018_2022",
    "epc_change_2021_2022",
    "epc_determination_change_2018_2022",
    "epc_determination_change_2021_2022",
]
epc_2018_2021_2022.query("tract_geoid in @analysis_tracs")[out_cols].to_csv(
    "Data/epc_tract_research_areas_2018_2021_2022.csv", index=False
)

## Merge 2021 and 2022 data

In [None]:
epc_merge = pd.merge(epc_2050p_2021, epc_2050p_2022, on="tract_geoid", suffixes=("_2021", "_2022"))

In [None]:
epc_merge["epc_change"] = epc_merge["epc_2050p_2022"] - epc_merge["epc_2050p_2021"]

In [None]:
epc_merge["epc_change"].value_counts()

In [None]:
epc_merge["epc_change_class"] = np.where(epc_merge["epc_change"] == 0, "no_change", np.where(epc_merge["epc_change"] > 0, "gain", "loss"))

In [None]:
# count the number of 0 values for each factor in 2021 and 2022
# by county
review_cols = [
    "pct_poc_2021",
    "pct_over75_2021",
    "pct_spfam_2021",
    "pct_lep_2021",
    "pct_below2_2021",
    "pct_disab_2021",
    "pct_zvhhs_2021",
    "pct_hus_re_2021",
    "pct_poc_2022",
    "pct_over75_2022",
    "pct_spfam_2022",
    "pct_lep_2022",
    "pct_below2_2022",
    "pct_disab_2022",
    "pct_hus_re_2022",
    "pct_zvhhs_2022",
]
epc_merge.query("epc_2050p_2021 == 1 or epc_2050p_2022 == 1")[review_cols][
    epc_merge[review_cols].isin([0, 1]).any(axis=1)
]

In [None]:
analysis_cols = [
    "tract_geoid",
    "county_fip_2021",
    "tot_pop_poc_2021",
    "tot_pop_se_2021",
    "tot_pop_po_2021",
    "tot_pop_ci_2021",
    "tot_hh_2021",
    "pop_zvhhs_2021",
    "tot_fam_2021",
    "tot_pop_ov_2021",
    "pop_hus_re_2021",
    "pop_poc_2021",
    "pop_over75_2021",
    "pop_spfam_2021",
    "pop_lep_2021",
    "pop_below2_2021",
    "pop_disabi_2021",
    "tot_pop_poc_2022",
    "tot_pop_se_2022",
    "tot_pop_po_2022",
    "tot_pop_ci_2022",
    "tot_pop_ov_2022",
    "tot_hh_2022",
    "tot_fam_2022",
    "pop_poc_2022",
    "pop_over75_2022",
    "pop_spfam_2022",
    "pop_lep_2022",
    "pop_below2_2022",
    "pop_disabi_2022",
    "pop_hus_re_2022",
    "pop_zvhhs_2022",
    "pct_poc_2022",
]

In [None]:
rev_cols = [
    "tract_geoid",
    "tot_pop_po_2021",
    "tot_pop_po_2022",
    "pop_below2_2021",
    "pop_below2_2022",
    "pct_below2_2021",
    "pct_below2_2022",
    "epc_change_class",
]
# epc_merge.query("county_fip_2021 == 41 and (below2_1_2_2021 == 1 or below2_1_2_2022 == 1)").groupby(["epc_change_class"])[rev_cols].mean()

In [None]:
epc_merge.query("county_fip_2021 == 41 and (below2_1_2_2021 == 1 or below2_1_2_2022 == 1)")[rev_cols]

In [None]:
out_list = epc_merge.columns.to_list()

In [None]:
# remove geometry columns from list
for item in ["geometry_2021", "geometry_2022"]:
    out_list.remove(item)

In [None]:
epc_merge[out_list].to_csv("Data/epc_2050p_2021_2022_wide.csv")