In [1]:
import os
import sys
import pandas as pd, geopandas as gp, numpy as np
import getpass
from arcgis import GIS

user = getpass.getuser()

DVUTILS_LOCAL_CLONE_PATH = f"/Users/{user}/Documents/GitHub/dvutils"
sys.path.insert(0, DVUTILS_LOCAL_CLONE_PATH)
from utils_io import *

In [2]:
# get census api key
api_key = os.environ.get("CENSUS_API_KEY")
agol_password = os.environ.get("AGOL_CONTENT_PASSWORD")

In [3]:
# authenticate to agol
gis = GIS(url="https://mtc.maps.arcgis.com/home", username="content_MTC", password=agol_password)

In [4]:
# pull 2050 equity priority communities data
# epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/communities_of_concern_2020_acs2018/FeatureServer/0"
epc_2050 = pd.read_csv("Data/epc_acs2018.csv", dtype={"geoid": str})

In [5]:
# pull draft acs 2021 equity priority communities data 
# draft_epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/DRAFT_Equity_Priority_Communities_Plan_Bay_Area_2050_/FeatureServer/0"
epc_2050p_2021 = pd.read_csv("Data/epc_acs2021.csv", dtype={"tract_geoid": str}) 

In [6]:
# pull draft acs 2022 equity priority communities data
# draft_epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/DRAFT_Equity_Priority_Communities_Plan_Bay_Area_2050_Plus_ACS_2022_/FeatureServer/0"
epc_2050p_2022 = pd.read_csv("Data/epc_acs2022.csv", dtype={"tract_geoid": str})

## Concat 2022 and 2021 data

In [7]:
county_fips_dict = {
    1: "Alameda",
    13: "Contra Costa",
    41: "Marin",
    55: "Napa",
    75: "San Francisco",
    81: "San Mateo",
    85: "Santa Clara",
    95: "Solano",
    97: "Sonoma",
}
epc_2050["county"] = epc_2050["county_fip"].map(county_fips_dict)
epc_2050p_2021["county"] = epc_2050p_2021["county_fip"].map(county_fips_dict)
# epc_2050p_2021.rename(columns={"epc_2050p":"epc_2021"}, inplace=True)
epc_2050p_2022["county"] = epc_2050p_2022["county_fip"].map(county_fips_dict)
# epc_2050p_2022.rename(columns={"epc_2050p":"epc_2022"}, inplace=True)

In [8]:
# create a function to flag share columns with values that are 0 or 1
# these columns may highlight areas where the data are not reliable

def flag_share_cols(df, share_cols):
    """
    Flags share columns with values that are 0 or 1. These columns may highlight areas where the data are not reliable.

    Parameters
    -------------------
    df (geodataframe):
    Geodataframe object.

    share_cols (list):
    List of columns to flag.

    Returns
    -------------------
    Geodataframe object
    """
    for col in share_cols:
        df[f"{col}_flag"] = np.where(
            (df[col] == 0) | (df[col] == 1), 1, 0
        )
    return df

In [9]:
# flag share columns
cols = [
    "pct_poc",
    "pct_over75",
    "pct_spfam",
    "pct_lep",
    "pct_below2",
    "pct_disab",
    "pct_zvhhs",
    "pct_hus_re",
]
epc_2050 = flag_share_cols(epc_2050, cols)
epc_2050p_2021 = flag_share_cols(epc_2050p_2021, cols)
epc_2050p_2022 = flag_share_cols(epc_2050p_2022, cols)

In [10]:
epc_2050.rename(columns={"geoid":"tract_geoid"}, inplace=True)
epc_2050["tract_geoid"] = "0" + epc_2050["tract_geoid"]
epc_2050.drop(columns=["tract"], inplace=True)

In [11]:
# add vintage cols
epc_2050["vintage"] = 2018
epc_2050p_2021["vintage"] = 2021
epc_2050p_2022["vintage"] = 2022

# concat epc data
epc_concat = pd.concat([epc_2050, epc_2050p_2021, epc_2050p_2022])

In [12]:
epc_concat.to_csv("Data/epc_comparisons_2018_2021_2022.csv", index=False)

In [13]:
epc_2050.pct_below2.mean()

0.21426180780037782

In [14]:
epc_2050p_2021.pct_below2.mean()

0.18514875331275824

In [15]:
epc_2050p_2022.pct_below2.mean().round(2)

0.18

In [16]:
.5 * (epc_2050p_2022.pct_below2.std().round(2))

0.065

## Summarize EPC Determination

In [69]:
def list_check(list1, list2):
    """Checks if items from list 1 are in list 2. If items from list 1 in list two, return true.

    Args:
        list1 (list): List of items to check.
        list2 (list): List of items to check against.

    Returns:
        boolean: True if items from list 1 are in list 2. False if not.
    """
    return all(item in list2 for item in list1)

In [109]:
def set_temp_determination(x, epc_flag_col):
    if x[epc_flag_col] == 1:
        return x[x == 1].index.drop(epc_flag_col).tolist()
    elif x[x == 1].index.tolist() != []:
        return x[x == 1].index.tolist()
    else:
        return None

In [117]:
def set_final_determination(x):
    low_income_poc = ["Low-Income", "People of Color"]

    if x != None and list_check(low_income_poc, x):
        return low_income_poc
    else:
        return x

In [145]:
## Create a epc determination classification function that returns a class for summary stats

def set_determination_class(x):
    low_income_poc = ["Low-Income", "People of Color"]
    if x != None:
        if list_check(low_income_poc, x):
            return "Low-Income and PoC"
        elif "Low-Income" in x and len(x) >=4:
            return "Low-Income and 3 or more other factors"
        else:
            return "Not an EPC"
    else:
        return "Not an EPC"

In [147]:
## create a function that indicates the column variable(s) that determined whether a tract was an epc

def epc_determination(df, epc_flag_col):
    """Returns a epc dataframe with a column indicating the variable(s) that determined whether a tract was an epc.

    Args:
        df (dataframe): Equity Priority Communities dataframe.
        epc_flag_col (string): Name of column with epc flag.
        out_col (string): Name of output column.
    """
    new_df = df.copy()

    ## rename flag columns to be more descriptive
    new_df.rename(
        {
            "poc_1_2": "People of Color",
            "below2_1_2": "Low-Income",
            "over75_1_2": "Seniors 75 Years and Over",
            "spfam_1_2": "Single Parent Family",
            "lep_1_2": "Limited English Proficiency",
            "disab_1_2": "Disability",
            "hus_re_1_2": "Rent Burdened",
            "zvhh_1_2": "Zero-Vehicle Households",
        },
        axis=1,
        inplace=True,
    )

    ## create a new temporary column that indicates all the variables that determined whether a tract was an epc
    cols = [
        "People of Color",
        "Low-Income",
        "Seniors 75 Years and Over",
        "Single Parent Family",
        "Limited English Proficiency",
        "Disability",
        "Rent Burdened",
        "Zero-Vehicle Households",
    ]

    ## add epc flag column to cols list
    cols.append(epc_flag_col)

    # new_df["temp_determination"] = new_df[cols].apply(
    #     lambda row: row[row == 1].index.drop(epc_flag_col).tolist() if row[epc_flag_col] == 1 else row[row == 1].index.tolist(),
    #     axis=1,
    # )

    new_df["temp_determination"] = new_df[cols].apply(lambda x: set_temp_determination(x, epc_flag_col), axis=1)

    ## create a new column that indicates the variable(s) that determined whether a tract was an epc
    # create list that will be used to check if temp_determination contains values
    low_income_poc = ["Low-Income", "People of Color"]
    new_df["epc_determination_list"] = new_df["temp_determination"].apply(set_final_determination)

    ## create human-readable column that indicates the variable(s) that determined whether a tract was an epc
    new_df["EPC Determination"] = new_df["epc_determination_list"].apply(
        lambda x: ", ".join(x) if x != None else None
    )


    ## create a column that indicates the classification of the epc determination
    new_df["EPC Determination Class"] = new_df["epc_determination_list"].apply(set_determination_class)
    
    ## drop temp_determination column
    new_df.drop(columns=["temp_determination"], inplace=True)

    return new_df

In [179]:
# set epc determination for 2018 data
epc_2018_det = epc_determination(epc_2050, epc_flag_col="epc_2050")

# set epc determination for 2021 data
epc_2021_det = epc_determination(epc_2050p_2021, epc_flag_col="epc_2050p")

# set epc determination for 2022 data
epc_2022_det = epc_determination(epc_2050p_2022, epc_flag_col="epc_2050p")

In [162]:
# cols = [
#     "People of Color",
#     "Low-Income",
#     "Seniors 75 Years and Over",
#     "Single Parent Family",
#     "Limited English Proficiency",
#     "Disability",
#     "Rent Burdened",
#     "Zero-Vehicle Households",
#     "EPC Determination",
#     "EPC Determination Class",
# ]
# epc_2021_det.query("epc_2050p == 0")[cols]

In [175]:
## aggregate epc determination 
epc_2021_agg = epc_2021_det.groupby(["EPC Determination Class"]).size().reset_index(name="Summary Count").sort_values(by=["EPC Determination Class"])

## aggregate epc determination
epc_2022_agg = epc_2022_det.groupby(["EPC Determination Class"]).size().reset_index(name="Summary Count").sort_values(by=["EPC Determination Class"])

epc_agg_merge = pd.merge(
    epc_2021_agg,
    epc_2022_agg,
    on="EPC Determination Class",
    suffixes=(" 2021", " 2022"),
)

In [176]:
## aggregate epc determination by county
epc_2021_agg_cnty = (
    epc_2021_det.groupby(["county", "EPC Determination Class"])
    .size()
    .reset_index(name="Summary Count")
    .sort_values(by=["county", "EPC Determination Class"])
)

## aggregate epc determination by county
epc_2022_agg_cnty = (
    epc_2022_det.groupby(["county", "EPC Determination Class"])
    .size()
    .reset_index(name="Summary Count")
    .sort_values(by=["county", "EPC Determination Class"])
)

epc_agg_merge_cnty = pd.merge(
    epc_2021_agg_cnty,
    epc_2022_agg_cnty,
    on=["county","EPC Determination Class"],
    suffixes=(" 2021", " 2022"),
)

In [177]:
epc_agg_merge

Unnamed: 0,EPC Determination Class,Summary Count 2021,Summary Count 2022
0,Low-Income and 3 or more other factors,71,72
1,Low-Income and PoC,242,281
2,Not an EPC,1459,1419


In [178]:
epc_agg_merge_cnty

Unnamed: 0,county,EPC Determination Class,Summary Count 2021,Summary Count 2022
0,Alameda,Low-Income and 3 or more other factors,18,13
1,Alameda,Low-Income and PoC,75,88
2,Alameda,Not an EPC,286,278
3,Contra Costa,Low-Income and 3 or more other factors,6,4
4,Contra Costa,Low-Income and PoC,44,46
5,Contra Costa,Not an EPC,192,192
6,Marin,Low-Income and 3 or more other factors,1,4
7,Marin,Low-Income and PoC,4,3
8,Marin,Not an EPC,58,56
9,Napa,Low-Income and 3 or more other factors,4,4


## Evaluate the how EPCs changed between vintages & indicate what changed

In [214]:
def define_epc_change(previous_epc_flag, current_epc_flag):
    if current_epc_flag == 1 and previous_epc_flag == 1:
        return "EPCs in both"
    elif current_epc_flag == 1 and previous_epc_flag == 0:
        return "New EPC"
    elif current_epc_flag == 0 and previous_epc_flag == 1:
        return "Former EPC"
    else:
        return "EPCs in neither"

In [252]:
# merge 2018 and 2021 epc data
epc_2018_2021 = pd.merge(
    epc_2018_det[["tract_geoid", "epc_2050", "epc_determination_list"]],
    epc_2021_det[["tract_geoid", "epc_2050p", "epc_determination_list"]],
    on="tract_geoid",
    how="outer",
    suffixes=("_2018", "_2021"),
)

In [253]:
# merge 2022 data
epc_2018_2021_2022 = pd.merge(
    epc_2018_2021,
    epc_2022_det[["tract_geoid", "epc_2050p", "epc_determination_list"]],
    on="tract_geoid",
    how="outer",
    suffixes=("_2021", "_2022"),
)

epc_2018_2021_2022.rename(columns={"epc_determination_list":"epc_determination_list_2022"}, inplace=True)

In [254]:
# calculate change in epc from 2018 to 2022
# calculate change in epc from 2021 to 2022

epc_2018_2021_2022["epc_change_2018_2022"] = epc_2018_2021_2022.apply(
    lambda x: define_epc_change(x["epc_2050"], x["epc_2050p_2022"]), axis=1
)
epc_2018_2021_2022["epc_change_2021_2022"] = epc_2018_2021_2022.apply(
    lambda x: define_epc_change(x["epc_2050p_2021"], x["epc_2050p_2022"]), axis=1
)

In [255]:
## compare two lists and return the items that are different between the two lists
def list_compare(list1, list2):
    """Compares two lists and returns the items that are different between the two lists.

    Args:
        list1 (list): List of items to check.
        list2 (list): List of items to check against.

    Returns:
        list: List of items that are different between the two lists.
    """
    return ", ".join([item for item in list1 if item not in list2])

In [256]:
def epc_determination_change(previous_epc_determination, current_epc_determination, epc_change):

    if previous_epc_determination != None and current_epc_determination != None:
        if epc_change == "Former EPC":
            return "Following factors no longer exceed thresholds: " + list_compare(previous_epc_determination, current_epc_determination)
        elif epc_change == "New EPC":
            return ("Following factors now exceed thresholds: " 
                    + list_compare(current_epc_determination, previous_epc_determination) + ". " 
                    + "Factors that exceed thresholds include " + ", ".join(current_epc_determination))
        elif epc_change == "EPCs in both":
            return "Both sets of factors exceed thresholds"
        else:
            return "Neither set of factors exceed thresholds"

In [257]:
# calculate change in epc determination from 2018 to 2022
epc_2018_2021_2022["epc_determination_change_2018_2022"] = epc_2018_2021_2022.apply(
    lambda x: epc_determination_change(
        x["epc_determination_list_2018"],
        x["epc_determination_list_2022"],
        x["epc_change_2018_2022"],
    ),
    axis=1,
)

# calculate change in epc determination from 2021 to 2022
epc_2018_2021_2022["epc_determination_change_2021_2022"] = epc_2018_2021_2022.apply(
    lambda x: epc_determination_change(
        x["epc_determination_list_2021"],
        x["epc_determination_list_2022"],
        x["epc_change_2021_2022"],
    ),
    axis=1,
)

In [258]:
analysis_tracs = [
    "06075980600",
    "06075061000",
    "06075033203",
    "06075017700",
    "06081602300",
    "06081611700",
    "06085513000",
    "06085505202",
    "06001422000",
    "06013303207", 
    "06013372000",
    "06095252702",
    "06095253402",
    "06055201005",
    "06055202000",
    "06041104102",
    "06041129000",
]

# add research area column set to non if no dict match
research_area_dict = {
    "06075980600": "Hunters Point",
    "06075061000": "Candlestick Point",
    "06075033203": "Parkmerced",
    "06075017700": "East Mission",
    "06081602300": "South SF",
    "06081611700": "Menlo Park",
    "06085513000": "Stanford",
    "06085505202": "Santa Clara",
    "06001422000": "West Berkeley",
    "06013303207": "Antioch/Brentwood",
    "06013372000": "North Richmond",
    "06095252702": "Suisun",
    "06095253402": "Dixon",
    "06055201005": "American Canyon",
    "06055202000": "Calistoga",
    "06041104102": "Novato",
    "06041129000": "Marin City",
}

epc_2018_2021_2022["research_area"] = epc_2018_2021_2022["tract_geoid"].map(research_area_dict)

In [259]:
out_cols = [
    "tract_geoid",
    "research_area",
    "epc_2050",
    "epc_2050p_2021",
    "epc_2050p_2022",
    "epc_change_2018_2022",
    "epc_change_2021_2022",
    "epc_determination_change_2018_2022",
    "epc_determination_change_2021_2022",
]
epc_2018_2021_2022.query("tract_geoid in @analysis_tracs")[out_cols].to_csv(
    "Data/epc_tract_research_areas_2018_2021_2022.csv", index=False
)

## Merge 2021 and 2022 data

In [248]:
epc_merge = pd.merge(epc_2050p_2021, epc_2050p_2022, on="tract_geoid", suffixes=("_2021", "_2022"))

In [249]:
epc_merge["epc_change"] = epc_merge["epc_2050p_2022"] - epc_merge["epc_2050p_2021"]

In [250]:
epc_merge["epc_change"].value_counts()

epc_change
 0    1668
 1      72
-1      32
Name: count, dtype: int64

In [None]:
epc_merge["epc_change_class"] = np.where(epc_merge["epc_change"] == 0, "no_change", np.where(epc_merge["epc_change"] > 0, "gain", "loss"))

In [None]:
# count the number of 0 values for each factor in 2021 and 2022
# by county
review_cols = [
    "pct_poc_2021",
    "pct_over75_2021",
    "pct_spfam_2021",
    "pct_lep_2021",
    "pct_below2_2021",
    "pct_disab_2021",
    "pct_zvhhs_2021",
    "pct_hus_re_2021",
    "pct_poc_2022",
    "pct_over75_2022",
    "pct_spfam_2022",
    "pct_lep_2022",
    "pct_below2_2022",
    "pct_disab_2022",
    "pct_hus_re_2022",
    "pct_zvhhs_2022",
]
epc_merge.query("epc_2050p_2021 == 1 or epc_2050p_2022 == 1")[review_cols][
    epc_merge[review_cols].isin([0, 1]).any(axis=1)
]

In [None]:
analysis_cols = [
    "tract_geoid",
    "county_fip_2021",
    "tot_pop_poc_2021",
    "tot_pop_se_2021",
    "tot_pop_po_2021",
    "tot_pop_ci_2021",
    "tot_hh_2021",
    "pop_zvhhs_2021",
    "tot_fam_2021",
    "tot_pop_ov_2021",
    "pop_hus_re_2021",
    "pop_poc_2021",
    "pop_over75_2021",
    "pop_spfam_2021",
    "pop_lep_2021",
    "pop_below2_2021",
    "pop_disabi_2021",
    "tot_pop_poc_2022",
    "tot_pop_se_2022",
    "tot_pop_po_2022",
    "tot_pop_ci_2022",
    "tot_pop_ov_2022",
    "tot_hh_2022",
    "tot_fam_2022",
    "pop_poc_2022",
    "pop_over75_2022",
    "pop_spfam_2022",
    "pop_lep_2022",
    "pop_below2_2022",
    "pop_disabi_2022",
    "pop_hus_re_2022",
    "pop_zvhhs_2022",
    "pct_poc_2022",
]

In [None]:
rev_cols = [
    "tract_geoid",
    "tot_pop_po_2021",
    "tot_pop_po_2022",
    "pop_below2_2021",
    "pop_below2_2022",
    "pct_below2_2021",
    "pct_below2_2022",
    "epc_change_class",
]
# epc_merge.query("county_fip_2021 == 41 and (below2_1_2_2021 == 1 or below2_1_2_2022 == 1)").groupby(["epc_change_class"])[rev_cols].mean()

In [None]:
epc_merge.query("county_fip_2021 == 41 and (below2_1_2_2021 == 1 or below2_1_2_2022 == 1)")[rev_cols]

In [None]:
out_list = epc_merge.columns.to_list()

In [None]:
# remove geometry columns from list
for item in ["geometry_2021", "geometry_2022"]:
    out_list.remove(item)

In [None]:
epc_merge[out_list].to_csv("Data/epc_2050p_2021_2022_wide.csv")