In [1]:
import os
import sys
import pandas as pd, geopandas as gp, numpy as np
import getpass
from arcgis import GIS

user = getpass.getuser()

DVUTILS_LOCAL_CLONE_PATH = f"/Users/{user}/Documents/GitHub/dvutils"
sys.path.insert(0, DVUTILS_LOCAL_CLONE_PATH)
from utils_io import *

In [2]:
# get census api key
api_key = os.environ.get("CENSUS_API_KEY")
agol_password = os.environ.get("AGOL_CONTENT_PASSWORD")

In [3]:
# authenticate to agol
gis = GIS(url="https://mtc.maps.arcgis.com/home", username="content_MTC", password=agol_password)

In [4]:
def pull_acs_5_year_est_data(
    census_api_key,
    acs_year=2019,
    tbl_prof_type="Detailed",
    table_id=None,
    select_table_vars=None,
    drop_anno_cols=True,
    drop_margin_cols=True,
):
    """
    Pull American Community Survey (ACS) 5 year estimate data. Data can be pulled for an entire
    table or for select table variables.

    !Must include a table_id or list to select_table_vars parameters!

    Parameters
    -------------------
    census_api_key (String):
    Your secret census api key.

    acs_year (Integer):
    Year for acs estimates, default is 2019 which is latest year 5 year data is available.

    tbl_prof_type (String):
    Table or profile type. These include the following types: Detailed, Subject, Data, or Comparison.

    table_id (String):
    ACS table id. Example 'B01001'

    select_table_vars (List):
    provide a list of ACS table variables as strings. Example: ['B01001_001E','B01001_002E']

    drop_anno_cols (Boolean):
    Used if table_id provided. Drops annotation of margin of error and annotation of estimate
    columns.

    drop_margin_cols (Boolean):
    Used if table_id provided. Drops margin of error columns.

    Returns
    -------------------
    Geodataframe object

    Author: Joshua Croff
    Variable Reference: https://www.census.gov/data/developers/data-sets/acs-5year.html
    """
    import requests
    import pandas as pd

    if table_id:
        var = f"group({table_id})"
    else:
        var = ",".join(select_table_vars)

    counties = "001,013,041,055,075,081,085,095,097"
    state = "06"
    # set base url
    if tbl_prof_type not in ["Detailed", "Subject", "Data", "Comparison"]:
        return "Please provide the following table types: Detailed, Subject, Data, or Comparison"
    elif tbl_prof_type == "Detailed":
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5?"
    elif tbl_prof_type == "Subject":
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5/subject?"
        # https://api.census.gov/data/2020/acs/acs5/subject?get=NAME,S0101_C01_001E&for=county:037&in=state:06&key=YOUR_KEY_GOES_HERE
    elif tbl_prof_type == "Data":
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5/profile?"
    else:
        base_url = f"https://api.census.gov/data/{acs_year}/acs/acs5/cprofile?"

    # set query params
    query_params = {
        "get": var,
        "for": "tract:*",
        "in": [
            f"county:{counties}",
            f"state:{state}",
        ],
        "key": census_api_key,
    }
    rq = requests.get(base_url, params=query_params)
    data = rq.json()
    acs_df = pd.DataFrame(data[1:], columns=data[0])
    # Cast numeric columns to numeric types
    cols = acs_df.columns.to_list()

    if table_id:
        str_cols = ["GEO_ID", "NAME", "state", "county", "tract"]
    else:
        str_cols = ["state", "county", "tract"]
    num_cols = list(set(cols) - set(str_cols))
    acs_df[num_cols] = acs_df[num_cols].apply(pd.to_numeric)

    # Drop annotation columns
    if drop_anno_cols:
        acs_df = acs_df.loc[
            :, ~((acs_df.columns.str.endswith("EA")) | (acs_df.columns.str.endswith("MA")))
        ].copy()

    if drop_margin_cols:
        acs_df = acs_df.loc[:, ~acs_df.columns.str.endswith("M").copy()]

    # add tract id column
    acs_df["tract_geoid"] = acs_df["state"] + acs_df["county"] + acs_df["tract"]

    # rename columns
    acs_df = acs_df.rename(columns={"county": "fipco"})

    # drop redundent columns
    if table_id:
        acs_df = acs_df.drop(columns=["GEO_ID", "NAME", "state", "tract"])
    else:
        acs_df = acs_df.drop(columns=["state", "tract"])

    return acs_df

In [5]:
def pull_census_tracts_geodata(year=2020, cartographic=False):
    """
    Pulls Census Tracts from TIGERweb REST API and returns Geopandas GeoDataframe.
    Default year is 2020 which is the latest-available vintage for TIGER tracts.

    How to choose vintage: https://www2.census.gov/geo/pdfs/maps-data/data/tiger/How_do_I_choose_TIGER_vintage.pdf

    Parameters
    -------------------
    year (int):
    the TIGER vintage.
    list of valid years: [2012,2015,2016,2017,2018,2019,2020]

    catrographic (bool):
    If the cartographic parameter is set to true, a generalized version of tracts is returned
    with water areas clipped.

    Author: Joshua Croff
    Source: https://tigerweb.geo.census.gov/tigerwebmain/TIGERweb_restmapservice.html
    """
    import geopandas as gpd
    import requests

    valid_years = [2012, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022]
    pre_2020 = [2012, 2015, 2016, 2017, 2018, 2019]
    if year not in valid_years:
        print("Error- vintage not available. Please see docstring for valid years")
        return

    if year == 2020 and cartographic == True:
        map_service = f"Generalized_TAB{year}"
    elif year == 2020:
        map_service = f"tigerWMS_Census{year}"
        layer_id = "6"
    elif year in pre_2020 and cartographic == True:
        map_service = f"Generalized_ACS{year}"
    elif year in pre_2020:
        map_service = f"tigerWMS_ACS{year}"
        layer_id = "8"
    elif year > 2020 and cartographic == True:
        map_service = f"Generalized_ACS{year}"
    else:
        map_service = f"tigerWMS_ACS{year}"
        layer_id = "6"

    state = "06"
    counties = "('001','013','041','055','075','081','085','095','097')"
    where_str = f"where=STATE='{state}'+AND+COUNTY+IN{counties}"
    query_args = [where_str, "outFields=GEOID&f=geojson"]

    if cartographic:
        url = "/".join(
            [
                "https://tigerweb.geo.census.gov",
                "arcgis",
                "rest",
                "services",
                map_service,
                "Tracts_Blocks",
                "MapServer",
                "3",
                "query?{}".format("&".join(query_args)),
            ]
        )
    else:
        url = "/".join(
            [
                "https://tigerweb.geo.census.gov",
                "arcgis",
                "rest",
                "services",
                "TIGERweb",
                map_service,
                "MapServer",
                layer_id,
                "query?{}".format("&".join(query_args)),
            ]
        )
    r = requests.get(url)
    geog_json = r.json()
    geog_gdf = gpd.GeoDataFrame.from_features(geog_json["features"], crs="EPSG:4326")

    # rename GEOID column to tract_geoid
    geog_gdf = geog_gdf.rename(columns={"GEOID": "tract_geoid"})
    return geog_gdf

In [6]:
# pull 2050 equity priority communities data
epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/communities_of_concern_2020_acs2018/FeatureServer/0"
epc_2050 = pull_geotable_agol(base_url=epc_url, client=gis, reproject_to_analysis_crs=False)

Breaking feature service layer IDs into 8 chunks


In [7]:
# pull draft acs 2021 equity priority communities data 
draft_epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/DRAFT_Equity_Priority_Communities_Plan_Bay_Area_2050_/FeatureServer/0"
epc_2050p_2021 = pull_geotable_agol(base_url=draft_epc_url, client=gis, reproject_to_analysis_crs=False)

Breaking feature service layer IDs into 8 chunks


In [8]:
# pull draft acs 2022 equity priority communities data
draft_epc_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/DRAFT_Equity_Priority_Communities_Plan_Bay_Area_2050_Plus_ACS_2022_/FeatureServer/0"
epc_2050p_2022 = pull_geotable_agol(base_url=draft_epc_url, client=gis, reproject_to_analysis_crs=False)

Breaking feature service layer IDs into 8 chunks


In [11]:
epc_2050p_2021.rename(
    columns={
        "tot_pop_mi": "tot_pop_poc",
        "pop_minori": "pop_poc",
        "pct_minori": "pct_poc",
        "minori_1_2": "poc_1_2",
    },
    inplace=True,
)

In [12]:
epc_2050

Unnamed: 0,geometry,objectid,geoid,state_fip,county_fip,tract,tot_pop,tot_pop_po,tot_pop_ci,tot_hh,...,disab_1_2,hus_re_1_2,zvhh_1_2,epc_2035,epc_2040,epc_2050,c2040_2050,epc_class,Shape__Area,Shape__Length
0,"POLYGON ((-121.90340 37.68028, -121.89608 37.6...",1917,06001450604,06,001,450604,5577,5577,5577,1887,...,0,0,0,0,0,0,0,,0.000216,0.070448
1,"POLYGON ((-121.92052 37.70200, -121.91817 37.7...",1916,06001450603,06,001,450603,5308,5303,5303,1704,...,0,0,0,0,0,0,0,,0.000406,0.092875
2,"POLYGON ((-121.93545 37.70054, -121.93281 37.6...",1915,06001450602,06,001,450602,10096,10090,10090,3465,...,0,0,0,0,0,0,0,,0.000645,0.176641
3,"POLYGON ((-122.02092 37.69768, -122.02038 37.6...",1914,06001450601,06,001,450601,3633,3633,3633,1343,...,0,0,0,0,0,0,0,,0.005128,0.460197
4,"POLYGON ((-122.00128 37.70373, -121.97794 37.7...",1913,06001450502,06,001,450502,5880,5880,5880,1913,...,0,0,0,0,0,0,0,,0.001137,0.181823
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1577,"POLYGON ((-122.72817 38.42330, -122.72457 38.4...",3143,06097153102,06,097,153102,5735,5653,5660,1454,...,0,0,0,1,1,1,0,High,0.000128,0.050685
1578,"POLYGON ((-122.74179 38.43068, -122.73944 38.4...",3145,06097153104,06,097,153104,3890,3830,3890,1206,...,1,1,0,1,1,1,0,Higher,0.000176,0.082621
1579,"POLYGON ((-122.74145 38.40182, -122.74146 38.4...",3146,06097153200,06,097,153200,8621,8614,8614,2602,...,0,1,0,0,0,1,1,High,0.001307,0.160268
1580,"POLYGON ((-122.81638 38.40347, -122.81248 38.4...",3147,06097153300,06,097,153300,13212,13149,13206,3782,...,1,0,0,0,0,1,1,High,0.003370,0.266220


## Concat 2022 and 2021 data

In [13]:
# map pba2050 county by county fips
county_fips_dict = {
    "001": "Alameda",
    "013": "Contra Costa",
    "041": "Marin",
    "055": "Napa",
    "075": "San Francisco",
    "081": "San Mateo",
    "085": "Santa Clara",
    "095": "Solano",
    "097": "Sonoma",
}
epc_2050["county"] = epc_2050["county_fip"].map(county_fips_dict)

In [15]:
county_fips_dict = {
    1: "Alameda",
    13: "Contra Costa",
    41: "Marin",
    55: "Napa",
    75: "San Francisco",
    81: "San Mateo",
    85: "Santa Clara",
    95: "Solano",
    97: "Sonoma",
}
epc_2050p_2021["county"] = epc_2050p_2021["county_fip"].map(county_fips_dict)
# epc_2050p_2021.rename(columns={"epc_2050p":"epc_2021"}, inplace=True)
epc_2050p_2022["county"] = epc_2050p_2022["county_fip"].map(county_fips_dict)
# epc_2050p_2022.rename(columns={"epc_2050p":"epc_2022"}, inplace=True)

In [16]:
# create a function to flag share columns with values that are 0 or 1
# these columns may highlight areas where the data are not reliable

def flag_share_cols(df, share_cols):
    """
    Flags share columns with values that are 0 or 1. These columns may highlight areas where the data are not reliable.

    Parameters
    -------------------
    df (geodataframe):
    Geodataframe object.

    share_cols (list):
    List of columns to flag.

    Returns
    -------------------
    Geodataframe object
    """
    for col in share_cols:
        df[f"{col}_flag"] = np.where(
            (df[col] == 0) | (df[col] == 1), 1, 0
        )
    return df

In [17]:
# flag share columns
cols = [
    "pct_poc",
    "pct_over75",
    "pct_spfam",
    "pct_lep",
    "pct_below2",
    "pct_disab",
    "pct_zvhhs",
    "pct_hus_re",
]
epc_2050 = flag_share_cols(epc_2050, cols)
epc_2050p_2021 = flag_share_cols(epc_2050p_2021, cols)
epc_2050p_2022 = flag_share_cols(epc_2050p_2022, cols)

In [20]:
out_cols = [
    "tract_geoid",
    "county_fip",
    "county",
    "tot_pop",
    "tot_pop_poc",
    "tot_pop_se",
    "tot_pop_po",
    "tot_pop_ci",
    "tot_pop_ov",
    "tot_hh",
    "tot_fam",
    "pop_poc",
    "pop_over75",
    "pop_spfam",
    "pop_lep",
    "pop_below2",
    "pop_disabi",
    "pop_hus_re",
    "pop_zvhhs",
    "vintage",
    "pct_poc",
    "pct_over75",
    "pct_spfam",
    "pct_lep",
    "pct_below2",
    "pct_disab",
    "pct_hus_re",
    "pct_zvhhs",
    # "poc_1_2",
    # "over75_1_2",
    # "spfam_1_2",
    # "lep_1_2",
    # "disab_1_2",
    # "below2_1_2",
    # "hus_re_1_2",
    # "zvhh_1_2",
    "epc_2050",
    "epc_2050p",
    # "epc_class",
]
# add vintage cols
epc_2050["vintage"] = 2018
epc_2050p_2021["vintage"] = 2021
epc_2050p_2022["vintage"] = 2022

# concat epc data
epc_concat = pd.concat([epc_2050, epc_2050p_2021, epc_2050p_2022])

In [21]:
epc_concat.to_csv("Data/epc_comparisons_2018_2021_2022.csv", index=False)

In [127]:
epc_concat.groupby(["vintage","epc_2050p"])[
    [
        "pop_poc",
        "pop_over75",
        "pop_spfam",
        "pop_lep",
        "pop_below2",
        "pop_disabi",
        "pop_hus_re",
        "pop_zvhhs",
    ]
].median()

Unnamed: 0_level_0,Unnamed: 1_level_0,pop_poc,pop_over75,pop_spfam,pop_lep,pop_below2,pop_disabi,pop_hus_re,pop_zvhhs
vintage,epc_2050p,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021,0,2376.0,258.0,89.0,163.5,536.0,361.0,100.0,62.0
2021,1,3356.0,187.0,185.0,563.0,1475.0,468.0,238.0,151.0
2022,0,2361.0,275.0,88.0,161.5,516.0,369.0,98.0,64.0
2022,1,3508.0,203.0,173.0,542.0,1418.0,494.0,245.0,150.0


In [23]:
epc_2050.pct_below2.mean()

0.21507443161287756

In [24]:
epc_2050p_2021.pct_below2.mean()

0.1858830544062323

In [25]:
epc_2050p_2022.pct_below2.mean()

0.18564096723739376

In [26]:
epc_2050.shape

(1582, 53)

In [27]:
epc_2050p_2021.shape

(1765, 49)

## Merge 2022 and 2021 data

In [22]:
epc_merge = pd.merge(epc_2050p_2021, epc_2050p_2022, on="tract_geoid", suffixes=("_2021", "_2022"))

In [70]:
epc_merge["epc_change"] = epc_merge["epc_2050p_2022"] - epc_merge["epc_2050p_2021"]

In [71]:
epc_merge["epc_change"].value_counts()

epc_change
 0    1661
 1      72
-1      32
Name: count, dtype: int64

In [72]:
epc_merge["epc_change_class"] = np.where(epc_merge["epc_change"] == 0, "no_change", np.where(epc_merge["epc_change"] > 0, "gain", "loss"))

In [96]:
# count the number of 0 values for each factor in 2021 and 2022
# by county
review_cols = [
    "pct_poc_2021",
    "pct_over75_2021",
    "pct_spfam_2021",
    "pct_lep_2021",
    "pct_below2_2021",
    "pct_disab_2021",
    "pct_zvhhs_2021",
    "pct_hus_re_2021",
    "pct_poc_2022",
    "pct_over75_2022",
    "pct_spfam_2022",
    "pct_lep_2022",
    "pct_below2_2022",
    "pct_disab_2022",
    "pct_hus_re_2022",
    "pct_zvhhs_2022",
]
epc_merge.query("epc_2050p_2021 == 1 or epc_2050p_2022 == 1")[review_cols][
    epc_merge[review_cols].isin([0, 1]).any(axis=1)
]

  epc_merge.query("epc_2050p_2021 == 1 or epc_2050p_2022 == 1")[review_cols][


Unnamed: 0,pct_poc_2021,pct_over75_2021,pct_spfam_2021,pct_lep_2021,pct_below2_2021,pct_disab_2021,pct_zvhhs_2021,pct_hus_re_2021,pct_poc_2022,pct_over75_2022,pct_spfam_2022,pct_lep_2022,pct_below2_2022,pct_disab_2022,pct_hus_re_2022,pct_zvhhs_2022
87,0.479839,0.0,0.295858,0.039672,0.252688,0.185484,0.49162,0.127374,0.528021,0.015938,0.154639,0.063484,0.267998,0.122365,0.174757,0.332039
111,0.213642,0.085682,0.0,0.028994,0.23168,0.162909,0.411387,0.253408,0.309837,0.096791,0.109589,0.050402,0.323514,0.234613,0.323985,0.445756
834,0.718701,0.001462,0.0,0.00804,0.782564,0.075389,0.709653,0.513513,0.723163,0.002339,0.0,0.00735,0.786662,0.069376,0.504035,0.710932
1024,0.592035,0.040545,0.298111,0.100898,0.306781,0.069429,0.0,0.152771,0.612628,0.022753,0.31383,0.112169,0.367653,0.083397,0.18799,0.002089
1169,0.552447,0.0,0.15099,0.004677,0.356599,0.041161,0.384,0.252,0.575603,0.0,0.220619,0.021489,0.415695,0.057714,0.323949,0.400079
1364,1.0,0.01747,0.54662,0.040185,0.706447,0.0844,0.178899,0.246942,1.0,0.015934,0.478818,0.029913,0.635099,0.116248,0.260211,0.20751
1369,0.768853,0.127869,0.105882,0.206761,0.285493,0.134301,0.06204,0.011567,0.718165,0.130329,0.109568,0.188284,0.235276,0.158134,0.0,0.051619
1381,0.946604,0.148654,0.114923,0.559465,0.673217,0.160615,0.72709,0.239678,1.0,0.160886,0.134259,0.529361,0.638947,0.170079,0.163055,0.692466
1382,0.605155,0.0,0.268085,0.182769,0.564433,0.0,0.641813,0.135965,0.788939,0.0,0.179372,0.143819,0.335037,0.0,0.149596,0.656334
1383,1.0,0.951923,0.0,1.0,1.0,0.471154,0.949495,0.040404,1.0,0.868217,0.0,1.0,1.0,0.488372,0.042373,0.957627


In [None]:
analysis_cols = [
    "tract_geoid",
    "county_fip_2021",
    "tot_pop_poc_2021",
    "tot_pop_se_2021",
    "tot_pop_po_2021",
    "tot_pop_ci_2021",
    "tot_hh_2021",
    "pop_zvhhs_2021",
    "tot_fam_2021",
    "tot_pop_ov_2021",
    "pop_hus_re_2021",
    "pop_poc_2021",
    "pop_over75_2021",
    "pop_spfam_2021",
    "pop_lep_2021",
    "pop_below2_2021",
    "pop_disabi_2021",
    "tot_pop_poc_2022",
    "tot_pop_se_2022",
    "tot_pop_po_2022",
    "tot_pop_ci_2022",
    "tot_pop_ov_2022",
    "tot_hh_2022",
    "tot_fam_2022",
    "pop_poc_2022",
    "pop_over75_2022",
    "pop_spfam_2022",
    "pop_lep_2022",
    "pop_below2_2022",
    "pop_disabi_2022",
    "pop_hus_re_2022",
    "pop_zvhhs_2022",
    "pct_poc_2022",
]

In [84]:
rev_cols = [
    "tract_geoid",
    "tot_pop_po_2021",
    "tot_pop_po_2022",
    "pop_below2_2021",
    "pop_below2_2022",
    "pct_below2_2021",
    "pct_below2_2022",
    "epc_change_class",
]
# epc_merge.query("county_fip_2021 == 41 and (below2_1_2_2021 == 1 or below2_1_2_2022 == 1)").groupby(["epc_change_class"])[rev_cols].mean()

In [85]:
epc_merge.query("county_fip_2021 == 41 and (below2_1_2_2021 == 1 or below2_1_2_2022 == 1)")[rev_cols]

Unnamed: 0,tract_geoid,tot_pop_po_2021,tot_pop_po_2022,pop_below2_2021,pop_below2_2022,pct_below2_2021,pct_below2_2022,epc_change_class
933,6041132100,1808,1742,461,473,0.254978,0.271527,no_change
995,6041108202,5013,4644,1288,1166,0.256932,0.251077,gain
997,6041108201,2458,2343,630,623,0.256306,0.265898,gain
1013,6041105001,4209,4323,1133,1640,0.269185,0.379366,no_change
1019,6041104104,4740,5173,1392,1603,0.293671,0.309878,no_change
1024,6041104102,5574,5274,1710,1939,0.306781,0.367653,gain
1026,6041103200,7623,7312,1759,1776,0.230749,0.242888,gain
1034,6041102203,5412,5486,1331,1627,0.245935,0.296573,no_change
1036,6041102202,5416,5393,1124,1314,0.207533,0.243649,no_change
1692,6041112202,5942,5730,3241,2861,0.545439,0.499302,no_change


In [131]:
out_list = epc_merge.columns.to_list()

In [134]:
# remove geometry columns from list
for item in ["geometry_2021", "geometry_2022"]:
    out_list.remove(item)

In [136]:
epc_merge[out_list].to_csv("Data/epc_2050p_2021_2022_wide.csv")