# What are the demographic characteristics of neighborhoods where entitlements are?

In [1]:
import intake
import IPython.display
import ipywidgets
import matplotlib.pyplot as plt
import numpy as np
import pandas
import pcts_census_utils

cat = intake.open_catalog("../catalogs/*.yml")

## Load data

In [2]:
prefix_list = pcts_census_utils.FULL_PREFIX_LIST
suffix_list = pcts_census_utils.FULL_SUFFIX_LIST

remove_prefix = ["ENV"]
remove_suffix = ["EIR"]

prefix_list = [x for x in prefix_list if x not in remove_prefix]
suffix_list = [x for x in suffix_list if x not in remove_suffix]

In [3]:
# Load PCTS and subset to the prefix / suffix list we want
pcts = pcts_census_utils.subset_pcts(
    prefix_list=prefix_list,
    suffix_list=suffix_list,
    get_dummies=True,
    verbose=True,
)
pcts = pcts_census_utils.drop_child_cases(pcts, keep_child_entitlements=True)

Downloading PCTS extract
Parsing PCTS case numbers
Getting dummy indicators for case types
Prefixes with no associated cases:  {'PS', 'HPO', 'TT', 'VTT'}
Suffixes with no associated cases:  {'SCEA', 'REC3', 'SUP1', 'REC2', 'SE', 'SCPE', 'ADD1', 'REC4', 'REC5', 'ND', 'EAF'}


In [4]:
# SplEIRPCTS info to case info (constant within a case), and the prefixes and suffixes (make this long)
case_info_cols = ["CASE_ID", "APLC_ID", "CASE_NBR", "CASE_SEQ_NBR", "CASE_YR_NBR", "CASE_ACTION_ID", 
           "CASE_FILE_RCV_DT", "CASE_FILE_DATE", "PARNT_CASE_ID", "PARENT_CASE", 
            "AIN", "PROJ_DESC_TXT", "id"]


# Make a df that has info about case. 
# Once we reshape prefixes/suffixes to be long, 
# we can merge these 2 parts together again with a m:1 merge.
pcts_case_info = pcts[case_info_cols]

In [5]:
# Make a list of only prefixes/suffixes 
drop_case_info = case_info_cols.copy()
drop_case_info.remove("CASE_ID")

# Make a df that contains ONLY suffixes
drop_me = prefix_list + drop_case_info
pcts_suffix = pcts.drop(columns = drop_me).drop_duplicates()

# Replace 0's in the dummy with NaNs so we can do a dropna later
pcts_suffix = pcts_suffix.replace(0, np.nan)

In [6]:
# Each case can have multiple suffixes. However, it is useful
# to be able to refer to a single suffix column when performing
# aggregations and computing statistics.
# Here we melt the suffix columns into a single column.
# This has the effect of creating multiple rows for cases
# that have multiple entitlements being requested.
pcts_suffix = pcts_suffix.melt(
        id_vars="CASE_ID",
        var_name="suffix",
        value_name="dummy"
    ).dropna(subset=["dummy"]).drop(columns = "dummy")

pcts_suffix.head(2)

Unnamed: 0,CASE_ID,suffix
5,177215.0,1A
14,177239.0,1A


In [7]:
# Merge the 2 dfs together
pcts_cleaned = pandas.merge(pcts_case_info, pcts_suffix, on = "CASE_ID", how = "inner")
pcts_cleaned.head(2)

Unnamed: 0,CASE_ID,APLC_ID,CASE_NBR,CASE_SEQ_NBR,CASE_YR_NBR,CASE_ACTION_ID,CASE_FILE_RCV_DT,CASE_FILE_DATE,PARNT_CASE_ID,PARENT_CASE,AIN,PROJ_DESC_TXT,id,suffix
0,166296.0,99811.0,DIR-2008-673-DRB-SPP,673.0,2008.0,2.0,2010-10-01,2010-10,,166296.0,2715025048,DESIGN REVIEW BOARD & PROJECT PERMIT COMPLIANC...,673_2010,DRB
1,166296.0,99811.0,DIR-2008-673-DRB-SPP,673.0,2008.0,2.0,2010-10-01,2010-10,,166296.0,2715025048,DESIGN REVIEW BOARD & PROJECT PERMIT COMPLIANC...,673_2010,EXT


In [8]:
# ACS data for income, race, commute, tenure
census = cat.census_analysis_table.read()

# Census tracts
tracts = cat.census_tracts.read()
tracts = (tracts[["GEOID10", "geometry"]]
          .rename(columns = {"GEOID10": "GEOID"})
         )

In [9]:
# Crosswalk linking AIN to tract GEOID
parcel_to_tract = cat.crosswalk_parcels_tracts.read()
parcel_to_tract = parcel_to_tract[["AIN", "num_AIN", "GEOID"]]

In [10]:
# Merge entitlements with tract using crosswalk
pcts_df = pandas.merge(pcts_cleaned, parcel_to_tract, 
                       on = "AIN", how = "inner", validate = "m:1")

In [11]:
# Check if there were duplicate parcels...not sure how to address there is num_AIN > 1
pcts_df.num_AIN.value_counts()

1     216671
2         21
13         4
44         3
11         3
47         1
3          1
Name: num_AIN, dtype: int64

## Clean up data and get rid of outliers

In [12]:
big_cases = pcts_df.CASE_ID.value_counts().head(100).index
pcts_df = pcts_df[~pcts_df.CASE_ID.isin(big_cases)]

In [13]:
# Our first pass at analyzing entitlements is to count the number
# of cases for each census tract, to see which kinds of entitlements
# are being applied for in which types of census tract:
entitlement = (pcts_df
    .groupby(["GEOID", "suffix", "CASE_YR_NBR"])
    .size()
    .to_frame("count")
).reset_index(level=1).reset_index(level=1).rename(columns={"CASE_YR_NBR": "year"})
entitlement = entitlement.assign(
    year=entitlement.year.astype("Int64")
)

In [14]:
# Merge the census data with the entitlements counts:
joined = pandas.merge(
    census,
    entitlement,
    on="GEOID",
    how="left", 
    validate="1:m"
)
joined = (joined.assign(
        count=joined["count"].fillna(0).astype(int)
    ).dropna()
    .sort_values(["GEOID", "suffix", "year"])
)

# Somehow, Int64 is throwing error in the map below, but int works

joined.head()

Unnamed: 0,GEOID,non_car_workers,workers_total,pct_non_car_workers,zero_veh_workers,pct_zero_veh_workers,pop_renter,pop_total,pct_pop_renter,pop_whitenonhisp,...,total_r75to99,total_r100to124,total_r125to149,total_r150to199,total_gt200,total_total,density,year,suffix,count
0,6037101110,46,1927,0.023871,0,0.0,2199,4219,0.521214,2516,...,277,91,93,109,44,1596,9565.898824,2012,CUW,2
1,6037101110,46,1927,0.023871,0,0.0,2199,4219,0.521214,2516,...,277,91,93,109,44,1596,9565.898824,2013,CUW,1
2,6037101110,46,1927,0.023871,0,0.0,2199,4219,0.521214,2516,...,277,91,93,109,44,1596,9565.898824,2014,EXT,1
3,6037101110,46,1927,0.023871,0,0.0,2199,4219,0.521214,2516,...,277,91,93,109,44,1596,9565.898824,2014,PMLA,1
4,6037101110,46,1927,0.023871,0,0.0,2199,4219,0.521214,2516,...,277,91,93,109,44,1596,9565.898824,2017,PMLA,1


## Plot

In [15]:
# Plot entitlement stats against median household income,
# population density, and geography:
def plot_entitlement(df, tracts, suffix, year="2017"):
        
    if year == "all":
        to_plot = df[(df["count"] > 0) & (df.year >= 2010) & (df.suffix == suffix)]
        to_plot = to_plot.groupby("GEOID").agg({
            "count": "sum",
            "medhhincome": "first",
            "density": "first"
        }).reset_index()
    else:
        to_plot = df[(df["count"] > 0) & (df.year == int(year)) & (df.suffix == suffix)]
    
    # Merge in geometry
    final_df = tracts.merge(
        to_plot,
        on="GEOID", 
        how="left",
    ).fillna(
        {"count": 0}
    )
    
    fig = plt.figure(figsize=(12,8))
    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    ax1.set_xlabel("Median household income")
    ax1.set_ylabel("Entitlement count per tract")
    ax1.scatter(final_df.medhhincome, final_df["count"])

    ax2 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    ax2.set_xlabel("Population density")
    ax2.set_ylabel("Entitlement count per tract")
    ax2.scatter(final_df.density, final_df["count"])
    
    ax3 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)
    ax3.axis("off")
    
    final_df.plot(
        ax=ax3,
        column="count",
        cmap="magma",
    )
    
    plt.close() # Prevent double plotting
    return fig

In [16]:
years = [("All (from 2010 - present)", "all")] + [(str(i), str(i)) for i in range(2010, 2020)]
year_dropdown = ipywidgets.Dropdown(description="Year", options=years)
suffix_dropdown = ipywidgets.Dropdown(description="Suffix")
output = ipywidgets.Output()

display(year_dropdown)
display(suffix_dropdown)
display(output)

change_guard = False

def on_suffix_selection(*args):
    global change_guard
    if change_guard:
        return
    output.clear_output(wait=True)
    suffix = suffix_dropdown.value
    year = year_dropdown.value
    with output:
        display(plot_entitlement(joined, tracts, suffix, year))

def on_year_selection(*args):
    global change_guard
    if year_dropdown.value == "all":
        condition = (joined.year >= 2010)
    else:
        condition = (joined.year == int(year_dropdown.value))
    counts = joined[condition].groupby("suffix").agg({"count": "sum"})["count"]
    # Sort by alphabetical or in descending value of counts?
    #counts = counts.sort_values(ascending=False)
    old_val = suffix_dropdown.value 
    change_guard=True
    suffix_dropdown.options = [
        (f"{name} ({count:,} applications)", name) 
        for name,count in zip(counts.index, counts)
    ]
    if old_val in counts.index:
        suffix_dropdown.value = old_val
    else:
        suffix_dropdown.index = 0
    change_guard=False
    on_suffix_selection()

on_year_selection()
suffix_dropdown.observe(on_suffix_selection, names="value")
year_dropdown.observe(on_year_selection, names="value")

Dropdown(description='Year', options=(('All (from 2010 - present)', 'all'), ('2010', '2010'), ('2011', '2011')…

Dropdown(description='Suffix', options=(), value=None)

Output()