# What are the demographic characteristics of neighborhoods where entitlements are?

In [1]:
import intake
import IPython.display
import matplotlib.pyplot as plt
import pandas

cat = intake.open_catalog("../catalogs/*.yml")

In [2]:
# Load PCTS extract
pcts = cat.pcts2.read()

In [3]:
# ACS data for income, race, commute, tenure
census = pandas.read_parquet(
    "s3://city-planning-entitlements/data/final/census_cleaned.parquet"
)

In [4]:
# Census tracts
tracts = cat.census_tracts.read()
tracts = tracts.assign(
    density=tracts.HD01_VD01.astype(int)/(tracts.Shape_STAr/5280./5280.),
    population=tracts.HD01_VD01,
)

In [5]:
parcel_to_tract = pandas.read_parquet(
    "s3://city-planning-entitlements/data/crosswalk_parcels_tracts.parquet"
)

In [6]:
# The requested entitlements are in the suffixes of the PCTS case number.
# A given case can have an arbitrary number of entitlement suffixes, so
# we need to parse it into its component parts.
# We can use a regex for that:

from utils import GENERAL_PCTS_RE
cols = pcts.CASE_NBR.str.extract(GENERAL_PCTS_RE)

In [7]:
all_suffixes = cols[3].str.strip("-").str.split("-", expand=True)

In [8]:
suffix_counts = (all_suffixes
    .apply(lambda col: col.value_counts(), axis=0)
    .sum(axis=1)
    .astype(int)
    .sort_values(ascending=False)
)

In [9]:
# Associate each PCTS entitlement case with a census tract:
pcts = pcts.merge(
    parcel_to_tract[["GEOID", "AIN"]],
    how="left",
    on="AIN",
).merge(
    tracts[["GEOID10", "population", "density"]],
    how="left",
    left_on="GEOID",
    right_on="GEOID10",
)

In [10]:
pcts_suffixes = pandas.concat((pcts, all_suffixes), axis=1)

In [11]:
# Each case can have multiple suffixes. However, it is useful
# to be able to refer to a single suffix column when performing
# aggregations and computing statistics.
# Here we melt the suffix columns into a single column.
# This has the effect of creating multiple rows for cases
# that have multiple entitlements being requested.

pcts_suffixes = pcts_suffixes.melt(
    id_vars=pcts.columns,
    var_name="nothing",
    value_name="suffix"
).dropna(subset=["suffix"]).drop(columns=["GEOID10", "nothing"])

In [12]:
# Our first pass at analyzing entitlements is to count the number
# of cases for each census tract, to see which kinds of entitlements
# are being applied for in which types of census tract:
entitlement = (pcts_suffixes
    .groupby(["GEOID", "suffix", "CASE_YR_NBR"])
    .size()
    .to_frame("count")
).reset_index(level=1).reset_index(level=1).rename(columns={"CASE_YR_NBR": "year"})
entitlement = entitlement.assign(
    year=entitlement.year.astype("Int64")
)

In [13]:
# We next calculate incomes on a census tract level.
# We pivot the `new_var` column into wide format to get a column for each variable.
income = census[(census.table == "incomerange") & (census.year == 2018)]
income = income.pivot(index="GEOID", columns="new_var", values="num")

In [14]:
# Merge the income data with the entitlements counts:
joined = pandas.merge(
    income,
    entitlement,
    how="inner",
    left_index=True,
    right_index=True,
)

In [15]:
# We'd like to calculate income percentiles from the reported ranges.
# The following function takes a row from the pivoted ACS data,
# and estimates a set of percentiles from the binned data:
def income_percentiles(row, percentiles, prefix="total"):
    # Edges of the reported income bins, in thousands of dollars
    bins = [0, 10, 15, 20, 25, 30, 35, 40, 45, 50, 60, 75, 100, 125, 150, 200]
    # Iterator over percentiles
    p_it = iter(percentiles)
    # Final values for percentiles
    values = []
    # Initialize current percentile and an accumulator variable
    curr = next(p_it)
    acc = 0
    # The total count for the tract
    total = row[f"{prefix}_total"]
    if total <= 0:
        return values
    for i, b in enumerate(bins):
        # Compute the label of the current bin
        if i == 0:
            label = f"{prefix}_lt{bins[i+1]}"
        elif i == len(bins) - 1:
            label = f"{prefix}_gt{b}"
        else:
            label = f"{prefix}_r{b}to{bins[i+1]-1}"
        # Estimate the value for the current percentile
        # if it falls within this bin
        while (acc + row[label])/total > curr/100.0:
            frac = (total*curr/100.0 - acc)/row[label]
            lower = b
            upper = bins[i+1] if i < (len(bins) - 1) else 300. 
            interp = (1.0 - frac) * lower + frac * upper
            values.append(interp)
            try:
                curr = next(p_it)
            except StopIteration:
                return values
        # Increment the accumulator
        acc = acc + row[label]
    return values

In [16]:
# Compute the inter-quartile range for the income data:
iqr = joined.apply(
    lambda r: pandas.Series(income_percentiles(r, [25,50,75]), dtype="float64"),
    axis=1,
).rename(columns={0: "Q1", 1: "Q2", 2: "Q3"})

In [17]:
joined = pandas.concat((joined, iqr), axis=1)[["Q1", "Q2", "Q3", "suffix", "year", "count"]]
# Bring in population density
joined = joined.reset_index().merge(
    tracts[["GEOID10", "density"]],
    how="left",
    left_on="GEOID",
    right_on="GEOID10",
).drop(columns="GEOID10").set_index("GEOID").dropna()

In [18]:
# Plot entitlement stats against median household income,
# population density, and geography:
def plot_entitlement(df, tracts, suffix, year="2017"):
    if year == "all":
        to_plot = df[(df["count"] > 0) & (df.year >= 2010) & (df.suffix == suffix)]
        to_plot = to_plot.groupby(to_plot.index).agg({
            "count": "sum",
            "Q2": "first",
            "density": "first"
        })
    else:
        to_plot = df[(df["count"] > 0) & (df.year == int(year)) & (df.suffix == suffix)]
    
    fig = plt.figure(figsize=(12,8))
    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    ax1.set_xlabel("Median income")
    ax1.set_ylabel("Entitlement count per tract")
    ax1.scatter(to_plot.Q2, to_plot["count"])

    ax2 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    ax2.set_xlabel("Population density")
    ax2.set_ylabel("Entitlement count per tract")
    ax2.scatter(to_plot.density, to_plot["count"])

    ax3 = plt.subplot2grid((2, 2), (0, 1), rowspan=2)
    ax3.axis("off")
    tracts.merge(
        to_plot,
        left_on="GEOID10",
        right_index=True,
        how="left"
    ).fillna(
        {"count": 0}
    ).plot(
        ax=ax3,
        column="count",
        cmap="magma",
    )
    plt.close() # Prevent double plotting
    return fig

In [19]:
import ipywidgets

years = [("All (from 2010 - present)", "all")] + [(str(i), str(i)) for i in range(2010, 2020)]
year_dropdown = ipywidgets.Dropdown(description="Year", options=years)
suffix_dropdown = ipywidgets.Dropdown(description="Suffix")
output = ipywidgets.Output()

display(year_dropdown)
display(suffix_dropdown)
display(output)

change_guard = False

def on_suffix_selection(*args):
    global change_guard
    if change_guard:
        return
    output.clear_output(wait=True)
    suffix = suffix_dropdown.value
    year = year_dropdown.value
    with output:
        display(plot_entitlement(joined, tracts, suffix, year))

def on_year_selection(*args):
    global change_guard
    if year_dropdown.value == "all":
        condition = (joined.year >= 2010)
    else:
        condition = (joined.year == int(year_dropdown.value))
    counts = joined[condition].groupby("suffix").agg({"count": "sum"})["count"]
    counts = counts.sort_values(ascending=False)
    old_val = suffix_dropdown.value 
    change_guard=True
    suffix_dropdown.options = [
        (f"{name} ({count:,} applications)", name) 
        for name,count in zip(counts.index, counts)
    ]
    if old_val in counts.index:
        suffix_dropdown.value = old_val
    else:
        suffix_dropdown.index = 0
    change_guard=False
    on_suffix_selection()

on_year_selection()
suffix_dropdown.observe(on_suffix_selection, names="value")
year_dropdown.observe(on_year_selection, names="value")

Dropdown(description='Year', options=(('All (from 2010 - present)', 'all'), ('2010', '2010'), ('2011', '2011')…

Dropdown(description='Suffix', options=(), value=None)

Output()