# Correlation matrix for TOC regressions

In [1]:
import altair as alt
import intake
import pandas as pd

import laplan

In [2]:
catalog = intake.open_catalog("../catalogs/*.yml")

bucket_name = 'city-planning-entitlements'

In [3]:
def prep_pcts():
    pcts = catalog.pcts.read()
    
    start = "10/1/2017"
    suffix = ["TOC"]

    pcts1 = laplan.pcts.subset_pcts(pcts, 
                                 start_date = start, 
                                 suffix_list = suffix, 
                                 get_dummies = True,
                                 verbose = True,
                                )

    cols = ["CASE_ID", "CASE_NUMBER", "AIN", "TOC"]
    pcts1 = pcts1[cols]
    
    return pcts1

In [4]:
pcts = prep_pcts()

Parsing PCTS case numbers
0 cases failed to parse.
Getting dummy indicators for case types
Prefixes with no associated cases:  {'CPC', 'VTT', 'APCSV', 'APCH', 'PS', 'APCS', 'APCE', 'CHC', 'ENV', 'APCNV', 'AA', 'HPO', 'TT', 'APCW'}


In [5]:
# Import data
crosswalk_parcels_tracts = catalog.crosswalk_parcels_tracts.read()
census = catalog.census_analysis_table.read()

zoning = catalog.crosswalk_tracts_zone_class.read()

favorable_toc_zones = ["R2", "R3", "C2"]
zoning = zoning[["GEOID"] + favorable_toc_zones]

toc_eligible_tracts = (crosswalk_parcels_tracts[["GEOID", "total_AIN", "pct_toc_AIN", "toc_AIN"]]
                   .drop_duplicates()
                    .assign(
                       toc_AIN = crosswalk_parcels_tracts.toc_AIN.astype(bool)
                   )
                  )

tier_parcels = (
    crosswalk_parcels_tracts
    [crosswalk_parcels_tracts.TOC_Tier > 0]
    .groupby(["GEOID", "TOC_Tier"])
    .size()
    .to_frame("n")
    .reset_index(level=1)
    .pivot(columns="TOC_Tier", values="n")
    .rename(columns=lambda x: f"Tier_{x}")
    .fillna(0.0)
    .astype(int)
)

In [6]:
def assemble_data(pcts, crosswalk_parcels_tracts, tier_parcels, zoning, census):
    df = (pd.merge(
        pcts, 
        crosswalk_parcels_tracts[["uuid", "AIN", "GEOID", "total_AIN", "pct_toc_AIN", "toc_AIN"]], 
        how="inner", 
        on="AIN", 
        validate="m:1"
    ).merge(
        tier_parcels,
        how="left",
        on="GEOID",
        validate="m:1"
    ).merge(
        zoning, 
        how="left",
        on="GEOID",
        validate="m:1"
    ).merge(
        census, 
        how="left",
        on="GEOID",
        validate="m:1"
    ).fillna(0)
          .dropna()
          .rename(columns = {"toc_AIN": "TOC_eligible"})
          .drop_duplicates()
          .reset_index(drop=True)
    )
        
    # Get rid of duplicates 
    df = (df.sort_values(["CASE_ID", "uuid"])
          .drop_duplicates(subset = ["CASE_ID", "uuid"], keep="first")
          .reset_index(drop=True)
          .assign(
              TOC_eligible = df.TOC_eligible.astype(bool), 
              favorable_toc_zoning = df[favorable_toc_zones].sum(axis=1)
          )
         )
    
    return df

In [7]:
df = assemble_data(pcts, crosswalk_parcels_tracts, tier_parcels, zoning, census)

group_cols = [
    "GEOID", 
    "pct_toc_AIN", "TOC_eligible", 
    "Tier_1", "Tier_2", "Tier_3", "Tier_4",
    'pct_pop_renter', 'pct_whitenonhisp', 'medhhincome', 
    "density", "favorable_toc_zoning"
]

In [8]:
summary = (df.groupby(group_cols)
           .agg({"TOC": "sum", "CASE_ID": "nunique"})
           .rename(columns = {"CASE_ID": "TOC_cases", 
                             "TOC": "TOC_entitlements"})
           .reset_index()
          )

In [9]:
correlation_long = summary.corr().reset_index().melt('index')

In [10]:
correlation_long.columns = ['var1', 'var2', 'correlation']

In [11]:
base = alt.Chart(correlation_long).transform_filter(
    alt.datum.var1 < alt.datum.var2
).encode(
    x='var1',
    y='var2',
).properties(
    width=alt.Step(40),
    height=alt.Step(40)
)

rects = base.mark_rect().encode(
    color='correlation'
)

text = base.mark_text(
    size=10
).encode(
    text=alt.Text('correlation', format=".2f"),
    color=alt.condition(
        "datum.correlation > 0.5",
        alt.value('white'),
        alt.value('black')
    )
)

rects + text

In [12]:
corr_cols = ['pct_toc_AIN', 'Tier_1', 'Tier_2', 'Tier_3',
       'Tier_4','pct_pop_renter', 'pct_whitenonhisp', 'medhhincome',
       'density', 'favorable_toc_zoning', 'TOC_entitlements', 'TOC_cases']

regress_cols = ["pct_pop_renter", "favorable_toc_zoning", "Tier_2", "Tier_3"]

scatter = (
    alt.Chart(summary).mark_circle().encode(
        alt.X(alt.repeat("column"), type='quantitative'),
        alt.Y(alt.repeat("row"), type='quantitative'),
        color='TOC_cases:N'
    ).properties(
        width=150,
        height=150
    ).repeat(
        row=corr_cols,
        column=corr_cols,
    )
)

scatter

In [13]:
# https://stackoverflow.com/questions/61277181/adding-r-value-correlation-to-scatter-chart-in-altair
chart = alt.Chart(summary[["Tier_3", "pct_pop_renter"]]).mark_circle().encode(
        alt.X('Tier_3',), #scale=alt.Scale(domain=(5,10))),
        y='pct_pop_renter'
)

correlation = summary[['Tier_3','pct_pop_renter']].corr().iloc[0,1]
correlation

text = alt.Chart({'values':[{}]}).mark_text(
    align="left", baseline="top"
).encode(
    x=alt.value(5),  # pixels from left
    y=alt.value(5),  # pixels from top
    text=alt.value(f"r: {correlation:.3f}"),
)

chart + text + chart.transform_regression('Tier_3','pct_pop_renter').mark_line()

In [14]:
"""
https://github.com/altair-viz/altair/pull/1945

import altair as alt
from vega_datasets import data

df_iris = data.iris()
corrMatrix = df_iris.corr().reset_index().melt('index')
corrMatrix.columns = ['var1', 'var2', 'correlation']

base = alt.Chart(corrMatrix).transform_filter(
    alt.datum.var1 < alt.datum.var2
).encode(
    x='var1',
    y='var2',
).properties(
    width=alt.Step(100),
    height=alt.Step(100)
)

rects = base.mark_rect().encode(
    color='correlation'
)

text = base.mark_text(
    size=30
).encode(
    text=alt.Text('correlation', format=".2f"),
    color=alt.condition(
        "datum.correlation > 0.5",
        alt.value('white'),
        alt.value('black')
    )
)

rects + text
"""

'\nhttps://github.com/altair-viz/altair/pull/1945\n\nimport altair as alt\nfrom vega_datasets import data\n\ndf_iris = data.iris()\ncorrMatrix = df_iris.corr().reset_index().melt(\'index\')\ncorrMatrix.columns = [\'var1\', \'var2\', \'correlation\']\n\nbase = alt.Chart(corrMatrix).transform_filter(\n    alt.datum.var1 < alt.datum.var2\n).encode(\n    x=\'var1\',\n    y=\'var2\',\n).properties(\n    width=alt.Step(100),\n    height=alt.Step(100)\n)\n\nrects = base.mark_rect().encode(\n    color=\'correlation\'\n)\n\ntext = base.mark_text(\n    size=30\n).encode(\n    text=alt.Text(\'correlation\', format=".2f"),\n    color=alt.condition(\n        "datum.correlation > 0.5",\n        alt.value(\'white\'),\n        alt.value(\'black\')\n    )\n)\n\nrects + text\n'