# Explore outliers
* There are 38 cases that have touch 20+ parcels, 4 are explicitly mentioned in Ashley's email that should only apply to 1 parcel, but somehow, are touching a lot of parcels
* Check if this is true in the uncleaned PCTS
* Is this a data cleaning issue or did the raw data come like this?

In [1]:
import boto3
import geopandas as gpd
import intake
import ipywidgets
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import laplan

In [2]:
cat = intake.open_catalog("../catalogs/*.yml")

s3 = boto3.client('s3')
bucket_name = "city-planning-entitlements"

In [3]:
"""
# CASE_NBRs
DIR-2013-684-VSO -- should only apply to PIN # 108B149 1190
DIR-2012-2817-VSO-MEL -- same property as DIR-2013-684-VSO above
ZA-2015-3539-CUW -- should only apply to PIN # 157-5A207 44
DIR-2012-1857-DI -- should only apply to 141B185 347
"""

# Corresponding CASE_IDs
BIG_CASES = [190670, 188939, 204512, 187855]
VENICE_CASES = [190670, 188939]

In [4]:
"""
def create_outlier_parcel_geom_file():
    # Import data
    parcel_geom = gpd.read_file(f'zip+s3://{bucket_name}/gis/intermediate/la_parcels_with_dups.zip')
    
    cases = pd.read_parquet('../data/tCASE.parquet')
    app = pd.read_parquet('../data/tAPLC.parquet')
    geo_info = pd.read_parquet('../data/tPROP_GEO_INFO.parquet')
    la_prop = pd.read_parquet('../data/tLA_PROP.parquet')
    
    # Only keep min cols needed
    cases = cases[['CASE_ID', 'APLC_ID', 'CASE_NBR', 'CASE_FILE_RCV_DT']]
    geo_info = geo_info[["CASE_ID", "PROP_ID"]]
    la_prop = la_prop[la_prop.ASSR_PRCL_NBR.notna()][['PROP_ID', 'ASSR_PRCL_NBR']]
    
    # Start merging and assembling
    big_case_ids = cases[cases.CASE_ID.isin(BIG_CASES)]
    raw2 = pd.merge(big_case_ids, geo_info, on = "CASE_ID", how = "inner")
    raw3 = pd.merge(raw2, la_prop, on = "PROP_ID", how  = "inner", validate = "m:1")

    # Save only the parcel geom for these big cases...otherwise, parcel file is too unwieldy
    big_case_parcels = (raw3[['ASSR_PRCL_NBR']]
                        .drop_duplicates()
                        .reset_index(drop=True)
                       )
    
    big_case_parcels2 = pd.merge(parcel_geom, big_case_parcels, 
         left_on = "AIN", right_on = "ASSR_PRCL_NBR", 
         how = "inner", validate = "1:1")
    
    
    # Save to S3 as geoparquet
    file_name = 'big_case_parcels.parquet'
    big_case_parcels2.to_parquet(f'../gis/{file_name}')
    s3.upload_file(f'../gis/{file_name}', bucket_name, f'gis/{file_name}')
   
   
create_outlier_parcel_geom_file()
"""

'\ndef create_outlier_parcel_geom_file():\n    # Import data\n    parcel_geom = gpd.read_file(f\'zip+s3://{bucket_name}/gis/intermediate/la_parcels_with_dups.zip\')\n    \n    cases = pd.read_parquet(\'../data/tCASE.parquet\')\n    app = pd.read_parquet(\'../data/tAPLC.parquet\')\n    geo_info = pd.read_parquet(\'../data/tPROP_GEO_INFO.parquet\')\n    la_prop = pd.read_parquet(\'../data/tLA_PROP.parquet\')\n    \n    # Only keep min cols needed\n    cases = cases[[\'CASE_ID\', \'APLC_ID\', \'CASE_NBR\', \'CASE_FILE_RCV_DT\']]\n    geo_info = geo_info[["CASE_ID", "PROP_ID"]]\n    la_prop = la_prop[la_prop.ASSR_PRCL_NBR.notna()][[\'PROP_ID\', \'ASSR_PRCL_NBR\']]\n    \n    # Start merging and assembling\n    big_case_ids = cases[cases.CASE_ID.isin(BIG_CASES)]\n    raw2 = pd.merge(big_case_ids, geo_info, on = "CASE_ID", how = "inner")\n    raw3 = pd.merge(raw2, la_prop, on = "PROP_ID", how  = "inner", validate = "m:1")\n\n    # Save only the parcel geom for these big cases...otherwise, pa

In [5]:
# Read in parcel_geom (
# local because geoparquets need to be dowloaded locally; we saved to S3)
big_case_parcels = gpd.read_parquet('../gis/big_case_parcels.parquet')

big_case_parcels = big_case_parcels[['AIN', 'num_AIN', 'geometry']]

In [6]:
big_case_parcels.head()

Unnamed: 0,AIN,num_AIN,geometry
0,4224033002,1,"POLYGON ((6423873.005 1817761.017, 6423854.765..."
1,4224033003,1,"POLYGON ((6423925.405 1817744.217, 6423907.015..."
2,4224033004,1,"POLYGON ((6423973.135 1817728.897, 6423954.675..."
3,4224033005,1,"POLYGON ((6424020.565 1817713.677, 6424002.395..."
4,4224033006,1,"POLYGON ((6424068.295 1817698.367, 6424049.845..."


## Raw PCTS

In [7]:
'''
import ibis
# Download S3 locally
con = ibis.sqlite.connect("../data/PCTS.sqlite")

cases = con.table("tCASE")
app = con.table("tAPLC")
geo_info = con.table("tPROP_GEO_INFO")
la_prop = con.table("tLA_PROP")

#table[table.PROP_ID==190670].execute()
'''

'\nimport ibis\n# Download S3 locally\ncon = ibis.sqlite.connect("../data/PCTS.sqlite")\n\ncases = con.table("tCASE")\napp = con.table("tAPLC")\ngeo_info = con.table("tPROP_GEO_INFO")\nla_prop = con.table("tLA_PROP")\n\n#table[table.PROP_ID==190670].execute()\n'

In [8]:
cases = pd.read_parquet('../data/tCASE.parquet')
app = pd.read_parquet('../data/tAPLC.parquet')
geo_info = pd.read_parquet('../data/tPROP_GEO_INFO.parquet')
la_prop = pd.read_parquet('../data/tLA_PROP.parquet')

cases = cases[['CASE_ID', 'APLC_ID', 'CASE_NBR', 'CASE_FILE_RCV_DT']]
app = app[['APLC_ID', 'PROJ_DESC_TXT']]
geo_info = geo_info[["CASE_ID", "PROP_ID"]]
la_prop = la_prop[la_prop.ASSR_PRCL_NBR.notna()][['PROP_ID', 'ASSR_PRCL_NBR']]

### 1. Subset case table to these 4 big cases
* 4 CASE_IDs (expected)
* 3 APLC_IDs

CASE_IDs and APLC_IDs don't always match.
<br> 
The Venice case shows that it was the same application, but were assigned 2 different CASE_IDs, each of which is a parent case.

In [9]:
# Subset case table to just these 4 big cases
big_case_ids = cases[cases.CASE_ID.isin(BIG_CASES)]

# Merge in project description
raw1 = pd.merge(big_case_ids, app, 
         on = "APLC_ID", how = "left", validate = "m:1")

display(raw1)
print(f'# unique CASE_IDs: {raw1.CASE_ID.nunique()}')
print(f'# unique APLC_IDs: {raw1.APLC_ID.nunique()}')

Unnamed: 0,CASE_ID,APLC_ID,CASE_NBR,CASE_FILE_RCV_DT,PROJ_DESC_TXT
0,188939.0,103289.0,DIR-2012-2817-VSO-MEL,2012-10-12,VSO - DEMO (E) SFD; CONSTRUCT NEW 3-STORY SFD ...
1,190670.0,103289.0,DIR-2013-684-VSO,2013-03-12,VSO - DEMO (E) SFD; CONSTRUCT NEW 3-STORY SFD ...
2,187855.0,114989.0,DIR-2012-1857-DI,2012-07-12,HOLLYWOOD SUD
3,204512.0,170796.0,ZA-2015-3539-CUW,2015-09-28,CONDITIONAL USE FOR THE INSTALLATION OF A VERI...


# unique CASE_IDs: 4
# unique APLC_IDs: 3


### 2. Bring in PROP_ID
case and geo_info tables both have `CASE_ID`
<br>
Getting `PROP_ID` is an intermediate step to getting `AIN`

* 4 CASE_IDs
* 11,099 unique PROP_IDs

What is `PROP_ID`? It doesn't appear to be unique to an AIN. Is it unique to the AIN-CASE_NBR? Every time a new case is filed, those parcels get assigned a new `PROP_ID`?

In [10]:
# Merge in geo_info table
raw2 = pd.merge(raw1, geo_info, on = "CASE_ID", how = "inner")
print(f'# unique CASE_IDs: {raw2.CASE_ID.nunique()}')
print(f'# unique PROP_IDs: {raw2.PROP_ID.nunique()}')

# unique CASE_IDs: 4
# unique PROP_IDs: 11099


### 3. Bring in AIN
* 4 CASE_IDs 
* 11,053 unique PROP_IDs (but number fell by 44)
* 8,076 unique AINs

In [11]:
raw3 = (pd.merge(raw2, la_prop, 
                 on = "PROP_ID", how  = "inner", validate = "m:1")
        .rename(columns = {'ASSR_PRCL_NBR': 'AIN'})
       )


print(f'# unique CASE_IDs: {raw3.CASE_ID.nunique()}')
print(f'# unique PROP_IDs: {raw3.PROP_ID.nunique()}')
print(f'# unique AINs: {raw3.AIN.nunique()}')

# unique CASE_IDs: 4
# unique PROP_IDs: 11053
# unique AINs: 8076


In [12]:
# Merge in geometry
raw4 = pd.merge(big_case_parcels, raw3,
                on = "AIN", how = "inner", validate = "1:m")

### What are these PROP_IDs
Looks like PROP_IDs are uniquely assigned when CASE_ID or APLC_ID comes in.

In [13]:
la_prop.ASSR_PRCL_NBR.value_counts()

6046019930    305
4431015BRK    195
4129027902    191
5145018006    189
7440026903    171
             ... 
5510001020      1
2408018014      1
2228014004      1
4412029901      1
2261021054      1
Name: ASSR_PRCL_NBR, Length: 364273, dtype: int64

In [14]:
lots_of_prop_ids = ["6046019930", "4431015BRK", "4129027902"]

m1 = pd.merge(cases, app, on = "APLC_ID", how = "left", validate = "m:1")
m2 = pd.merge(m1, geo_info, on = "CASE_ID", how = "inner")
m3 = pd.merge(m2, la_prop[la_prop.ASSR_PRCL_NBR.isin(lots_of_prop_ids)], 
              on = "PROP_ID", how  = "inner", validate = "m:1")

print(f'# unique APLC_IDs: {m3.APLC_ID.nunique()}')
print(f'# unique CASE_IDs: {m3.CASE_ID.nunique()}')
print(f'# unique PROP_IDs: {m3.PROP_ID.nunique()}')
print(f'# unique AIN: {m3.ASSR_PRCL_NBR.nunique()}')

# unique APLC_IDs: 24
# unique CASE_IDs: 96
# unique PROP_IDs: 691
# unique AIN: 3


In [15]:
example_parcel = "6046019930"

display(la_prop[la_prop.ASSR_PRCL_NBR==example_parcel].head())
print(f'# unique AIN: {la_prop[la_prop.ASSR_PRCL_NBR==example_parcel].ASSR_PRCL_NBR.nunique()}')
print(f'# unique PROP_ID: {la_prop[la_prop.ASSR_PRCL_NBR==example_parcel].PROP_ID.nunique()}')

Unnamed: 0,PROP_ID,ASSR_PRCL_NBR
365622,59326709.0,6046019930
365774,59326706.0,6046019930
365775,59326707.0,6046019930
365776,59326708.0,6046019930
365777,59326710.0,6046019930


# unique AIN: 1
# unique PROP_ID: 305


For our outlier cases, AINs don't appear multiple times.

In [16]:
example_parcel = "4224033002"
display(la_prop[la_prop.ASSR_PRCL_NBR==example_parcel].head())
print(f'# unique AIN: {la_prop[la_prop.ASSR_PRCL_NBR==example_parcel].ASSR_PRCL_NBR.nunique()}')
print(f'# unique PROP_ID: {la_prop[la_prop.ASSR_PRCL_NBR==example_parcel].PROP_ID.nunique()}')

Unnamed: 0,PROP_ID,ASSR_PRCL_NBR
87845,59043273.0,4224033002


# unique AIN: 1
# unique PROP_ID: 1


## Map

In [17]:
col_order = ['CASE_ID', 'CASE_NBR', 'PROJ_DESC_TXT', 
             'AIN', 'PROP_ID', 'geometry']

to_plot = (raw4.assign(
                CASE_ID = raw4.CASE_ID.astype(int).astype(str)
            ).dissolve(
            by=['CASE_ID', 'PROJ_DESC_TXT'], 
            aggfunc = {'AIN': 'nunique', 
                       'PROP_ID': 'nunique',
                      'CASE_NBR': 'first'}
            ).reset_index()
           .to_crs("EPSG:4326")
          )[col_order]

In [18]:
import ipyleaflet
from ipyleaflet import Map, GeoData, LayersControl, basemaps, WidgetControl
from ipywidgets import link, FloatSlider, Text, HTML
from ipywidgets.embed import embed_minimal_html
import branca.colormap
import json

geo_data = json.loads(to_plot.set_index('CASE_ID').to_json())

# Take what we want to map and turn it into a dictionary
# Can only include the key-value pair, the value you want to map, nothing more.
choro_df = dict(zip(to_plot.CASE_ID.tolist(), to_plot.AIN.tolist()))

In [19]:
m = ipyleaflet.Map(center = (34.0536, -118.2427), zoom = 11,
                  basemap = basemaps.CartoDB.Positron)

layer = ipyleaflet.Choropleth(
    geo_data = geo_data,
    choro_data = choro_df, 
    colormap = branca.colormap.linear.RdPu_03,
    border_color = '#999999',
    value_min = 0, 
    value_max = 0, 
    style = {'fillOpacity': 0.9, 'weight': 0.5, 'color': '#999999', 'opacity': 0.8},
    name = 'Cases'
)

html = HTML(''' 
    Hover over a tract
''')

html.layout.margin = '0 px 10px 10px 10px'

def click_handler(event = None, id = None, properties = None):
    label.value = properties['PROJ_DESC_TXT']

def update_html_tract(feature, id, **kwargs): 
    html.value = '''
        CASE_ID:  
        <b>{}</b> <br>
        # unique AIN
        {} 
    '''.format(id, feature['properties']['AIN'])    
    
layer.on_hover(update_html_tract)

m.add_layer(layer)

control = ipyleaflet.WidgetControl(widget = html, position = 'topright')
layers_control = ipyleaflet.LayersControl(position = 'topright')

m.add_control(control)
m.add_control(layers_control)

m.layout.height = '100%'
m.layout.min_height = '400px'
m

Map(center=[34.0536, -118.2427], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', '…

In [20]:
(to_plot.drop(columns = 'geometry')
 .rename(columns = {
     "PROJ_DESC_TXT": "Description",
     "AIN": "# unique AIN",
     "PROP_ID": "# unique PROP_ID"
     })
 .style
)

Unnamed: 0,CASE_ID,CASE_NBR,Description,# unique AIN,# unique PROP_ID
0,187855,DIR-2012-1857-DI,HOLLYWOOD SUD,750,1515
1,188939,DIR-2012-2817-VSO-MEL,VSO - DEMO (E) SFD; CONSTRUCT NEW 3-STORY SFD + 2 UNCOVERED PKG,6297,7650
2,190670,DIR-2013-684-VSO,VSO - DEMO (E) SFD; CONSTRUCT NEW 3-STORY SFD + 2 UNCOVERED PKG,6297,7650
3,204512,ZA-2015-3539-CUW,"CONDITIONAL USE FOR THE INSTALLATION OF A VERIZON WIRELESS TELECOMMUNICATIONS FACILITY CONSISTING OF: 12 ANTENNAS, 12 A2 UNITS, 12 RRUS AND 12 RAYCAPS. ALSO 2 CABINETS AND 1 BACKUP GENERATOR TO BE LOCATED ADJACENT TO THE MONOPOLE.",55,80
