## Test out new PCTS 

In [1]:
import pandas as pd
import laplan

In [2]:
pcts = pd.read_parquet('s3://city-planning-entitlements/data/final/pcts.parquet')

# List of tracts
tracts = pd.read_parquet(
    "s3://city-planning-entitlements/data/crosswalk_parcels_tracts_lacounty.parquet")[
    ["GEOID"]].drop_duplicates().reset_index(drop=True)

# List of unique AINs
parcels = pd.read_parquet(
    "s3://city-planning-entitlements/data/crosswalk_parcels_tracts_lacounty.parquet")[
    ["AIN", "GEOID"]].drop_duplicates().reset_index(drop=True)

### A. Check if there are cases that have AIN but not GEOID or vice versa
Since GEOID is derived from CENSUS_TRACT, we shouldn't have the case where one is missing and the other isn't.

In [3]:
print(f"AIN is na, CENSUS_TRACT not na: {len(pcts[(pcts.AIN.isna()) & (pcts.CENSUS_TRACT.notna())])}")
print(f"GEOID is na, AIN not na: {len(pcts[(pcts.GEOID.isna()) & (pcts.AIN.notna())])}")

AIN is na, CENSUS_TRACT not na: 18418
GEOID is na, AIN not na: 1608


In [4]:
cols = ["CASE_ID", "AIN", "CENSUS_TRACT", "GEOID"]
pcts[(pcts.AIN.isna()) & (pcts.CENSUS_TRACT.notna())][cols].head()

Unnamed: 0,CASE_ID,AIN,CENSUS_TRACT,GEOID
1411,180442,,1193.0,6037119300
1553,181301,,2674.02,6037267402
3402,183544,,2218.1,6037221810
5254,4326,,2035.0,6037203500
14041,38649,,2074.0,6037207400


In [5]:
pcts[(pcts.GEOID.isna()) & (pcts.AIN.notna())][cols].head()

Unnamed: 0,CASE_ID,AIN,CENSUS_TRACT,GEOID
8125,18499,2526023916,,
61144,28929,5555011038,,
92101,94024,4226015BRK,,
92102,94025,4226015BRK,,
102369,99028,4228001003,,


### B. Check the cases where AIN is na but GEOID isn't
These are cases that seem to have problems in themselves. How can you be associated with a parcel but not a tract?

In [6]:
c1 = (pcts.AIN.isna())
c2 = (pcts.GEOID.notna())

print(f"No AIN, but have GEOID: {len(pcts[c1 & c2][cols])}")

No AIN, but have GEOID: 18418


In [7]:
pcts[c1 & c2].CENSUS_TRACT.value_counts()

Multiple      2242
9800.09        986
2240.10        910
2260.00        811
2035.00        658
              ... 
1096.030         1
2218.10          1
1200.30          1
2169.00          1
1098.000         1
Name: CENSUS_TRACT, Length: 292, dtype: int64

#### B1. CENSUS_TRACT has errors, which means GEOID is incorrectly generated.
Don't like cleaning these ad-hoc. 
Not sustainable in long-run when we have live connection, because it requires looking through several types of errors, and we aren't limited to the errors we see here. Then it requires testing if the replacements are valid GEOIDs, and if they aren't, investigate why.

In [8]:
pcts1 = pcts.copy()
# Check if there are letters appearing in CENSUS_TRACT, which there shouldn't be
pcts1['has_string'] = pcts1.CENSUS_TRACT.str.contains(r'[a-zA-Z]', na=False)

In [9]:
# These cases need GEOID replaced
pcts1 = pcts1.assign(
    GEOID = pcts1.apply(lambda x: x.GEOID if x.has_string == False else "", axis=1)
)

In [10]:
pcts1[pcts1.has_string==True].CENSUS_TRACT.value_counts()

Multiple      4390
barlow san       4
Name: CENSUS_TRACT, dtype: int64

In [11]:
pcts_unique_GEOID = pcts1[pcts1.GEOID.notna()][["CASE_ID", "GEOID"]].drop_duplicates().reset_index(drop=True)

In [12]:
m1 = pd.merge(pcts_unique_GEOID, tracts, how = "left", on = "GEOID", validate = "m:1", indicator=True)

In [13]:
fix_me = m1[m1._merge=="left_only"]

fix_me = fix_me.assign(
    string_length = fix_me.GEOID.str.len()
)

In [14]:
check_cases = fix_me[~fix_me.string_length.isin([0, 11])]

In [15]:
# These cases don't have AINs...
# but somehow have census tract info, but it's erroneous census tract info
pcts1[pcts1.CASE_ID.isin(check_cases.CASE_ID)].AIN.value_counts()

Series([], Name: AIN, dtype: int64)

In [16]:
check_cases['GEOID2'] = check_cases.GEOID.str[0:11]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
# Check that fixing it this way actually matches with a GEOID we have in our crosswalk
pd.merge(check_cases.drop(columns = "_merge"), 
         tracts, 
         left_on = "GEOID2", right_on = "GEOID", 
         validate = "m:1", indicator = True)

Unnamed: 0,CASE_ID,GEOID_x,string_length,GEOID2,GEOID_y,_merge
0,45582,603721480000,13,6037214800,6037214800,both
1,93477,603721480000,13,6037214800,6037214800,both
2,197413,603721480000,13,6037214800,6037214800,both
3,197688,603721480000,13,6037214800,6037214800,both
4,193170,60371098000,12,6037109800,6037109800,both
5,193170,60371096030,12,6037109603,6037109603,both


### C. Check the cases where AIN is not na but GEOID is na

In [18]:
c3 = (pcts.GEOID.isna())
c4 = (pcts.AIN.notna())

pcts[c3 & c4][cols].head()

Unnamed: 0,CASE_ID,AIN,CENSUS_TRACT,GEOID
8125,18499,2526023916,,
61144,28929,5555011038,,
92101,94024,4226015BRK,,
92102,94025,4226015BRK,,
102369,99028,4228001003,,


Check if there are any AINs in the entire PCTS that do not get matched to a parcel in our crosswalk (exclude those with BRK or weird symbols).

In [19]:
pcts_unique_ain = pcts[c4][cols].drop_duplicates().reset_index(drop=True)

In [20]:
m2 = pd.merge(pcts_unique_ain, parcels, how = "left", on = "AIN", validate = "m:1", indicator=True)

In [21]:
fix_me2 = m2[m2._merge == "left_only"]

find_me = ["BRK", "*", "brk"]

fix_me2 = fix_me2.assign(
    has_brk = fix_me2.AIN.apply(lambda x: True if any(word in x for word in find_me) else False)
)

In [22]:
check_cases2 = fix_me2[fix_me2.has_brk==False]

check_cases2.AIN.drop_duplicates().value_counts()

2044026071    1
2378018041    1
5160019038    1
2565013102    1
2629006010    1
             ..
5546026036    1
4128010922    1
5192018019    1
5166015010    1
2239006043    1
Name: AIN, Length: 789, dtype: int64

In [23]:
check_cases2._merge.value_counts()

left_only     3713
both             0
right_only       0
Name: _merge, dtype: int64

These left_only cases are ones that with valid AINs (no BRK or weird symbols in it...those wouldn't have gotten matched anyway). These come with GEOID in PCTS, but not through our crosswalk. 
Is this because the PCTS case pre-dates 2010? It's more problematic for cases 2010-after, since we have historical AINs from 2006-2019.

In [24]:
tract_in_pcts_not_crosswalk = check_cases2[["CASE_ID", "AIN"]].drop_duplicates().reset_index(drop=True)

pcts2 = pd.merge(pcts,
        tract_in_pcts_not_crosswalk, 
        on = ["CASE_ID", "AIN"], 
        how = "inner", 
        validate = "m:1")

In [25]:
# Quite a bit of cases that are in the relevant time range still
our_time_range = (pcts2.FILE_DATE >= "1/1/2010")
print(f"unique cases 2010-after that have AINs not in our crosswalk: {pcts2[our_time_range].CASE_ID.nunique()}")
print(f"# obs: {len(pcts2[our_time_range])}")
print(f"# unique AINs touched by problem cases: {pcts2[our_time_range].AIN.nunique()}")
print(f"# unique tracts touched by problem cases: {pcts2[our_time_range].GEOID.nunique()}")

unique cases 2010-after that have AINs not in our crosswalk: 760
# obs: 6496
# unique AINs touched by problem cases: 702
# unique tracts touched by problem cases: 331


In [26]:
# These GEOIDs...do they also appear in our crosswalk?
# If they do, why aren't we linking them?
# These GEOIDs do exist in our crosswalk, but these AINs don't...
pd.merge(pcts2[["GEOID"]].drop_duplicates(), 
         tracts, 
         how = "left", 
         on = "GEOID", 
         validate = "1:1", 
         indicator=True)._merge.value_counts()

both          302
left_only      48
right_only      0
Name: _merge, dtype: int64

These AINs don't appear to be malformed, but do not appear in our crosswalk, but are attached to cases 2010-after.
They do store CENSUS_TRACT that is a valid CENSUS_TRACT / GEOID, but they wouldn't appear on valid AINs.

What should be done with cases that are on parcels we don't find, but have a valid tract. Should they be counted?

Or, should only cases with valid AINs and valid tracts be counted?

#### C1. Of cases that have AIN but no census tract info...do these AIN not appear in our crosswalk at all?

In [27]:
t1 = pcts[c3 & c4][cols]
print(f"# obs that have AIN, but no census tract: {len(t1)}")
print(f"# unique cases: {t1.CASE_ID.nunique()}")
print(f"# unique AIN: {t1.AIN.nunique()}")

t1.head()

# obs that have AIN, but no census tract: 1608
# unique cases: 58
# unique AIN: 521


Unnamed: 0,CASE_ID,AIN,CENSUS_TRACT,GEOID
8125,18499,2526023916,,
61144,28929,5555011038,,
92101,94024,4226015BRK,,
92102,94025,4226015BRK,,
102369,99028,4228001003,,


In [28]:
pd.merge(t1[["AIN"]].drop_duplicates(), 
         parcels, 
         on = "AIN", 
         how = "left", 
         validate = "1:1",
         indicator=True
        )._merge.value_counts()

both          459
left_only      62
right_only      0
Name: _merge, dtype: int64

Majority of these do appear in our crosswalk, so we would want to take that AIN and merge it with our crosswalk to get the GEOID.