In [1]:
import sys
import getpass
import re
from functools import partial
from tqdm import tqdm
import uuid

user = getpass.getuser()
sys.dont_write_bytecode = True

sys.path.insert(0, '/Users/{}/Box/DataViz Projects/Utility Code'.format(user))

from utils_io import *  # initial imports
reload_module('utils_io')  # reload after update to utils_io.py
from utils_io import *  # reload reflects updates

#geo imports
import geopandas as gp
import folium
from geopandas.tools import geocode
from geopy.geocoders import GoogleV3
from geopy.extra.rate_limiter import RateLimiter
from geopy import Point

In [2]:
google_api_key = G_CREDS['dataviz_team']

In [5]:
apr_id = 'briv-ikjp'

## Pull Geocoded Data and Run Through Geocoder w/ updates

In [119]:
apr = pull_df_from_socrata('briv-ikjp')

pulling data in 3 chunks of 14000 rows each
pulling chunk 0
pulling chunk 1
pulling chunk 2
took 6.4961 seconds


In [116]:
apr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30864 entries, 0 to 30863
Data columns (total 74 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   mtc_id                       30864 non-null  object 
 1   mtc_type                     30667 non-null  object 
 2   mtc_year                     30864 non-null  float64
 3   mtc_vlow_income_dr           254 non-null    float64
 4   mtc_vlow_income_ndr          371 non-null    float64
 5   mtc_vlow_tot                 30864 non-null  float64
 6   mtc_low_income_dr            388 non-null    float64
 7   mtc_low_income_ndr           821 non-null    float64
 8   mtc_low_tot                  30864 non-null  float64
 9   mtc_mod_income_dr            466 non-null    float64
 10  mtc_mod_income_ndr           3269 non-null   float64
 11  mtc_mod_tot                  30864 non-null  float64
 12  mtc_above_mod_income         25870 non-null  float64
 13  mtc_total_units 

In [125]:
columns = ['mtc_id',
           'mtc_address_full',
           'mtc_geocode_type']
apr_notgeo = apr[columns][apr['mtc_geocode_type'] == 'NOT_GEOCODED']

In [126]:
#Create a geo locator
g = GoogleV3(api_key=google_api_key)

In [127]:
#Use a rate limiter
bound_box = [Point(38.864245,-121.208156), Point(36.893329,-123.632497)]
geocode = RateLimiter(g.geocode, 
                      min_delay_seconds=.025, 
                      error_wait_seconds=1)

In [129]:
#Create a location column with geolocator and time the process 
tqdm.pandas()
bound_box = [Point(38.864245,-121.208156), Point(36.893329,-123.632497)]
apr_notgeo['location'] = (apr_notgeo['mtc_address_full']
                            .progress_apply(partial(geocode,
                                                    components={"country": "USA"},
                                                    timeout=1000,
                                                    bounds=bound_box)))


100%|██████████| 527/527 [05:39<00:00,  1.55it/s]


In [137]:
#Set geocode location type
apr_notgeo['mtc_geocode_type'] = apr_notgeo['location'].map(lambda loc: loc.raw['geometry']['location_type'] 
                                                                if loc else 'NOT_GEOCODED')

In [138]:
#Set latitude
apr_notgeo['mtc_lat'] = (apr_notgeo['location']
                                        .map(lambda loc: loc.latitude if loc else None))
#Set longitude 
apr_notgeo['mtc_long'] = (apr_notgeo['location']
                                        .map(lambda loc: loc.longitude if loc else None))

In [139]:
#Set geocode address 
apr_notgeo['mtc_geocode_address'] = (apr_notgeo['location']
                                      .map(lambda loc: loc.address if loc else None))

In [140]:
#Create geodataframe from lat long coords
apr_notgeo_geo = gp.GeoDataFrame(apr_notgeo,
                                  geometry=gp.points_from_xy(apr_notgeo['mtc_long'],apr_notgeo['mtc_lat']),
                                  crs="EPSG:4326")

In [147]:
#Set geometry field to null if no lat long
apr_notgeo_geo['geometry'] = np.where(apr_notgeo_geo['mtc_geocode_type']  == 'NOT_GEOCODED',
                                        np.nan,
                                        apr_notgeo_geo['geometry'])

In [148]:
#Set wkt column
apr_notgeo_geo['mtc_wkt'] = apr_notgeo_geo['geometry'].apply(lambda x: np.nan if x is None else x.wkt)

In [149]:
#Create checkbox column to indicate whether or not the project had been mapped
apr_notgeo_geo['mtc_mapped'] = np.where(apr_notgeo_geo['mtc_geocode_type'] == 'NOT_GEOCODED', False,True)

In [152]:
apr_geocode_updated_geo = apr_notgeo_geo[apr_notgeo_geo['mtc_mapped'] == True]

In [153]:
apr_geocode_updated_geo.shape

(68, 10)

In [154]:
final_columns = ['mtc_id',
                 'mtc_address_full',
                 'mtc_geocode_address',
                 'mtc_geocode_type',
                 'mtc_mapped',
                 'mtc_lat',
                 'mtc_long',
                 'mtc_wkt',
                 'mtc_mapped']

In [155]:
upsert_df_socrata(apr_geocode_updated_geo[final_columns],'briv-ikjp')

upserting data to Socrata in 1 chunks of 68 rows each


  upsert_data_raw = df.to_dict(orient='records')


upserted chunk 0
data upserted to briv-ikjp
took 1.7823 seconds


## Match un-geocoded projects by APN

-[ParcelAtlas v24 (BASIS)](https://data.bayareametro.gov/Cadastral/ParcelAtlas-v24-BASIS-/8hp3-7hq6)

In [3]:
parcels_id = '8hp3-7hq6'

In [4]:
#Pull Sonoma County Parcels from Socrata
query_args = {'where': "county = 'Sonoma'"}
parcels_sonoma = pull_df_from_socrata(parcels_id,query_args=query_args)

pulling data in 9 chunks of 22000 rows each
pulling chunk 0
pulling chunk 1
pulling chunk 2
pulling chunk 3
pulling chunk 4
pulling chunk 5
pulling chunk 6
pulling chunk 7
pulling chunk 8
took 55.6488 seconds


In [10]:
#Pull un-geocoded sonoma projects from Socrata
query_args = {'where': "cnty_name = 'SONOMA' and mtc_mapped = False"}

In [12]:
sonoma_projects = pull_df_from_socrata(apr_id,query_args=query_args)

pulling data in 1 chunks of 230 rows each
pulling chunk 0
took 2.2991 seconds


In [17]:
sonoma_projects[['mtc_id','apn','mtc_address_full','street_address','mtc_wkt']].sample(n=20)

Unnamed: 0,mtc_id,apn,mtc_address_full,street_address,mtc_wkt
60,f440b510-c0ec-4ca5-a918-2d6014b0f1d2,146120045,,,
227,901b6bea-089a-4064-ace7-5637fd425854,157020009,,,
88,2273b832-7d0c-4efd-9a5f-b77ceffcf526,37240025,,,
216,9c13f165-a5e8-452b-8157-5ae7486eead0,173670001,,,
14,d50b09bd-d18d-423c-a2c2-7796b27bbc86,183440025,,,
161,bb9ce4ba-7f44-4c64-9945-613049a39991,089-300-028,"1080 TRENTADUE WY, HEALDSBURG, CA , USA",1080 TRENTADUE WY,
106,56707968-e418-493a-8bf4-5880fd38d645,36870007,,,
205,76421449-b0c6-4b65-b237-110458a9c12e,182600007,,,
177,ca8dd64b-9b25-4755-879a-5d11aec0958a,089-300-011,"1010 TRENTADUE WY, HEALDSBURG, CA , USA",1010 TRENTADUE WY,
49,dfa4745a-2419-4663-a092-99fbefcba732,153590024,,,


In [16]:
parcels_sonoma.head(5)

Unnamed: 0,objectid,apn,apn2,state,county,fips,sit_hse_nu,sit_dir,sit_str_na,sit_str_sf,...,ycoord,minx,miny,maxx,maxy,attdate,version,quantarium,gpid,geometry
0,5967917.0,001-011-004,1011004000,CA,Sonoma,6097,721,N,CLOVERDALE,BLVD,...,38.813,-123.023715,38.812709,-123.022919,38.813278,20190815,5/8/2015,21082694,-123.02332:+38.81299:0001,"{'type': 'MultiPolygon', 'coordinates': [[[[-1..."
1,5967918.0,001-011-005,1011005000,CA,Sonoma,6097,611,N,CLOVERDALE,BLVD,...,38.8127,-123.023579,38.812456,-123.022752,38.812988,20190815,5/8/2015,21082695,-123.02317:+38.81272:0001,"{'type': 'MultiPolygon', 'coordinates': [[[[-1..."
2,5967919.0,001-011-015,1011015000,CA,Sonoma,6097,146,,NORTH,ST,...,38.8118,-123.026706,38.811331,-123.024732,38.812201,20190815,5/8/2015,21082696,-123.02572:+38.81177:0001,"{'type': 'MultiPolygon', 'coordinates': [[[[-1..."
3,5967920.0,001-011-018,1011018000,CA,Sonoma,6097,500,,VENEZIA,WAY,...,38.8131,-123.024391,38.812914,-123.023752,38.813269,20190815,5/8/2015,21082697,-123.02407:+38.81309:0002,"{'type': 'MultiPolygon', 'coordinates': [[[[-1..."
4,5967921.0,001-011-019,1011019000,CA,Sonoma,6097,500,,VENEZIA,WAY,...,38.8131,-123.023996,38.812935,-123.023461,38.81329,20190815,5/8/2015,21082698,-123.02373:+38.81311:0001,"{'type': 'MultiPolygon', 'coordinates': [[[[-1..."


## Format APNs to match Sonoma County Pattern

In [None]:
df.loc[df['Type'] == 'PsychicFairy', ['HP']] = 42

In [99]:
#Subset apns that match the county pattern
regex_ctny_pattern = r'\d{3}-\d{3}-\d{3}'
sonoma_apn_reg = sonoma_projects[['apn']][sonoma_projects['apn'].str.contains(regex_ctny_pattern,na=False)]

In [100]:
#Create a column with formatted apn, setting it to the first correctly mattched pattern 
#if multiple apns listed in column
regex_ctny_extract_pattern = r'(^\d{3}-\d{3}-\d{3})'
sonoma_apn_reg['apn_fmt'] = sonoma_apn_reg['apn'].str.extract(regex_ctny_extract_pattern)

In [102]:
#update the main dataframe with updated values
sonoma_projects['apn_fmt'].update(sonoma_apn_reg['apn_fmt'])

In [103]:
sonoma_projects[['mtc_id','apn','apn_fmt']][sonoma_projects['apn_fmt'].notnull()].sample(n=15)

Unnamed: 0,mtc_id,apn,apn_fmt
176,55a05b53-5699-4147-8886-68ba1bf98afe,089-300-012,089-300-012
197,4cd36049-50f9-459e-88d0-df6ca7d4a5af,144-450-002,144-450-002
159,b307c22b-e9a3-4e3b-8f05-2ecc9c5e2b11,089-300-030,089-300-030
185,b226381b-4340-4b37-a22f-6259899eb600,163-320-016,163-320-016
158,3f13ba93-a2a8-4ca0-9e9a-a4ebf42f2750,091-330-008,091-330-008
160,2c789932-812c-4ef7-982c-1cae78c80abe,089-300-029,089-300-029
155,6a406148-2bd9-4949-bce1-9f889116c7cb,091-330-037,091-330-037
190,09d51fb6-cf58-47a7-86fd-f5423d3090eb,017-080-006,017-080-006
162,76c5eefc-3100-41a4-942a-f67b3cae6a3b,089-300-027,089-300-027
189,caba53e5-df90-4d9a-aa0d-ae54b494b124,058-361-042,058-361-042


In [104]:
#Select apns of length 8
#These apns need to be pre-pended with a leading 0 and seperated by dashes every 3 digits
sonoma_apn_eight_dig = sonoma_projects[['apn']].loc[sonoma_projects['apn'].str.len() == 8]

In [105]:
sonoma_apn_eight_dig

Unnamed: 0,apn
7,44460001
8,44460001
9,44460001
10,38172028
11,36270012
...,...
149,10522028
150,10522027
151,10510028
195,37240050


In [106]:
sonoma_apn_eight_dig['apn_fmt'] = '0' + sonoma_apn_eight_dig['apn']

In [107]:
sonoma_apn_eight_dig['apn_fmt'] = (sonoma_apn_eight_dig['apn_fmt'].str[0:3] + 
                                 '-' +
                                 sonoma_apn_eight_dig['apn_fmt'].str[3:6] +
                                 '-' +
                                 sonoma_apn_eight_dig['apn_fmt'].str[6:])

In [108]:
#Update original dataframe apn_fmt column with correctly formatted apn
sonoma_projects['apn_fmt'].update(sonoma_apn_eight_dig['apn_fmt'])

In [111]:
sonoma_projects[['mtc_id','apn','apn_fmt']][sonoma_projects['apn'].str.len() == 8].sample(n=15)

Unnamed: 0,mtc_id,apn,apn_fmt
91,3ad101ee-0960-44ea-adc3-57124b66298e,37240021,037-240-021
67,298b6199-cd87-4e14-bad1-6d75c9070628,59420004,059-420-004
116,bd3098cd-1af3-417a-b0ac-72f392e25b66,34480058,034-480-058
78,fe6f4339-954d-46e3-b974-80ae0cd2e284,41161030,041-161-030
120,bde5802e-171c-49b6-8da0-23de2f20061c,34041009,034-041-009
73,bb80fc06-c52d-4b92-a0f5-e88f19afe565,44051008,044-051-008
85,28a60642-4b79-45e9-9e93-b28202399c04,37240028,037-240-028
119,8ef89b6e-cabf-431a-b576-0be8b59b9fd8,34041019,034-041-019
104,8da2f0d0-6c78-43de-b0a0-a80c88fcdae8,36870009,036-870-009
7,621bf043-48eb-4006-881e-25a7fd231e5a,44460001,044-460-001


In [114]:
sonoma_apn_nine_dig = sonoma_projects[['mtc_id','apn','apn_fmt']][sonoma_projects['apn'].str.len() == 9]

In [117]:
sonoma_apn_nine_dig['apn_fmt'] = (sonoma_apn_nine_dig['apn'].str[0:3] + 
                                 '-' +
                                 sonoma_apn_nine_dig['apn'].str[3:6] +
                                 '-' +
                                 sonoma_apn_nine_dig['apn'].str[6:])

In [118]:
sonoma_apn_nine_dig

Unnamed: 0,mtc_id,apn,apn_fmt
12,71be63e5-5f49-40ee-a272-ab68b8d1922d,183440026,183-440-026
13,0f384ee8-0043-42ca-8c59-66dad89f3900,183440026,183-440-026
14,d50b09bd-d18d-423c-a2c2-7796b27bbc86,183440025,183-440-025
15,5cc9cc09-4abc-4da1-9a34-5f87926fb86f,183440002,183-440-002
16,b70e4100-0c9f-49cd-addd-d863b004bd79,182600031,182-600-031
...,...,...,...
225,200449b5-a06b-4e2c-9650-b3372cdc1cf9,157020009,157-020-009
226,3e0ed55b-ec2e-4ac9-b8e7-49d6752656aa,157020009,157-020-009
227,901b6bea-089a-4064-ace7-5637fd425854,157020009,157-020-009
228,e6c38fdd-21db-416b-9c21-f7a5951cc719,157020009,157-020-009


In [120]:
#Update original dataframe apn_fmt column with correctly formatted apn
sonoma_projects['apn_fmt'].update(sonoma_apn_nine_dig['apn_fmt'])

In [121]:
sonoma_projects[['mtc_id','apn','apn_fmt']][sonoma_projects['apn'].str.len() == 9].sample(n=15)

Unnamed: 0,mtc_id,apn,apn_fmt
207,01bfd502-0df8-46d1-9acc-cf2e4b999564,182600005,182-600-005
13,0f384ee8-0043-42ca-8c59-66dad89f3900,183440026,183-440-026
26,bc9ef309-4b7c-4663-ad28-3560cd056bcc,157020009,157-020-009
54,4281dbf5-b0c1-4c94-9a65-29f792e22e46,153590012,153-590-012
41,66c95cec-6e50-459d-a4b8-846388ecc15b,153600006,153-600-006
25,48220ed5-6661-4d1e-9e38-75dcd1608bad,157020009,157-020-009
33,963da6d9-2b3b-4632-8a3e-0977093ea327,153610003,153-610-003
208,e69d4d21-0cad-40f7-a1eb-33e704ad45a2,182600004,182-600-004
42,bc0d677e-d7f2-44bd-bf7a-5c0cab177b92,153600005,153-600-005
221,6d2e8957-47b3-4dd3-b8f8-c1ed4490570b,157020009,157-020-009


In [123]:
sonoma_projects[['mtc_id','apn','apn_fmt']][sonoma_projects['apn_fmt'].isnull()]

Unnamed: 0,mtc_id,apn,apn_fmt
152,08b438af-e532-4daf-a12d-733333489e4f,9242045,
153,6fe061b1-e520-4894-9934-f8b9ad91a120,9152012,
154,1d9700dc-7fc4-4359-98ca-0eb2eb82a10b,7610029,
194,1abd229e-71b4-4105-b7b4-8918cbea083c,125111037 and 125101031,


In [126]:
#Subset projects with 7 digits 
sonoma_apn_seven_dig = sonoma_projects[['apn']][sonoma_projects['apn'].str.len() == 7]

In [127]:
#Prepend with two zeros
sonoma_apn_seven_dig['apn_fmt'] = '00' + sonoma_apn_seven_dig['apn']

In [128]:
sonoma_apn_seven_dig

Unnamed: 0,apn,apn_fmt
152,9242045,9242045
153,9152012,9152012
154,7610029,7610029


In [129]:
sonoma_apn_seven_dig['apn_fmt'] = (sonoma_apn_seven_dig['apn_fmt'].str[0:3] + 
                                 '-' +
                                 sonoma_apn_seven_dig['apn_fmt'].str[3:6] +
                                 '-' +
                                 sonoma_apn_seven_dig['apn_fmt'].str[6:])

In [130]:
sonoma_apn_seven_dig

Unnamed: 0,apn,apn_fmt
152,9242045,009-242-045
153,9152012,009-152-012
154,7610029,007-610-029


In [131]:
#Update original dataframe with updated apns
sonoma_projects['apn_fmt'].update(sonoma_apn_seven_dig['apn_fmt'])

In [133]:
sonoma_projects[['apn','apn_fmt']][sonoma_projects['apn_fmt'].isnull()]

Unnamed: 0,apn,apn_fmt
194,125111037 and 125101031,


In [137]:
sonoma_projects.loc[194,'apn_fmt'] = '125-111-037'

In [140]:
sonoma_projects[['apn','apn_fmt']][sonoma_projects['apn_fmt'].isnull()]

Unnamed: 0,apn,apn_fmt


## Join to parcels by apn

In [141]:
#Subset projects to required fields
sonoma_projects_sub = sonoma_projects[['mtc_id','apn','apn_fmt']].copy()

In [142]:
sonoma_projects_parcels = pd.merge(sonoma_projects_sub,
                                  parcels_sonoma,
                                  left_on ='apn_fmt',
                                  right_on='apn',
                                  how='left',
                                  indicator=True)

In [147]:
sonoma_projects_parcels[sonoma_projects_parcels['_merge'] == 'both'].shape

(186, 50)

In [148]:
sonoma_projects.shape

(230, 75)

In [149]:
sonoma_projects_parcels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 241 entries, 0 to 240
Data columns (total 50 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   mtc_id      241 non-null    object  
 1   apn_x       241 non-null    object  
 2   apn_fmt     241 non-null    object  
 3   objectid    186 non-null    float64 
 4   apn_y       186 non-null    object  
 5   apn2        186 non-null    object  
 6   state       186 non-null    object  
 7   county      186 non-null    object  
 8   fips        186 non-null    object  
 9   sit_hse_nu  101 non-null    object  
 10  sit_dir     2 non-null      object  
 11  sit_str_na  107 non-null    object  
 12  sit_str_sf  107 non-null    object  
 13  sit_full_s  107 non-null    object  
 14  sit_city    108 non-null    object  
 15  sit_state   108 non-null    object  
 16  sit_zip     108 non-null    object  
 17  sit_zip4    101 non-null    object  
 18  land_value  186 non-null    float64 
 19  impr_val

In [151]:
sonoma_projects_parcels_sub = sonoma_projects_parcels[['mtc_id',
                                                       'apn_fmt',
                                                       'xcoord',
                                                       'ycoord',
                                                       '_merge']][sonoma_projects_parcels['_merge'] == 'both'].copy()

In [152]:
sonoma_projects_parcels_sub

Unnamed: 0,mtc_id,apn_fmt,xcoord,ycoord,_merge
0,9091b6ab-b00d-47eb-bcb2-483dff3c6224,163-320-023,-122.789,38.5299,both
1,9bb8caca-1ff1-40b0-b75e-48b96a99617d,163-320-022,-122.789,38.5298,both
2,0df968b2-af3f-42b5-a89b-1986dcb4d7c8,066-060-004,-122.821,38.5507,both
3,18fb91bb-0c32-4714-9155-f31a9bb7107d,073-100-070,-122.928,38.3809,both
4,47687333-d1d6-45e3-b268-4af6463d2f6c,056-201-101,-122.487,38.3209,both
...,...,...,...,...,...
225,81136139-3cf4-4152-b477-61b2a51b9d00,173-670-001,-122.722,38.4916,both
226,e3d8f002-924d-40b1-b009-dca2d87f40ef,173-670-001,-122.722,38.4916,both
227,9c13f165-a5e8-452b-8157-5ae7486eead0,173-670-001,-122.722,38.4916,both
228,11706618-93fb-41ed-bf3a-2b5af808d107,173-670-001,-122.722,38.4916,both


In [153]:
#Create geodataframe from lat long coords
sonoma_geo = gp.GeoDataFrame(sonoma_projects_parcels_sub,
                                  geometry=gp.points_from_xy(sonoma_projects_parcels_sub['xcoord'],
                                                             sonoma_projects_parcels_sub['ycoord']),
                                  crs="EPSG:4326")

In [155]:
sonoma_geo.head(5)

Unnamed: 0,mtc_id,apn_fmt,xcoord,ycoord,_merge,geometry
0,9091b6ab-b00d-47eb-bcb2-483dff3c6224,163-320-023,-122.789,38.5299,both,POINT (-122.78900 38.52990)
1,9bb8caca-1ff1-40b0-b75e-48b96a99617d,163-320-022,-122.789,38.5298,both,POINT (-122.78900 38.52980)
2,0df968b2-af3f-42b5-a89b-1986dcb4d7c8,066-060-004,-122.821,38.5507,both,POINT (-122.82100 38.55070)
3,18fb91bb-0c32-4714-9155-f31a9bb7107d,073-100-070,-122.928,38.3809,both,POINT (-122.92800 38.38090)
4,47687333-d1d6-45e3-b268-4af6463d2f6c,056-201-101,-122.487,38.3209,both,POINT (-122.48700 38.32090)


In [157]:
sonoma_geo['mtc_wkt'] = sonoma_geo['geometry'].map(lambda x: x.wkt)

In [159]:
#Rename columns to match final schema
sonoma_geo.rename(columns={'xcoord':'mtc_long','ycoord':'mtc_lat'},inplace=True)

In [161]:
#Add mtc_mapped column and set to True
sonoma_geo['mtc_mapped'] = True

In [None]:
#Add mtc_geocode_type and set to APN_MATCH
sonoma