In [None]:
import fsspec
import geopandas
import geopy
import ibis
import intake
import pandas
import shapely
import tqdm

tqdm.tqdm.pandas()
BUCKET = "s3://hcid-cdbg-project-ita-data"

## Load the data from CPAS

In [None]:
fs = fsspec.filesystem("s3")
fs.download(f"{BUCKET}/cpas.sqlite", "cpas.sqlite")
con = ibis.sqlite.connect("cpas.sqlite")

In [None]:
cat = intake.open_catalog("../catalogs/*.yml")

In [None]:
app_info = cat.application_info.read()
app_list = cat.application_list.read()
pep = cat.pep_info.read()
gpr = cat.grant_gpr.read()

## Which columns have addresses in them?

In [None]:
display(app_info.columns)

In [None]:
display(pep.columns)

In [None]:
display(gpr.columns)

In [None]:
con.list_tables()

In [None]:
for t in con.list_tables():
    cols = con.table(t).columns
    for c in cols:
        if "addr" in c:
            print(f"{t}: {c}")

From the above, we are likely most interested in addresses from `grnt_prjct_loctn`, `grnt_gpr`, and `grnt_pep`.
We probably don't need the address for the grant contact person.

In [None]:
grnt_pep = con.table("grnt_pep").execute()
grnt_pep

In [None]:
geocoder = geopy.ArcGIS()
city_boundary = cat.la_geohub.city_boundary.read().iloc[0].geometry
county_boundary = cat.la_geohub.county_boundary.read().iloc[0].geometry

def geocode(row):
    try:
        # Try to geocode the address normally
        loc = geocoder.geocode(row)
        pt = shapely.geometry.Point(loc.longitude, loc.latitude) if loc else None
        # If the geocode failed, it's possible that it was a street address
        # but was missing the city/state. Assume Los Angeles and try again.
        # We could consider it a failure if the point is not in LA city, but
        # there are several projects listed in unicorporated LA county,
        # and nearby cities, so we check the county instead
        if not pt or not county_boundary.contains(pt):
            print(f"Geocode failed, attempting to add city/state to {row}")
            loc = geocoder.geocode(row + " Los Angeles, CA")
            pt = shapely.geometry.Point(loc.longitude, loc.latitude) if loc else None

        return pandas.Series(
            [loc.address if loc else None, pt],
            index=["address", "geometry"],
        )
    except:
        return pandas.Series([None, None], index=["address", "geometry"])

# Possibly wrap in a rate-limiter
# from geopy.extra.rate_limiter import RateLimiter
# geocode = RateLimiter(geocode, min_delay_seconds=1)

In [None]:
pep_addr = grnt_pep.head(10).pep_proj_loc_addr.progress_apply(geocode)

In [None]:
geopandas.GeoDataFrame(
    pandas.concat([grnt_pep, pep_addr], axis=1)
).to_file(f"{BUCKET}/pep_geocode.geojson", driver="GeoJSON")

In [None]:
def combine_address(row):
    parts = [
        row.addr_nbr,
        row.addr_frctn,
        row.addr_dir,
        row.addr_nm,
        row.addr_sfx,
        row.addr_apt,
        row.addr_cty if not pandas.isna(row.addr_cty) else "Los Angeles",
        row.addr_st if not pandas.isna(row.addr_st) else "CA",
        row.addr_zip,
    ]
    parts = [p.strip() for p in parts if bool(p)]
    return " ".join(parts)

In [None]:
proj_loc = con.table("grnt_prjct_loctn").execute()
proj_addr = proj_loc.apply(combine_address, axis=1).progress_apply(geocode)

In [None]:
geopandas.GeoDataFrame(
    pandas.concat([proj_loc, proj_addr], axis=1)
).to_file(f"{BUCKET}/project_location_geocode.geojson", driver="GeoJSON")

In [None]:
grnt_gpr = con.table("grnt_gpr").execute()
gpr_addr = grnt_gpr.proj_addr.progress_apply(geocode)

In [None]:
geopandas.GeoDataFrame(
    pandas.concat([grnt_gpr, gpr_addr], axis=1)
).to_file(f"{BUCKET}/grant_gpr_geocode.geojson", driver="GeoJSON")