In [1]:
import fsspec
import geopandas
import geopy
import ibis
import intake
import pandas
import shapely
import tqdm

tqdm.tqdm.pandas()
BUCKET = "s3://hcid-cdbg-project-ita-data"

  from pandas import Panel


## Load the data from CPAS

In [2]:
fs = fsspec.filesystem("s3")
fs.download(f"{BUCKET}/cpas.sqlite", "cpas.sqlite")
con = ibis.sqlite.connect("cpas.sqlite")

In [3]:
cat = intake.open_catalog("../catalogs/cpas.yml")

In [4]:
app_info = cat.application_info.read()
app_list = cat.application_list.read()
pep = cat.pep_info.read()
gpr = cat.grant_gpr.read()

## Which columns have addresses in them?

In [5]:
display(app_info.columns)

Index(['pgm_year', 'proj_id', 'reimbursetotal', 'approvedcdbgtotal',
       'cdbgrequested', 'prjct_ttl', 'grnt_aplctn_id', 'bypass_email_notif_yn',
       'prpsd_contract_cnt', 'grnt_l_proj_loc_rsn', 'knwn_site_loc_yn',
       'cnfid_yn', 'pepsubmittedcount', 'pepapprovedcount', 'locationcount'],
      dtype='object')

In [6]:
display(pep.columns)

Index(['pgm_year', 'grnt_pep_id', 'pep_agcy_nm', 'pep_proj_nm',
       'citywide_cncl_dist_yn', 'grnt_l_pep_stts', 'pep_vrftn_dt',
       'pep_to_environ_dt', 'pep_to_mgmt_dt', 'pep_send_out_dt',
       'grnt_l_send_out_mthd', 'cdbo_send_vrftn_dt', 'pep_rcv_dt',
       'grnt_l_pep_typ_value', 'prjct_ttl', 'cdbg_fnd_amt', 'proj_id',
       'trackingcomment', 'pep_vrftn_comment', 'pep_to_environ_comment',
       'pep_to_mgmt_comment', 'cdbo_send_vrftn_comment',
       'grnt_l_pep_stts_value', 'grnt_l_send_out_mthd_value', 'department',
       'grnt_l_rec_color', 'grnt_l_rec_color_value', 'council_district',
       'status_date'],
      dtype='object')

In [7]:
display(gpr.columns)

Index(['pgm_year', 'yr', 'grnt_gpr_id', 'dept', 'pid', 'actv_nbr', 'proj_nm',
       'actv_nm', 'proj_addr', 'proj_desc', 'natl_obj', 'hud_cd',
       'grnt_hud_cd_id', 'ttl', 'regulation_cit', 'grnt_l_accmplsh', 'obj_cnt',
       'otcm_cnt', 'accmplsh_actl_units', 'accmplsh_narrtv', 'fund_amt',
       'drn_thru_amt', 'tot_accmplsh', 'tot_hsg', 'accmplsh_narrtv_updt',
       'aprv_anlst_email', 'aprv_anlst_tel', 'aprv_anlst_sig_dt',
       'aprv_supv_nm', 'aprv_supv_email', 'aprv_anlst_dept_nm', 'gpr_subm_dt',
       'grnt_l_gpr_actv_stts', 'ent_in_idis_dt'],
      dtype='object')

In [8]:
con.list_tables()

['grnt_accmplsh_dtl',
 'grnt_aplc_conplan_catg',
 'grnt_aplc_goal_otcm',
 'grnt_aplctn',
 'grnt_aplctn_attchmnt',
 'grnt_aplctn_note',
 'grnt_aplctn_prog',
 'grnt_bud',
 'grnt_bud_expns',
 'grnt_bud_gnrl_dtl',
 'grnt_case_note',
 'grnt_cbdo',
 'grnt_cbdo_doc',
 'grnt_census_tract',
 'grnt_chk_list',
 'grnt_chk_list_photo',
 'grnt_cntct',
 'grnt_cptl_plan',
 'grnt_cptl_plan_dtl',
 'grnt_cptl_typ',
 'grnt_form',
 'grnt_goal_otcm',
 'grnt_gpr',
 'grnt_gpr_attchmnt',
 'grnt_gpr_census_tract',
 'grnt_gpr_duns',
 'grnt_gpr_incm_lvl',
 'grnt_gpr_job_catg',
 'grnt_gpr_race',
 'grnt_hist',
 'grnt_hud_cd',
 'grnt_hud_objctive',
 'grnt_pep',
 'grnt_pep_accmplsh',
 'grnt_pep_accs_auth',
 'grnt_pep_attchmnt',
 'grnt_pep_chk_list',
 'grnt_pep_chk_list_photo',
 'grnt_pep_cncl_dist',
 'grnt_pep_goal_otcm',
 'grnt_pep_lvrg_resrc',
 'grnt_pep_note',
 'grnt_pep_oth_fnd_src',
 'grnt_pep_prjctd_reimb',
 'grnt_pps_accmplsh',
 'grnt_pps_accmplsh_dtl',
 'grnt_pps_addr',
 'grnt_prjct_accmplsh',
 'grnt_prjct_fn

In [9]:
for t in con.list_tables():
    cols = con.table(t).columns
    for c in cols:
        if "addr" in c:
            print(f"{t}: {c}")

grnt_aplctn: proj_addr_cmty_wide_yn
grnt_chk_list: mntr_addr
grnt_cntct: addr_nbr
grnt_cntct: addr_frctn
grnt_cntct: addr_dir
grnt_cntct: addr_nm
grnt_cntct: addr_sfx
grnt_cntct: addr_apt
grnt_cntct: addr_cty
grnt_cntct: addr_st
grnt_cntct: addr_zip
grnt_gpr: proj_addr
grnt_pep: pep_proj_loc_addr
grnt_pep_chk_list: mntr_addr
grnt_prjct_loctn: addr_nbr
grnt_prjct_loctn: addr_frctn
grnt_prjct_loctn: addr_dir
grnt_prjct_loctn: addr_nm
grnt_prjct_loctn: addr_sfx
grnt_prjct_loctn: addr_apt
grnt_prjct_loctn: addr_cty
grnt_prjct_loctn: addr_st
grnt_prjct_loctn: addr_zip
grnt_prjct_loctn: email_addr


From the above, we are likely most interested in addresses from `grnt_prjct_loctn`, `grnt_gpr`, and `grnt_pep`.
We probably don't need the address for the grant contact person.

In [10]:
grnt_pep = con.table("grnt_pep").execute()
grnt_pep

Unnamed: 0,grnt_pep_id,grnt_aplctn_id,grnt_l_davis_bacon,pep_agcy_nm,pep_proj_nm,pep_proj_loc_addr,grnt_hud_cd_id,grnt_hud_objctive_id,cdbg_fnd_amt,oth_fnd_amt,...,amort_ln_nbr_mnth,defr_ln_int_rate,defr_ln_nbr_mnth,amort_ln_amt,defr_ln_amt,ammnd_typ,serv_area_zip,grnt_l_fund_agcy_typ,pep_from_environ_dt,lst_updt
0,173627,100541,406,HOMEBOY INDUSTRIES,COUNSELING AND TATTOO REMOVAL,"1916 EAST FIRST STREET\r\nLOS ANGELES, CA 90033",26,104,17956.0,,...,,,,,,,,,NaT,
1,173631,100541,406,ATWATER PARK CENTER,PARENT/CHILD TRAINING PROJECT,3370 PERLITA AVENUE\r\nLOS ANGELES 90039,38,104,18217.0,,...,,,,,,,,,NaT,
2,173633,100541,406,ASSISTANCE LEAGUE OF SOUTHERN CALIFORNIA,YOUTH NETWORK,"1370 N. ST ANDREWS PLACE, LOS ANGELES, CA 90028",30,104,18217.0,,...,,,,,,,,,NaT,
3,173635,100541,406,LOS ANGELES BOYS AND GIRLS CLUB,PROJECT LEARN,"2635 PASADENA AVENUE\r\nLOS ANGELES, CA 90031",26,104,17956.0,,...,,,,,,,,,NaT,
4,173637,100541,406,ECHO PARK SILVERLAKE PEOPLE'S CHILD CARE CENTER,PLAYGROUP,"1953 LAKESHORE\r\nAVENUE, LOS ANGELES, CA 90039",38,104,17475.0,,...,,,,,,,,,NaT,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2858,516285,506543,406,CENTER FOR THE PACIFIC ASIAN FAMILY,DOMESTIC VIOLENCE SHELTER OPERATIONS,CONFIDENTIAL,33,104,309924.0,,...,,,,,,,90017.0,418089.0,NaT,
2859,516291,506543,406,JENESSE CENTER INC,DOMESTIC VIOLENCE SHELTER OPERATIONS,CONFIDENTIAL,33,104,459302.0,,...,,,,,,,90017.0,418089.0,NaT,
2860,516297,506543,406,DOMESTIC VIOLENCE PROJECT COORDINATOR,DOMESTIC VIOLENCE SHELTER OPERATIONS,CONFIDENTIAL,33,104,81145.0,,...,,,,,,,90017.0,418091.0,NaT,
2861,516303,506543,406,SOUTHERN CALIFORNIA ALCOHOL AND DRUG PROGRAM,DOMESTIC VIOLENCE SHELTER OPERATIONS,CONFIDENTIAL,33,104,152981.0,,...,,,,,,,90017.0,418089.0,NaT,


In [20]:
geocoder = geopy.ArcGIS()
def geocode(row):
    try:
        loc = geocoder.geocode(row)
        return pandas.Series(
            [
                loc.address if loc else None,
                shapely.geometry.Point(loc.longitude, loc.latitude) if loc else None
            ],
            index=["address", "geometry"]
        )
    except:
        return pandas.Series([None, None], index=["address", "geometry"])

from geopy.extra.rate_limiter import RateLimiter
geocode = RateLimiter(geocode, min_delay_seconds=1)

In [22]:
pep_addr = grnt_pep.pep_proj_loc_addr.progress_apply(geocode)

100%|██████████| 2863/2863 [48:25<00:00,  1.01s/it]  


In [23]:
geopandas.GeoDataFrame(
    pandas.concat([grnt_pep, pep_addr], axis=1)
).to_file(f"{BUCKET}/pep_geocode.geojson", driver="GeoJSON")

In [24]:
def combine_address(row):
    parts = [
        row.addr_nbr,
        row.addr_frctn,
        row.addr_dir,
        row.addr_nm,
        row.addr_sfx,
        row.addr_apt,
        row.addr_cty if not row.addr_city,
        row.addr_st,
        row.addr_zip,
    ]
    parts = [p.strip() for p in parts if bool(p)]
    return " ".join(parts)

In [25]:
proj_loc = con.table("grnt_prjct_loctn").execute()
proj_addr = proj_loc.apply(combine_address, axis=1).progress_apply(geocode)

100%|██████████| 4262/4262 [1:11:07<00:00,  1.00s/it]


In [26]:
geopandas.GeoDataFrame(
    pandas.concat([proj_loc, proj_addr], axis=1)
).to_file(f"{BUCKET}/project_location_geocode.geojson", driver="GeoJSON")

In [27]:
grnt_gpr = con.table("grnt_gpr").execute()
gpr_addr = grnt_gpr.proj_addr.progress_apply(geocode)

100%|██████████| 4078/4078 [1:08:02<00:00,  1.00s/it]


In [28]:
geopandas.GeoDataFrame(
    pandas.concat([grnt_gpr, gpr_addr], axis=1)
).to_file(f"grant_gpr_geocode.geojson", driver="GeoJSON")