In [41]:
import os
import sys
import getpass
import numpy as np
import pandas as pd

user = getpass.getuser()

DVUTILS_LOCAL_CLONE_PATH = f"/Users/{user}/Documents/GitHub/dvutils"
sys.path.insert(0, DVUTILS_LOCAL_CLONE_PATH)
from utils_io import *

from arcgis import GIS

In [42]:
analysis_crs = "EPSG:26910"

In [43]:
def geo_assign_fields(
    id_df,
    id_field,
    overlay_df,
    overlay_fields,
    return_intersection_area=False,
    overlay_within_pct=None,
):
    """Given an id_df and an overlay_df, assigns the overlay fields.

    Methodology:
    Assigns based on the area with the largest intersection with each id_field (where there are
    duplicate assignments).

    Notes:
    - This is primarily used for generating correspondences, such as new parcel id : old parcel id
    - If any overlay_fields also occur in the id_df, append a _y suffix to the overlay field

    Args:
        id_df (geopandas GeoDataFrame): The ID GeoDataFrame
        id_field (str): The name of the ID column in the ID GeoDataFrame
        overlay_df (geopandas GeoDataFrame): The overlay GeoDataFrame
        overlay_fields (list): A list of overlay fields to assign to the ID GeoDataFrame
        return_intersection_area (bool, optional): Flag for whether to return the intersection area
            of the overlay. Defaults to False.
        overlay_within_pct (float, optional): Value between 0 and 1. If provided, will only assign ID
            field if the overlay intersection area is at least this percentage of the ID field area. Defaults to None.
    Returns:
        geopandas GeoDataFrame: The ID GeoDataFrame with the overlay fields assigned by largest
            intersection area
    """
    a = time.time()
    if id_df.crs != analysis_crs or overlay_df.crs != analysis_crs:
        logger.debug("base geo crs: {}".format(id_df.crs))
        logger.debug("overlay geo crs: {}".format(overlay_df.crs))
        logger.debug("Both GeoDataFrames must be in EPSG:26910. Reprojecting:")
        id_df = project_to_analysis_crs(id_df)
        overlay_df = project_to_analysis_crs(overlay_df)

    join_df = gpd.overlay(id_df, overlay_df, how="intersection")
    join_df["intersection_sq_m"] = join_df.geometry.area
    join_df["idx"] = join_df.index

    max_idxs = (
        join_df.groupby(id_field, as_index=False)
        .agg({"intersection_sq_m": "idxmax"})
        .rename(columns={"intersection_sq_m": "idx"})
    )
    join_df = join_df.merge(max_idxs)

    final_fields = [id_field] + overlay_fields

    # calculate intersection area and share of id_df in intersection
    id_df["base_sq_m"] = id_df.geometry.area
    final_assignment = id_df[[id_field, "base_sq_m"]].merge(
        join_df[final_fields + ["intersection_sq_m"]], how="left"
    )
    final_assignment["area_share"] = (
        final_assignment["intersection_sq_m"] / final_assignment["base_sq_m"]
    )

    # set the assignment to None if no more than overlay_within_pct of the overlay_df is in the intersection
    if overlay_within_pct is not None:
        final_assignment.loc[
            final_assignment["area_share"] < overlay_within_pct, overlay_fields
        ] = None

    b = time.time()
    print(f"took {print_runtime(b-a)}")
    if return_intersection_area:
        return final_assignment[final_fields + ["base_sq_m", "intersection_sq_m", "area_share"]]
    else:
        return final_assignment

In [44]:
# authenticate to agol
password = os.getenv("AGOL_CONTENT_PASSWORD")
gis = GIS(url="https://mtc.maps.arcgis.com/home", username="content_MTC", password=password)

In [45]:
base_url = "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/TOC_Station_Area_Buffers/FeatureServer/49"
p10_parcels = pull_geotable_redshift(
    "SELECT * FROM urbansim_2022_p10.urbansim_parcels_topo_fix WHERE county = 'Alameda'",
    crs=analysis_crs,
    output_crs=analysis_crs,
)
toc = pull_geotable_agol(base_url=base_url, client=gis)

took 22.6322 seconds
Breaking feature service layer IDs into 50 chunks


In [46]:
# try function without overlay within percent first to see how many parcels are assigned
p10_toc = geo_assign_fields(
    id_df=p10_parcels,
    id_field="parcel_id",
    overlay_df=toc,
    overlay_fields=["corridor_id", "station_name"],
)

took 8.5105 seconds


In [47]:
p10_toc_assigned = p10_toc.query("corridor_id.notnull()")
p10_toc_assigned.shape

(57098, 6)

In [48]:
# merge with p10_parcels
p10_toc_gdf = pd.merge(p10_parcels, p10_toc_assigned, on="parcel_id", how="right")

In [1]:
# m = toc.explore(color="red")
# p10_toc_gdf.explore(m=m)

In [50]:
# now test the geo_assign_fields function with an overlay within percentage
p10_toc_ow = geo_assign_fields(
    id_df=p10_parcels,
    id_field="parcel_id",
    overlay_df=toc,
    overlay_fields=["corridor_id", "station_name"],
    overlay_within_pct=0.5,
)

took 8.4923 seconds


In [51]:
p10_toc_assigned_ow = p10_toc_ow.query("corridor_id.notnull()")
p10_toc_assigned_ow.shape

(55035, 6)

In [52]:
p10_toc_ow_gdf = pd.merge(p10_parcels, p10_toc_assigned_ow, on="parcel_id", how="right")

In [2]:
# m = toc.explore(color="red")
# p10_toc_ow_gdf.explore(m=m)