In [18]:
import getpass
import logging
import pathlib
import pandas as pd
import geopandas as gpd
from dvutils.miscio import log_or_print
from dvutils.geospatial import google_geocode_batch

user = getpass.getuser()



In [19]:
def setup_logger(logger_name, output_dir):
    """Set up a logger with the specified name and output directory."""
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    
    # Create a file handler for logging
    log_file = f"{output_dir}/{logger_name}.log"
    file_handler = logging.FileHandler(log_file)
    
    # Create a formatter and set it for the handler
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # Add the handler to the logger
    logger.addHandler(file_handler)
    
    return logger

In [20]:
work_dir = pathlib.Path(
    f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Data Services/FasTrak Data"
)
ft_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_cleaned.csv"
gc_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_geocoded.csv"
final_gc_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_geocoded_final.geojson"
epc_data = (
    "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/"
    "draft_equity_priority_communities_pba2050plus_acs2022a/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
)
tract_data = (
    "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/"
    "region_2020_censustract/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
)

In [21]:
LOGGER = setup_logger(logger_name="fastrak_geocoding", output_dir="Logs")

In [22]:
def read_fastrak_data(file_path):
    """Read the Fastrak data from the specified file path."""
    log_or_print(f"Reading Fastrak data from {file_path}", LOGGER)
    df = pd.read_csv(file_path)
    log_or_print(f"Read {len(df)} records from Fastrak data", LOGGER)
    return df

In [23]:
def read_epc_data(file_path):
    """Read the EPC data from the specified file path."""
    log_or_print(f"Reading EPC data from {file_path}", LOGGER)
    df = gpd.read_file(file_path)
    log_or_print(f"Read {len(df)} records from {file_path}", LOGGER)
    return df

In [24]:
def read_tract_data(file_path):
    """Read the tract data from the specified file path."""
    log_or_print(f"Reading tract data from {file_path}", LOGGER)
    df = gpd.read_file(file_path)
    log_or_print(f"Read {len(df)} records from {file_path}", LOGGER)
    return df

In [25]:
def write_geocoded_data(df, file_path):
    """Write the geocoded data to the specified file path."""
    log_or_print(f"Writing geocoded data to {file_path}", LOGGER)
    df.to_csv(file_path, index=False)
    log_or_print(f"Wrote {len(df)} records to {file_path}", LOGGER)

In [26]:
def sjoin_geocoded_data(ft_gdf, tracts_gdf):
    """Spatially join the geocoded census tract data."""
    log_or_print(
        f"Spatially joining geocoded Fastrak data with tract data. Fastrack gdf len: {len(ft_gdf)} Tract gdf len: {len(tracts_gdf)}",
        LOGGER,
    )
    # check CRS
    if ft_gdf.crs != tracts_gdf.crs:
        log_or_print("CRS do not match. Reprojecting tracts data to match Fastrak data", LOGGER)
        tracts_gdf = tracts_gdf.to_crs(ft_gdf.crs)
    
    joined_gdf = gpd.sjoin(ft_gdf, tracts_gdf, how="left", predicate="intersects")
    log_or_print(f"Joined {len(joined_gdf)} records", LOGGER)
    return joined_gdf

In [27]:
def read_geocoded_data(file_path):
    """Read the geocoded data from the specified file path."""
    log_or_print(f"Reading geocoded data from {file_path}", LOGGER)
    # read in the geocoded data to a GeoDataFrame, which has a geometry column
    df = pd.read_csv(gc_data)
    g = gpd.GeoSeries.from_wkt(df["geometry"])
    gdf = gpd.GeoDataFrame(df, geometry=g, crs="EPSG:4326")
    log_or_print(f"Read {len(df)} records from geocoded data", LOGGER)
    return gdf

In [28]:
def create_required_cols(df):
    """Create the required columns for geocoding."""
    log_or_print("Creating required columns for geocoding", LOGGER)

    required_columns = ["ADDR", "CITY", "STATE", "ZIP_CODE"]
    
    # Check if all required columns exist in the DataFrame
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        error_message = f"Missing required columns: {', '.join(missing_columns)}"
        log_or_print(error_message, LOGGER)
        raise ValueError(error_message)

    df = df.copy()
    df["FULL_ADDRESS"] = df["ADDR"] + ", " + df["CITY"] + ", " + df["STATE"] + " " + df["ZIP_CODE"].astype(str)

    log_or_print("Created FULL_ADDRESS column", LOGGER)
    return df

In [29]:
def batch_geocode_addresses(df, out_file_path, overwrite_local=False):
    """Batch geocode addresses using Google Maps Geocoding API.

    Function to batch geocode wraps dvutils.geospatial.google_geocode_batch. 

    Args:
        df (pd.DataFrame): DataFrame containing the addresses to geocode.
        out_file_path (str): Path to write the geocoded data to.
        overwrite_local (bool, optional): Whether to overwrite the local geocoded data file. Defaults to False.

    Returns:
        pd.DataFrame: DataFrame containing the geocoded addresses with the following columns:
            - address_orig: Original address
            - formatted_address: Formatted address
            - geometry_location_type: Location type of the geocoded address
            - types: Types of the geocoded address
            - partial_match: Whether the geocoded address is a partial match
            - geometry: Geometric information of the geocoded address
    """

    log_or_print(f"Starting batch address geocoding on {len(df)} records", LOGGER)

    # If out file exists, and overwrite_local is False, read the geocoded data
    if not overwrite_local and pathlib.Path(out_file_path).exists():
        log_or_print(f"Local geocoded data file exists at {out_file_path}. Reading local data file", LOGGER)
        results_df = read_geocoded_data(out_file_path)
        log_or_print(f"Read {len(results_df)} records from geocoded data", LOGGER)
        return results_df

    out_cols = [
        "address_orig",
        "formatted_address",
        "geometry_location_type",
        "types",
        "partial_match",
        "geometry",
    ]

    try:
        results_df = google_geocode_batch(
            address_list=df["FULL_ADDRESS"].tolist(),
            include_details=True,
            allowed_location_types=["ROOFTOP", "RANGE_INTERPOLATED"],
        )
        log_or_print(f"Finished batch address geocoding. {len(results_df)} geocoded", LOGGER)
    except Exception as e:
        log_or_print(f"Error during batch geocoding: {e}", LOGGER)
        raise

    # check for bad results by checking if bad_addresses.txt file exists
    bad_address_file = "bad_addresses.txt"
    if pathlib.Path(bad_address_file).exists():
        log_or_print(f"Bad addresses file found at {bad_address_file}", LOGGER)

        try:
            with open(bad_address_file, "r") as f:
                bad_addresses = f.read().splitlines()
            log_or_print(f"Found {len(bad_addresses)} bad addresses", LOGGER)
        except Exception as e:
            log_or_print(f"Error reading bad addresses file: {e}", LOGGER)
            raise
    
    # write the geocoded data
    write_geocoded_data(results_df[out_cols], out_file_path)

In [30]:
# join the EPC data
def join_epc_data(joined_gdf, epc_gdf, epc_cols=["tract_geoid", "epc_2050p"]):
    """Join the EPC data to the joined geocoded data."""
    log_or_print(f"Joining the sjoin tract and Fastrak data to EPC data. Sjoin Fastrak data: {len(joined_gdf)} EPC data: {len(epc_gdf)}", LOGGER)
    joined_gdf = pd.merge(joined_gdf, epc_gdf[epc_cols], left_on="geoid", right_on="tract_geoid", how="left")
    log_or_print(f"Joined {len(joined_gdf)} records", LOGGER)
    return joined_gdf

In [31]:
# join to the original ft data
def join_original_data(joined_gdf, ft_data):
    """Join the original Fastrak data to the joined data."""
    log_or_print(f"Joining the original Fastrak data to the joined data. Joined data: {len(joined_gdf)} Original Fastrak data: {len(ft_data)}", LOGGER)
    # rename ft data address column
    ft_data = ft_data.rename(columns={"FULL_ADDRESS": "address_orig"})
    joined_gdf = pd.merge(joined_gdf, ft_data, on="address_orig", how="right")
    log_or_print(f"Joined {len(joined_gdf)} records", LOGGER)
    return joined_gdf

In [32]:
# final geocoding post processing


def geocode_post_processing(gdf):
    """Post processing for geocoded data."""
    # classify geocode accuracy
    log_or_print(
        "Flagging matches. Only a match if geometry_location_type in: [ROOFTOP, 'RANGE_INTERPOLATED] and partial_match = False",
        LOGGER,
    )
    gdf["match"] = (
        (gdf["geometry_location_type"].isin(["ROOFTOP", "RANGE_INTERPOLATED"]))
        & (gdf["partial_match"].isnull())
    ).astype(int)
    # log true/false counts
    log_or_print(f"Flagged {gdf['match'].value_counts().to_dict()} records as matches", LOGGER)

    # Flag data within the region
    log_or_print("Flagging data within the region", LOGGER)
    gdf["in_region"] = (gdf["index_right"].notnull()).astype(int)
    log_or_print(
        f"Flagged {gdf['in_region'].value_counts().to_dict()} records within the region", LOGGER
    )

    # drop unnecessary columns
    drop_cols = ["geometry_location_type", "types", "partial_match", "index_right", "tract_geoid"]
    log_or_print(f"Dropping unnecessary columns: {drop_cols}", LOGGER)
    gdf = gdf.drop(columns=drop_cols)

    return gdf

In [33]:
# final post processing

def final_post_processing(gdf, out_file_path):
    """Final post processing for the joined data."""
    
    # update epc_2050p, match, and in_region columns so they are not null
    cols = ["epc_2050p", "match", "in_region"]
    log_or_print(f"Updating columns {cols} to not null", LOGGER)
    gdf[cols] = gdf[cols].fillna(0).astype(int)

    # move geometry column to the end
    log_or_print("Moving geometry column to the end", LOGGER)
    gdf = gdf[[col for col in gdf.columns if col != "geometry"] + ["geometry"]]

    # provide log summary statistics
    log_or_print(f"Fastrak data geocoded results: {gdf['match'].value_counts().to_dict()}", LOGGER)
    log_or_print(f"Fastrak data in region results: {gdf['in_region'].value_counts().to_dict()}", LOGGER)
    log_or_print(f"Fastrak data EPC results: {gdf['epc_2050p'].value_counts().to_dict()}", LOGGER)

    # write the final geocoded data to geojson file
    log_or_print(f"Writing final geocoded data to {out_file_path} of length {len(gdf)}", LOGGER)
    gdf.to_file(out_file_path, driver="GeoJSON")

In [34]:
def main():
    # read in the data
    ft_df = read_fastrak_data(ft_data)

    # create the required columns
    ft_df = create_required_cols(ft_df)

    # drop duplicated addresses
    log_or_print(f"Dropping {ft_df.duplicated(subset=['FULL_ADDRESS']).sum()} duplicated addresses", LOGGER)
    ft_dedup_df = ft_df.drop_duplicates(subset=["FULL_ADDRESS"])

    # geocode the addresses
    results_gdf = batch_geocode_addresses(df=ft_dedup_df, out_file_path=gc_data, overwrite_local=False)

    # read the tract data
    tract_gdf = read_tract_data(tract_data)

    # read the EPC data
    epc_gdf = read_epc_data(epc_data)
    
    # spatially join the geocoded data with the tract data
    joined_gdf = sjoin_geocoded_data(results_gdf, tract_gdf[["geoid", "geometry"]])

    # join the EPC data
    joined_gdf = join_epc_data(joined_gdf, epc_gdf)

    # geocode post processing
    joined_gdf = geocode_post_processing(joined_gdf)

    # join to the original ft data
    final_gdf = join_original_data(joined_gdf, ft_df)

    # final post processing
    final_post_processing(final_gdf, final_gc_data)

if __name__ == "__main__":
    main()