In [1]:
import getpass
import logging
import pathlib
import pandas as pd
import geopandas as gpd
from dvutils.miscio import log_or_print
from dvutils.geospatial import google_geocode_batch

user = getpass.getuser()

Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json


In [2]:
def setup_logger(logger_name, output_dir):
    """Set up a logger with the specified name and output directory."""
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    
    # Create a file handler for logging
    log_file = f"{output_dir}/{logger_name}.log"
    file_handler = logging.FileHandler(log_file)
    
    # Create a formatter and set it for the handler
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # Add the handler to the logger
    logger.addHandler(file_handler)
    
    return logger

In [3]:
work_dir = pathlib.Path(f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Data Services/FasTrak Data")
ft_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_cleaned.csv"

In [4]:
LOGGER = setup_logger(logger_name="fastrak_geocoding", output_dir="Logs")

In [5]:
def read_fastrak_data(file_path):
    """Read the Fastrak data from the specified file path."""
    log_or_print(f"Reading Fastrak data from {file_path}", LOGGER)
    df = pd.read_csv(file_path)
    log_or_print(f"Read {len(df)} records from Fastrak data", LOGGER)
    return df

In [6]:
def create_required_cols(df):
    """Create the required columns for geocoding."""
    log_or_print("Creating required columns for geocoding", LOGGER)

    required_columns = ["ADDR", "CITY", "STATE", "ZIP_CODE"]
    
    # Check if all required columns exist in the DataFrame
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        error_message = f"Missing required columns: {', '.join(missing_columns)}"
        log_or_print(error_message, LOGGER)
        raise ValueError(error_message)

    df = df.copy()
    df["FULL_ADDRESS"] = df["ADDR"] + ", " + df["CITY"] + ", " + df["STATE"] + " " + df["ZIP_CODE"].astype(str)

    log_or_print("Created FULL_ADDRESS column", LOGGER)
    return df

In [7]:
def batch_geocode_addresses(df):
    """_summary_

    Args:
        df (pd.DataFrame): DataFrame containing the addresses to geocode.

    Returns:
        _type_: _description_
    """

    log_or_print(f"Starting batch address geocoding on {len(df)} records", LOGGER)

    out_cols = [
        "address_orig",
        "formatted_address",
        "geometry_location_type",
        "types",
        "partial_match",
        "geometry",
    ]

    try:
        results_df = google_geocode_batch(
            address_list=df["FULL_ADDRESS"].tolist(),
            include_details=True,
            allowed_location_types=["ROOFTOP", "RANGE_INTERPOLATED"],
        )
        log_or_print(f"Finished batch address geocoding. {len(results_df)} geocoded", LOGGER)
    except Exception as e:
        log_or_print(f"Error during batch geocoding: {e}", LOGGER)
        raise

    # check for bad results by checking if bad_addresses.txt file exists
    bad_address_file = "bad_addresses.txt"
    if pathlib.Path(bad_address_file).exists():
        log_or_print(f"Bad addresses file found at {bad_address_file}", LOGGER)

        try:
            with open(bad_address_file, "r") as f:
                bad_addresses = f.read().splitlines()
            log_or_print(f"Found {len(bad_addresses)} bad addresses", LOGGER)
        except Exception as e:
            log_or_print(f"Error reading bad addresses file: {e}", LOGGER)
            raise

    return results_df[out_cols]

In [8]:
# read in the data
df = read_fastrak_data(ft_data)

In [9]:
# create the required columns
df = create_required_cols(df)

In [10]:
# batch geocode the addresses
results_df = batch_geocode_addresses(df)

Unable to geocode the following address: PO BOX 925, Diablo, CA 94528
Unable to geocode the following address: PO BOX 650, Pescadero, CA 94060
Unable to geocode the following address: 916 SOUTHLAND DR, Vallejo, CA 94589
Unable to geocode the following address: PO BOX 916, Stinson Beach, CA 94970
Unable to geocode the following address: PO BOX 916, Clayton, CA 94517
Error geocoding address: 901 BRODERICK ST, San Francisco, CA 94115: 500 Server Error: Internal Server Error for url: https://maps.googleapis.com/maps/api/geocode/json?address=901+BRODERICK+ST%2C+San+Francisco%2C+CA+94115&key=AIzaSyBzgPmkRUprYzmsMokf90ll6FeQoVp09MA
Unable to geocode the following address: 901 BRODERICK ST, San Francisco, CA 94115
Error geocoding address: 614 28TH ST, San Francisco, CA 94131: 400 Client Error: Bad Request for url: https://maps.googleapis.com/maps/api/geocode/json?address=%7F614+28TH+ST%2C+San+Francisco%2C+CA+94131&key=AIzaSyBzgPmkRUprYzmsMokf90ll6FeQoVp09MA
Unable to geocode the following add