In [1]:
import getpass
import logging
import pathlib
import pandas as pd
import geopandas as gpd
from dvutils.miscio import log_or_print
from dvutils.geospatial import google_geocode_batch

user = getpass.getuser()

Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json


In [2]:
def setup_logger(logger_name, output_dir):
    """Set up a logger with the specified name and output directory."""
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    
    # Create a file handler for logging
    log_file = f"{output_dir}/{logger_name}.log"
    file_handler = logging.FileHandler(log_file)
    
    # Create a formatter and set it for the handler
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # Add the handler to the logger
    logger.addHandler(file_handler)
    
    return logger

In [10]:
work_dir = pathlib.Path(
    f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Data Services/FasTrak Data"
)
ft_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_cleaned.csv"
gc_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_geocoded.csv"
epc_data = (
    "https://services3.arcgis.com/i2dkYWmb4wHvYPda/arcgis/rest/services/"
    "draft_equity_priority_communities_pba2050plus_acs2022a/FeatureServer/0/query?outFields=*&where=1%3D1&f=geojson"
)

In [4]:
LOGGER = setup_logger(logger_name="fastrak_geocoding", output_dir="Logs")

In [5]:
def read_fastrak_data(file_path):
    """Read the Fastrak data from the specified file path."""
    log_or_print(f"Reading Fastrak data from {file_path}", LOGGER)
    df = pd.read_csv(file_path)
    log_or_print(f"Read {len(df)} records from Fastrak data", LOGGER)
    return df

In [None]:
def read_epc_data(file_path):
    """Read the EPC data from the specified file path."""
    log_or_print(f"Reading EPC data from {file_path}", LOGGER)
    df = pd.read_csv(file_path)
    log_or_print(f"Read {len(df)} records from EPC data", LOGGER)
    return df

In [6]:
def create_required_cols(df):
    """Create the required columns for geocoding."""
    log_or_print("Creating required columns for geocoding", LOGGER)

    required_columns = ["ADDR", "CITY", "STATE", "ZIP_CODE"]
    
    # Check if all required columns exist in the DataFrame
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        error_message = f"Missing required columns: {', '.join(missing_columns)}"
        log_or_print(error_message, LOGGER)
        raise ValueError(error_message)

    df = df.copy()
    df["FULL_ADDRESS"] = df["ADDR"] + ", " + df["CITY"] + ", " + df["STATE"] + " " + df["ZIP_CODE"].astype(str)

    log_or_print("Created FULL_ADDRESS column", LOGGER)
    return df

In [7]:
def batch_geocode_addresses(df):
    """Batch geocode addresses using Google Maps Geocoding API.

    Function to batch geocode wraps dvutils.geospatial.google_geocode_batch. 

    Args:
        df (pd.DataFrame): DataFrame containing the addresses to geocode.

    Returns:
        pd.DataFrame: DataFrame containing the geocoded addresses with the following columns:
            - address_orig: Original address
            - formatted_address: Formatted address
            - geometry_location_type: Location type of the geocoded address
            - types: Types of the geocoded address
            - partial_match: Whether the geocoded address is a partial match
            - geometry: Geometric information of the geocoded address
    """

    log_or_print(f"Starting batch address geocoding on {len(df)} records", LOGGER)

    out_cols = [
        "address_orig",
        "formatted_address",
        "geometry_location_type",
        "types",
        "partial_match",
        "geometry",
    ]

    try:
        results_df = google_geocode_batch(
            address_list=df["FULL_ADDRESS"].tolist(),
            include_details=True,
            allowed_location_types=["ROOFTOP", "RANGE_INTERPOLATED"],
        )
        log_or_print(f"Finished batch address geocoding. {len(results_df)} geocoded", LOGGER)
    except Exception as e:
        log_or_print(f"Error during batch geocoding: {e}", LOGGER)
        raise

    # check for bad results by checking if bad_addresses.txt file exists
    bad_address_file = "bad_addresses.txt"
    if pathlib.Path(bad_address_file).exists():
        log_or_print(f"Bad addresses file found at {bad_address_file}", LOGGER)

        try:
            with open(bad_address_file, "r") as f:
                bad_addresses = f.read().splitlines()
            log_or_print(f"Found {len(bad_addresses)} bad addresses", LOGGER)
        except Exception as e:
            log_or_print(f"Error reading bad addresses file: {e}", LOGGER)
            raise

    return results_df[out_cols]

In [8]:
# read in the data
df = read_fastrak_data(ft_data)
    
# create the required columns
df = create_required_cols(df)

# drop duplicated addresses
log_or_print(f"Dropping {df.duplicated(subset=['FULL_ADDRESS']).sum()} duplicated addresses", LOGGER)
df = df.drop_duplicates(subset=["FULL_ADDRESS"])

# geocode the addresses
results_df = batch_geocode_addresses(df)

Geocoding addresses:   2%|▏         | 14383/784942 [33:51<28:23:15,  7.54it/s] 

Unable to geocode the following address: PO BOX 925, Diablo, CA 94528


Geocoding addresses:   5%|▌         | 40939/784942 [1:36:33<458:16:38,  2.22s/it]

Error geocoding address: 2133 166TH AVE, San Leandro, CA 94578: 500 Server Error: Internal Server Error for url: https://maps.googleapis.com/maps/api/geocode/json?address=2133+166TH+AVE%2C+San+Leandro%2C+CA+94578&key=AIzaSyBzgPmkRUprYzmsMokf90ll6FeQoVp09MA
Unable to geocode the following address: 2133 166TH AVE, San Leandro, CA 94578


Geocoding addresses:   7%|▋         | 56521/784942 [2:25:20<29:57:18,  6.75it/s]    

Unable to geocode the following address: PO BOX 650, Pescadero, CA 94060


Geocoding addresses:   9%|▉         | 74473/784942 [3:22:34<27:22:52,  7.21it/s]    

Unable to geocode the following address: 916 SOUTHLAND DR, Vallejo, CA 94589


Geocoding addresses:  15%|█▍        | 117506/784942 [5:22:57<25:24:47,  7.30it/s]   

Unable to geocode the following address: 568 MONTEREY RD, Morgan Hill, CA 95037


Geocoding addresses:  18%|█▊        | 138649/784942 [6:14:09<25:40:08,  6.99it/s]  

Unable to geocode the following address: PO BOX 916, Stinson Beach, CA 94970


Geocoding addresses:  21%|██        | 161794/784942 [7:08:16<22:29:59,  7.69it/s] 

Unable to geocode the following address: PO BOX 916, Clayton, CA 94517


Geocoding addresses:  21%|██        | 164155/784942 [7:13:39<20:32:42,  8.39it/s]

Error geocoding address: 614 28TH ST, San Francisco, CA 94131: 400 Client Error: Bad Request for url: https://maps.googleapis.com/maps/api/geocode/json?address=%7F614+28TH+ST%2C+San+Francisco%2C+CA+94131&key=AIzaSyBzgPmkRUprYzmsMokf90ll6FeQoVp09MA
Unable to geocode the following address: 614 28TH ST, San Francisco, CA 94131


Geocoding addresses:  27%|██▋       | 209997/784942 [9:34:51<21:20:03,  7.49it/s]    

Unable to geocode the following address: 510 514 6712, Hayward, CA 94544


Geocoding addresses:  27%|██▋       | 212835/784942 [9:41:33<22:28:59,  7.07it/s]

Unable to geocode the following address: 916 H PARK BLVD, San Mateo, CA 94404


Geocoding addresses:  31%|███       | 241135/784942 [10:47:53<20:43:12,  7.29it/s]

Unable to geocode the following address:  PO BOX 916, Suisun City, CA 94585


Geocoding addresses:  38%|███▊      | 297144/784942 [13:02:35<17:18:18,  7.83it/s]  

Unable to geocode the following address: PO BOX 916, Alameda, CA 94501


Geocoding addresses:  38%|███▊      | 298530/784942 [13:05:48<17:10:09,  7.87it/s]

Unable to geocode the following address: PO BOX 916, Danville, CA 94526


Geocoding addresses:  54%|█████▍    | 427720/784942 [18:20:02<13:17:16,  7.47it/s]  

Unable to geocode the following address: 650 LIST SPAWN ST, San Francisco, CA 94122


Geocoding addresses:  61%|██████▏   | 481934/784942 [20:43:19<11:05:04,  7.59it/s]  

Unable to geocode the following address: PO BOX 415, Inverness, CA 94937


Geocoding addresses:  62%|██████▏   | 482853/784942 [20:45:23<11:50:15,  7.09it/s]

Unable to geocode the following address: 71 KELLY CT, Novato, CA 94949


Geocoding addresses:  66%|██████▌   | 515670/784942 [22:15:55<10:49:07,  6.91it/s]    

Unable to geocode the following address: PO BOX 707, Fairfield, CA 94533


Geocoding addresses:  68%|██████▊   | 531493/784942 [22:55:27<9:22:02,  7.52it/s]   

Unable to geocode the following address: PO BOX 209, Oakley, CA 94561


Geocoding addresses:  76%|███████▌  | 595628/784942 [25:38:15<8:50:14,  5.95it/s]   

Unable to geocode the following address: 73 KELLY CT, Novato, CA 94949


Geocoding addresses:  80%|████████  | 629079/784942 [27:02:00<7:17:32,  5.94it/s]   

Unable to geocode the following address: PO BOX 916, Ross, CA 94957


Geocoding addresses:  85%|████████▍ | 664889/784942 [28:56:39<14:41:24,  2.27it/s]    

Unable to geocode the following address: 209 BLACKSTONE CMN, Livermore, CA 94550


Geocoding addresses:  85%|████████▌ | 669651/784942 [29:09:00<4:20:04,  7.39it/s] 

Unable to geocode the following address: 916 APPLE CREEK LN, Santa Rosa, CA 95401


Geocoding addresses:  92%|█████████▏| 719143/784942 [31:17:11<2:17:40,  7.97it/s]   

Unable to geocode the following address: 415 ALAMEDA, Vallejo, CA 94590


Geocoding addresses:  95%|█████████▍| 745608/784942 [32:20:59<1:33:31,  7.01it/s] 

Error geocoding address: 244 ISLETON AVE, Oakland, CA 94603: 400 Client Error: Bad Request for url: https://maps.googleapis.com/maps/api/geocode/json?address=%7F244+ISLETON+AVE%2C+Oakland%2C+CA+94603&key=AIzaSyBzgPmkRUprYzmsMokf90ll6FeQoVp09MA
Unable to geocode the following address: 244 ISLETON AVE, Oakland, CA 94603


Geocoding addresses:  97%|█████████▋| 758007/784942 [32:50:57<1:00:26,  7.43it/s]

Unable to geocode the following address: 925 FILLIMAN ST, San Francisco, CA 94134


Geocoding addresses: 100%|██████████| 784942/784942 [34:04:13<00:00,  6.40it/s]   


Writing bad addresses to bad_addresses.txt


In [11]:
results_df.to_csv(gc_data, index=False)

In [12]:
results_df

Unnamed: 0,address_orig,formatted_address,geometry_location_type,types,partial_match,geometry
0,"15 PIGEON HOLLOW RD, San Rafael, CA 94901","15 Pigeon Hollow Rd, San Rafael, CA 94901, USA",ROOFTOP,premise,,POINT (-122.50501 37.97307)
1,"49 FOREST LANE, San Rafael, CA 94903","49 Forest Ln, San Rafael, CA 94903, USA",ROOFTOP,street_address,,POINT (-122.56248 38.00931)
2,"2821 PINE ST, San Francisco, CA 94115","2821 Pine St, San Francisco, CA 94115, USA",ROOFTOP,premise,,POINT (-122.44299 37.78655)
3,"1057 GALLEY LN, San Mateo, CA 94404","1057 Galley Ln, Foster City, CA 94404, USA",ROOFTOP,street_address,,POINT (-122.25455 37.54722)
4,"10029 MINNESOTA AVE, Penngrove, CA 94951","10029 Minnesota Ave, Penngrove, CA 94951, USA",ROOFTOP,premise,,POINT (-122.68725 38.30362)
...,...,...,...,...,...,...
784912,"313 CORTEZ DR, Petaluma, CA 94954","313 Cortez Dr, Petaluma, CA 94954, USA",ROOFTOP,premise,,POINT (-122.62425 38.24763)
784913,"28045 THORUP LN, Hayward, CA 94542","28045 Thorup Ln, Hayward, CA 94542, USA",ROOFTOP,premise,,POINT (-122.03852 37.64856)
784914,"3731 MARKET ST, San Francisco, CA 94131","3731 Market St, San Francisco, CA 94131, USA",ROOFTOP,premise,,POINT (-122.44171 37.75418)
784915,"446 MORAGA ST, San Francisco, CA 94122","446 Moraga St, San Francisco, CA 94122, USA",ROOFTOP,street_address,,POINT (-122.4677 37.75668)


In [None]:
# read in the geocoded data to a GeoDataFrame, which has a geometry column
a
