In [1]:
import getpass
import logging
import pathlib
import pandas as pd
import geopandas as gpd
from dvutils.miscio import log_or_print
from dvutils.geospatial import google_geocode, google_geocode_batch

user = getpass.getuser()

Info: Found credentials at: /Users/jcroff/Library/CloudStorage/Box-Box/dvutils-creds-jcroff.json


In [2]:
def setup_logger(logger_name, output_dir):
    """Set up a logger with the specified name and output directory."""
    logger = logging.getLogger(logger_name)
    logger.setLevel(logging.INFO)
    
    # Create a file handler for logging
    log_file = f"{output_dir}/{logger_name}.log"
    file_handler = logging.FileHandler(log_file)
    
    # Create a formatter and set it for the handler
    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    file_handler.setFormatter(formatter)
    
    # Add the handler to the logger
    logger.addHandler(file_handler)
    
    return logger

In [3]:
work_dir = pathlib.Path(f"/Users/{user}/Library/CloudStorage/Box-Box/DataViz Projects/Data Services/FasTrak Data")
ft_data = work_dir / "Fastrak Accounts Cleaned" / "bay_area_fastrak_accounts_cleaned.csv"

In [4]:
LOGGER = setup_logger(logger_name="fastrak_geocoding", output_dir="Logs")

In [5]:
def read_fastrak_data(file_path):
    """Read the Fastrak data from the specified file path."""
    log_or_print(f"Reading Fastrak data from {file_path}", LOGGER)
    df = pd.read_csv(file_path)
    log_or_print(f"Read {len(df)} records from Fastrak data", LOGGER)
    return df

In [6]:
def create_required_cols(df):
    """Create the required columns for geocoding."""
    log_or_print("Creating required columns for geocoding", LOGGER)

    required_columns = ["ADDR", "CITY", "STATE", "ZIP_CODE"]
    
    # Check if all required columns exist in the DataFrame
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        error_message = f"Missing required columns: {', '.join(missing_columns)}"
        log_or_print(error_message, LOGGER)
        raise ValueError(error_message)

    df = df.copy()
    df["FULL_ADDRESS"] = df["ADDR"] + ", " + df["CITY"] + ", " + df["STATE"] + " " + df["ZIP_CODE"].astype(str)

    log_or_print("Created FULL_ADDRESS column", LOGGER)
    return df

In [7]:
# read in the data
df = read_fastrak_data(ft_data)

In [10]:
log_or_print(f"Starting batch address geocoding on {len(df)} records", LOGGER)
results_df = google_geocode_batch(
    address_list=df["FULL_ADDRESS"].tolist(),
    include_details=True,
    allowed_location_types=["ROOFTOP", "RANGE_INTERPOLATED"],
)
log_or_print(f"Finished batch address geocoding. {len(results_df)} geocoded", LOGGER)

# check for bad results by checking if bad_addresses.txt file exists
bad_address_file = "bad_addresses.txt"
if pathlib.Path(bad_address_file).exists():
    log_or_print(f"Bad addresses written to {bad_address_file}", LOGGER)
    with open(bad_address_file, "r") as f:
        bad_addresses = f.read().splitlines()
    log_or_print(f"Found {len(bad_addresses)} bad addresses", LOGGER)


In [21]:
out_cols = [
    "address_orig",
    "formatted_address",
    "geometry_location_type",
    "place_id",
    "types",
    "partial_match",
    "geometry",
]
test_geocode_results[out_cols]

Unnamed: 0,address_orig,formatted_address,geometry_location_type,place_id,types,partial_match,geometry
0,"15 PIGEON HOLLOW RD, San Rafael, CA 94901","15 Pigeon Hollow Rd, San Rafael, CA 94901, USA",ROOFTOP,ChIJKZ1plCeahYARoUbo8184hA8,premise,,POINT (-122.50501 37.97307)
1,"49 FOREST LANE, San Rafael, CA 94903","49 Forest Ln, San Rafael, CA 94903, USA",ROOFTOP,ChIJ11KVY76XhYARHLGgctWqis0,street_address,,POINT (-122.56248 38.00931)
2,"2821 PINE ST, San Francisco, CA 94115","2821 Pine St, San Francisco, CA 94115, USA",ROOFTOP,ChIJeQzPs8uAhYARQbnIeZBvm3Q,premise,,POINT (-122.44299 37.78655)
3,"1057 GALLEY LN, San Mateo, CA 94404","1057 Galley Ln, Foster City, CA 94404, USA",ROOFTOP,ChIJ-9KmcdCYj4ARhi5GxlF92S4,street_address,,POINT (-122.25455 37.54722)
4,"10029 MINNESOTA AVE, Penngrove, CA 94951","10029 Minnesota Ave, Penngrove, CA 94951, USA",ROOFTOP,ChIJ9dqyAM9KhIARde_8x5hc6tY,premise,,POINT (-122.68725 38.30362)
5,"3049 GOLDEN RAIN RD, Walnut Creek, CA 94595","3049 Golden Rain Rd, Walnut Creek, CA 94595, USA",ROOFTOP,ChIJjbEQDexjhYARlgZoBpS9O9k,street_address,,POINT (-122.08522 37.87965)
6,"PO BOX 332, Vallejo, CA 94590","Vallejo, CA 94590, USA",APPROXIMATE,ChIJJ0YjG2xzhYARajO53cKJbEo,postal_code,True,POINT (-122.24837 38.10525)
7,"286 SAN RAFAEL AVE., Belvedere, CA 94920","286 San Rafael Ave, Belvedere Tiburon, CA 9492...",ROOFTOP,ChIJ7d9OMvOEhYARJI7E7obQMeE,premise,,POINT (-122.46963 37.87727)
8,"45 DEER RUN, Corte Madera, CA 94925","45 Deer Run, Corte Madera, CA 94925, USA",ROOFTOP,ChIJbyoI25KahYARAFwx-PWZf7s,premise,,POINT (-122.51554 37.92137)
9,"808 SMITH RD, Mill Valley, CA 94941","808 Smith Rd, Mill Valley, CA 94941, USA",ROOFTOP,ChIJ9RB2JPePhYARmReD8ByAQDU,premise,,POINT (-122.54191 37.87515)
