# State Lookup Benchmark

Loads the full-detail TIGER/Line shapefile for the 50 U.S. states + DC and a simplified version of their boundaries, generates random points within the continental U.S. bounding box, and compares lookup accuracy.

In [149]:
# Imports and file paths
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
import time

# Geometry file and USPS state codes
full_shapefile = 'tl_2024_us_state.shp'
VALID_CODES = { 'AL','AK','AZ','AR','CA','CO','CT','DE','FL','GA', 'HI','ID','IL','IN','IA','KS','KY','LA','ME','MD', 'MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ', 'NM','NY','NC','ND','OH','OK','OR','PA','RI','SC', 'SD','TN','TX','UT','VT','VA','WA','WV','WI','WY','DC' }

In [150]:
# 1. Load full-detail states and filter to 50 + DC
states_full = gpd.read_file(full_shapefile).to_crs(epsg=4326)
states_full = states_full[states_full['STUSPS'].isin(VALID_CODES)]
print(f"Loaded and filtered to {len(states_full)} state geometries (50 + DC).")

Loaded and filtered to 51 state geometries (50 + DC).


In [None]:
# 2. Simplify geometries
tol = 0.01  # degrees (~100 m)
states_simp = states_full.copy()
states_simp['geometry'] = states_simp.geometry.simplify(
    tolerance=tol, preserve_topology=True
)
print(f"Created simplified geometries with tolerance {tol}.")

Created simplified geometries with tolerance 0.1.


In [152]:
# 3. Build spatial indexes
states_full_sindex = states_full.sindex
states_simp_sindex = states_simp.sindex

In [153]:
# 4. Generate random points in contiguous U.S. bounding box
min_lon, max_lon = -125.0, -66.9
min_lat, max_lat =   24.5,   49.5
n_points = 1_000_000
lons = np.random.uniform(min_lon, max_lon, n_points)
lats = np.random.uniform(min_lat, max_lat, n_points)
points = np.vstack((lons, lats)).T

In [154]:
# Lookup function using spatial index
def lookup_state(idx, gdf, lon, lat):
    possible_idx = list(idx.intersection((lon, lat, lon, lat)))
    for i in possible_idx:
        if gdf.geometry.iloc[i].contains(Point(lon, lat)):
            return gdf.iloc[i]['NAME']
    return None


In [155]:
# 5. Benchmark full-detail lookup
start_full = time.time()
results_full = [lookup_state(states_full_sindex, states_full, lon, lat) for lon, lat in points]
time_full = time.time() - start_full
print(f"Full-detail: {time_full:.2f}s total, {time_full/n_points*1e6:.2f}µs per lookup")

Full-detail: 44.62s total, 44.62µs per lookup


In [156]:
# 6. Benchmark simplified lookup
start_simp = time.time()
results_simp = [lookup_state(states_simp_sindex, states_simp, lon, lat) for lon, lat in points]
time_simp = time.time() - start_simp
print(f"Simplified: {time_simp:.2f}s total, {time_simp/n_points*1e6:.2f}µs per lookup")

Simplified: 21.65s total, 21.65µs per lookup


In [157]:
# 7. Compare accuracy
matches = sum(1 for f, s in zip(results_full, results_simp) if f == s)
match_rate = matches / n_points * 100
print(f"Match rate: {match_rate:.6f}% ({matches}/{n_points})")

Match rate: 99.347800% (993478/1000000)
