# Contact Geocode Search - Optimized Version

---

***Using Nominatim/OpenStreetMaps to collect geocodes for contact records with advanced optimization techniques.***

---

In [13]:
import json
import numpy as np
import pandas as pd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time
from tqdm.notebook import tqdm
import hashlib
import os

In [14]:
df = pd.read_parquet('../data/processed/cleaned_contacts.parquet')
df

Unnamed: 0,Ind_PAC_ID,Provider Last Name,Provider First Name,gndr,Cred,pri_spec,sec_spec_1,sec_spec_2,sec_spec_3,sec_spec_4,...,Telehlth,Facility Name,org_pac_id,adr_ln_1,adr_ln_2,City/Town,State,ZIP Code,Telephone Number,Full Address
0,3971628603,OKETOKUN,ADEFOLAJU,F,MD,GENERAL PRACTICE,,,,,...,,,,12321 MIDDLEBROOK RD,SUITE 102,GERMANTOWN,MD,20874,,"12321 MIDDLEBROOK RD, GERMANTOWN, MD 20874"
1,7618130956,RASER-SCHRAMM,JONATHAN,M,MD,NEUROLOGY,,,,,...,Y,UNION HOSPITAL OF CECIL COUNTY,6.901720e+09,106 BOW ST,,ELKTON,MD,21921,4.103984e+09,"106 BOW ST, ELKTON, MD 21921"
2,7315047990,NGUYEN,VAN ANH,F,MD,FAMILY PRACTICE,,,,,...,Y,"PRIVIA MEDICAL GROUP, LLC",4.385682e+09,9470 ANNAPOLIS RD,SUITE 309,LANHAM,MD,20706,2.028322e+09,"9470 ANNAPOLIS RD, LANHAM, MD 20706"
3,3870748189,JIMENEZ-TOSADO,ZAHIR,F,MD,FAMILY PRACTICE,,,,,...,,KAISER FOUNDATION HEALTH PLAN OF THE MID ATLAN...,3.779496e+09,1221 MERCANTILE LN,,LARGO,MD,20774,3.016186e+09,"1221 MERCANTILE LN, LARGO, MD 20774"
4,143665364,UENO,ANN,F,,MENTAL HEALTH COUNSELOR,,,,,...,,CALVERT COUNTY HEALTH DEPARTMENT,7.214188e+09,975 SOLOMONS ISLAND RD N,SUITE 119,PRINCE FREDERICK,MD,20678,4.105355e+12,"975 SOLOMONS ISLAND RD N, PRINCE FREDERICK, MD..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7175,6103986922,BERGER,YANIV,M,DO,FAMILY PRACTICE,,,,,...,Y,,,300 FREDERICK RD,SUITE 102,CATONSVILLE,MD,21228,4.106510e+09,"300 FREDERICK RD, CATONSVILLE, MD 21228"
7176,6103986922,BERGER,YANIV,M,DO,FAMILY PRACTICE,,,,,...,Y,LIFEBRIDGE COMMUNITY PHYSICIANS INC,3.678751e+09,2700 QUARRY LAKE DR,SUITE 280,BALTIMORE,MD,21209,4.434713e+09,"2700 QUARRY LAKE DR, BALTIMORE, MD 21209"
7177,6103986922,BERGER,YANIV,M,DO,FAMILY PRACTICE,,,,,...,Y,AGAPE PHYSICAL THERAPY AND SPORTS REHABILITATI...,3.870741e+09,12 NEWPORT DR,SUITE A,FOREST HILL,MD,21050,,"12 NEWPORT DR, FOREST HILL, MD 21050"
7178,3173967031,SWETLOW,TRACY,F,,MENTAL HEALTH COUNSELOR,,,,,...,Y,QUINCE ORCHARD PSYCHOTHERAPY LLC,5.395194e+09,10 N JEFFERSON ST,SUITE 202,FREDERICK,MD,21701,2.407506e+09,"10 N JEFFERSON ST, FREDERICK, MD 21701"


In [15]:
# Cache file setup
CACHE_FILE = '../data/processed/geocode_cache.json'

# Load existing cache or create new one
if os.path.exists(CACHE_FILE):
    with open(CACHE_FILE, 'r') as f:
        geocode_cache = json.load(f)
    print(f"Loaded {len(geocode_cache)} cached addresses")
else:
    geocode_cache = {}
    print("Starting with empty cache")

# Initialize Nominatim geocoder with rate limiter
print("Initializing Nominatim geocoder...")
geolocator = Nominatim(user_agent="MedMatch_Geocoder/1.0 (bmccarty505@gmail.com)", timeout=10)
# Use 1.5 second delay to be more conservative and avoid 403 errors
geocode_api = RateLimiter(geolocator.geocode, min_delay_seconds=1.5)

# Test API connection
print("Testing API connection...")
try:
    test_location = geolocator.geocode("Baltimore, MD")
    if test_location:
        print(f"✓ API test successful! Found: {test_location.address}")
        print(f"  Coordinates: ({test_location.latitude}, {test_location.longitude})")
    else:
        print("⚠ API test returned no results")
except Exception as e:
    print(f"✗ API test failed: {type(e).__name__}: {str(e)}")
    print("Please check your internet connection and try again.")

# Function to create hash for address (for cache key)
def hash_address(address):
    """Create a unique hash for an address string"""
    return hashlib.md5(address.lower().strip().encode()).hexdigest()

# Function to format address for geocoding with fallback options
def format_address(row, fallback_level=0):
    """
    Combine address components into a single string for geocoding
    fallback_level 0: Full address (all components)
    fallback_level 1: Address line 1 only (no adr_ln_2)
    fallback_level 2: ZIP + State only (general area)
    """
    address_parts = []
    
    if fallback_level == 0:
        # Primary attempt: Full address
        if pd.notna(row['adr_ln_1']):
            address_parts.append(str(row['adr_ln_1']))
        if pd.notna(row.get('adr_ln_2', None)) and str(row.get('adr_ln_2', '')) != '':
            address_parts.append(str(row['adr_ln_2']))
        if pd.notna(row.get('City/Town', None)):
            address_parts.append(str(row['City/Town']))
        if pd.notna(row['State']):
            address_parts.append(str(row['State']))
        if pd.notna(row['ZIP Code']):
            address_parts.append(str(row['ZIP Code']))
    elif fallback_level == 1:
        # Fallback 1: Address line 1 without adr_ln_2 (suite/apt numbers can cause issues)
        if pd.notna(row['adr_ln_1']):
            address_parts.append(str(row['adr_ln_1']))
        if pd.notna(row.get('City/Town', None)):
            address_parts.append(str(row['City/Town']))
        if pd.notna(row['State']):
            address_parts.append(str(row['State']))
        if pd.notna(row['ZIP Code']):
            address_parts.append(str(row['ZIP Code']))
    else:
        # Fallback 2: Just ZIP code + State (useful for general area)
        if pd.notna(row['ZIP Code']):
            address_parts.append(str(row['ZIP Code']))
        if pd.notna(row['State']):
            address_parts.append(str(row['State']))
    
    return ', '.join(address_parts)

# Function to geocode a single address with caching and retry logic
def geocode_address(address, max_retries=3, show_errors=False):
    """Geocode an address with cache support, retry logic, and return lat/long or None"""
    # Check cache first
    addr_hash = hash_address(address)
    if addr_hash in geocode_cache:
        cached_result = geocode_cache[addr_hash]
        return cached_result.get('lat'), cached_result.get('lon')
    
    # Not in cache, make API call with retries
    for attempt in range(max_retries):
        try:
            location = geocode_api(address)
            if location:
                lat, lon = location.latitude, location.longitude
                # Save to cache
                geocode_cache[addr_hash] = {
                    'lat': lat, 
                    'lon': lon, 
                    'address': address,
                    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
                }
                return lat, lon
            else:
                # Cache the null result too to avoid repeated lookups
                geocode_cache[addr_hash] = {
                    'lat': None, 
                    'lon': None, 
                    'address': address,
                    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
                    'reason': 'not_found'
                }
                return None, None
        except Exception as e:
            error_msg = f"{type(e).__name__}: {str(e)}"
            
            # Check if it's a 403 error (rate limiting/blocking)
            if "403" in str(e) or "GeocoderInsufficientPrivileges" in type(e).__name__:
                if show_errors:
                    print(f"  ⚠ Rate limit detected, waiting longer before retry...")
                time.sleep(5)  # Wait longer for 403 errors
            
            if attempt < max_retries - 1:
                if show_errors:
                    print(f"  Retry {attempt + 1}/{max_retries} for: {address[:50]}... - {error_msg}")
                time.sleep(3)  # Increased wait time before retry
            else:
                if show_errors:
                    print(f"  Failed after {max_retries} attempts: {address[:50]}... - {error_msg}")
                geocode_cache[addr_hash] = {
                    'lat': None, 
                    'lon': None, 
                    'address': address,
                    'error': error_msg,
                    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
                }
                return None, None
    
    return None, None

# Function to save cache quietly
def save_cache(silent=True):
    """Save the current cache to disk"""
    with open(CACHE_FILE, 'w') as f:
        json.dump(geocode_cache, f, indent=2)
    if not silent:
        print(f"Cache saved ({len(geocode_cache)} entries)")

# Get unique addresses first to minimize API calls
print("\nIdentifying unique addresses...")
df['full_address'] = df.apply(format_address, axis=1)
unique_addresses = df['full_address'].unique()
print(f"Found {len(unique_addresses)} unique addresses out of {len(df)} total records")

# Initialize columns for lat/long
if 'latitude' not in df.columns:
    df['latitude'] = None
if 'longitude' not in df.columns:
    df['longitude'] = None

# Dictionary to map original addresses to their geocoded results
address_to_coords = {}

# Geocode unique addresses only
print(f"\nStarting geocoding for {len(unique_addresses)} unique addresses...")
print("(Errors will be shown for the first 5 failed addresses)")
print("Waiting 2 seconds before starting to respect rate limits...\n")
time.sleep(2)  # Initial delay to avoid immediate rate limiting

cache_hits = 0
api_calls = 0
failed_addresses = []
fallback_1_used = 0
fallback_2_used = 0
geocoded_so_far = 0
error_count = 0

# Create progress bar with detailed statistics
pbar = tqdm(unique_addresses, desc="Geocoding addresses", unit="addr")

for i, address in enumerate(pbar):
    addr_hash = hash_address(address)
    
    # Track cache hits vs API calls
    if addr_hash in geocode_cache:
        cache_hits += 1
    else:
        api_calls += 1
    
    # Show errors for first few failed attempts
    show_errors = error_count < 5
    
    # Try primary address
    lat, lon = geocode_address(address, show_errors=show_errors)
    
    # If geocoding failed, try fallback approaches in sequence
    if lat is None and lon is None:
        error_count += 1
        # Try to get row for fallback
        row_idx = df[df['full_address'] == address].index[0]
        
        # Fallback 1: Try address line 1 without adr_ln_2
        fallback_1_address = format_address(df.loc[row_idx], fallback_level=1)
        if fallback_1_address != address:
            lat, lon = geocode_address(fallback_1_address, show_errors=show_errors)
            if lat is not None:
                fallback_1_used += 1
                # Cache the result under the ORIGINAL address too
                geocode_cache[addr_hash] = {
                    'lat': lat, 
                    'lon': lon, 
                    'address': address,
                    'fallback_used': 1,
                    'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
                }
        
        # Fallback 2: If still failed, try ZIP + State
        if lat is None and lon is None:
            fallback_2_address = format_address(df.loc[row_idx], fallback_level=2)
            if fallback_2_address not in [address, fallback_1_address]:
                lat, lon = geocode_address(fallback_2_address, show_errors=show_errors)
                if lat is not None:
                    fallback_2_used += 1
                    # Cache the result under the ORIGINAL address too
                    geocode_cache[addr_hash] = {
                        'lat': lat, 
                        'lon': lon, 
                        'address': address,
                        'fallback_used': 2,
                        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
                    }
    
    # Store result mapped to original address
    address_to_coords[address] = (lat, lon)
    
    if lat is not None:
        geocoded_so_far += 1
    else:
        failed_addresses.append(address)
    
    # Update progress bar with statistics
    success_rate = (geocoded_so_far / (i + 1) * 100) if (i + 1) > 0 else 0
    pbar.set_postfix({
        'Success': f'{success_rate:.1f}%',
        'Cache': cache_hits,
        'API': api_calls,
        'Failed': len(failed_addresses)
    })
    
    # Save cache every 50 addresses (silently)
    if (i + 1) % 50 == 0:
        save_cache(silent=True)

pbar.close()

# Map geocoded results back to original dataframe
print("\nMapping results back to dataframe...")
for idx in tqdm(df.index, desc="Populating dataframe", unit="row"):
    address = df.loc[idx, 'full_address']
    
    # Use the address_to_coords mapping
    if address in address_to_coords:
        lat, lon = address_to_coords[address]
        df.loc[idx, 'latitude'] = lat
        df.loc[idx, 'longitude'] = lon

# Final cache save
save_cache(silent=False)

# Summary statistics
geocoded_count = df['latitude'].notna().sum()
print(f"\n{'='*60}")
print(f"Geocoding complete!")
print(f"{'='*60}")
print(f"Total records: {len(df)}")
print(f"Unique addresses: {len(unique_addresses)}")
print(f"Successfully geocoded: {geocoded_count}/{len(df)} records ({geocoded_count/len(df)*100:.1f}%)")
print(f"Failed to geocode: {len(df) - geocoded_count} records")
print(f"Cache hits: {cache_hits}/{len(unique_addresses)} ({cache_hits/len(unique_addresses)*100:.1f}%)")
print(f"API calls made: {api_calls}/{len(unique_addresses)} ({api_calls/len(unique_addresses)*100:.1f}%)")
print(f"Fallback 1 successes (no adr_ln_2): {fallback_1_used}")
print(f"Fallback 2 successes (ZIP+State): {fallback_2_used}")
print(f"Total cached addresses: {len(geocode_cache)}")
print(f"Cache saved to: {CACHE_FILE}")

if failed_addresses:
    print(f"\n{len(failed_addresses)} addresses failed to geocode:")
    for addr in failed_addresses[:10]:  # Show first 10
        print(f"  - {addr}")
    if len(failed_addresses) > 10:
        print(f"  ... and {len(failed_addresses) - 10} more")

# Remove temporary column
df = df.drop(columns=['full_address'])

df

Starting with empty cache
Initializing Nominatim geocoder...
Testing API connection...
✓ API test successful! Found: Baltimore, Maryland, United States
  Coordinates: (39.2908816, -76.610759)

Identifying unique addresses...
Found 2168 unique addresses out of 7180 total records

Starting geocoding for 2168 unique addresses...
(Errors will be shown for the first 5 failed addresses)
Waiting 2 seconds before starting to respect rate limits...



Geocoding addresses:   0%|          | 0/2168 [00:00<?, ?addr/s]


Mapping results back to dataframe...


Populating dataframe:   0%|          | 0/7180 [00:00<?, ?row/s]

Cache saved (3094 entries)

Geocoding complete!
Total records: 7180
Unique addresses: 2168
Successfully geocoded: 7180/7180 records (100.0%)
Failed to geocode: 0 records
Cache hits: 55/2168 (2.5%)
API calls made: 2113/2168 (97.5%)
Fallback 1 successes (no adr_ln_2): 982
Fallback 2 successes (ZIP+State): 337
Total cached addresses: 3094
Cache saved to: ../data/processed/geocode_cache.json


Unnamed: 0,Ind_PAC_ID,Provider Last Name,Provider First Name,gndr,Cred,pri_spec,sec_spec_1,sec_spec_2,sec_spec_3,sec_spec_4,...,org_pac_id,adr_ln_1,adr_ln_2,City/Town,State,ZIP Code,Telephone Number,Full Address,latitude,longitude
0,3971628603,OKETOKUN,ADEFOLAJU,F,MD,GENERAL PRACTICE,,,,,...,,12321 MIDDLEBROOK RD,SUITE 102,GERMANTOWN,MD,20874,,"12321 MIDDLEBROOK RD, GERMANTOWN, MD 20874",39.176383,-77.255398
1,7618130956,RASER-SCHRAMM,JONATHAN,M,MD,NEUROLOGY,,,,,...,6.901720e+09,106 BOW ST,,ELKTON,MD,21921,4.103984e+09,"106 BOW ST, ELKTON, MD 21921",39.609262,-75.832051
2,7315047990,NGUYEN,VAN ANH,F,MD,FAMILY PRACTICE,,,,,...,4.385682e+09,9470 ANNAPOLIS RD,SUITE 309,LANHAM,MD,20706,2.028322e+09,"9470 ANNAPOLIS RD, LANHAM, MD 20706",38.965486,-76.845115
3,3870748189,JIMENEZ-TOSADO,ZAHIR,F,MD,FAMILY PRACTICE,,,,,...,3.779496e+09,1221 MERCANTILE LN,,LARGO,MD,20774,3.016186e+09,"1221 MERCANTILE LN, LARGO, MD 20774",38.907034,-76.835604
4,143665364,UENO,ANN,F,,MENTAL HEALTH COUNSELOR,,,,,...,7.214188e+09,975 SOLOMONS ISLAND RD N,SUITE 119,PRINCE FREDERICK,MD,20678,4.105355e+12,"975 SOLOMONS ISLAND RD N, PRINCE FREDERICK, MD...",38.558067,-76.597822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7175,6103986922,BERGER,YANIV,M,DO,FAMILY PRACTICE,,,,,...,,300 FREDERICK RD,SUITE 102,CATONSVILLE,MD,21228,4.106510e+09,"300 FREDERICK RD, CATONSVILLE, MD 21228",39.273545,-76.725737
7176,6103986922,BERGER,YANIV,M,DO,FAMILY PRACTICE,,,,,...,3.678751e+09,2700 QUARRY LAKE DR,SUITE 280,BALTIMORE,MD,21209,4.434713e+09,"2700 QUARRY LAKE DR, BALTIMORE, MD 21209",39.384434,-76.69151
7177,6103986922,BERGER,YANIV,M,DO,FAMILY PRACTICE,,,,,...,3.870741e+09,12 NEWPORT DR,SUITE A,FOREST HILL,MD,21050,,"12 NEWPORT DR, FOREST HILL, MD 21050",39.578223,-76.381403
7178,3173967031,SWETLOW,TRACY,F,,MENTAL HEALTH COUNSELOR,,,,,...,5.395194e+09,10 N JEFFERSON ST,SUITE 202,FREDERICK,MD,21701,2.407506e+09,"10 N JEFFERSON ST, FREDERICK, MD 21701",39.41283,-77.420984


In [17]:
df.to_parquet('../data/processed/Geocoded_Contacts.parquet', index=False, compression = 'zstd')