In [2]:
import pandas as pd
import time
from sklearn.datasets import fetch_california_housing
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
from tqdm import tqdm
import json

# Load the California Housing dataset
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Add target and coordinates
df['Target'] = data.target
df['Latitude'] = df['Latitude']
df['Longitude'] = df['Longitude']

# Setup geolocator with rate limiting
geolocator = Nominatim(user_agent="geoapi")
reverse = RateLimiter(geolocator.reverse, min_delay_seconds=1)

# Caching previously looked-up results
cache_file = 'geocode_cache.json'
try:
    with open(cache_file, 'r') as f:
        cache = json.load(f)
except FileNotFoundError:
    cache = {}

# Function to get full address with caching
def get_full_address(lat, lon):
    key = f"{lat},{lon}"
    
    # Check if the result is in the cache
    if key in cache:
        return cache[key]
    
    # If not in cache, do reverse geocoding
    try:
        location = reverse((lat, lon), exactly_one=True)
        full_address = location.address
        
        # Cache the result
        cache[key] = full_address
        # Optionally write back to cache file every 100 requests to avoid large memory usage
        if len(cache) % 100 == 0:
            with open(cache_file, 'w') as f:
                json.dump(cache, f)
        
        return full_address
    except Exception as e:
        return "Error"

# Add progress bar with tqdm
tqdm.pandas()

# Apply function to the DataFrame and store result in a new column 'Full_Address'
df['Full_Address'] = df.progress_apply(lambda row: get_full_address(row['Latitude'], row['Longitude']), axis=1)

# Optionally, after processing, save the updated DataFrame with full addresses
df.to_csv('geocoded_full_address.csv', index=False)

print(df.head())  # To check the first few rows of the dataset

  4%|▍         | 845/20640 [05:00<2:51:32,  1.92it/s]RateLimiter caught an error, retrying (0/2 tries). Called with (*((np.float64(37.58), np.float64(-122.07)),), **{'exactly_one': True}).
Traceback (most recent call last):
  File "/opt/miniconda3/envs/llm_training/lib/python3.13/site-packages/geopy/adapters.py", line 298, in get_text
    page = self.urlopen(req, timeout=timeout)
  File "/opt/miniconda3/envs/llm_training/lib/python3.13/urllib/request.py", line 489, in open
    response = self._open(req, data)
  File "/opt/miniconda3/envs/llm_training/lib/python3.13/urllib/request.py", line 506, in _open
    result = self._call_chain(self.handle_open, protocol, protocol +
                              '_open', req)
  File "/opt/miniconda3/envs/llm_training/lib/python3.13/urllib/request.py", line 466, in _call_chain
    result = func(*args)
  File "/opt/miniconda3/envs/llm_training/lib/python3.13/urllib/request.py", line 1367, in https_open
    return self.do_open(http.client.HTTPSConnec

KeyboardInterrupt: 