In [1]:
from google import genai
from dotenv import load_dotenv
import os
import pandas as pd

In [None]:
load_dotenv()


In [None]:
prompt_rules = ""

In [2]:
%pip install geopy

Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.1-py3-none-any.whl.metadata (1.6 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
Downloading geographiclib-2.1-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.1 geopy-2.4.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

# --- 1. Load your data ---
# Make sure 'unique_event_locations.csv' is in the same directory as your script.
try:
    df = pd.read_csv('../../data/raw/unique_event_locations.csv')
    print("Successfully loaded 'unique_event_locations.csv'.")
    print(f"Found {len(df)} locations to geocode.")
except FileNotFoundError:
    print("Error: Could not find 'unique_event_locations.csv'.")
    print("Please make sure the file is in the correct directory and run the script again.")
    exit() # Exit the script if the file is not found.

# --- 2. Set up the Geocoder ---
# We use Nominatim, a free service based on OpenStreetMap.
# The user_agent can be any name you choose for your app.
geolocator = Nominatim(user_agent="my-location-geocoder-app")

# IMPORTANT: To avoid overwhelming the free service, we must not send requests too quickly.
# This RateLimiter automatically adds a 1-second delay between each request.
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# --- 3. Define the Geocoding Function ---
# This function creates a clean, specific address to get the best results.
def get_coordinates(row):
    try:
        # For better accuracy, combine the main street, borough, and state.
        # Takes the part of the address before "between" or "and" for a cleaner query.
        main_location = str(row['event_location']).split(' between ')[0].split(' and ')[0].strip()
        full_address = f"{main_location}, {row['event_borough']}, NY"
        
        print(f"Processing address: {full_address}") # Lets you watch the progress
        
        location_data = geocode(full_address)
        
        # If the geocoder finds a location, return its lat/lon.
        if location_data:
            return pd.Series([location_data.latitude, location_data.longitude])
        # If no location is found, return empty values.
        else:
            return pd.Series([None, None])
    except Exception as e:
        # Handle potential errors in the data or during the geocoding call
        print(f"An error occurred for row {row.name}: {e}")
        return pd.Series([None, None])

# --- 4. Apply the function to all rows ---
# This creates two new columns to hold the latitude and longitude.
# This step can take a while for a large number of addresses.
print("\nStarting the geocoding process. This may take some time...")
df[['latitude', 'longitude']] = df.apply(get_coordinates, axis=1)

# --- 5. Save the final result ---
output_filename = 'locations_with_coordinates.csv'
df.to_csv(output_filename, index=False)

print("\n------------------")
print("Geocoding complete!")
print(f"Results saved to '{output_filename}'")
print("\nHere is a preview of your new data:")
print(df.head())

In [None]:
import pandas as pd
import os
import json
import time
from geopy.geocoders import Nominatim
from google import genai
from dotenv import load_dotenv
from geopy.extra.rate_limiter import RateLimiter

# --- 1. SETUP ---
try:
    # genai.configure(api_key=os.environ["GEMINI_API_KEY"])
    # ai_client = genai.GenerativeModel('gemini-pro')
    ai_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))


    print("Gemini AI client configured successfully.")
except (KeyError, TypeError):
    print("Error: GEMINI_API_KEY not found in environment variables.")
    exit()

# Configure geopy
geolocator = Nominatim(user_agent="geopy-batch-geocoder")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# --- 2. LOAD DATA ---
try:
    df = pd.read_csv('../../data/raw/unique_event_locations.csv')
    # Initialize lat/lon columns
    df['latitude'] = None
    df['longitude'] = None
    print(f"Successfully loaded 'unique_event_locations.csv' with {len(df)} rows.")
except FileNotFoundError:
    print("Error: Could not find 'unique_event_locations.csv'.")
    exit()

# --- 3. FIRST PASS: GEOCODE WITH GEOPY ---
print("\n--- Starting Pass 1: Geocoding with Geopy ---")
failed_locations = []

for index, row in df.iterrows():
    try:
        main_location = str(row['event_location']).split(' between ')[0].split(' and ')[0].strip()
        full_address = f"{main_location}, {row['event_borough']}, NY"
        
        location_data = geocode(full_address)
        
        if location_data:
            df.at[index, 'latitude'] = location_data.latitude
            df.at[index, 'longitude'] = location_data.longitude
            print(f"✅ Geopy SUCCESS for: {full_address}")
        else:
            print(f"🤔 Geopy FAILED for: {full_address}")
            # Add the full address and original index to the list of failures
            failed_locations.append({'index': index, 'address': full_address})
            
    except Exception as e:
        print(f"Geopy error for row {index}: {e}")
        failed_locations.append({'index': index, 'address': full_address})

print(f"\nGeopy pass complete. {len(failed_locations)} locations failed and will be sent to Gemini.")

# --- 4. SECOND PASS: BATCH GEOCODE FAILED LOCATIONS WITH GEMINI ---
if failed_locations:
    print("\n--- Starting Pass 2: Batch Geocoding with Gemini ---")

    # Create a numbered list of addresses for the prompt
    address_list_for_prompt = "\n".join([f"{i+1}. \"{item['address']}\"" for i, item in enumerate(failed_locations)])

    # This prompt is critical for getting structured JSON back
    batch_prompt = f"""
    You are an expert geocoding assistant.
    For each numbered location in the list below, provide its latitude and longitude.
    Return your response as a single, valid JSON object.
    The JSON object should have the original address string as the key, and the value should be another object containing "latitude" and "longitude".
    If you cannot find coordinates for an address, use a value of null.

    Here are the locations:
    {address_list_for_prompt}

    Do not include any text, explanations, or markdown formatting like ```json before or after the JSON object.
    """

    try:
        print("Sending single batch request to Gemini API...")
        # response = ai_client.generate_content(batch_prompt)
        response = ai_client.models.generate_content(
            model = 'gemini-2.5-flash',
            contents = batch_prompt
        )

        
        # Clean up the response to ensure it's valid JSON
        json_text = response.text.strip().replace('```json', '').replace('```', '')
        
        # Parse the JSON response
        coordinates_map = json.loads(json_text)
        print("Successfully parsed JSON response from Gemini.")

        # Update the DataFrame with the results from Gemini
        for item in failed_locations:
            address = item['address']
            if address in coordinates_map and coordinates_map[address]:
                lat = coordinates_map[address].get('latitude')
                lon = coordinates_map[address].get('longitude')
                df.at[item['index'], 'latitude'] = lat
                df.at[item['index'], 'longitude'] = lon
                print(f"🤖 Gemini SUCCESS for: {address}")
            else:
                print(f"❌ Gemini could not find coordinates for: {address}")

    except Exception as e:
        print(f"An error occurred during the Gemini API call or parsing: {e}")

# --- 5. SAVE THE FINAL RESULT ---
output_filename = 'locations_fully_geocoded.csv'
df.to_csv(output_filename, index=False)

print("\n------------------")
print("✅ Hybrid geocoding complete!")
print(f"Results saved to '{output_filename}'")
print("\nPreview of the final data:")
# Display rows that had missing data to see if they were filled
print(df[df.latitude.notna()].head())

In [None]:
import pandas as pd
import os
import json
import time
from geopy.geocoders import Nominatim
from google import genai
from dotenv import load_dotenv
from geopy.extra.rate_limiter import RateLimiter


# --- 1. SETUP ---
try:
    # genai.configure(api_key=os.environ["GEMINI_API_KEY"])
    # ai_client = genai.GenerativeModel('gemini-pro')
    ai_client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
    print("Gemini AI client configured successfully.")
except (KeyError, TypeError):
    print("Error: GEMINI_API_KEY not found in environment variables.")
    exit()

geolocator = Nominatim(user_agent="advanced-geocoder")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# --- 2. LOAD DATA ---
try:
    df = pd.read_csv('../../data/raw/unique_event_locations.csv')
    df['latitude'] = None
    df['longitude'] = None
    print(f"Successfully loaded 'unique_event_locations.csv' with {len(df)} rows.")
except FileNotFoundError:
    print("Error: Could not find 'unique_event_locations.csv'.")
    exit()

# --- 3. ADVANCED GEOPY PASS (MIDPOINT CALCULATION) ---
print("\n--- Starting Pass 1: Advanced Geocoding with Geopy ---")
failed_locations = []

for index, row in df.iterrows():
    location_str = str(row['event_location'])
    borough = row['event_borough']
    
    # ADVANCED LOGIC: Try to parse "between" format
    match = re.search(r'(.+?) between (.+?) and (.+)', location_str, re.IGNORECASE)
    
    lat, lon = None, None
    
    if match:
        main_street, cross_street1, cross_street2 = match.groups()
        main_street = main_street.strip()
        
        try:
            # Geocode both intersections
            addr1 = f"{main_street} & {cross_street1.strip()}, {borough}, NY"
            loc1 = geocode(addr1)
            
            addr2 = f"{main_street} & {cross_street2.strip()}, {borough}, NY"
            loc2 = geocode(addr2)

            if loc1 and loc2:
                # Calculate the midpoint
                lat = (loc1.latitude + loc2.latitude) / 2
                lon = (loc1.longitude + loc2.longitude) / 2
                print(f"✅ Midpoint SUCCESS for: {location_str}")
        except Exception as e:
            print(f"Midpoint geocoding error: {e}")
            # Continue to fallback...
    
    # FALLBACK LOGIC: If midpoint failed or format was different
    if lat is None or lon is None:
        try:
            main_location = location_str.split(' between ')[0].split(' and ')[0].strip()
            full_address = f"{main_location}, {borough}, NY"
            location_data = geocode(full_address)
            
            if location_data:
                lat, lon = location_data.latitude, location_data.longitude
                print(f"✅ Fallback SUCCESS for: {full_address}")
        except Exception as e:
            print(f"Fallback geocoding error: {e}")

    # Final check and assignment
    if lat and lon:
        df.at[index, 'latitude'] = lat
        df.at[index, 'longitude'] = lon
    else:
        full_address = f"{location_str}, {borough}, NY"
        print(f"🤔 Geopy FAILED for: {full_address}")
        failed_locations.append({'index': index, 'address': full_address})

print(f"\nGeopy pass complete. {len(failed_locations)} locations failed and will be sent to Gemini.")

# --- 4. BATCH GEOCODE FAILED LOCATIONS WITH GEMINI ---
if failed_locations:
    print("\n--- Starting Pass 2: Batch Geocoding with Gemini ---")
    
    address_list_for_prompt = "\n".join([f"{i+1}. \"{item['address']}\"" for i, item in enumerate(failed_locations)])
    
    batch_prompt = f"""
    You are an expert geocoding assistant. For each numbered location in the list below, provide its latitude and longitude.
    Return your response as a single, valid JSON object where the key is the original address string and the value is an object with "latitude" and "longitude".
    If you cannot find coordinates, use a value of null.

    Here are the locations:
    {address_list_for_prompt}

    Provide ONLY the JSON object in your response.
    """
    
    try:
        print("Sending single batch request to Gemini API...")
        # response = ai_client.generate_content(batch_prompt)
        response = ai_client.models.generate_content(
            model = 'gemini-2.5-flash',
            contents = batch_prompt
        )
        json_text = response.text.strip().replace('```json', '').replace('```', '')
        coordinates_map = json.loads(json_text)
        print("Successfully parsed JSON response from Gemini.")

        for item in failed_locations:
            address = item['address']
            if address in coordinates_map and coordinates_map[address]:
                lat = coordinates_map[address].get('latitude')
                lon = coordinates_map[address].get('longitude')
                df.at[item['index'], 'latitude'] = lat
                df.at[item['index'], 'longitude'] = lon
                print(f"🤖 Gemini SUCCESS for: {address}")
            else:
                print(f"❌ Gemini could not find coordinates for: {address}")
    except Exception as e:
        print(f"An error occurred during the Gemini API call or parsing: {e}")

# --- 5. SAVE THE FINAL RESULT ---
output_filename = 'locations_geocoded_with_midpoints.csv'
df.to_csv(output_filename, index=False)

print("\n------------------")
print("✅ Advanced geocoding complete!")
print(f"Results saved to '{output_filename}'")
print("\nPreview of the final data:")
print(df.head())

Gemini AI client configured successfully.
Error: Could not find 'unique_event_locations.csv'.

--- Starting Pass 1: Advanced Geocoding with Geopy ---


NameError: name 're' is not defined

: 