# Open Cage Geocoding API
- https://opencagedata.com/pricing
- https://opencagedata.com/tutorials/geocode-in-python


In [1]:
# Import required libraries
import pandas as pd 
import overpy
import json
import os
from sqlalchemy import create_engine
import pymssql
import requests
import time

from opencage.geocoder import OpenCageGeocode
from pprint import pprint

In [2]:
# Load database access configuration from config/db_config.json
with open('../config/db_config.json', 'r') as f:
    db_config = json.load(f)

# Access db credentials
server = db_config['server']
database = db_config['database']
db_user = db_config['db_user']
db_password = db_config['db_password']

In [4]:
# Connect to SQL Database
conn = pymssql.connect(server, db_user, db_password, database)

# Create connection string for SQLAlchemy
connection_string = f"mssql+pymssql://{db_user}:{db_password}@{server}/{database}"
engine = create_engine(connection_string)

In [5]:
df_coords = pd.read_sql_table(table_name='wanderwege', con=engine)
print(df_coords)

         timestamp_apicall        id  \
0      2024-09-20 10:42:31     22614   
1      2024-09-20 10:42:31    103607   
2      2024-09-20 10:42:31    112830   
3      2024-09-20 10:42:31    112831   
4      2024-09-20 10:42:31    112833   
...                    ...       ...   
15191  2024-09-20 10:42:31  18057730   
15192  2024-09-20 10:42:31  18057731   
15193  2024-09-20 10:42:31  18057943   
15194  2024-09-20 10:42:31  18058034   
15195  2024-09-20 10:42:31  18058036   

                                               name         lat         lon  
0      Nationalpark Wanderroute 15 (Munt la Schera)  46.6501430  10.2301992  
1                                     Wanderwege SG  47.4309774   9.6201700  
2                    Uetliberg - Uetliberg Uto Kulm  47.3511680   8.4897796  
3                               Folenweid - Baldern  47.3291235   8.5007261  
4                              Felsenegg - Balderen  47.3152439   8.5050559  
...                                             ...

In [8]:
# Get current working directory
current_dir = os.getcwd()
print(current_dir)

# c:\Users\etien\OneDrive\02_Progression\CAS_DataEngineering_ZHAW\03_Leistungsnachweis\Wanderwege\notebooks

c:\Users\etien\OneDrive\02_Progression\CAS_DataEngineering_ZHAW\03_Leistungsnachweis\Wanderwege\notebooks


In [12]:
# Load API key from config/api_config.json file
with open("../config/api_config.json", 'r') as f:
    api_config = json.load(f)

api_key = api_config["api_key_opencage"]
#print(api_config)

In [13]:
# TO DO: Move API key to config file
#key = '1d19cf6031504c6081684a1be836f95a'

# Initializing geocoder
geocoder = OpenCageGeocode(api_key)

# Requesting address for one coordinate
results = geocoder.reverse_geocode(46.6501430, 10.2301992)




In [14]:
# Printing results
pprint(results)

[{'annotations': {'DMS': {'lat': "46° 39' 46.96092'' N",
                          'lng': "10° 14' 28.85964'' E"},
                  'MGRS': '32TNS9496568468',
                  'Maidenhead': 'JN56cp89xd',
                  'Mercator': {'x': 1140061.856, 'y': 5856154.734},
                  'NUTS': {'NUTS0': {'code': 'CH'},
                           'NUTS1': {'code': 'CH0'},
                           'NUTS2': {'code': 'CH05'},
                           'NUTS3': {'code': 'CH056'}},
                  'OSM': {'edit_url': 'https://www.openstreetmap.org/edit?node=336132069#map=17/46.66304/10.24135',
                          'note_url': 'https://www.openstreetmap.org/note/new#map=17/46.66304/10.24135&layers=N',
                          'url': 'https://www.openstreetmap.org/?mlat=46.66304&mlon=10.24135#map=17/46.66304/10.24135'},
                  'UN_M49': {'regions': {'CH': '756',
                                         'EUROPE': '150',
                                         'WESTER

In [49]:
# Extracting relevant attributes for one coordinate
extracted_data = []
for entry in results:
    components = entry['components']
    lat = entry['geometry']['lat']
    lon = entry['geometry']['lng']
    extracted_data.append({
        'lat': lat,
        'lon': lon,
        'country': components['country'],
        'county': components['county'],
        'local_administrative_area': components['local_administrative_area'],
        'locality': components['locality'],
        'postcode': components['postcode'],
        'state': components['state'],
        'state_code': components['state_code'],
        'village': components['village']
    })

# Creating a DataFrame
df = pd.DataFrame(extracted_data)

# Display the DataFrame
print(df)

         lat       lon      country                              county  \
0  46.663045  10.24135  Switzerland  Region Engiadina Bassa/Val Müstair   

  local_administrative_area    locality postcode    state state_code village  
0                    Zernez  Stabelchod     7530  Grisons         GR  Zernez  


### Access multiple coordinates in a row

In [59]:
# Function to get detailed address information from latitude and longitude
def get_address_from_coordinates(lat, lon):
    url = f'https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={key}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            return data['results'][0]  # Return the first result entry
        else:
            return None  # No results found
    else:
        return None  # Error in request

In [60]:
# List of coordinates
#coordinates = [(46.663045, 10.24135), (47.4309774, 9.6201700), (47.3511680, 8.4897796)]
nrows = 1000
coordinates = list(zip(df_coords['lat'][:nrows], df_coords['lon'][:nrows]))
print(coordinates)

[('46.6501430', '10.2301992'), ('47.4309774', '9.6201700'), ('47.3511680', '8.4897796'), ('47.3291235', '8.5007261'), ('47.3152439', '8.5050559'), ('47.3164826', '8.5186003'), ('46.6864945', '8.5941959'), ('46.7582930', '8.6574212'), ('47.5352432', '8.0967558'), ('46.8255681', '6.5042809'), ('46.8326800', '6.5145084'), ('46.8356755', '6.5227180'), ('46.8435439', '6.5313614'), ('46.8556668', '6.5501737'), ('46.8634566', '6.5658229'), ('46.8609414', '6.6144089'), ('46.8964957', '6.6098089'), ('47.3265932', '9.0257357'), ('47.3204765', '9.0322050'), ('47.3139131', '9.0340285'), ('47.3096563', '9.0309978'), ('47.3005555', '9.0197317'), ('47.2960901', '9.0220985'), ('47.2915794', '9.0283607'), ('47.2808333', '9.0325995'), ('47.2717847', '9.0363372'), ('47.2671226', '9.0409020'), ('47.3028875', '8.5080851'), ('47.2857101', '8.5134809'), ('47.2712226', '8.5262202'), ('47.2635986', '8.5335671'), ('47.2647956', '8.5554930'), ('47.5490567', '8.1111345'), ('47.3623550', '8.4906313'), ('47.3552026

In [61]:
###############
# Extracting addresses and relevant attributes
extracted_data = []
for lat, lon in coordinates:
    result = get_address_from_coordinates(lat, lon)
    if result:
        components = result['components']
        extracted_data.append({
            'lat': lat,
            'lon': lon,
            'country': components.get('country', 'N/A'),
            'county': components.get('county', 'N/A'),
            'local_administrative_area': components.get('local_administrative_area', 'N/A'),
            'locality': components.get('locality', 'N/A'),
            'postcode': components.get('postcode', 'N/A'),
            'state': components.get('state', 'N/A'),
            'state_code': components.get('state_code', 'N/A'),
            'village': components.get('village', 'N/A')
        })

# Creating a DataFrame from the extracted data
df_addresses = pd.DataFrame(extracted_data)

# Display the DataFrame
print(df_addresses)

            lat         lon      country                              county  \
0    46.6501430  10.2301992  Switzerland  Region Engiadina Bassa/Val Müstair   
1    47.4309774   9.6201700  Switzerland                  Wahlkreis Rheintal   
2    47.3511680   8.4897796  Switzerland                     District Zurich   
3    47.3291235   8.5007261  Switzerland                    Bezirk Affoltern   
4    47.3152439   8.5050559  Switzerland                    Bezirk Affoltern   
..          ...         ...          ...                                 ...   
995  47.0883300   8.8521432  Switzerland                          Einsiedeln   
996  47.0757646   8.8374605  Switzerland                              Schwyz   
997  47.0901801   8.8160090  Switzerland                          Einsiedeln   
998  47.0908527   8.8031424  Switzerland                          Einsiedeln   
999  47.0811989   8.8364843  Switzerland                          Einsiedeln   

    local_administrative_area    locali

In [64]:
# Save df_addresses to CSV in folder data/raw
# Note Etienne - This saves the first 1000 addresses as csv. Free API Version only allows 2500 requests per day with 1 request per second
# https://opencagedata.com/pricing#
df_addresses.to_csv('../data/processed/addresses.csv', index=False)

# OLD STUFF

In [21]:


# Function to get address from a single latitude and longitude with retry logic
def get_address_from_coordinates(lat, lon, retries=3):
    url = f"https://nominatim.openstreetmap.org/reverse?format=jsonv2&lat={lat}&lon={lon}"
    
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=5)
            if response.status_code == 200:
                data = response.json()
                return data.get('display_name', "Address not found")
            else:
                return "Error in request"
        except requests.exceptions.RequestException as e:
            if attempt < retries - 1:
                print(f"Error: {e}. Retrying ({attempt + 1}/{retries})...")
                time.sleep(3)  # Short wait before retrying
            else:
                return "Error in request after retries"

# Load your DataFrame with coordinates
# df_coords should have columns "lat" and "lon"
# Example of DataFrame loading
# df_coords = pd.read_csv('coordinates.csv')  # Load from file if needed

# Assuming df_coords has 'lat' and 'lon' columns
coordinates = df_coords[['lat', 'lon']].values.tolist()

# Batch size to control API requests
batch_size = 100  # Process 100 coordinates per batch
delay_between_batches = 10  # Delay in seconds between batches to avoid rate limits

addresses = []

In [22]:
# Process coordinates in batches
for i in range(0, len(coordinates), batch_size):
    batch = coordinates[i:i + batch_size]
    print(f"Processing batch {i // batch_size + 1}: Coordinates {i + 1} to {min(i + batch_size, len(coordinates))}")

    # Fetch addresses for each coordinate in the batch
    for lat, lon in batch:
        print(f"Fetching address for Latitude: {lat}, Longitude: {lon}...")
        address = get_address_from_coordinates(lat, lon)
        addresses.append((lat, lon, address))
        print(f"Address for ({lat}, {lon}): {address}")
        time.sleep(2)  # Small delay between requests within the batch to avoid overwhelming the server
    
    # Introduce delay between batches
    print(f"Completed batch {i // batch_size + 1}. Waiting for {delay_between_batches} seconds before the next batch...")
    time.sleep(delay_between_batches)
    
    print(f"Processed {i + len(batch)} coordinates out of {len(coordinates)}")

# Create a new DataFrame to store the results
df_results = pd.DataFrame(addresses, columns=['lat', 'lon', 'address'])
print("Creating DataFrame with results...")

# Save results to a CSV file
df_results.to_csv('coordinates_with_addresses.csv', index=False)
print("Results saved to 'coordinates_with_addresses.csv'.")

# Optional: You can also update your original DataFrame with the address information
df_coords['address'] = df_results['address']
print("Original DataFrame updated with address information.")


Processing batch 1: Coordinates 1 to 100
Fetching address for Latitude: 46.6501430, Longitude: 10.2301992...
Address for (46.6501430, 10.2301992): Error in request


KeyboardInterrupt: 

In [2]:
def get_address_from_coordinates(lat, lon):
    url = f"https://nominatim.openstreetmap.org/reverse?format=jsonv2&lat={lat}&lon={lon}"
    response = requests.get(url)
    data = response.json()
    
    if 'error' not in data:
        return data['display_name']
    else:
        return "Address not found"



In [3]:
# Example usage
latitude = 46.6501430
longitude = 10.2301992
address = get_address_from_coordinates(latitude, longitude)
print(address)

Stabelchod, Zernez, Region Engiadina Bassa/Val Müstair, Graubünden/Grischun/Grigioni, 7530, Schweiz/Suisse/Svizzera/Svizra


In [None]:
import requests
import time

# Function to get address from a single latitude and longitude
def get_address_from_coordinates(lat, lon):
    url = f"https://nominatim.openstreetmap.org/reverse?format=jsonv2&lat={lat}&lon={lon}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('display_name', "Address not found")
    else:
        return "Error in request"

# List of 15,000 coordinates (Example with 15 coordinates for illustration)
coordinates = [(47.3769, 8.5417), (40.7128, -74.0060), (48.8566, 2.3522), ...] # Up to 15,000 coordinates

# Batch size to control API requests
batch_size = 100  # For example, process 100 coordinates per batch
delay_between_batches = 10  # Delay in seconds between batches to avoid rate limits

addresses = []

# Process coordinates in batches
for i in range(0, len(coordinates), batch_size):
    batch = coordinates[i:i + batch_size]
    
    # Fetch addresses for each coordinate in the batch
    for lat, lon in batch:
        address = get_address_from_coordinates(lat, lon)
        addresses.append((lat, lon, address))
        time.sleep(1)  # Small delay between requests within the batch to avoid overwhelming the server
    
    # Introduce delay between batches
    time.sleep(delay_between_batches)
    
    print(f"Processed {i + len(batch)} coordinates out of {len(coordinates)}")

# You can save addresses to a file or database for further use
