# Open Cage Geocoding API

Notebook purpose:
- Get address data for coordinates of weather forecast
- Ingest address data into sql database

Important note:
- The free api version is limited to 2500 calls per day

- https://opencagedata.com/pricing
- https://opencagedata.com/tutorials/geocode-in-python


In [1]:
# Import required libraries
import pandas as pd 
#import overpy
import json
import os
from sqlalchemy import create_engine, text, Column, Float, String, DateTime
import pymssql
import requests
#import time

from opencage.geocoder import OpenCageGeocode
from pprint import pprint

In [2]:
# Load database access configuration from config/db_config.json
with open('../config/db_config.json', 'r') as f:
    db_config = json.load(f)

# Access db credentials
server = db_config['server']
database = db_config['database']
db_user = db_config['db_user']
db_password = db_config['db_password']

In [3]:
# Connect to SQL Database
conn = pymssql.connect(server, db_user, db_password, database)

# Create connection string for SQLAlchemy
connection_string = f"mssql+pymssql://{db_user}:{db_password}@{server}/{database}"
engine = create_engine(connection_string)

In [6]:
df_weatherForecast = pd.read_sql_table(table_name='OPNM_WeatherForecast_1d_H', con=engine)
print(df_weatherForecast.head(8))

      id                date        lat        lon  temperature_2m  \
0  22614 2024-09-23 00:00:00  46.650143  10.230199            3.83   
1  22614 2024-09-23 01:00:00  46.650143  10.230199            3.58   
2  22614 2024-09-23 02:00:00  46.650143  10.230199            3.53   
3  22614 2024-09-23 03:00:00  46.650143  10.230199            3.03   
4  22614 2024-09-23 04:00:00  46.650143  10.230199            3.28   
5  22614 2024-09-23 05:00:00  46.650143  10.230199            3.43   
6  22614 2024-09-23 06:00:00  46.650143  10.230199            4.08   
7  22614 2024-09-23 07:00:00  46.650143  10.230199            4.63   

   relative_humidity_2m  dew_point_2m  apparent_temperature  precipitation  \
0                  94.0      2.952793              1.619223            0.0   
1                  97.0      3.148276              1.358339            0.0   
2                  96.0      2.951973              1.307380            0.0   
3                 100.0      3.030000              0.9687

In [7]:
df_uniqueCoords = df_weatherForecast[['lat', 'lon']].drop_duplicates(subset=['lat', 'lon'], keep='first')
print(df_uniqueCoords.head(5))
print("------------------------")
print(df_uniqueCoords.info())

           lat        lon
0    46.650143  10.230199
48   47.430977   9.620170
96   47.351168   8.489780
144  47.329124   8.500726
192  47.315244   8.505056
------------------------
<class 'pandas.core.frame.DataFrame'>
Index: 299 entries, 0 to 14352
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lat     299 non-null    float64
 1   lon     299 non-null    float64
dtypes: float64(2)
memory usage: 7.0 KB
None


In [8]:
# Get current working directory
current_dir = os.getcwd()
print(current_dir)

# c:\Users\etien\OneDrive\02_Progression\CAS_DataEngineering_ZHAW\03_Leistungsnachweis\Wanderwege\notebooks

c:\Users\etien\OneDrive\02_Progression\CAS_DataEngineering_ZHAW\03_Leistungsnachweis\Wanderwege\notebooks


In [9]:
# Load API key from config/api_config.json file
with open("../config/api_config.json", 'r') as f:
    api_config = json.load(f)

api_key = api_config["api_key_opencage"]
#print(api_config)

In [13]:
# Initializing geocoder
geocoder = OpenCageGeocode(api_key)


### Access multiple coordinates

In [14]:
# Function to get detailed address information from latitude and longitude
def get_address_from_coordinates(lat, lon):
    url = f'https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={api_key}'
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        if data['results']:
            return data['results'][0]  # Return the first result entry
        else:
            return None  # No results found
    else:
        return None  # Error in request

In [15]:
# List of coordinates to extract from API
coordinates = list(zip(df_uniqueCoords['lat'], df_uniqueCoords['lon']))
print(coordinates)
print(len(coordinates))

[(46.650143, 10.2301992), (47.4309774, 9.62017), (47.351168, 8.4897796), (47.3291235, 8.5007261), (47.3152439, 8.5050559), (47.3164826, 8.5186003), (46.6864945, 8.5941959), (46.758293, 8.6574212), (47.5352432, 8.0967558), (46.8255681, 6.5042809), (46.83268, 6.5145084), (46.8356755, 6.522718), (46.8435439, 6.5313614), (46.8556668, 6.5501737), (46.8634566, 6.5658229), (46.8609414, 6.6144089), (46.8964957, 6.6098089), (47.3265932, 9.0257357), (47.3204765, 9.032205), (47.3139131, 9.0340285), (47.3096563, 9.0309978), (47.3005555, 9.0197317), (47.2960901, 9.0220985), (47.2915794, 9.0283607), (47.2808333, 9.0325995), (47.2717847, 9.0363372), (47.2671226, 9.040902), (47.3028875, 8.5080851), (47.2857101, 8.5134809), (47.2712226, 8.5262202), (47.2635986, 8.5335671), (47.2647956, 8.555493), (47.5490567, 8.1111345), (47.362355, 8.4906313), (47.3552026, 8.4892329), (47.3489275, 8.4994754), (47.2887772, 8.8581679), (47.295046, 8.850249), (47.285254, 8.8695647), (47.2831134, 8.8494622), (47.2750342, 

In [16]:
# Extract address data from API for each coordinate
# Expected runtime: 90 seconds
extracted_data = []
for lat, lon in coordinates:
    result = get_address_from_coordinates(lat, lon)
    if result:
        components = result['components']
        extracted_data.append({
            'lat': lat,
            'lon': lon,
            'country': components.get('country', 'N/A'),
            'county': components.get('county', 'N/A'),
            'local_administrative_area': components.get('local_administrative_area', 'N/A'),
            'locality': components.get('locality', 'N/A'),
            'postcode': components.get('postcode', 'N/A'),
            'state': components.get('state', 'N/A'),
            'state_code': components.get('state_code', 'N/A'),
            'village': components.get('village', 'N/A')
        })

# Creating a DataFrame from the extracted data
df_addresses = pd.DataFrame(extracted_data)


In [17]:
# Add time and datestamp of API call to dataframe
timestamp_apicall = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
df_addresses['timestamp_apicall'] = timestamp_apicall

In [24]:
# Display the DataFrame
print(df_addresses.info())
print("------------------------")
print(df_addresses.head(10))    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   lat                        299 non-null    float64
 1   lon                        299 non-null    float64
 2   country                    299 non-null    object 
 3   county                     299 non-null    object 
 4   local_administrative_area  299 non-null    object 
 5   locality                   299 non-null    object 
 6   postcode                   299 non-null    object 
 7   state                      299 non-null    object 
 8   state_code                 299 non-null    object 
 9   village                    299 non-null    object 
 10  timestamp_apicall          299 non-null    object 
dtypes: float64(2), object(9)
memory usage: 25.8+ KB
None
------------------------
         lat        lon      country                              county  \
0  46.65

In [19]:
## Test - Save df_addresses to CSV in folder data/raw
# df_addresses.to_csv('../data/processed/addresses_subset.csv', index=False)

In [20]:
df_addresses.columns

Index(['lat', 'lon', 'country', 'county', 'local_administrative_area',
       'locality', 'postcode', 'state', 'state_code', 'village',
       'timestamp_apicall'],
      dtype='object')

In [None]:
# Write address dataframe to SQL table

""" 
df_addresses.to_sql(
    name='OPNC_Addresses',
    con=engine,
    if_exists='replace',
    index=False,
    dtype={
        'lat': Column(Float, nullable=False),  # lat must be NOT NULL
        'lon': Column(Float, nullable=False),  # lon must be NOT NULL
        'country': Column(String(255), nullable=True),  
        'county': Column(String(255), nullable=True),  
        'local_administrative_area': Column(String(255), nullable=True),  
        'locality': Column(String(255), nullable=True),  
        'postcode': Column(String(255), nullable=True),  
        'state': Column(String(255), nullable=True),  
        'state_code': Column(String(255), nullable=True),  
        'village': Column(String(255), nullable=True),  
        'timestamp_apicall': Column(DateTime, nullable=False) 
    }
)
"""

In [21]:
# Create table if it doesn't exist and ingest data
table_name = "OPNC_Addresses"
query = f"""
    IF OBJECT_ID(N'dbo.{table_name}', N'U') IS NULL
    BEGIN
        CREATE TABLE {table_name} (
            lat                         FLOAT               NOT NULL,
            lon                         FLOAT               NOT NULL,
            country                     VARCHAR(255)        NULL,
            county                      VARCHAR(255)        NULL,
            local_administrative_area   VARCHAR(255)        NULL,
            locality                    VARCHAR(255)        NULL,
            postcode                    VARCHAR(255)        NULL,
            state                       VARCHAR(255)        NULL,
            state_code                  VARCHAR(255)        NULL,
            village                     VARCHAR(255)        NULL,
            timestamp_apicall           DATETIME            NULL,
            
            PRIMARY KEY (lat, lon)
        );
    END
    """

conn = pymssql.connect(server, db_user, db_password, database)
cursor = conn.cursor()
cursor.execute(query)

conn.commit()
conn.close()

In [22]:
# Create connection string for SQLAlchemy
connection_string = f"mssql+pymssql://{db_user}:{db_password}@{server}/{database}"
engine = create_engine(connection_string)

In [26]:
# Write data to database and append table
df_addresses.to_sql(table_name, con=engine, if_exists='replace', index=False)

109

### API Example - How to access only one address through the geocode API

In [11]:
# Printing results
# Requesting address for one coordinate
results = geocoder.reverse_geocode(46.6501430, 10.2301992)
pprint(results)

[{'annotations': {'DMS': {'lat': "46° 39' 46.96092'' N",
                          'lng': "10° 14' 28.85964'' E"},
                  'MGRS': '32TNS9496568468',
                  'Maidenhead': 'JN56cp89xd',
                  'Mercator': {'x': 1140061.856, 'y': 5856154.734},
                  'NUTS': {'NUTS0': {'code': 'CH'},
                           'NUTS1': {'code': 'CH0'},
                           'NUTS2': {'code': 'CH05'},
                           'NUTS3': {'code': 'CH056'}},
                  'OSM': {'edit_url': 'https://www.openstreetmap.org/edit?node=336132069#map=17/46.66304/10.24135',
                          'note_url': 'https://www.openstreetmap.org/note/new#map=17/46.66304/10.24135&layers=N',
                          'url': 'https://www.openstreetmap.org/?mlat=46.66304&mlon=10.24135#map=17/46.66304/10.24135'},
                  'UN_M49': {'regions': {'CH': '756',
                                         'EUROPE': '150',
                                         'WESTER

In [12]:
# Extracting relevant attributes for one coordinate
extracted_data = []
for entry in results:
    components = entry['components']
    lat = entry['geometry']['lat']
    lon = entry['geometry']['lng']
    extracted_data.append({
        'lat': lat,
        'lon': lon,
        'country': components['country'],
        'county': components['county'],
        'local_administrative_area': components['local_administrative_area'],
        'locality': components['locality'],
        'postcode': components['postcode'],
        'state': components['state'],
        'state_code': components['state_code'],
        'village': components['village']
    })

# Creating a DataFrame
df_example = pd.DataFrame(extracted_data)

# Display the DataFrame
print(df_example)

         lat       lon      country                              county  \
0  46.663045  10.24135  Switzerland  Region Engiadina Bassa/Val Müstair   

  local_administrative_area    locality postcode    state state_code village  
0                    Zernez  Stabelchod     7530  Grisons         GR  Zernez  
