# Open Street Map Data

This notebook is used to extract hiking route data from the Overpass API ([Link](https://overpass-turbo.eu/)).

First, we request hiking routes from the API using Overpass QL (short for "Overpass Query Language").
In OpenStreetMap, hiking routes are defined as relations. We search for relations with specific signage and the tags "hiking routes," "local walking network",
within an area slightly larger than Switzerland. Using "Center" as Output, OpenStreetMap calculates the central location of each route.
Since the "name" tag is often missing, we interpolate the name by concatenating the start and end points of each hiking route.
Finally, we retrieve the ID, name, latitude, and longitude as data points. 

The data is then converted into a DataFrame object, and a table is created in an SQL database (hosted on Microsoft Azure).

In [97]:
# Import required libraries
import os
import json
import overpy
import pyodbc
import urllib
import pymssql
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from sqlalchemy import Integer, String, Float, DATETIME, create_engine

### Connect to API

In [117]:
# Initialize the Overpass API with a custom URL
api = overpy.Overpass(url="http://overpass.osm.ch/api/interpreter")

# Overpass query for hiking trails within Switzerland. Using 'center', we obtain the coordinates in the middle of a hiking trail
query = """
[out:json];
relation
["route"="hiking"]
//["name"!~"fixme", i]
["network"="lwn"]
["osmc:symbol"~"yellow::yellow_diamond|red:white:red_bar|yellow:white:yellow_diamond|blue:white:blue_bar"]
(45.8899, 6.0872, 47.8085, 10.4921);
out center tags;
"""

# Execute the request
result = api.query(query)

### Save Data to DataFrame

In [118]:
# Add time and datestamp of API call to dataframe
timestamp_apicall = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")

# List to store the extracted information
list = []

# Iterate over all relations
for relation in result.relations:

    # Extract relevant data
    name = relation.tags.get('name')
    von = relation.tags.get('from')
    bis = relation.tags.get('to')
    symbol = relation.tags.get('osmc:symbol')
    lat = getattr(relation, 'center_lat')
    lon = getattr(relation, 'center_lon')
    
    dict = {    
    'id': relation.id,
    'name': name,
    'symbol': symbol,
    'von': von,
    'bis': bis,
    'lat': lat,
    'lon': lon,
    'timestamp_apicall': timestamp_apicall}

    # Each tuple is now saved in the list as a new row
    list.append(dict)

# Once all data is processed, create the DataFrame
df_wanderwege = pd.DataFrame(list)

# Convert lat and lon to numeric, timestamp to datetime
df_wanderwege['lat'] = pd.to_numeric(df_wanderwege['lat'], errors='coerce')
df_wanderwege['lon'] = pd.to_numeric(df_wanderwege['lon'], errors='coerce')
df_wanderwege['timestamp_apicall'] = pd.to_datetime(df_wanderwege['timestamp_apicall'], errors='coerce')

# Print the DataFrame
print(df_wanderwege.head())



       id                                          name  \
0   22614  Nationalpark Wanderroute 15 (Munt la Schera)   
1  103607                                 Wanderwege SG   
2  112830                                          None   
3  112831                                          None   
4  112833                                          None   

                   symbol        von                 bis        lat  \
0       red:white:red_bar       None                None  46.650143   
1  yellow::yellow_diamond       None                None  47.430977   
2  yellow::yellow_diamond  Uetliberg  Uetliberg Uto Kulm  47.351168   
3  yellow::yellow_diamond  Folenweid             Baldern  47.329124   
4  yellow::yellow_diamond  Felsenegg            Balderen  47.315244   

         lon   timestamp_apicall  
0  10.230198 2024-11-20 14:53:12  
1   9.620170 2024-11-20 14:53:12  
2   8.489780 2024-11-20 14:53:12  
3   8.500726 2024-11-20 14:53:12  
4   8.505056 2024-11-20 14:53:12  


### Identifying missing data

In [None]:
print("\nMissing Data:")
print(df_wanderwege.isnull())

### Remove rows with any missing data


In [119]:
df_wanderwege = df_wanderwege.dropna(subset=["name"])

### Input Missing Data

In [None]:
for index, row in df_wanderwege.iterrows():
    if pd.isnull(row["name"]) and not pd.isnull(row["von"]) and not pd.isnull(row["bis"]):
        df_wanderwege.at[index, "name"] = f"{row['von']} - {row['bis']}"

# Resultierender DataFrame anzeigen
print(df_wanderwege)


### Identifying duplicate rows


In [None]:
print("\nDuplicate rows:")
print(df_wanderwege.duplicated())

### Identifying similar rows

In [111]:
from rapidfuzz import fuzz, process
import pandas as pd

# Namen auf Ähnlichkeit prüfen
threshold = 80  # Ähnlichkeitsschwelle (z. B. 80%)
similar_names = []

# Iteriere über alle Namen
for index, name in enumerate(df_wanderwege["name"]):
    for other_index, other_name in enumerate(df_wanderwege["name"]):
        if index != other_index:  # Nicht mit sich selbst vergleichen
            score = fuzz.ratio(name, other_name)
            if score >= threshold:  # Wenn Ähnlichkeit über Schwelle liegt
                similar_names.append((name, other_name, score))

# Ähnliche Namen anzeigen
print("Ähnliche Namen:")
for name1, name2, score in similar_names:
    print(f"'{name1}' und '{name2}' haben eine Ähnlichkeit von {score}%.")


Ähnliche Namen:


In [124]:
print(f"Anzahl der Zeilen im DataFrame: {df_wanderwege.shape[0]}")


Anzahl der Zeilen im DataFrame: 2824


### Remove similar Names

In [123]:
threshold = 80

# List of indices to drop
indices_to_drop = set()

# Iterate over all names
for index, name in enumerate(df_wanderwege["name"]):
    if index not in indices_to_drop: 
        matches = process.extract(
            name, df_wanderwege["name"], scorer=fuzz.ratio, limit=len(df_wanderwege)
        )
        # Iterate over all matches
        for match_name, score, match_index in matches:
            if score >= threshold and index != match_index:
                indices_to_drop.add(match_index)

# Drop the indices
df_wanderwege = df_wanderwege.drop(indices_to_drop)

In [121]:
df_wanderwege = df_wanderwege[~df_wanderwege["name"].str.contains("fixme", case=False, na=False)]

### Analyze latitude and loninude

In [None]:
# DataFrame für die relevanten Spalten vorbereiten
df_melted = df_wanderwege[["lat", "lon"]].melt(var_name="Variable", value_name="Wert")

# Boxplot zeichnen
plt.figure(figsize=(8, 6))
sns.boxplot(x="Variable", y="Wert", data=df_melted)
plt.title("Boxplot der Verteilung von Latitude und Longitude")
plt.ylabel("Wert")
plt.xlabel("Variable")
plt.show()


### Remove invalid latitude and longitude

In [112]:
# Prüfen und Entfernen von Zeilen mit negativen Werten in 'lan' und 'lot'
df_wanderwege = df_wanderwege[(df_wanderwege["lat"] > 0) & (df_wanderwege["lon"] > 0)]

# Ergebnis anzeigen
print(df_wanderwege)


             id                                        name  \
1529    1230658           Carouge place Sigismond - Pinchat   
1663    1423938                           Kempten - Rosinli   
1716    1482730                            Carrera-Valendas   
1868    1622282                       Nante-Alpe di Pesciüm   
2734    2411408               Santeberg Nord - Dagmersellen   
2834    2516538                        Mühlebach-Niederwald   
2888    2594902                    Chli Hüenliwald-Murmösli   
3649    3047880                             Äbnet - Schlatt   
5692    5365971                      Sämtisersee - Staubern   
6712    7617970                        Schachen - Rotenflue   
6728    7651838                         Laubenalp - Oberalp   
6740    7665160                            Tenero - Gordemo   
6931    8049607                    Oberurdorf - Reppischhof   
7604    8491639                 Hohliebi - Wallbachschlucht   
7685    8531293                       Zweisimmen - Ober

### optional: Store data in csv


In [113]:
df_wanderwege.to_csv("../data/processed/overpass.csv")

### Connect to SQL Server

In [114]:
# Load configuration from config/db_config.json
with open('../config/db_config.json', 'r') as f:
    db_config = json.load(f)

# Get database credentials
server = db_config['server']
database = db_config['database']
db_user = db_config['db_user']
db_password = db_config['db_password']

### Create empty SQL table

In [115]:
# Create table if it doesn't exist
table_name = "OVRP_HikingRoutes"
query = f"""
    IF OBJECT_ID(N'dbo.{table_name}', N'U') IS NULL
    BEGIN
        CREATE TABLE {table_name} (
            id                      INT         NOT NULL,
            name                    VARCHAR(255) NULL,
            von                     VARCHAR(255) NULL,
            bis                     VARCHAR(255) NULL,
            lat                     FLOAT       NOT NULL,
            lon                     FLOAT       NOT NULL,
            symbol                  VARCHAR(255) NULL,
            timestamp_apicall       DATETIME    NULL,
            PRIMARY KEY (id)
        );
    END
    """

conn = pymssql.connect(server, db_user, db_password, database)
cursor = conn.cursor()
cursor.execute(query)

conn.commit()
conn.close()

### Upload to SQL Server

In [125]:
# Create connection string for SQLAlchemy
connection_string = f"mssql+pymssql://{db_user}:{db_password}@{server}/{database}"
engine = create_engine(connection_string)

# Ingest data to tabledatabase table
df_wanderwege.to_sql(table_name, con=engine, if_exists='replace', index=False)
print("DataFrame erfolgreich in die MSSQL-Datenbank geladen!")

DataFrame erfolgreich in die MSSQL-Datenbank geladen!
