In [1]:
# Import necessary libraries
import pathlib  # For working with file paths
import re  # For regular expressions
import pickle  # For serializing Python objects

import networkx as nx  # Network analysis library
import pandas as pd  # Data manipulation library
import geopandas as gpd  # Geospatial data manipulation library
import pyproj  # For coordinate transformations
import shapely  # Geometric objects library

import numpy as np  # Numerical computing library

# Define the version of the code
version = 'v0.1'



In [2]:
# Define the data directory path
# This variable points to the path '~/data/euris', which will be used for working with data files.
# The `expanduser()` method ensures that the tilde (`~`) is expanded to the user's home directory.
data_dir = pathlib.Path('~/data/euris').expanduser()



In [3]:
node_paths = list(data_dir.glob('Node*'))
fairway_section_paths = list(data_dir.glob('FairwaySection*'))

In [4]:
# Define a regular expression pattern for parsing filenames
# Explanation:
# - (?P<dataset>[\w]+): Captures the dataset name (one or more word characters).
# - _(?P<country>[A-Z]{2})_: Captures the country code (two uppercase letters).
# - _(?P<date>[\d]+)\.geojson: Captures the date (one or more digits) followed by '.geojson'.

path_re = re.compile(r'(?P<dataset>[\w]+)_(?P<country>[A-Z]{2})_(?P<date>[\d]+)\.geojson')



In [5]:
# Initialize an empty list to store GeoDataFrames
node_gdfs = []

# Iterate over each node path
for node_path in node_paths:
    # Read the GeoDataFrame from the given path
    node_gdf_i = gpd.read_file(node_path)
    
    # Check if the filename matches the expected pattern
    match = path_re.search(str(node_path))
    if match:
        # Extract relevant information from the filename
        path_data = match.groupdict()
        node_gdf_i['country'] = path_data['country']
        node_gdf_i['dataset'] = path_data['dataset']
        node_gdf_i['date'] = path_data['date']
        
    # Append the processed GeoDataFrame to the list
    node_gdfs.append(node_gdf_i)

# Concatenate all GeoDataFrames into a single DataFrame
node_gdf = pd.concat(node_gdfs)


In [6]:
# Initialize an empty list to store GeoDataFrames
fairway_section_gdfs = []

# Iterate over each fairway section path
for fairway_section_path in fairway_section_paths:
    # Read the GeoDataFrame from the given path
    fairway_section_gdf_i = gpd.read_file(fairway_section_path)
    
    # Check if the filename matches the expected pattern
    match = path_re.search(str(fairway_section_path))
    if match:
        # Extract relevant information from the filename
        path_data = match.groupdict()
        fairway_section_gdf_i['country'] = path_data['country']
        fairway_section_gdf_i['dataset'] = path_data['dataset']
        fairway_section_gdf_i['date'] = path_data['date']
        
    # Append the processed GeoDataFrame to the list
    fairway_section_gdfs.append(fairway_section_gdf_i)

# Concatenate all GeoDataFrames into a single DataFrame
fairway_section_gdf = pd.concat(fairway_section_gdfs)

# Set a multi-level index using columns 'country' and 'code'
fairway_section_gdf = fairway_section_gdf.set_index(['country', 'code'])


  fairway_section_gdf = pd.concat(fairway_section_gdfs)


In [7]:
node_gdf =  node_gdf.set_index(['country', 'objectcode'])
nodes = node_gdf.index.tolist()
node_sections_gdf = node_gdf.reset_index().set_index(['country', 'sectionref'])

In [8]:
# Create a boolean index where both 'borderpoint' and 'locode' are trueish
border_idx = node_gdf[['borderpoint', 'locode']].astype('bool').all(axis=1)

# Filter the GeoDataFrame to include only rows where both 'borderpoint' and 'locode' are true
border_nodes = node_gdf.loc[border_idx].reset_index(names=['country', 'objectcode'])

# Merge the filtered GeoDataFrame with itself based on matching 'locode' and 'borderpoint' with the opposite 'borderpoint' and 'locode'
border_edges = pd.merge(border_nodes, border_nodes, left_on=['locode', 'borderpoint'], right_on=['borderpoint', 'locode'], how='inner')

# Select specific columns for the resulting edges GeoDataFrame
border_edges = border_edges[['country_x', 'objectcode_x', 'country_y', 'objectcode_y', 'geometry_x', 'geometry_y', 'sectionref_x', 'sectionref_y']]

# Create a new 'geometry' column by constructing LineString geometries
border_edges['geometry'] = border_edges.apply(lambda row: shapely.LineString([row['geometry_x'], row['geometry_y']]), axis=1)

# Save the processed GeoDataFrame to a GeoPackage file
border_edges.drop(columns=['geometry_x', 'geometry_y']).to_file(f'border_edges_{version}.gpkg')


In [9]:
border_nodes

Unnamed: 0,country,objectcode,objectcode_cb,hectom_cb,sectionref_cb,locode_cb,function,ww_name,ww_name_cb,rt_name,...,borderpoint,remark,locode,objectname,sectionref,hectom,vplnpoint,geometry,dataset,date
0,RO,84540,08454,08454,RS0000108578,RSXXX000010000008454,dismar,Dunărea,Dunav,Route Danube,...,BG,,ROXXX000010000008454,Distance Mark Along Waterway Axis,RO0000185780,08454,1.0,POINT (22.67717 44.21517),Node,20240527
1,RO,84540,84540,08454,BG0000184540,BGXXX000010000008450,dismar,Dunărea,Dunav,Route Danube,...,RS,,ROXXX000010000008454,Distance Mark Along Waterway Axis,RO0000184540,08454,1.0,POINT (22.67717 44.21517),Node,20240527
2,RO,37410,,,,,dismar,Dunărea,,Route Danube,...,BG,,ROXXX000010000003741,Distance Mark Along Waterway Axis,RO0000137410,03741,,POINT (27.27835 44.13270),Node,20240527
3,RO,D7500,10750,10750,RS0000110750,RSXXX000010000010750,dismar,Dunărea,Dunav,Route Danube,...,RSXXX000010000010750,,ROXXX000010000010750,Distance Mark Along Waterway Axis,RO00001D7500,10750,1.0,POINT (21.35978 44.82056),Node,20240527
4,FR,F1836,J0155,00477,FRVN21900004,FRXXXVN219J015500477,junction,Grensleie,Lys,Grensleie,...,FRXXXVN219J015500477,,BECOM185010000000000,Borderpoint France - Lys,BE1850100000,00000,1.0,POINT (2.94408 50.73142),Node,20240527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,BE,J3524,F5570,00624,BE0440500000,BEKRO04404F557000624,junction,Maas van Wessem naar Ohe en Laak,Gemeenschappelijke Maas,Maas van Wessem naar Ohe en Laak,...,BEKRO04404F557000624,,NLSVW0150CJ352400056,junction : Maas - Jachthaven Stevensweert,NL0150C00564,00056,0.0,POINT (5.84441 51.13834),Node,20240527
111,BE,J4769,F5569,00620,BE0440300000,BEKRO04404F556900620,junction,Maas van Wessem naar Ohe en Laak,Gemeenschappelijke Maas,Maas van Wessem naar Ohe en Laak,...,BEKRO04404F556900620,,NLSVW0150CJ476900059,ongoing: Maas - Maas,NL0150C00000,00053,0.0,POINT (5.84339 51.13536),Node,20240527
112,BE,J4769,F5569,00620,BE0440400000,BEKRO04404F556900620,junction,Maas van Wessem naar Ohe en Laak,Gemeenschappelijke Maas,Maas van Wessem naar Ohe en Laak,...,BEKRO04404F556900620,,NLSVW0150CJ476900059,ongoing: Maas - Maas,NL0150C00530,00053,0.0,POINT (5.84339 51.13536),Node,20240527
113,BE,J4770,F5572,00636,BE0440500000,BEKRO04405F557200636,junction,Maas van Wessem naar Ohe en Laak,Gemeenschappelijke Maas,Maas van Wessem naar Ohe en Laak,...,BEKRO04405F557200636,,NLMSB0150CJ477000046,ongoing: Maas - Maas,NL0150C00564,00067,0.0,POINT (5.85533 51.14439),Node,20240527


In [10]:
graph = nx.Graph()
for n, row in node_gdf.iterrows():
    graph.add_node(n, **row)

In [11]:
all_sections = set(node_sections_gdf.index)

In [12]:
failed_sections = []

for i, (country, sectionref) in enumerate(all_sections):
    rows = node_sections_gdf.loc[(country, sectionref)]
    if not rows.shape[0] == 2:
        failed_sections.append((country, sectionref))
        continue

    # get object code
    a = rows['objectcode'].iloc[0]
    b = rows['objectcode'].iloc[1]

    # add country code
    a = (country, a)
    b = (country, b)

    section_info = fairway_section_gdf.loc[(country, sectionref)]

    graph.add_edge(a, b, **section_info)

# this section is not properly connected
failed_sections

  rows = node_sections_gdf.loc[(country, sectionref)]


[('DE', 'AT0000100003')]

In [14]:
for i, row in border_edges.iterrows():
    section_info = fairway_section_gdf.loc[(row['country_x'], row['sectionref_x'])]
    section_info = section_info.copy()
    section_info['geometry'] = row['geometry']
    section_info['tot_length'] = np.nan
    section_info['comment'] = 'added by border edge connection method'
    a = row['country_x'], row['objectcode_x']
    b = row['country_y'], row['objectcode_y']
    graph.add_edge(a, b, **section_info)
    


In [15]:
# add length_m to each edge, based on spherical distance
wgs84 = pyproj.Geod(ellps='WGS84')

for e, edge in graph.edges.items():
    length_m = wgs84.geometry_length(edge['geometry'])
    edge['length_m'] = length_m
    

In [16]:
# create a dataset of node information in the graph
nodes_df = pd.DataFrame(graph.nodes.values(), index=graph.nodes.keys())
nodes_gdf = gpd.GeoDataFrame(nodes_df, geometry='geometry', crs='EPSG:4326')
nodes_gdf = nodes_gdf.reset_index(names=['country', 'objectcode'])
nodes_gdf.to_file(f'euris_graph_nodes_{version}.gpkg')


In [17]:
# create a dataset of edge information in the graph

edges_df = nx.to_pandas_edgelist(graph)
edges_gdf = gpd.GeoDataFrame(edges_df, geometry='geometry', crs='EPSG:4326')
# je weet toch...
edges_gdf['source'] = edges_gdf['source'].apply(lambda x: "_".join(x))
edges_gdf['target'] = edges_gdf['target'].apply(lambda x: "_".join(x))
edges_gdf.to_file(f'euris_graph_edges_{version}.gpkg')


In [18]:
# store the whole graph
with open(f'euris_graph_{version}.pickle', 'wb') as f:
    pickle.dump(graph, f)