In [None]:
# Sampling origins across different cities and creating origin-destination pairs.

## Prerequisites
This repository builds on the python package OSMNx (v.2.0.1, https://osmnx.readthedocs.io/en/stable/). I recommend installing it via conda:
```
conda create -n ox -c conda-forge --strict-channel-priority osmnx
```
For sampling nodes based on city names two additional packages are required, namely geopy (v.2.3.1, https://geopy.readthedocs.io/en/stable/) and overpy (v.0.7, https://python-overpy.readthedocs.io/en/latest/)

```
pip install geopy
pip install overpy
```

For visualizing routes and geometry on maps I use the folium package (v.0.19.4, https://python-visualization.github.io/folium/latest/) that is included in the OSMNx package, but for creating static images of these visualizations the Selenium package is required (v.4.28.0, https://www.selenium.dev/documentation/)

```
pip install selenium
```

## This example
Cities are used as the basis to find random samples of intersections. The region and country names are nice to have, but they are not necessary.

In [1]:
sample_size = 3
min_distance = 3
random_seed = 3
network_type = 'drive'
point_distance_size = 10000
experiment_name = "2025-03-A"
base_path=f"/proj/nobackup/streetnetwork-alignment/{experiment_name}"

import os

if not os.path.exists(base_path):
    os.makedirs(base_path)
    
parameters_file_path = os.path.join(base_path, f"{experiment_name}_parameters.csv")
city_sample_nodes_path = os.path.join(base_path, f'city_sample_nodes_{experiment_name}.csv')
local_graph_folder = os.path.join(base_path,'local_origin_graphs')
import csv

with open(parameters_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Parameter", "Value"])
    writer.writerow(["sample_size", sample_size])
    writer.writerow(["min_distance", min_distance])
    writer.writerow(["random_seed", random_seed])
    writer.writerow(["network_type", network_type])
    writer.writerow(["point_distance_size", point_distance_size])
    writer.writerow(["base_path", base_path])
    writer.writerow(["experiment_name", experiment_name])

In [None]:
# The workflow for analyzing the routes begins with coordinate points used as origin locations.
import pandas as pd

df = pd.read_csv("100_city_sample.csv")
display(df)


# To sample nodes from the cities we call use a method from sample_nodes.py
import node_sampling

df = node_sampling.get_random_nodes_for_all_cities(df,min_distance_km=min_distance,sample_size=sample_size,random_seed=random_seed)


display(df)

import os # for file operations
city_sample_nodes_path = os.path.join(base_path, f'city_sample_nodes_{experiment_name}.csv')
df.to_csv(os.path.join(base_path,city_sample_nodes_path))


In [None]:
from origin_graph import origin_graph # for creating the origin graph object
import os # for file operations
import ast # for parsing string to tuple
import pandas as pd # for reading the csv file
import osmnx as ox # for plotting the graph
import matplotlib.pyplot as plt # for plotting the graphs
import multiprocessing
"""
The origin graph object requires the following parameters for instantiation:
# Required parameters:
- origin_point: The coordinate point of the city as a tuple (latitude, longitude)
- distance_from_point: The distance from the origin point in meters to the sides of a bounding box.
- city_name: The name of the city
- network_type: The type of network to use. The default is 'drive'.

# The following parameters are optional:
- remove_parallel: A boolean to remove parallel edges in the graph. The default is False.
- simplify: A boolean to simplify the graph. The default is False.
- edge_attr_diff: The attribute to differentiate edges in the graph when simplifying the graph. The default is None.

# Downloading a graph and weighing it can take some time. To speed up the process, you can load a graph from a file.
- save_graphml: The path to save the graphml file.
- load_graphml: The path to a graphml file to load.
"""

sub_folder = "local_origin_graphs"
local_graph_folder = os.path.join(base_path, sub_folder)
if not os.path.exists(folder):
    os.makedirs(folder)
print(city_sample_nodes_path)
df = pd.read_csv(city_sample_nodes_path)

df['graph_path'] = None

def process_row(row, folder):
    latlon_point = ast.literal_eval(row['node_latlon'])
    og = origin_graph(origin_point=latlon_point, distance_from_point=point_distance_size,
                      city_name=row["city_name"], network_type=network_type, remove_parallel=True, simplify=True)

    graph_path = os.path.join(folder, f"{row['city_name']}_{row['node_id']}.graphml")
    og.save_graph(graph_path)
    
    # Plot the origin graph to see if something is obviously wrong
    ox.plot_graph(og.graph, node_color='blue', node_size=10, edge_linewidth=1, edge_color='black', bgcolor='white',
                   save=True, filepath=os.path.join(local_graph_folder, f"{row['city_name']}_{row['node_id']}.png"),show=False)

    return graph_path

# Number of processes to use
num_processes = multiprocessing.cpu_count()  # Adjust based on your system's capabilities
print(f"Number of processes to use:{num_processes}")
# Use multiprocessing Pool for parallel processing
with multiprocessing.Pool(processes=num_processes) as pool:
    # Use pool.apply_async for asynchronous processing
    results = [pool.apply_async(process_row, args=(row, local_graph_folder)) for index, row in df.iterrows()]

    # Wait for all processes to complete and update the DataFrame
    for index, result in enumerate(results):
        try:
            graph_path = result.get()
            df.at[index, 'graph_path'] = graph_path
        except Exception as e:
            print(f"Error processing row {index}: {e}")

# Save the updated DataFrame
df.to_csv(city_sample_nodes_path, index=False)

/proj/nobackup/streetnetwork-alignment/2025-03-A/city_sample_nodes_2025-03-A.csv
Number of processes to use:28


In [6]:
import pandas as pd
local_graph_folder = os.path.join(base_path, "local_origin_graphs")
df = pd.read_csv(city_sample_nodes_path)
df['graph_path'] = df.apply(lambda row: os.path.join(local_graph_folder, f"{row['city_name']}_{row['node_id']}.graphml"), axis=1)
df['random_seed'] = random_seed
df['min_distance'] = min_distance
df['point_distance_size'] = point_distance_size
display(df)
df.to_csv(city_sample_nodes_path, index=False)

Unnamed: 0.1,Unnamed: 0,city_name,country,region,network_type,node_id,node_latlon,graph_path,random_seed,min_distance,point_distance_size
0,0,Amsterdam,Nederland,Europe,drive,46394761,"(52.3733989, 4.8780385)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
1,1,Amsterdam,Nederland,Europe,drive,6466683449,"(52.3043889, 5.0244272)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
2,2,Amsterdam,Nederland,Europe,drive,4004330717,"(52.4244714, 4.8276361)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
3,3,Atlanta,United States,US/Canada,drive,69162374,"(33.677683, -84.4356051)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
4,4,Atlanta,United States,US/Canada,drive,69475361,"(33.7176867, -84.4957314)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
...,...,...,...,...,...,...,...,...,...,...,...
268,268,Warsaw,Polska,Europe,drive,1747133057,"(52.191898, 20.9598302)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
269,269,Warsaw,Polska,Europe,drive,1250996553,"(52.3068401, 20.9254515)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
270,270,Washington,United States,US/Canada,drive,50186470,"(38.9525865, -76.9411932)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000
271,271,Washington,United States,US/Canada,drive,2019253022,"(38.8199764, -77.1515994)",/proj/nobackup/streetnetwork-alignment/2025-03...,3,3,10000


In [3]:
# The next step is to add weights to the edges of the graph.

from origin_graph import origin_graph # for creating the origin graph object
import pandas as pd # for reading the csv file
import multiprocessing


local_graph_folder = local_graph_folder
df = pd.read_csv(city_sample_nodes_path)
api_keys = pd.read_csv("api_keys.csv")
google_key = api_keys.loc[api_keys['service'] == 'Gmaps', 'key'].values[0]

def add_graph_weights(row,folder):
    og = origin_graph.from_graphml(graphml_path=row['graph_path'])
    og.add_simplest_paths_from_origin()
    og.add_node_elevation(api_key=google_key)
    og.add_weights('deviation_from_prototypical')
    og.add_weights('node_degree')
    og.add_weights('instruction_equivalent')
    og.save_graph(row['graph_path'])
    print(f"Finished with graph: {row['city_name']} node: {row['node_id']}")
    

num_processes = multiprocessing.cpu_count()  # Adjust based on your system's capabilities
print(f"Number of processes to use:{num_processes}")
# Use multiprocessing Pool for parallel processing
with multiprocessing.Pool(processes=num_processes) as pool:
    # Use pool.apply_async for asynchronous processing
    for index, row in df.iterrows():
        pool.apply_async(add_graph_weights, args=(row,local_graph_folder))

    # Close the pool and wait for all processes to complete
    pool.close()
    pool.join()





Number of processes to use:28
Finished with graph: Beijing node: 8100695142
Finished with graph: Beijing node: 8571957386
Finished with graph: Beijing node: 4571627076
Finished with graph: Atlanta node: 69475361
Finished with graph: Amsterdam node: 6466683449
Finished with graph: Budapest node: 3445703817
Finished with graph: Berlin node: 28253316
Finished with graph: Cape Town node: 266715705
Finished with graph: Barcelona node: 601147833
Finished with graph: Budapest node: 8199080326
Finished with graph: Berlin node: 30815443
Finished with graph: Baltimore node: 49524262
Finished with graph: Atlanta node: 69427859
Finished with graph: Barcelona node: 10182303354
Finished with graph: Atlanta node: 69162374
Finished with graph: Baltimore node: 49379039
Finished with graph: Caracas node: 6828018578
Finished with graph: Caracas node: 4880519633
Finished with graph: Baltimore node: 49475152
Finished with graph: Caracas node: 1969824700
Finished with graph: Boston node: 61369399
Finished w

In [None]:
from origin_graph import origin_graph  # for creating the origin graph object
import pandas as pd  # for reading the csv file
import multiprocessing

local_graph_folder = local_graph_folder
df = pd.read_csv(city_sample_nodes_path)

def get_od_pairs(row):
    og = origin_graph.from_graphml(graphml_path=row['graph_path'])
    og.create_od_pairs(min_radius=1000, max_radius=1500, sample_size=144)
    od_pair_data = og.get_od_pair_data()
    return od_pair_data

num_processes = multiprocessing.cpu_count()
print(f"Number of processes to use: {num_processes}")

# List to collect results
results = []

with multiprocessing.Pool(processes=num_processes) as pool:
    # Apply the function asynchronously and collect the results
    for index, row in df.iterrows():
        results.append(pool.apply_async(get_od_pairs, args=(row,)))

    pool.close()
    pool.join()

# Collect the results from the async calls
origins_od_pair_data = [result.get() for result in results]

# Concatenate all the dataframes into one
origin_od_pairs = pd.concat(origins_od_pair_data, ignore_index=True)

# Save the concatenated dataframe to a JSON file
origin_od_pairs.to_json("example/origin_od_pairs.json", orient="records", default_handler=str, indent=2)




In [None]:
import post_processing
import pandas as pd
od_pair_data = pd.read_json("example/origin_od_pairs.json")

od_pair_data = post_processing.label_length_outliers(od_pair_data)
od_pair_data = post_processing.label_gridlike_groups(od_pair_data)

# Before normalizing the complexity, we need to remove the length outliers.
print(f"od-pairs before removing length outliers {len(od_pair_data)}")
od_pair_data = od_pair_data[od_pair_data['length_outliers'] == False]
print(f"od-pairs after removing length outliers {len(od_pair_data)}")
od_pair_data = post_processing.normalize_complexity(od_pair_data)

# The od-pair data contains lists and dictionaries that are not easily saved to a csv file, so we store it as a json file.
# Still, there some columns that need to be serialized to strings such as shapely polygon objects.
od_pair_data.to_json("example/origin_od_pairs.json",orient="records",default_handler=str,indent=2)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

od_pair_data = pd.read_json("example/origin_od_pairs.json")


od_pair_data['closest_strongest_lag'] = abs(od_pair_data['closest_strongest_lag'])

od_pair_data = od_pair_data.sort_values(by="closest_strongest_lag", ascending=True)


city_counts = od_pair_data['city_name'].value_counts()
city_counts.plot(kind='bar')
plt.xlabel('City Name')
plt.ylabel('Number of od-pairs')
plt.title('Number of od-pairs in Each city')
plt.show()