# Sampling origins across different cities and creating origin-destination pairs.

## Prerequisites
This repository builds on the python package OSMNx (v.2.0.1, https://osmnx.readthedocs.io/en/stable/). I recommend installing it via conda:
```
conda create -n ox -c conda-forge --strict-channel-priority osmnx
```
For sampling nodes based on city names two additional packages are required, namely geopy (v.2.3.1, https://geopy.readthedocs.io/en/stable/) and overpy (v.0.7, https://python-overpy.readthedocs.io/en/latest/)

```
pip install geopy
pip install overpy nodes run Ubuntu Jammy 22.04 LTS.
There is local scratch space on each node, which is shared between the jobs currently running. Connected to Kebnekaise is also our parallel file system Ransarn (where your project storage is located), which provide quick access to files regardless of which node they run on. For more information about the different file systems that are available on our systems, read the Filesystems and Storage page.
```

For visualizing routes and geometry on maps I use the folium package (v.0.19.4, https://python-visualization.github.io/folium/latest/) that is included in the OSMNx package, but for creating static images of these visualizations the Selenium package is required (v.4.28.0, https://www.selenium.dev/documentation/)

```
pip install selenium
```

## This exampleCities are used as the basis to find random samples of intersections. The region and country names are nice to have, but they are not necessary.

In [None]:
sample_size = 3
min_distance = 3
random_seed = 3
network_type = 'drive'
point_distance_size = 10000
experiment_name = "2025-03-A"
base_path=f"/proj/nobackup/streetnetwork-alignment/{experiment_name}"
min_od_distance = 4750
max_od_distance = 5250
sample_size = 144
import os

if not os.path.exists(base_path):
    os.makedirs(base_path)
    
parameters_file_path = os.path.join(base_path, f"{experiment_name}_parameters.csv")
city_sample_nodes_path = os.path.join(base_path, f'city_sample_nodes_{experiment_name}.csv')
local_graph_folder = os.path.join(base_path,'local_origin_graphs')
import csv

with open(parameters_file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Parameter", "Value"])
    writer.writerow(["sample_size", sample_size])
    writer.writerow(["min_distance", min_distance])
    writer.writerow(["random_seed", random_seed])
    writer.writerow(["network_type", network_type])
    writer.writerow(["point_distance_size", point_distance_size])
    writer.writerow(["min_od_distance", min_od_distance])
    writer.writerow(["max_od_distance", max_od_distance])
    writer.writerow(["base_path", base_path])
    writer.writerow(["city_sample_nodes_path", city_sample_nodes_path])
    writer.writerow(["local_graph_folder", local_graph_folder])
    writer.writerow(["base_path", base_path])
    writer.writerow(["experiment_name", experiment_name])

import pandas as pd
param = pd.read_csv(parameters_file_path)
display(param)

import multiprocessing

num_processes = multiprocessing.cpu_count()  # Adjust based on your system's capabilities
print(f"Number of processes to use: {num_processes}")



In [None]:
# The workflow for analyzing the routes begins with coordinate points used as origin locations.
import pandas as pd

df = pd.read_csv("100_city_sample.csv")
display(df)


# To sample nodes from the cities we call use a method from sample_nodes.py
import node_sampling

df = node_sampling.get_random_nodes_for_all_cities(df,min_distance_km=min_distance,sample_size=sample_size,random_seed=random_seed)


display(df)

import os # for file operations
city_sample_nodes_path = os.path.join(base_path, f'city_sample_nodes_{experiment_name}.csv')
df.to_csv(os.path.join(base_path,city_sample_nodes_path))


In [None]:
from origin_graph import origin_graph  # for creating the origin graph object
import os  # for file operations
import ast  # for parsing string to tuple
import pandas as pd  # for reading the csv file
import osmnx as ox  # for plotting the graph
import matplotlib.pyplot as plt  # for plotting the graphs
import multiprocessing
import logging
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s',filename='jupyter.log', filemode='w')


sub_folder = "local_origin_graphs"
local_graph_folder = os.path.join(base_path, sub_folder)
if not os.path.exists(local_graph_folder):
    os.makedirs(local_graph_folder)

print(city_sample_nodes_path)
df = pd.read_csv(city_sample_nodes_path)

def process_row(row):
    if os.path.exists(row['graph_path']):
        #print(f"Graph exists: {row['graph_path']}")
        return f"Graph already exists {row['graph_path']}"
    else:
        print(f"---Graph missing: {row['graph_path']}---")
        # Apply the function asynchronously
    try:
        latlon_point = ast.literal_eval(row['node_latlon'])
        og = origin_graph(origin_point=latlon_point, distance_from_point=point_distance_size,
                          city_name=row["city_name"], network_type=network_type, remove_parallel=True, simplify=True)
    
        og.save_graph(row['graph_path'])
    
        # Plot the origin graph to see if something is obviously wrong
        ox.plot_graph(og.graph, node_color='blue', node_size=10, edge_linewidth=1, edge_color='black', bgcolor='white',
                       save=True, filepath=os.path.join(local_graph_folder, f"{row['city_name']}_{row['node_id']}.png"), show=False)
        logging.error(f"Finished with graph: {row['graph_path']}")
        return f"Finished with graph: {row['graph_path']}"

    except Exception as e:
        logging.error(f"error {e} creating {row['graph_path']}")
        return f"Failed with graph: {row['graph_path']} error {e}"
        
num_processes = multiprocessing.cpu_count()  # Adjust based on your system's capabilities
print(f"Number of processes to use: {num_processes}")

# Collect results from multiprocessing
try:
    # Use multiprocessing Pool for parallel processing
    with multiprocessing.Pool(processes=num_processes) as pool:
        # Use pool.map instead of apply_async for better control and result handling
        results = pool.map(process_row, [row for _, row in df.iterrows()])
    
    # Optional: Process and log the results
    for result in results:
        print(result)

except Exception as e:
    logging.error(f"Multiprocessing error: {e}")

In [None]:
import pandas as pd 
import os 
df = pd.read_csv(city_sample_nodes_path)

for index, row in df.iterrows():
        if os.path.exists(row['graph_path']):
            #print(f"Graph exists: {row['city_name']}_{row['node_id']}")
            continue
        else:
            print(row['node_id'])
            print(f"---Graph missing: {row['graph_path']}---")


print(df["graph_path"][0])

In [None]:
import pandas as pd
local_graph_folder = os.path.join(base_path, "local_origin_graphs")
df = pd.read_csv(city_sample_nodes_path)
df['graph_path'] = df.apply(lambda row: os.path.join(local_graph_folder, f"{row['city_name']}_{row['node_id']}.graphml"), axis=1)
df['random_seed'] = random_seed
df['min_distance'] = min_distance
df['point_distance_size'] = point_distance_size
display(df)
df.to_csv(city_sample_nodes_path, index=False)

In [None]:
import os
import joblib
num_processes = joblib.cpu_count()
print(f"Number of processes to use: {num_processes}")

In [None]:
# The next step is to add weights to the edges of the graph.
from origin_graph import origin_graph # for creating the origin graph object
import pandas as pd # for reading the csv file
import joblib # replacing multiprocessing with joblib

df = pd.read_csv(city_sample_nodes_path)

if 'weights_added' not in df.columns:
    df['weights_added'] = None

def add_graph_weights(row):
    try:
        og = origin_graph.from_graphml(graphml_path=row['graph_path'])
        og.add_simplest_paths_from_origin()
        og.add_weights('deviation_from_prototypical')
        og.add_weights('node_degree')
        og.add_weights('instruction_equivalent')
        og.save_graph(row['graph_path'])
        print(f"Finished with graph: {row['city_name']} node: {row['node_id']}")
        return True,row['city_name'],row['node_id']
    except Exception as e:
        print(f"Failed with graph: {row['graph_path']} error {e}")
        return False,row['city_name'],row['node_id']

# Number of processes to use
num_processes = joblib.cpu_count()
print(f"Number of processes to use: {num_processes}")

# Collect results from joblib
try:
    # Use joblib's Parallel and delayed for parallel processing
    results = joblib.Parallel(n_jobs=num_processes,backend='loky')(
        joblib.delayed(add_graph_weights)(row) for _, row in df.iterrows()
    )
    for result in results:
        if result[0]:  # Check if the value in results[0] is True
            # Find the row in df where both 'city_name' and 'start_node' match
            mask = (df['city_name'] == result[1]) & (df['start_node'] == result[2])
            df.loc[mask, 'weights_added'] = True  # Update 'weights_added' to True


except Exception as e:
    logging.error(f"Joblib parallel processing error: {e}")

In [2]:
import pandas as pd # for reading the csv file
import joblib # replacing multiprocessing with joblib
from origin_graph import origin_graph
df = pd.read_csv(city_sample_nodes_path)

local_odpair_folder = os.path.join(base_path, "od_pair_data")
print(f"odpair data will be stored at {local_odpair_folder}")
os.makedirs(local_odpair_folder, exist_ok=True)


if 'od_pairs_added' not in df.columns:
    df['od_pairs_added'] = False


def get_od_pairs(row):
    try:
        og = origin_graph.from_graphml(graphml_path=row['graph_path'])
        og.create_od_pairs(min_radius=min_od_distance, max_radius=max_od_distance, sample_size=144)
        od_pair_data = og.get_od_pair_data()
        json_path = os.path.join(local_odpair_folder, f"od_pair_{row['city_name']}_{row['node_id']}.json")
        od_pair_data.to_json(json_path, orient="records", default_handler=str, indent=2)
        print(f"Finished finding OD_pairs for graph: {row['city_name']} node: {row['node_id']}")
        return True,row['city_name'],row['node_id']
    except Exception as e:
        print(f"Failed finding OD_pairs for  graph: {row['graph_path']} error {e}")
        return False,row['city_name'],row['node_id']


num_processes = (joblib.cpu_count() - 1)
print(f"Number of processes to use: {num_processes}")



rows_to_process = []
for idx, row in df.iterrows():
    if not row['od_pairs_added']:
        rows_to_process.append(row)

results = joblib.Parallel(n_jobs=num_processes, backend='loky')(
    joblib.delayed(get_od_pairs)(row) for row in rows_to_process
)

for result in results:
    if result[0]:
        mask = (df['city_name'] == result[1]) & (df['start_node'] == result[2])
        df.loc[mask, 'od_pairs_added'] = True

df.to_csv(city_sample_nodes_path)
    

odpair data will be stored at /proj/nobackup/streetnetwork-alignment/2025-03-A/od_pair_data
Number of processes to use: 63
Finished finding OD_pairs for graph: Copenhagen node: 272274819
Finished finding OD_pairs for graph: Dubai node: 12504312388




Finished finding OD_pairs for graph: Beijing node: 4571627076
Finished finding OD_pairs for graph: Dubai node: 1039038802


  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)


Failed finding OD_pairs for  graph: /proj/nobackup/streetnetwork-alignment/2025-03-A/local_origin_graphs/Damascus_1566546477.graphml error 'origin_graph' object has no attribute 'od_pairs'
Finished finding OD_pairs for graph: Detroit node: 62679624


  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)


Failed finding OD_pairs for  graph: /proj/nobackup/streetnetwork-alignment/2025-03-A/local_origin_graphs/Cape Town_34996614.graphml error unclosed token: line 239835, column 2
Finished finding OD_pairs for graph: Denver node: 176081390
Finished finding OD_pairs for graph: Cape Town node: 266715705
Finished finding OD_pairs for graph: Dubai node: 3151490641
Failed finding OD_pairs for  graph: /proj/nobackup/streetnetwork-alignment/2025-03-A/local_origin_graphs/Bangkok_5919687081.graphml error unclosed token: line 160139, column 2
Finished finding OD_pairs for graph: Denver node: 4436004856


  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)
  normalized_circ_cross_corr = circ_cross_corr / np.max(circ_cross_corr)


Failed finding OD_pairs for  graph: /proj/nobackup/streetnetwork-alignment/2025-03-A/local_origin_graphs/Damascus_49355838.graphml error 'origin_graph' object has no attribute 'od_pairs'
Finished finding OD_pairs for graph: Detroit node: 5052743506
Finished finding OD_pairs for graph: Beijing node: 8100695142
Finished finding OD_pairs for graph: Dublin node: 244062730
Failed finding OD_pairs for  graph: /proj/nobackup/streetnetwork-alignment/2025-03-A/local_origin_graphs/Cairo_6660416815.graphml error no element found: line 669454, column 459
Finished finding OD_pairs for graph: Detroit node: 62815449
Failed finding OD_pairs for  graph: /proj/nobackup/streetnetwork-alignment/2025-03-A/local_origin_graphs/Boston_61369399.graphml error no element found: line 1, column 0
Finished finding OD_pairs for graph: Denver node: 176096390
Finished finding OD_pairs for graph: Beijing node: 8571957386
Finished finding OD_pairs for graph: Dublin node: 1426962142
Finished finding OD_pairs for graph: B

KeyError: 'start_node'

Finished finding OD_pairs for graph: Buenos Aires node: 206163511
Finished finding OD_pairs for graph: London node: 31200012
Finished finding OD_pairs for graph: Sarajevo node: 9572343913
Finished finding OD_pairs for graph: Tokyo node: 1503780971


In [None]:
import post_processing
import pandas as pd
od_pair_data = pd.read_json("example/origin_od_pairs.json")

od_pair_data = post_processing.label_length_outliers(od_pair_data)
od_pair_data = post_processing.label_gridlike_groups(od_pair_data)

# Before normalizing the complexity, we need to remove the length outliers.
print(f"od-pairs before removing length outliers {len(od_pair_data)}")
od_pair_data = od_pair_data[od_pair_data['length_outliers'] == False]
print(f"od-pairs after removing length outliers {len(od_pair_data)}")
od_pair_data = post_processing.normalize_complexity(od_pair_data)
print(len(od_pair_data))
# The od-pair data contains lists and dictionaries that are not easily saved to a csv file, so we store it as a json file.
# Still, there some columns that need to be serialized to strings such as shapely polygon objects.
od_pair_data.to_json("example/origin_od_pairs.json",orient="records",default_handler=str,indent=2)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

od_pair_data = pd.read_json("example/origin_od_pairs.json")


od_pair_data['closest_strongest_lag'] = abs(od_pair_data['closest_strongest_lag'])

od_pair_data = od_pair_data.sort_values(by="closest_strongest_lag", ascending=True)


city_counts = od_pair_data['city_name'].value_counts()
city_counts.plot(kind='bar')
plt.xlabel('City Name')
plt.ylabel('Number of od-pairs')
plt.title('Number of od-pairs in Each city')
plt.show()