# Notebook for the download of street network indices

As an initial step, the street networks for 176 capital cities were downloaded. A CSV file with the list of cities is provided in the data folder.

The download notebook uses Ray as the library for parallel processing. It is adviced to use a Linux machine for running the notebook.

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import ray
import modules.network_extractor as net_extractor
import geopandas as geopd
import pandas as pd
from shapely import ops
from osmnx import settings
import pyproj
from pathlib import Path

In [None]:
# Add here the absolute path to the data folder
data_base_path = ""

# The extractor instance
extractor = net_extractor.NetworkExtractor(base_path=data_base_path)
 
# Custom OSMnx settings
settings.default_crs = "epsg:4326"

# Initialize Ray
# First shut down Ray if it was already initialized.
ray.shutdown()
# This will initialize Ray with the default settings, which uses all available CPUs.
ray.init()

# Instead, use the following line for specific resource allocation
# ray.init(num_cpus=2)

In [None]:
# Extract information from the GHS dataset
urban_centers: geopd.GeoDataFrame

ghs_dataset_path = f"{data_base_path}/GHS_URBAN_CENTERS/GHS_URBAN_SIMPLIFIED_fixed.gpkg"
urban_centers = geopd.read_file(
    ghs_dataset_path,
    layer='GHS_URBAN_SIMPLIFIED_fixed'    
)
urban_centers = urban_centers.rename(columns={
    "GC_POP_TOT_2025": "population",
    "GC_UCA_KM2_2025": "area",
    "GC_DEV_USR_2025": "continent",
    "GC_UCN_MAI_2025": "name",
    "GC_CNT_GAD_2025": "country"
})
transform = pyproj.Transformer.from_crs("ESRI:54009", "EPSG:4326", always_xy=True).transform

In [None]:
# Read the file with the capital cities that are used for the analysis.
capital_cities = f"{data_base_path}/capital_cities.csv"
capital_df = pd.read_csv(capital_cities, delimiter=",", header=None)
countries = list(capital_df[0])
city_names = list(capital_df[1])

# Create a DataFrame with the capitals.
capitals = pd.DataFrame()
for i in range(len(countries)):
    capital_city = urban_centers.loc[
        (urban_centers["name"] == city_names[i]) &
        (urban_centers["country"] == countries[i])
    ]
    capitals = pd.concat([capitals, capital_city])

capitals = capitals.reset_index(drop=True)


In [None]:
# Fix the names of the capitals to have them all in lowercase and slug_case without special characters.

capitals["display_name"] = pd.Series()
for cap in capitals.iterrows():
    # the city name in lowercase and slug_case for creating the folder to store the graphs and shapefiles
    city_name = cap[1][1].replace(" ", "_").lower()

    # The name to search the city in the GHS dataset. Capital case. Also used for the DEM.
    search_name = cap[1][1]

    # The country in which the city is located for searching the GHS dataset. Capital case.
    country = cap[1][3]

    # extract info from GHS with search_name and country
    city_info = urban_centers.loc[
        (urban_centers["name"] == search_name) &
        (urban_centers["country"] == country)
    ]

    if len(city_info) == 0:
        print(f"City {search_name}, {country} not found")

    else:
        geom = city_info["geometry"].values[0]
        geom = ops.transform(transform, geom)

    # 
    o_b = "{"
    c_b = "}"
    backslash = f"\\"
    specials = "áéíóú'șăŏã"
    replaces = "aeiou_saoa"

    display_name = city_name
    modified = False

    if "[" in display_name or "]" in display_name:
        modified  = True

    for i in range(len(specials)):
        if specials[i] in display_name:
            modified  = True
        display_name = display_name.replace(specials[i], replaces[i])
        display_name = display_name.replace("[", "")
        display_name = display_name.replace("]", "")

    capitals.loc[cap[0], "display_name"] = display_name

    # A geometry is built for the download of the DEM from Google Earth Engine.
    # Uncomment the next 2 lines for printing the geometry in the format for Google Earth Engine.
    # ee_str = f"{o_b}'geometry': ee.Geometry.BBox{geom.bounds}, 'name': '{display_name}' {c_b},"
    # print(ee_str)


In [None]:
# Download the street networks for each capital city.
# It uses Ray (https://pypi.org/project/ray/) for parallelizing the download of the networks.

# Warning: This part requires massive computational resources. For large cities, such as Tokyo,
# do not use parallelization, as it may deplete the memory fast.

errors = []
for cap in capitals.iterrows():
    display_name = cap[1]["display_name"]
    city_name = display_name
    # The name to search the city in the GHS dataset. Capital case. Also used for the DEM.
    search_name = cap[1][1]
    
    # The country in which the city is located for searching the GHS dataset. Capital case.
    country = cap[1][3]
    
    print(f"starting {city_name}, {search_name}, {country}")

    try:
        # extract info from GHS with search_name and country
        city_info = urban_centers.loc[
            (urban_centers["name"] == search_name) &
            (urban_centers["country"] == country)
        ]

        if len(city_info) == 0:
            print(f"City {search_name}, {country} not found")

        else:
            geom = city_info["geometry"].values[0]
            geom = ops.transform(transform, geom)

        print(geom.bounds)

        # the geometry from which to extract the network. Is given by the GHS dataset.
        geometry = geom

        # Variables for the elimination of duplicate pedestrian/driving streets
        dist_threshold = 20
        slope_threshold = 15

        # Assessment = False, so duplicate pedestrian/driving streets will be eliminated.
        assess = False

        # Create graph and shapefile folders if they do not exist
        Path(f"{data_base_path}/{city_name}/graph").mkdir(parents=True, exist_ok=True) # graphml folder
        Path(f"{data_base_path}/{city_name}/shp").mkdir(parents=True, exist_ok=True) # shapefiles folder

        # Process the 4 networks in parallel using the paralellized download_network function.
        g_promises = []

        # the pedestrian network
        g_promises.append(extractor.download_network.remote(
            extractor,
            "walk", 
            geometry, 
            city_name, 
            assessment=assess, 
            dist_threshold=dist_threshold, 
            slope_threshold=slope_threshold,
            add_elevation=True,
        ))

        # the cycling network
        g_promises.append(extractor.download_network.remote(
            extractor,
            "bike", 
            geometry, 
            city_name, 
            assessment=assess, 
            add_elevation=True,
        ))

        # # the driving network
        g_promises.append(extractor.download_network.remote(
            extractor,
            "drive", 
            geometry, 
            city_name, 
            assessment=assess, 
            add_elevation=True,
        ))

        # the public transport network
        g_promises.append(extractor.download_network.remote(
            extractor,
            "public_transport", 
            geometry, 
            city_name, 
            assessment=assess, 
            add_elevation=False,
        ))

        [g_walk, g_bike, g_drive, g_public] = ray.get(g_promises)

        # Save graphs
        extractor.save_as_graph(g_walk, f'{city_name}/graph/walk_{city_name}')
        extractor.save_as_graph(g_bike, f'{city_name}/graph/bike_{city_name}')
        extractor.save_as_graph(g_drive, f'{city_name}/graph/drive_{city_name}')
        extractor.save_as_graph(g_public, f'{city_name}/graph/public_{city_name}')

        # Save shapefiles
        extractor.save_as_shp(g_walk, f'{city_name}/shp/walk_{city_name}')    
        extractor.save_as_shp(g_bike, f'{city_name}/shp/bike_{city_name}')
        extractor.save_as_shp(g_drive, f'{city_name}/shp/drive_{city_name}')
        has_edges = g_public.number_of_edges() > 0 # save only if edges exist
        has_nodes = g_public.number_of_nodes() > 0 # save only if nodes exist
        extractor.save_as_shp(g_public, f'{city_name}/shp/public_{city_name}', save_edges=has_edges, save_nodes=has_nodes)
        
    except Exception as ex:
        errors.append(city_name)
        print(ex)

In [None]:
# Print the errors in case there were any
errors