# Fast OSM XML parsing
Custom implementation of XML -> networkx.MuliDiGraph parser for OSM files. Based on osmnx approach.

In [17]:
import xml
from dataclasses import dataclass, field
from typing import Optional
from xml.sax import xmlreader
from xml.sax.handler import ContentHandler

import networkx as nx


TAGS_TO_KEEP = {
    "node": {
        "ref",
        "highway",
    },
    "way": {
        "bridge",
        "tunnel",
        "oneway",
        "lanes",
        "ref",
        "name",
        "highway",
        "maxspeed",
        "service",
        "access",
        "area",
        "landuse",
        "width",
        "est_width",
        "junction",
    },
}
# https://wiki.openstreetmap.org/wiki/Key:oneway
ONE_WAY_PATH_OPTIONS = {"yes", "true", "1", "-1", "reverse"}
REVERSED_PATH_OPTIONS = {"-1", "reversed"}

In [18]:
@dataclass(slots=True)
class CoreElementBuffer:
    """Class for storing parsed parameters of XML element (osm node or way)."""

    type: str  # either "node" or "way"
    osmid: int

    # Parameters to add to node or edges, filled while parsing tag elements
    attrs: dict[str, str | int | float | bool] = field(default_factory=dict, init=False)

    # List of osmids of nodes in a way, filled while parsing nd elements
    path: list[int] = field(default_factory=list, init=False)

In [19]:
class OSMParser(ContentHandler):
    """
    Handler for parsing OSM XML file to networkx.MultiDiGraph.
    
    Relations (and relation members) are ignored while building graph.
    OSM ids are ids used in the graph (not node attributes).
    """

    def __init__(self):
        self.g: Optional[nx.MultiDiGraph] = None
        # Buffer for currently processed core element (node or way)
        self._element: Optional[CoreElementBuffer] = None

    def startDocument(self) -> None:
        """Initialize graph."""

        metadata = {"crs": "epsg:4326"}  # coordinate system (osmnx default)
        self.g = nx.MultiDiGraph(**metadata)

    def startElement(self, element_type: str, attrs: xmlreader.AttributesImpl) -> None:
        """Parse XML elements."""

        if element_type == "node":
            osmid = int(attrs["id"])
            self._element = CoreElementBuffer(element_type, osmid)

            self._element.attrs["x"] = float(attrs["lon"])
            self._element.attrs["y"] = float(attrs["lat"])

        elif element_type == "way":
            osmid = int(attrs["id"])
            self._element = CoreElementBuffer(element_type, osmid)

        # Don't parse relations
        elif element_type == "relation":
            self._element = None

        # Parse tags (ignore tags inside relations)
        elif element_type == "tag" and self._element is not None:
            tag = attrs["k"]
            if tag in TAGS_TO_KEEP[self._element.type]:
                # TODO: Cast to bool/float/int based on attr type
                self._element.attrs[tag] = attrs["v"]

        # Parse paths inside way elements
        elif element_type == "nd":
            node_osmid = int(attrs["ref"])
            self._element.path.append(node_osmid)

    def endElement(self, element_type: str) -> None:
        """Add elements to the graph."""

        if element_type == "node":
            self.g.add_node(self._element.osmid, **self._element.attrs)

        elif element_type == "way":
            path = self._element.path
            attrs = self._element.attrs

            is_one_way = "oneway" in attrs and attrs["oneway"] in ONE_WAY_PATH_OPTIONS
            is_reversed = "oneway" in attrs and attrs["oneway"] in REVERSED_PATH_OPTIONS

            attrs["oneway"] = is_one_way

            if is_reversed:
                path.reverse()

            edges = [(path[idx], path[idx + 1]) for idx in range(len(path) - 1)]

            self.g.add_edges_from(edges, **attrs)
            if not is_one_way:
                self.g.add_edges_from([(v, u) for u, v in edges], **attrs)

    def endDocument(self) -> None:
        self._element = None

## Compare custom vs osmnx

In [20]:
DATA_PATH = "data/slovakia_borders.osm"

In [21]:
def parse_custom(file_path: str) -> nx.MultiDiGraph:
    parser = OSMParser()
    xml.sax.parse(file_path, parser)
    return parser.g


g = parse_custom(DATA_PATH)
print("Number of nodes:", len(g.nodes))
print("Number of edges:", len(g.edges))


Number of nodes: 84368
Number of edges: 168734


In [22]:
import osmnx as ox


def parse_ox(file_path: str) -> nx.MultiDiGraph:
    return ox.graph_from_xml(file_path, retain_all=True, simplify=False)


oxg = parse_ox(DATA_PATH)
print("Number of nodes:", len(oxg.nodes))
print("Number of edges:", len(oxg.edges))


Number of nodes: 84368
Number of edges: 168734


In [23]:
%%timeit
parse_custom(DATA_PATH)

900 ms ± 53.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
%%timeit
parse_ox(DATA_PATH)

1.67 s ± 113 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
