In [7]:
# preprocess_svg.py
import xml.etree.ElementTree as ET
from xml.dom import minidom

# ——— CONFIG ———
INPUT_SVG     = "data/cad/images/train-01/0001-0040.svg"
OUTPUT_SVG    = "0001-0040-clean.svg"
TARGET_SEMANTIC = "1"
SVG_NS          = "http://www.w3.org/2000/svg"

# parse the original SVG
ET.register_namespace("", SVG_NS)  # preserve xmlns in output
tree = ET.parse(INPUT_SVG)
root = tree.getroot()

# collect all <path> elements regardless of namespace
all_paths = [el for el in root.iter() if el.tag.endswith("path")]

# filter by semantic-id
keep_paths = [p for p in all_paths if p.get("semantic-id") == TARGET_SEMANTIC]

# build a new <svg> root (same tag & attributes, but no children yet)
new_root = ET.Element(root.tag, root.attrib)

# clone & append just the paths we want
for p in keep_paths:
    sub = ET.SubElement(new_root, p.tag, p.attrib)

# convert to a “rough” byte string
rough_xml = ET.tostring(new_root, encoding="utf-8")

# parse with minidom and pretty-print
parsed = minidom.parseString(rough_xml)
pretty_xml = parsed.toprettyxml(indent="  ", encoding="utf-8")

# write out the cleaned & pretty SVG
with open(OUTPUT_SVG, "wb") as f:
    # strip any blank lines that minidom might introduce
    for line in pretty_xml.splitlines():
        if line.strip():
            f.write(line + b"\n")

print(f"Kept {len(keep_paths)} paths with semantic-id={TARGET_SEMANTIC}")
print(f"Wrote pretty-printed SVG → {OUTPUT_SVG}")

Kept 839 paths with semantic-id=1
Wrote pretty-printed SVG → 0001-0040-clean.svg
