In [2]:
#!/usr/bin/env python3
"""
preprocess_svg.py
Parses an SVG and extracts line segments by semantic-id,
outputting structured JSON ready for Blender import.
"""
import xml.etree.ElementTree as ET
import json
import re
import os

# ——— CONFIG ———
INPUT_SVG   = "data/cad/images/train-01/0128-0104.svg"
OUTPUT_JSON = "segments-128-0140.json"
TARGET_IDS  = {
    "1":  "wall",
    "2":  "curtain wall",
    "3":  "single door",
    "4":  "double door",
    "5":  "sliding door",
    "9":  "window",
    "11": "sliding window"
}

# Ensure input file exists
if not os.path.isfile(INPUT_SVG):
    raise FileNotFoundError(f"SVG file not found: {INPUT_SVG}")

# Helper to strip namespace
def strip_ns(tag):
    return tag.split('}')[-1]

# Parse SVG
tree = ET.parse(INPUT_SVG)
root = tree.getroot()

# Prepare output
output = {label: [] for label in TARGET_IDS.values()}
total = 0

# Regex to match 'M x1,y1 L x2,y2'
pattern = re.compile(r"M\s*([-\d\.]+),([-\d\.]+)\s+L\s+([-\d\.]+),([-\d\.]+)")

for elem in root.iter():
    if strip_ns(elem.tag) != 'path':
        continue

    sid = elem.get('semantic-id')
    label = TARGET_IDS.get(sid)
    if label is None:
        continue

    d = elem.get('d', '').strip()
    if not d:
        continue

    m = pattern.match(d)
    if not m:
        continue

    x1, y1, x2, y2 = map(float, m.groups())
    output[label].append({'x1': x1, 'y1': y1, 'x2': x2, 'y2': y2})
    total += 1

# Write JSON
with open(OUTPUT_JSON, 'w') as f:
    json.dump(output, f, indent=2)

print(f"Wrote {total} segments across {len(output)} categories to {OUTPUT_JSON}")

Wrote 424 segments across 7 categories to segments-128-0140.json
