In [None]:
import json
from pathlib import Path

import requests

try:
    from graphql import GraphQLList, GraphQLNonNull, GraphQLObjectType, build_schema
except ImportError as exc:
    raise ImportError(
        "Missing dependency 'graphql-core'. Install it via `pip install graphql-core`."
    ) from exc


class SemanticScraper:
    SCHEMA_URL_TEMPLATE = "https://graphql.datafordeler.dk/{register}/v1/schema"

    def __init__(self, api_key, output_dir="."):
        self.api_key = api_key
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        # Mapping GraphQL scalar names to semanticGIS pipeline primitives
        self.type_map = {
            "CharacterString": "str",
            "String": "str",
            "UUID": "str",
            "ID": "str",
            "Int": "int",
            "Long": "int",
            "Float": "float",
            "Decimal": "float",
            "Boolean": "bool",
            "DafDateTime": "datetime",
            "DateTime": "datetime",
            "Date": "date",
        }

    def fetch_schema_sdl(self, register):
        """Downloads the GraphQL SDL for a register."""
        url = self.SCHEMA_URL_TEMPLATE.format(register=register)
        print(f"[*] Downloading SDL for {register}...")
        response = requests.get(
            url,
            params={"apiKey": self.api_key},
            timeout=60,
        )
        if response.status_code != 200:
            raise RuntimeError(
                f"Schema download for {register} failed ({response.status_code}): {response.text}"
            )
        return response.text

    def resolve_base_type(self, gql_type):
        """Removes LIST/NON_NULL wrappers and returns the underlying type name."""
        while isinstance(gql_type, (GraphQLList, GraphQLNonNull)):
            gql_type = gql_type.of_type
        return getattr(gql_type, "name", str(gql_type))

    def build_registry(self, register, schema):
        registry = {
            "register": register,
            "entities": {},
        }
        skip_types = {"Query", "Mutation", "Subscription"}

        for type_name, gql_type in schema.type_map.items():
            if type_name.startswith("__") or type_name in skip_types:
                continue
            if not isinstance(gql_type, GraphQLObjectType):
                continue

            entity_info = {
                "da_description": gql_type.description or "",
                "attributes": {},
            }

            for field_name, field in gql_type.fields.items():
                base_type = self.resolve_base_type(field.type)
                clean_type = self.type_map.get(base_type, "str")
                entity_info["attributes"][field_name] = {
                    "da_description": field.description or "",
                    "tech_type": clean_type,
                    "graphql_type": base_type,
                }

            registry["entities"][type_name] = entity_info

        return registry

    def process_register(self, register):
        try:
            schema_sdl = self.fetch_schema_sdl(register)
            schema = build_schema(schema_sdl)
        except Exception as exc:
            print(f"[!] Failed to prepare schema for {register}: {exc}")
            return

        registry = self.build_registry(register, schema)
        output_file = self.output_dir / f"{register.lower().replace('/', '_')}.json"
        with output_file.open("w", encoding="utf-8") as f:
            json.dump(registry, f, indent=2, ensure_ascii=False)

        print(f"[+] Saved {len(registry['entities'])} entities to {output_file}")


# --- EXECUTION ---
REGISTERS = ["DAR", "BBR", "DHMOprindelse", "DHMHoejdekurver","MAT","ESB","FIKSPUNKT", "DS","GEODKV","CVR","CVR/custom", "EJF", "GEODKV","CPR","SVR","VUR","HISTKORT"]
MY_API_KEY = "YOUR_API_KEY_HERE"
OUTPUT_DIR = "schemas"

scraper = SemanticScraper(MY_API_KEY, output_dir=OUTPUT_DIR)

for reg in REGISTERS:
    scraper.process_register(reg)

[*] Downloading SDL for DAR...
[+] Saved 36 entities to schemas/dar.json
[*] Downloading SDL for BBR...
[+] Saved 48 entities to schemas/bbr.json
[*] Downloading SDL for DHMOprindelse...
[+] Saved 17 entities to schemas/dhmoprindelse.json
[*] Downloading SDL for DHMHoejdekurver...
[+] Saved 18 entities to schemas/dhmhoejdekurver.json
[*] Downloading SDL for MAT...
[+] Saved 117 entities to schemas/mat.json
[*] Downloading SDL for ESB...
[!] Failed to prepare schema for ESB: Schema download for ESB failed (404): 
[*] Downloading SDL for FIKSPUNKT...
[+] Saved 124 entities to schemas/fikspunkt.json
[*] Downloading SDL for DS...
[+] Saved 107 entities to schemas/ds.json
[*] Downloading SDL for GEODKV...
[+] Saved 218 entities to schemas/geodkv.json
[*] Downloading SDL for CVR...
[+] Saved 56 entities to schemas/cvr.json
[*] Downloading SDL for CVR/custom...
[+] Saved 4 entities to schemas/cvr_custom.json
[*] Downloading SDL for EJF...
[+] Saved 32 entities to schemas/ejf.json
[*] Download

In [6]:
import json
import re
import unicodedata
from pathlib import Path

STANDARD_SCALARS = {
    "CharacterString",
    "String",
    "UUID",
    "ID",
    "Int",
    "Integer",
    "Long",
    "Float",
    "Decimal",
    "Boolean",
    "DafDateTime",
    "DateTime",
    "Date",
}

EXCLUDED_ENTITY_TOKENS = [
    "Connection",
    "Edge",
    "Payload",
    "Spatial",
    "Geometry",
    "PageInfo",
    "Events",
]

OUTPUT_BASE = Path("output/dk")


def slugify(value, allow_non_ascii=False):
    """Returns a filesystem-safe slug. Optionally preserves Danish letters."""
    if value is None:
        value = ""
    if allow_non_ascii:
        norm = unicodedata.normalize("NFKC", value)
        pattern = r"[^0-9a-zæøå]+"
    else:
        norm = unicodedata.normalize("NFKD", value)
        norm = norm.encode("ascii", "ignore").decode("ascii")
        pattern = r"[^0-9a-z]+"
    norm = norm.lower()
    slug = re.sub(pattern, "_", norm).strip("_")
    return slug or norm


def canonical_entity_name(entity_name, register_name):
    """Removes register prefixes (DAR_, etc.) and slugifies the result."""
    lowered = entity_name.casefold()
    prefix = f"{register_name.casefold()}_"
    if lowered.startswith(prefix):
        lowered = lowered[len(prefix):]
    return slugify(lowered)


def display_entity_title(entity_name, register_name):
    """Removes register prefixes while preserving casing for presentation."""
    prefix = f"{register_name}_"
    if entity_name.lower().startswith(prefix.lower()):
        cleaned = entity_name[len(prefix):]
    else:
        cleaned = entity_name
    return cleaned.replace('_', ' ').strip() or entity_name


def clean_info(info_str):
    """Udtrækker de vigtige felter fra Grunddatamodel info-strengen."""
    if not info_str or "Grunddatamodel info:" not in info_str:
        return {"definition": info_str.replace('\n', ' ').strip() if info_str else ""}
    
    data = {}
    lines = info_str.split('\n')
    for line in lines:
        if ':' in line:
            key, val = line.split(':', 1)
            clean_val = val.replace('<memo>#NOTES#', '').replace('#NOTES#', '').strip()
            data[key.strip()] = clean_val
            
    return {
        "definition": data.get("definition (da)", "Ingen definition fundet").strip(),
        "type_clue": data.get("type", None),
        "legal": data.get("legalSource", None)
    }


def should_skip_entity(entity_name):
    return any(token in entity_name for token in EXCLUDED_ENTITY_TOKENS)


def register_output_dir(register_name):
    parts = [part for part in register_name.split('/') if part]
    if not parts:
        parts = [register_name or "register"]
    reg_dir = OUTPUT_BASE
    for part in parts:
        reg_dir /= part
    return reg_dir


def clean_register_dir(reg_dir):
    if not reg_dir.exists():
        return
    for child in reg_dir.iterdir():
        if child.is_file() and child.suffix == ".md":
            child.unlink()


def generate_entity_markdown(register_name, entity_name, entity_data, entity_slug_set):
    if should_skip_entity(entity_name):
        return None

    entity_info = clean_info(entity_data.get('da_description', ''))
    reg_low = register_name.lower()
    entity_slug = canonical_entity_name(entity_name, register_name)
    entity_title = display_entity_title(entity_name, register_name)
    md = f"---\ntitle: {entity_title}\ndraft: false\ntype: entity\n---\n\n"
    md += f"# {entity_title}\n\n{entity_info.get('definition', '')}\n\n"

    md += "### Semantic Template\n"
    md += "```python\n"
    md += "p.io.declare_input(\n"
    md += f"    output_name=\"{entity_slug}\",\n"
    md += "    attributes={\n"
    
    standard_slugs = {slugify(s, allow_non_ascii=True) for s in STANDARD_SCALARS}

    for attr_name, attr_data in entity_data.get('attributes', {}).items():
        attr_info = clean_info(attr_data.get('da_description', ''))
        type_clue = attr_info.get('type_clue')
        type_slug = slugify(type_clue, allow_non_ascii=True) if type_clue else None
        t_type = attr_data.get('tech_type', 'str')

        if attr_name == "id_lokalId":
            role = f"sg.PK({t_type})"
            scale = "MeasurementScale.NOMINAL"
        elif type_slug and type_slug in entity_slug_set:
            role = f"sg.FK(dk.{reg_low}.{type_slug}, {t_type})"
            scale = "MeasurementScale.NOMINAL"
        elif type_slug and type_slug not in standard_slugs:
            role = f"sg.LOOKUP(dk.{reg_low}.lookups.{type_slug}, {t_type})"
            scale = "MeasurementScale.NOMINAL"
        else:
            role = f"sg.DES({t_type})"
            if t_type in ['int', 'float']:
                scale = "MeasurementScale.RATIO"
            elif t_type == 'datetime':
                scale = "MeasurementScale.INTERVAL"
            else:
                scale = "MeasurementScale.NOMINAL"

        desc = attr_info.get('definition', '').replace('"', "'").replace('\n', ' ')
        attr_slug = slugify(attr_name)
        md += f"        \"{attr_slug}\": {{\n"
        md += f"            \"scale\": {scale},\n"
        md += f"            \"role\": {role},\n"
        md += f"            \"description\": \"{desc}\"\n"
        md += "        },\n"
    
    md += "    }\n)\n```\n"
    return md


def build_catalog(schema_folder):
    schema_path = Path(schema_folder)
    for json_path in sorted(schema_path.glob("*.json")):
        with json_path.open('r', encoding='utf-8') as f:
            data = json.load(f)

        reg_name = data['register']
        entity_slug_set = {
            canonical_entity_name(entity_name, reg_name)
            for entity_name in data['entities']
            if not should_skip_entity(entity_name)
        }

        reg_dir = register_output_dir(reg_name)
        reg_dir.mkdir(parents=True, exist_ok=True)
        clean_register_dir(reg_dir)

        for entity_name, entity_data in data['entities'].items():
            if entity_name.startswith("DAF_"):
                continue

            content = generate_entity_markdown(reg_name, entity_name, entity_data, entity_slug_set)
            if content is None:
                continue

            safe_entity = canonical_entity_name(entity_name, reg_name)
            out_path = reg_dir / f"{safe_entity}.md"
            out_path.write_text(content, encoding="utf-8")


# Run it
build_catalog('/Users/holmes/local_dev/semanticGIS/tests/semantictest/schemas')