In [None]:
import xml.etree.ElementTree as ET
import networkx as nx
import matplotlib.pyplot as plt


# Function to extract ontology from XML
def extract_xml_ontology(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    nodes = set()
    edges = []
    attributes = []

    # Recursive function to traverse XML
    def traverse(node, parent=None):
        node_name = node.tag
        nodes.add(node_name)

        # Store parent-child relationships
        if parent:
            edges.append((parent, node_name))

        # Capture attributes as key-value pairs
        for attr, value in node.attrib.items():
            attributes.append((node_name, attr, value))

        # Traverse children
        for child in node:
            traverse(child, node_name)

    # Start traversal from root
    traverse(root)

    return nodes, edges, attributes


# Function to visualize the ontology as a graph
def plot_graph(edges):
    G = nx.DiGraph()
    G.add_edges_from(edges)

    plt.figure(figsize=(10, 6))
    pos = nx.spring_layout(G)
    nx.draw(
        G,
        pos,
        with_labels=True,
        node_color="lightblue",
        edge_color="gray",
        node_size=2000,
        font_size=10,
    )
    plt.title("XML Ontology Graph")
    plt.show()


# Example usage
xml_file = r"C:\Users\bmills\Downloads\DoDAF-digitized-master\DoDAF-digitized-master\DM2Foundation_v2.02.xsd"  # Replace with your XML file path
nodes, edges, attributes = extract_xml_ontology(xml_file)

# Display extracted ontology
print("\nðŸ“Œ Unique Elements (Nodes):")
print(nodes)

print("\nðŸ“Œ Parent-Child Relationships (Edges):")
for edge in edges:
    print(f"{edge[0]} â†’ {edge[1]}")

print("\nðŸ“Œ Attributes Extracted:")
for attr in attributes:
    print(f"Element: {attr[0]}, Attribute: {attr[1]}, Value: {attr[2]}")

# Plot the ontology graph
plot_graph(edges)

In [None]:
import xml.etree.ElementTree as ET
from pyvis.network import Network
from IPython.display import IFrame, display, HTML
import re
import traceback


def camel_case_split(identifier):
    """Split camelCase or PascalCase string into words"""
    matches = re.finditer(
        ".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier
    )
    return [m.group(0) for m in matches]


def extract_relationship(element_name):
    """Extract relationship from element name systematically with correct case handling"""
    # Split camelCase name into parts
    parts = camel_case_split(element_name)

    if len(parts) < 2:
        return None

    # Common patterns in your schema:
    # noun + verb + noun (e.g., activityConsumesResource)
    # noun + verb + preposition + noun (e.g., activityPerformedByPerformer)

    # First part is usually subject
    subject = parts[0]
    # Last part is usually object
    object_name = parts[-1]
    # Middle parts form the relationship type
    relationship = " ".join(parts[1:-1]).lower()

    return subject, object_name, relationship


def extract_semantic_ontology(xml_file):
    """Extract ontology relationships programmatically"""
    try:
        print("\n=== Starting Programmatic Ontology Extraction ===")

        ns = {
            "xs": "http://www.w3.org/2001/XMLSchema",
            "ideas": "http://www.ideasgroup.org/xsd",
        }

        tree = ET.parse(xml_file)
        root = tree.getroot()

        nodes = set()
        edges = set()

        # First pass: collect all primary nodes from simple element names
        for element in root.findall(".//xs:element", ns):
            name = element.get("name", "")
            if not name:
                continue

            # Extract base type
            base_type = None
            extension = element.find(".//xs:extension", ns)
            if extension is not None:
                base_type = extension.get("base", "").split(":")[-1]

            # If it's a primary concept (not a relationship), add it as a node
            relationship = extract_relationship(name)
            if relationship is None:
                nodes.add(name)
                print(f"Added node: {name}")
                if base_type and base_type not in [
                    "Individual",
                    "IndividualType",
                    "typeInstance",
                ]:
                    edges.add((name, base_type, "is a"))
                    print(f"Added inheritance: {name} is a {base_type}")
            else:
                # For relationship elements, add both subject and object as nodes
                subject, object_name, rel_type = relationship
                # Capitalize first letter to match schema style
                nodes.add(subject)
                nodes.add(object_name)
                edges.add((subject, object_name, rel_type))
                print(f"Added relationship: {subject} {rel_type} {object_name}")

                # Also capture the base type relationship if it exists
                if base_type and base_type not in [
                    "Individual",
                    "IndividualType",
                    "typeInstance",
                ]:
                    edges.add((name, base_type, "is a"))
                    print(f"Added inheritance: {name} is a {base_type}")

        print(f"\nExtracted {len(nodes)} nodes and {len(edges)} relationships")
        return nodes, edges

    except Exception as e:
        print(f"Error in extraction: {str(e)}")
        traceback.print_exc()
        return set(), set()


def visualize_ontology(xml_file, output_file="ontology.html"):
    """Visualize the extracted ontology"""
    nodes, edges = extract_semantic_ontology(xml_file)

    if not nodes:
        return

    net = Network(
        height="900px",
        width="100%",
        bgcolor="#ffffff",
        font_color="#000000",
        directed=True,
        notebook=True,
        cdn_resources="remote",
    )

    # Add nodes
    for node in nodes:
        net.add_node(node, label=node, title=node, shape="box")

    # Add edges
    for source, target, relationship in edges:
        net.add_edge(source, target, label=relationship, title=relationship)

    # Configure physics for hierarchical layout
    net.set_options(
        """
    {
      "physics": {
        "hierarchicalRepulsion": {
          "centralGravity": 0.1,
          "springLength": 200,
          "springConstant": 0.01,
          "nodeDistance": 200
        },
        "solver": "hierarchicalRepulsion"
      },
      "layout": {
        "hierarchical": {
          "enabled": true,
          "direction": "LR",
          "sortMethod": "hubsize",
          "levelSeparation": 250
        }
      }
    }
    """
    )

    net.save_graph(output_file)
    display(HTML(filename=output_file))


# Run visualization
xml_file = r"C:\Users\bmills\Downloads\DoDAF-digitized-master\DoDAF-digitized-master\DM2Foundation_v2.02.xsd"
visualize_ontology(xml_file)


=== Starting Programmatic Ontology Extraction ===
Added relationship: Information  Pedigree
Added inheritance: InformationPedigree is a NamingScheme
Added relationship: Naming  Scheme
Added inheritance: NamingScheme is a NamingScheme
Added relationship: Pedigree  Information
Added relationship: Pedigree information Type
Added inheritance: PedigreeInformationType is a IndividualTypeType
Added node: Activity
Added relationship: activity consumes Resource
Added inheritance: activityConsumesResource is a WholePartType
Added relationship: activity produces Resource
Added inheritance: activityProducesResource is a WholePartType
Added relationship: rule constrains Activity
Added inheritance: ruleConstrainsActivity is a superSubtype
Added relationship: measure of type Activity
Added inheritance: measureOfTypeActivity is a superSubtype
Added node: Measure
Added relationship: Location  Type
Added node: Rule
Added relationship: Measure type units of Measure
Added inheritance: MeasureTypeUnitsOfM

AssertionError: non existent node 'activityPerformedByPerformer'