In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

pd.set_option('display.width', 2000)


### Define the project that you want to do analysis on

In [258]:
# case_study = 'bpic14'
case_study = 'bpic14'
use_sample = False

### Prepare so we can use PromG to load the data and execute queries

In [261]:
def reset_pbar(pbar=perf.pbar, total=None):
    # clear db
    pbar.reset()
    # TODO update dragons in PromG, #update method to set total for pbar
    pbar.total = total
    pbar.set_postfix_str()

In [262]:
# read the semantic header --> this details how the data should be structured
semantic_header = SemanticHeader.create_semantic_header(config=config)

In [264]:
# Set constraints in DB
db_manager.set_constraints();

14it [1:12:47, 672.84s/it, set_constraints: took 0.24 seconds]

### Load the data

In [267]:
# first, we load all records
# (if use_sample = False, this should take less than 2 minutes)
reset_pbar(total=11)
oced_pg.load()

100%|██████████| 11/11 [00:16<00:00,  1.66s/it, _filter_nodes for BPIC14Interaction: took 0.0 seconds]                     

In [269]:
# Check whether import and transformation was successful by checking whether all nodes were imported
node_count = db_connection.exec_query(check_nodes)
rel_count = db_connection.exec_query(check_relations)
print(
    f"In total, there are {sum([node['count'] for node in node_count])} nodes and {sum([rel['count'] for rel in rel_count])} relations.")

# we report on nodes and Relations excluding the record layer as these are not part of our schema
print(
    f"In total (excluding record layer), there are {sum([node['count'] for node in node_count if not node['label'] in [['Record'], ['RecordType'], ['Log']]])} nodes "
    f"and {sum([rel['count'] for rel in rel_count if not rel['label'] in ['IS_OF_TYPE', 'CONTAINS', 'EXTRACTED_FROM']])} relations.")


In total, there are 690626 nodes and 690622 relations.
In total (excluding record layer), there are 0 nodes and 0 relations.


In [271]:
# Print relationship statistics
print("Relations")
print(tabulate(rel_count))

Relations
--------  ------
CONTAINS  690622
--------  ------


Pipeline: Objects (simple & complex objects) + Resources Nodes-> Relations between objects and Resources -> Events nodes -> Relations between events and events & objects

"Simple objects" are object nodes that could be extracted directly from Record nodes as is. Those are Incidents, Interactions, Change, Knowledge Document. Resources nodes created in the same way as simple objects.

The name "Simple" is used as a placeholder.

In [273]:
def build_simple_objects_and_resources(db_connection):
    """
    Create simple Object and Resource nodes. Includes indexing.
    """

    def create_index(session, label):
        index_query = f"""
        CREATE INDEX {label.lower()}_sysId_index IF NOT EXISTS
        FOR (n:{label})
        ON (n.sysId)
        """
        session.run(index_query)
        print(f"Index for :{label}(sysId)")

    def build_entity(tx, label, config):
        log_name = config["log"]
        sysId_field = config["sysId"]
        attributes = config["attributes"]

        attr_updates = ", ".join([f"n.{attr} = r.{attr}" for attr in attributes])

        if log_name:
            source_query = f"""
            MATCH (l:Log {{name: '{log_name}'}})-[:CONTAINS]->(r:Record)
            WHERE r.{sysId_field} IS NOT NULL
            RETURN r.{sysId_field} AS sysId, r
            """
        else:
            source_query = f"""
            MATCH (r:Record)
            WHERE r.{sysId_field} IS NOT NULL
            RETURN r.{sysId_field} AS sysId, r
            """

        iterate_query = f"""
        CALL apoc.periodic.iterate(
            "{source_query}",
            "MERGE (n:{label} {{sysId: sysId}})
             SET {attr_updates}
             MERGE (n)-[:EXTRACTED_FROM]->(r)",
            {{batchSize: 1000, parallel: false}}
        )
        YIELD total, batches, failedBatches
        RETURN total
        """

        res = tx.run(iterate_query)
        print(f"→ {label} nodes created.")
        return


    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== INDEXES ===")
        for _, dictionary in [("Object", SIMPLE_OBJECTS), ("Resource", RESOURCES)]:
            for label in dictionary.keys():
                try:
                    create_index(session, label)
                except Exception as e:
                    print(f"Failed to create index for {label}: {e}")


        for collection, dictionary in [("Object", SIMPLE_OBJECTS), ("Resource", RESOURCES)]:
            print(f"\n=== Building {collection.upper()} NODES ===")
            for label, config in dictionary.items():
                try:
                    session.execute_write(lambda tx, l=label, c=config: build_entity(tx, l, c))
                except Exception as e:
                    print(f"Failed for {label}: {e}")


"Complex objects" are objects that require transformation during the extraction from Record nodes. Configuration Item (CI), Service Component (SC) and combined object CI_SC are complex objects. CI_SC inclides additional attributes affected & caused (boolean values) to identify whether CI_SC node was affected entity or the entity that cause disruption.

The name "Complex is used as a placeholder.


In [276]:
def build_complex_objects(tx, label):
    """
    Create complex nodes. Includes indexing
    """
    config = COMPLEX_OBJECTS[label]
    attributes = config["attributes"]
    sysId_field = config["sysId"]

    for role, attr in attributes.items():
        if label == "ConfigurationItem":
            name_attr = attr["ciName"]
            type_attr = attr["ciType"]
            subtype_attr = attr["ciSubtype"]

            source_query = f"""
            MATCH (r:Record)
            WHERE r.{name_attr} IS NOT NULL
            RETURN r.{name_attr} AS sysId,
                   r.{type_attr} AS type,
                   r.{subtype_attr} AS subtype,
                   r
            """

            iterate_query = f"""
            MERGE (ci:{label} {{sysId: sysId}})
            SET ci.{sysId_field} = sysId,
                ci.ciType = type,
                ci.ciSubtype = subtype,
                ci.{role} = true
            MERGE (ci)-[:EXTRACTED_FROM]->(r)
            """

        elif label == "ServiceComponent":
            name_attr = attr["scName"]

            source_query = f"""
            MATCH (r:Record)
            WHERE r.{name_attr} IS NOT NULL
            RETURN r.{name_attr} AS sysId, r
            """

            iterate_query = f"""
            MERGE (sc:{label} {{sysId: sysId}})
            SET sc.{sysId_field} = sysId,
                sc.{role} = true
            MERGE (sc)-[:EXTRACTED_FROM]->(r)
            """

        apoc_query = f"""
        CALL apoc.periodic.iterate(
            "{source_query}",
            "{iterate_query}",
            {{batchSize: 1000, parallel: false}}
        )
        YIELD total, batches, failedBatches
        RETURN total
        """

        tx.run(apoc_query)

    print(f"→ {label} nodes created.")

def build_ci_sc(tx):
    """
    Create combined CI_SC nodes (can be both affected and caused).
    Handles nulls with 'ci_null' and 'sc_null'.
    """

    config = COMPLEX_OBJECTS["CI_SC"]
    attributes = config["attributes"]

    for role, attr in attributes.items():
        ci_field = attr["ciName"]
        sc_field = attr["scName"]


        source_query = f"""
        MATCH (r:Record)
        WHERE r.{ci_field} IS NOT NULL OR r.{sc_field} IS NOT NULL
        RETURN
            coalesce(r.{ci_field}, 'ci_null') AS ciName,
            coalesce(r.{sc_field}, 'sc_null') AS scName,
            r
        """

        iterate_query = f"""
        MERGE (cs:CI_SC {{sysId: ciName + '_' + scName}})
        SET cs.ciName = ciName,
            cs.scName = scName,
            cs.{role} = true
        MERGE (cs)-[:EXTRACTED_FROM]->(r)
        """

        apoc_query = f"""
        CALL apoc.periodic.iterate(
            "{source_query}",
            "{iterate_query}",
            {{batchSize: 1000, parallel: false}}
        )
        YIELD total, batches, failedBatches
        RETURN total
        """

        res = tx.run(apoc_query)
        record = res.single()

    print("→ CI_SC nodes created.")

def build_all_complex_objects(db_connection):
    """
    Create indexes and build complex objects
    """
    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== INDEXES for COMPLEX OBJECTS ===")
        for label in COMPLEX_OBJECTS.keys():
            try:
                session.run(
                    f"CREATE INDEX {label.lower()}_sysId_index IF NOT EXISTS "
                    f"FOR (n:{label}) ON (n.sysId)"
                )
                print(f"Index for :{label}(sysId)")
            except Exception as e:
                print(f" Failed to create index for {label}: {e}")

        print("\n=== BUILDING COMPLEX OBJECTS ===")
        for label in COMPLEX_OBJECTS.keys():
            try:
                if label == "CI_SC":
                    session.execute_write(build_ci_sc)
                else:
                    session.execute_write(lambda tx: build_complex_objects(tx, label))
            except Exception as e:
                print(f"Failed for {label}: {e}")


### Objects and Resources Relations


There are interactions that have foregn keys to incidents that are missing in out data. For these cases i created new Incident nodes that only have sysId and a new attribute derivedFromInteraction and the nodes do not have relations to Record nodes.

In [279]:
build_object_Relations(db_connection)


=== INDEXES for RELATIONSHIP BUILDING ===
Index ensured for :Record(changeId)
Index ensured for :Record(kmNumber)
Index ensured for :Record(relatedIncident)
Index ensured for :Record(relatedInteraction)
Index ensured for :Record(ciNameAff)
Index ensured for :Record(serviceComponentAff)
Index ensured for :Record(ciNameCby)
Index ensured for :Record(serviceComponentCBy)
Index ensured for :CI_SC(ciName)
Index ensured for :CI_SC(scName)

=== BUILDING OBJECT Relations ===
Created  USED_KM Relations for Incident.
Created  USED_KM Relations for Interaction.
Created RELATED_CHANGE Relations (Incident → Change).
 Created RELATED_CI Relations.
 Created RELATED_SC Relations.
Created RELATED_INCIDENT Relations and new Incident nodes
 Created AFFECTED_CI_SC Relations (Incident → CI_SC).
 Created AFFECTED_CI_SC Relations (Interaction → CI_SC).
 Created AFFECTED_CI_SC Relations (Change → CI_SC).
 Created CAUSED_BY_CI_SC Relations (Incident → CI_SC).
 Created CAUSED_BY_CI_SC Relations (Interaction → 

Add a generic lable event

In [280]:
EVENTS = {
    "IncidentEvent": {
        "log": "BPIC14Incident.csv",
        "source_label": "Incident",
        "id_field": "incidentId",
        "mappings": {
            "Opened": "openTime",
            "Resolved": "resolvedTime",
            "Closed": "closeTime",
        },
    },
    "ChangeEvent": {
        "log": "Detail_Change.csv",
        "source_label": "Change",
        "id_field": "changeId",
        "mappings": {
            "Start": "actualStart",
            "End": "actualEnd",
        },
    },
    "InteractionEvent": {
        "log": "BPIC14Interaction.csv",
        "source_label": "Interaction",
        "id_field": "interactionId",
        "mappings": {
            "Open": "openTime",
            "Close": "closeTime",
        },
    },
    "IncidentActivityEvent": {
        "log": "Detail_Incident_Activity.csv",
        "id_field": "incidentId",
        "activityNumber": "activityNumber",
        "attributes": ["activity", "timestamp"],
    }
}

In [282]:
build_events_for_objects(db_connection)


=== INDEXES FOR EVENT ===
Index for :IncidentEvent(sysId)
Index for :ChangeEvent(sysId)
Index for :InteractionEvent(sysId)
Index for :IncidentActivityEvent(sysId)

=== BUILDING EVENT OBJECTS ===
      Created Opened IncidentEvent nodes
      Created Resolved IncidentEvent nodes
      Created Closed IncidentEvent nodes
      Created Start ChangeEvent nodes
      Created End ChangeEvent nodes
      Created Open InteractionEvent nodes
      Created Close InteractionEvent nodes
 Skipping IncidentActivityEvent, handled separately.


In [284]:
build_incidentactivityevent_nodes(db_connection)


=== BUILDING INCIDENT ACTIVITY EVENTS ===
Created IncidentActivityEvent nodes and Relations.


- (Resource)-[:RELATED]->(Incident) through IncidentActivityEvent
- (IncidentEvent|IncidentActivityEvent)-[:CORR]->(CI_SC) through Incident
- (InteractionEvent)-[:CORR]->(CI_SC) through Incident
- (ChangeEvent)-[:CORR]->(CI_SC) through Change
- (IncidentEvent|IncidentActivityEvent|ChangeEvent|InteractionEvent) -[:CORR]-> (CI|SC) through CI_SC

In [286]:
build_cross_object_Relations(db_connection)


=== BUILDING CROSS-OBJECT Relations ===
 Created :RELATED Relations between Resource and Incident
 Created :CORR edges between IncidentEvents/IncidentActivityEvents and CI_SC
 Created :CORR edges from ChangeEvent to CI_SC
 Created :CORR edges from InteractionEvent to CI_SC
 Created :CORR edges from all Events to ConfigurationItem and ServiceComponent (via CI_SC)


In [287]:
def build_related_activity_edges(db_connection):
    """
    Create RELATED_ACTIVITY edges between IncidentEvent and IncidentActivityEvent
    based on activity mapping (Opened → Open, Resolved → Resolved, Closed → Closed)
    for events correlated to the same Incident.
    """
    activity_map = {
        "Opened": "Open",
        "Resolved": "Resolved",
        "Closed": "Closed"
    }

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== BUILDING RELATED_ACTIVITY EDGES ===")

        for inc_act, act_act in activity_map.items():
            query = f"""
            CALL apoc.periodic.iterate(
                '
                MATCH (e:IncidentEvent)-[:CORR]->(i:Incident)
                MATCH (ae:IncidentActivityEvent)-[:CORR]->(i)
                WHERE e.activity = "{inc_act}" AND ae.activity = "{act_act}"
                RETURN e, ae
                ',
                '
                MERGE (e)-[:RELATED_ACTIVITY]->(ae)
                ',
                {{batchSize: 2000, parallel: true}}
            )
            YIELD total
            RETURN total
            """
            try:
                session.run(query)
                print(f" Created RELATED_ACTIVITY edges for {inc_act} → {act_act}")
            except Exception as e:
                print(f" Failed for {inc_act} → {act_act}: {e}")


### DF edges

In [289]:
def build_df_edges(db_connection):
    """
    Build :DF:* edges for all objects except KnowledgeDocument.
    Creates separate DF edges for each object type and incident event type.
    """

    all_objects = {**SIMPLE_OBJECTS, **RESOURCES, **COMPLEX_OBJECTS}
    excluded_objects = {"KnowledgeDocument", "ConfigurationItem", "ServiceComponent"} # add here objects for which DF are not needed
    for key in excluded_objects:
        all_objects.pop(key, None)

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("=== BUILDING DIRECTLY_FOLLOWS (DF) EDGES (excluding Incident) ===")

        for obj_label, obj_data in all_objects.items():
            sys_id_attr = obj_data.get("sysId")
            edge_type = obj_label

            if obj_label == "Incident":
                # Discover incident-related event labels
                query_labels = """
                MATCH (n:Event)-[:CORR]->(:Incident)
                UNWIND labels(n) AS lbl
                WITH DISTINCT lbl
                WHERE lbl <> 'Event'
                RETURN collect(lbl) AS incident_event_labels
                """
                result = session.run(query_labels).single()
                labels = result["incident_event_labels"] if result else []
                print(f"Found incident event labels: {labels}")

                for event_label in labels:
                    query = f"""
                    CALL apoc.periodic.iterate(
                        '
                        MATCH (e:Event:`{event_label}`)-[:CORR]->(o:Incident)
                        WHERE e.timestamp IS NOT NULL
                        WITH o, e ORDER BY e.timestamp, ID(e)
                        WITH o, collect(e) AS events
                        UNWIND range(0, size(events)-2) AS idx
                        WITH events[idx] AS fromEv, events[idx+1] AS toEv, o.incidentId AS objSysId
                        RETURN fromEv, toEv, objSysId
                        ',
                        '
                        WITH fromEv, toEv, objSysId
                        MERGE (fromEv)-[rel:DF {{objectType:"Incident", id:objSysId, dfType:"{event_label}"}}]->(toEv)
                        ',
                        {{batchSize:1000, parallel:false}}
                    ) YIELD total
                    RETURN total
                    """
                    res = session.run(query).data()
                    print(f"Incident {event_label} result:", res)

            elif obj_label == "Resource":
                query = f"""
                CALL apoc.periodic.iterate(
                    '
                    MATCH (e:Event)-[:EXECUTED_BY]->(o:{obj_label})
                    WHERE e.timestamp IS NOT NULL
                    WITH o, e ORDER BY e.timestamp, ID(e)
                    WITH o, collect(e) AS events
                    UNWIND range(0, size(events)-2) AS idx
                    WITH events[idx] AS fromEv, events[idx+1] AS toEv, o.{sys_id_attr} AS objSysId
                    RETURN fromEv, toEv, objSysId
                    ',
                    '
                    WITH fromEv, toEv, objSysId
                    MERGE (fromEv)-[rel:DF {{objectType:"{edge_type}", id:objSysId}}]->(toEv)
                    ',
                    {{batchSize:1000, parallel:false}}
                ) YIELD total
                RETURN total
                """
                res = session.run(query).data()
                print(f"{obj_label} DF creation result:", res)

            else:
                query = f"""
                CALL apoc.periodic.iterate(
                    '
                    MATCH (e:Event)-[:CORR]->(o:{obj_label})
                    WHERE e.timestamp IS NOT NULL
                    WITH o, e ORDER BY e.timestamp, ID(e)
                    WITH o, collect(e) AS events
                    UNWIND range(0, size(events)-2) AS idx
                    WITH events[idx] AS fromEv, events[idx+1] AS toEv, o.{sys_id_attr} AS objSysId
                    RETURN fromEv, toEv, objSysId
                    ',
                    '
                    WITH fromEv, toEv, objSysId
                    MERGE (fromEv)-[rel:DF {{objectType:"{edge_type}", id:objSysId}}]->(toEv)
                    ',
                    {{batchSize:1000, parallel:false}}
                ) YIELD total
                RETURN total
                """
                res = session.run(query).data()
                print(f"{obj_label} DF creation result:", res)


### Statistics

In [292]:
get_graph_statistics(db_connection)


=== GRAPH STATISTICS ===

--- Node counts ---
Event                          932164
Record                         690622
IncidentActivityEvent          466737
InteractionEvent               294008
Interaction                    147004
IncidentEvent                  138038
Incident                       47057
ChangeEvent                    33381
Change                         18000
CI_SC                          15356
ConfigurationItem              15134
KnowledgeDocument              2373
ServiceComponent               340
Resource                       242
Log                            4
Entity                         0
Activity                       0
RecordType                     0

--- Relationship counts ---
CORR                           3771566
EXTRACTED_FROM                 2993334
DF                             2071228
CONTAINS                       690622
EXECUTED_BY                    466737
AFFECTED_CI_SC                 223734
USED_KM                        193610
RELA