In [2]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

pd.set_option('display.width', 2000)

### Define the project that you want to do analysis on

In [3]:
case_study = 'bpic14'
use_sample = False

In [4]:
# retrieve configuration for case_study
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config['db_name']}")
print(f"uri: {config['uri']}")
print(f"password: {config['password']}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


### Prepare so we can use PromG to load the data and execute queries

In [5]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)
perf = Performance.set_up_performance(config=config)
dataset_descriptions = DatasetDescriptions(config=config)

0it [00:00, ?it/s]

In [6]:
def reset_pbar(pbar=perf.pbar, total=None):
    # clear db
    pbar.reset()
    # TODO update dragons in PromG, #update method to set total for pbar
    pbar.total = total
    pbar.set_postfix_str()

#### Prepare the DB

In [7]:
# read the semantic header --> this details how the data should be structured
semantic_header = SemanticHeader.create_semantic_header(config=config)

In [8]:
# Clear the DB (if use_sample = False, this should not take long on a loaded database)
db_manager = DBManagement(db_connection=db_connection, semantic_header=None)
db_manager.clear_db(replace=True);  # in the community version of neo4j, replace is not allowed. In that case, set replace=False

1it [00:07,  7.70s/it, clear_db: took 7.69 seconds]

  perf.perf = pd.concat([perf.perf, pd.DataFrame.from_records([


### Statistics

In [9]:
def get_graph_statistics(db_connection):
    """
    Statistics about nodes and relations.
    """

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== GRAPH STATISTICS ===")

        try:
            node_query = """
            MATCH (n)
            WITH n, labels(n) as labels
            RETURN reduce(label_str = "(", l in labels | label_str + ":" + l) + ")" as label, count(n) as count ORDER BY count DESC
            """
            node_counts = session.run(node_query)
            print("\n--- Node counts ---")
            for record in node_counts:
                print(f"{record['label']:<30} {record['count']}")

            rel_query = """
            MATCH (n) - [r] -> (n2)
            RETURN "[:" + type(r) + "]" as  type, count(r) as count ORDER BY count DESC
            """
            rel_counts = session.run(rel_query)
            print("\n--- Relationship counts ---")
            for record in rel_counts:
                print(f"{record['type']:<30} {record['count']}")

            total_nodes = session.run("MATCH (n) RETURN count(n) AS total").single()["total"]
            total_rels = session.run("MATCH ()-[r]->() RETURN count(r) AS total").single()["total"]

            print("\n--- Totals ---")
            print(f"Total nodes: {total_nodes}")
            print(f"Total relationships: {total_rels}")

        except Exception as e:
            print(f"Failed to get graph statistics: {e}")


### Load the data

In [10]:
use_sample = False
oced_pg = OcedPg(database_connection=db_connection,
                 dataset_descriptions=dataset_descriptions,
                 semantic_header=semantic_header,
                 use_sample=use_sample)


In [11]:
# first, we load all records
# (if use_sample = False, this should take less than 2 minutes)
reset_pbar(total=11)
oced_pg.load();

  0%|          | 0/11 [00:00<?, ?it/s]             



100%|██████████| 11/11 [00:34<00:00,  2.95s/it, _filter_nodes for BPIC14Interaction: took 0.0 seconds]                     

In [12]:
get_graph_statistics(db_connection)


=== GRAPH STATISTICS ===

--- Node counts ---
(:Record)                      690622
(:Log)                         4

--- Relationship counts ---
[:CONTAINS]                    690622

--- Totals ---
Total nodes: 690626
Total relationships: 690622


# EKG creation

Pipeline: Objects (simple & complex objects) + Resources Nodes-> Relations between objects and Resources -> Events nodes -> Relations between events and events & objects

### Objects Nodes

Create objects out of the Record nodes directly.
Those are I
- Incident
- Interaction
- Change
- Knowledge Document
- Resource.

In [13]:
objects = {
    "Incident": {
        "log": "BPIC14Incident.csv",
        "sysId": "incidentId",
        "attributes": ["incidentId",
                       "status",
                       "impact",
                       "priority",
                       "category",
                       "handleTimeHours",
                       "closureCode",
                       "alertStatus",
                       "numReassignments",
                       "numRelatedInteractions",
                       "numRelatedIncidents",
                       "numRelatedChanges"
        ],
    },
    "Interaction": {
        "log": "BPIC14Interaction.csv",
        "sysId": "interactionId",
        "attributes": ["interactionId",
                       "status",
                       "impact",
                       "priority",
                       "category",
                       "handleTimeSecs",
                       "closureCode",
                       "firstCallResolution"],
    },
    "Change":{
        "log": "Detail_Change.csv",
        "sysId": "changeId",
        "attributes": ["changeId",
                       "changeType",
                       "riskAssessment",
                       "cabApprovalNeeded",
                       "plannedStart",
                       "plannedEnd",
                       "scheduledDowntimeStart",
                       "scheduledDowntimeEnd",
                       "requestedEndDate",
                       "originatedFrom",
                       "numRelatedInteractions",
                       "numRelatedIncidents"
        ],
    },
    "KnowledgeDocument": {
        "log": None,
        "sysId": "kmNumber",
        "attributes": ["kmNumber"]
    },
    "Resource":{
        "log": "Detail_Incident_Activity.csv",
        "sysId": "assignmentGroup",
        "attributes": ["assignmentGroup"]
    }
}

In [14]:
def create_index(_db_connection, _label):
    index_query_str = f"""
        CREATE INDEX $index_name IF NOT EXISTS
        FOR (n:$label)
        ON (n.sysId)
    """

    index_query = Query(query_str=index_query_str,
                  parameters={
                      "index_name": f"{_label.lower()}_sysId_index"
                  },
                  template_string_parameters={
                      "label": _label
                  })

    _db_connection.exec_query(index_query)
    print(f"Index for :{_label}(sysId)")

def build_object(_label, _config):
    iterate_query = """
        :auto
        MATCH (l:Log)-[:CONTAINS]->(r:Record)
        WHERE r.$sysId_field IS NOT NULL $log_name_condition
        WITH r.$sysId_field AS sysId, r
        CALL (sysId, r) {
             MERGE (n:$label {sysId: sysId})
             MERGE (n)-[:EXTRACTED_FROM]->(r)
             SET $attr_updates
        } IN TRANSACTIONS
    """

    query = Query(
        query_str=iterate_query,
        parameters={
            "log_name": _config["log"],
        },
        template_string_parameters={
            "label": _label,
            "sysId_field":  _config["sysId"],
            "log_name_condition": "AND l.name = $log_name" if _config["log"] else "",
            "attr_updates": ", ".join([f"n.{attr} = r.{attr}" for attr in _config["attributes"]])
        }
    )
    db_connection.exec_query(query)
    print(f"→ {_label} nodes created.")

def build_objects(_db_connection):
    """
    Create simple Object. Includes indexing.
    """
    print("\n=== INDEXES ===")
    for _label in objects.keys():
        try:
            create_index(_db_connection=_db_connection,
                         _label = _label)
        except Exception as e:
            print(f"Failed to create index for {_label}: {e}")

    print(f"\n=== Building OBJECT NODES ===")

    for _label, _config in objects.items():
        try:
            build_object(_label=_label,
                     _config=_config)
        except Exception as e:
            print(f"Failed for {_label}: {e}")





In [15]:
build_objects(db_connection)


=== INDEXES ===
Index for :Incident(sysId)
Index for :Interaction(sysId)
Index for :Change(sysId)
Index for :KnowledgeDocument(sysId)
Index for :Resource(sysId)

=== Building OBJECT NODES ===
→ Incident nodes created.
→ Interaction nodes created.
→ Change nodes created.
→ KnowledgeDocument nodes created.
→ Resource nodes created.


In [16]:
get_graph_statistics(db_connection)


=== GRAPH STATISTICS ===

--- Node counts ---
(:Record)                      690622
(:Interaction)                 147004
(:Incident)                    46606
(:Change)                      18000
(:KnowledgeDocument)           2373
(:Resource)                    242
(:Log)                         4

--- Relationship counts ---
[:EXTRACTED_FROM]              1350969
[:CONTAINS]                    690622

--- Totals ---
Total nodes: 904851
Total relationships: 2041591


"Complex objects" are objects that require transformation during the extraction from Record nodes. Configuration Item (CI), Service Component (SC) and combined object CI_SC are complex objects. CI_SC includes additional attributes affected & caused (boolean values) to identify whether CI_SC node was affected entity or the entity that cause disruption.

The name "Complex is used as a placeholder.


In [23]:
COMPLEX_OBJECTS = {
    "ConfigurationItem": {
        "log": None,
        "sysId": "ciName",
        "attributes": {
            "affected": {
                "ciName": "ciNameAff",
                "ciType": "ciTypeAff",
                "ciSubtype": "ciSubtypeAff",
            },
            "caused": {
                "ciName": "ciNameCby",
                "ciType": "ciTypeCby",
                "ciSubtype": "ciSubtypeCby",
            },
        },
    },
    "ServiceComponent": {
        "log": None,
        "sysId": "scName",
        "attributes": {
            "affected": {"scName": "serviceComponentAff"},
            "caused": {"scName": "serviceComponentCBy"}
        },
    },

}


In [75]:
def build_complex_objects(tx, label):
    """
    Create complex nodes. Includes indexing
    """
    config = COMPLEX_OBJECTS[label]
    attributes = config["attributes"]
    sysId_field = config["sysId"]

    for role, attr in attributes.items():
        if label == "ConfigurationItem":
            name_attr = attr["ciName"]
            type_attr = attr["ciType"]
            subtype_attr = attr["ciSubtype"]

            source_query = f"""
            MATCH (r:Record)
            WHERE r.{name_attr} IS NOT NULL
            RETURN r.{name_attr} AS sysId,
                   r.{type_attr} AS type,
                   r.{subtype_attr} AS subtype,
                   r
            """

            iterate_query = f"""
            MERGE (ci:{label} {{sysId: sysId}})
            SET ci.{sysId_field} = sysId,
                ci.ciType = type,
                ci.ciSubtype = subtype,
                ci.{role} = true
            MERGE (ci)-[:EXTRACTED_FROM]->(r)
            """

        elif label == "ServiceComponent":
            name_attr = attr["scName"]

            source_query = f"""
            MATCH (r:Record)
            WHERE r.{name_attr} IS NOT NULL
            RETURN r.{name_attr} AS sysId, r
            """

            iterate_query = f"""
            MERGE (sc:{label} {{sysId: sysId}})
            SET sc.{sysId_field} = sysId,
                sc.{role} = true
            MERGE (sc)-[:EXTRACTED_FROM]->(r)
            """

        apoc_query = f"""
        CALL apoc.periodic.iterate(
            "{source_query}",
            "{iterate_query}",
            {{batchSize: 1000, parallel: false}}
        )
        YIELD total, batches, failedBatches
        RETURN total
        """

        tx.run(apoc_query)

    print(f"→ {label} nodes created.")



def build_all_complex_objects(db_connection):
    """
    Create indexes and build complex objects
    """
    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== INDEXES for COMPLEX OBJECTS ===")
        for label in COMPLEX_OBJECTS.keys():
            try:
                session.run(
                    f"CREATE INDEX {label.lower()}_sysId_index IF NOT EXISTS "
                    f"FOR (n:{label}) ON (n.sysId)"
                )
                print(f"Index for :{label}(sysId)")
            except Exception as e:
                print(f" Failed to create index for {label}: {e}")

        print("\n=== BUILDING COMPLEX OBJECTS ===")
        for label in COMPLEX_OBJECTS.keys():
            try:
                if label == "CI_SC":
                    session.execute_write(build_ci_sc)
                else:
                    session.execute_write(lambda tx: build_complex_objects(tx, label))
            except Exception as e:
                print(f"Failed for {label}: {e}")


In [None]:
COMPLEX_OBJECTS = {
    "CI_SC": {
        "log": None,
        "sysId": "sysId",  # CI_SC sysId = ciName + '_' + scName
        "attributes": {
            "affected": {"ciName": "ciNameAff", "scName": "serviceComponentAff"},
            "caused": {"ciName": "ciNameCby", "scName": "serviceComponentCBy"},
        },
    },
}
def build_ci_sc(tx):
    """
    Create combined CI_SC nodes (can be both affected and caused).
    Handles nulls with 'ci_null' and 'sc_null'.
    """

    config = COMPLEX_OBJECTS["CI_SC"]
    attributes = config["attributes"]

    for role, attr in attributes.items():
        ci_field = attr["ciName"]
        sc_field = attr["scName"]


        source_query = f"""
        MATCH (r:Record)
        WHERE r.{ci_field} IS NOT NULL OR r.{sc_field} IS NOT NULL
        RETURN
            coalesce(r.{ci_field}, 'ci_null') AS ciName,
            coalesce(r.{sc_field}, 'sc_null') AS scName,
            r
        """

        iterate_query = f"""
        MERGE (cs:CI_SC {{sysId: ciName + '_' + scName}})
        SET cs.ciName = ciName,
            cs.scName = scName,
            cs.{role} = true
        MERGE (cs)-[:EXTRACTED_FROM]->(r)
        """

        apoc_query = f"""
        CALL apoc.periodic.iterate(
            "{source_query}",
            "{iterate_query}",
            {{batchSize: 1000, parallel: false}}
        )
        YIELD total, batches, failedBatches
        RETURN total
        """

        res = tx.run(apoc_query)
        record = res.single()

    print("→ CI_SC nodes created.")

In [76]:
build_all_complex_objects(db_connection)


=== INDEXES for COMPLEX OBJECTS ===
Index for :ConfigurationItem(sysId)
Index for :ServiceComponent(sysId)
Index for :CI_SC(sysId)

=== BUILDING COMPLEX OBJECTS ===
→ ConfigurationItem nodes created.
→ ServiceComponent nodes created.
→ CI_SC nodes created.


In [77]:
for dictionary in [COMPLEX_OBJECTS]:
    for label in dictionary.keys():
        add_object_type_node(object_type=label)
        print(f'-> (:ObjectType {{objectType: "{label}"}} created.')

-> (:ObjectType {objectType: "ConfigurationItem"} created.
-> (:ObjectType {objectType: "ServiceComponent"} created.
-> (:ObjectType {objectType: "CI_SC"} created.


### Objects and Resources Relations

- (Incident|Interaction)-[:USED_KM]->(KnowledgeDocument)
- (Incident)-[:RELATED_CHANGE]->(Change)
- (CI_SC)-[:RELATED_SC]->(ServiceComponent)
- (CI_SC)-[:RELATED_CI]->(ConfigurationItem)
- (Interaction)-[:RELATED_INCIDENT]->(Incident)
- (Incident|Interaction|Change)-[:AFFECTED_CI_SC]->(CI_SC)
- (Incident|Interaction|Change)<-[:CAUSED_BY_CI_SC]-(CI_SC)


There are interactions that have foregn keys to incidents that are missing in out data. For these cases i created new Incident nodes that only have sysId and a new attribute derivedFromInteraction and the nodes do not have relations to Record nodes.

In [78]:
def build_object_Relations(db_connection):
    """
    Build Relations between Object nodes based on Record data:

    """

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== INDEXES for RELATIONSHIP BUILDING ===")

        indexes = [
            ("Record", "changeId"),
            ("Record", "kmNumber"),
            ("Record", "relatedIncident"),
            ("Record", "relatedInteraction"),
            ("Record", "ciNameAff"),
            ("Record", "serviceComponentAff"),
            ("Record", "ciNameCby"),
            ("Record", "serviceComponentCBy"),
            ("CI_SC", "ciName"),
            ("CI_SC", "scName"),
        ]

        for label, prop in indexes:
            try:
                session.run(
                    f"CREATE INDEX {label.lower()}_{prop}_index IF NOT EXISTS "
                    f"FOR (n:{label}) ON (n.{prop})"
                )
                print(f"Index ensured for :{label}({prop})")
            except Exception as e:
                print(f"Failed to create index for {label}.{prop}: {e}")

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== BUILDING OBJECT Relations ===")

        # INCIDENT / INTERACTION → KNOWLEDGE DOCUMENT
        for src_label in ["Incident", "Interaction"]:
            used_km_query = f"""
            CALL apoc.periodic.iterate(
                "
                MATCH (n:{src_label})-[:EXTRACTED_FROM]->(r)
                WHERE r.kmNumber IS NOT NULL
                RETURN n, r.kmNumber AS km
                ",
                "
                MATCH (m:KnowledgeDocument {{sysId: km}})
                MERGE (n)-[:USED_KM]->(m)
                ",
                {{batchSize: 2000, parallel: false}}
            )
            YIELD total
            RETURN total
            """
            res = session.run(used_km_query)
            record = res.single()
            print(f"Created  USED_KM Relations for {src_label}.")

        # :RELATED_CHANGE Relations (Incident → Change)
        related_change_query = """
        CALL apoc.periodic.iterate(
            "
            MATCH (n:Incident)-[:EXTRACTED_FROM]->(r)
            WHERE r.relatedChange IS NOT NULL
            RETURN n, r.relatedChange AS ch
            ",
            "
            MATCH (c:Change {sysId: ch})
            MERGE (n)-[:RELATED_CHANGE]->(c)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        res = session.run(related_change_query)
        record = res.single()
        print(f"Created RELATED_CHANGE Relations (Incident → Change).")

        # :RELATED_CI Relations (ConfigurationItem ← CI_SC)
        related_ci_query = """
        CALL apoc.periodic.iterate(
            "
            MATCH (cs:CI_SC)
            WHERE cs.ciName IS NOT NULL AND cs.ciName <> 'ci_null'
            WITH cs, cs.ciName AS ci
            MATCH (ciNode:ConfigurationItem {sysId: ci})
            RETURN cs, ciNode
            ",
            "
            MERGE (ciNode)<-[:RELATED_CI]-(cs)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            res = session.run(related_ci_query)
            rec = res.single()
            print(f" Created RELATED_CI Relations.")
        except Exception as e:
            print(f"Failed to create RELATED_CI: {e}")

        # :RELATED_SC Relations (ServiceComponent ← CI_SC)
        related_sc_query = """
        CALL apoc.periodic.iterate(
            "
            MATCH (cs:CI_SC)
            WHERE cs.scName IS NOT NULL AND cs.scName <> 'sc_null'
            WITH cs, cs.scName AS sc
            MATCH (scNode:ServiceComponent {sysId: sc})
            RETURN cs, scNode
            ",
            "
            MERGE (scNode)<-[:RELATED_SC]-(cs)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            res = session.run(related_sc_query)
            rec = res.single()
            print(f" Created RELATED_SC Relations.")
        except Exception as e:
            print(f" Failed to create RELATED_SC: {e}")

        # :RELATED_INCIDENT Relations (Interaction → Incident). On creation we add attribute "primary" to mark a pair
        # of Interaction(relatedIncident)->Incident(relatedInteraction) that refer each other using foreign keys (mentioned in ()). In addition, there are interactions
        # that refer to incidents that do not exist in the data, those incidents are created only with sysId and derivedFromInteraction = true attributes.
        related_incident_query = """
        CALL apoc.periodic.iterate(
            "
            MATCH (inter:Interaction)-[:EXTRACTED_FROM]->(r)
            WHERE r.relatedIncident IS NOT NULL
            RETURN inter, r.relatedIncident AS incId, r
            ",
            "
            MERGE (inc:Incident {sysId: incId})
                ON CREATE SET inc.derivedFromInteraction = true
            WITH inter, inc, r
            OPTIONAL MATCH (r2:Record)<-[:EXTRACTED_FROM]-(inc)
            MERGE (inter)-[rel:RELATED_INCIDENT]->(inc)
            SET rel.primary = CASE WHEN r2 IS NOT NULL AND r2.relatedInteraction = inter.sysId THEN true ELSE false END
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            res = session.run(related_incident_query)
            rec = res.single()
            print(f"Created RELATED_INCIDENT Relations and new Incident nodes")
        except Exception as e:
            print(f" Failed to create RELATED_SC: {e}")

        # :AFFECTED_CI_SC Relations
        for src_label in ["Incident", "Interaction", "Change"]:
            affected_query = f"""
            CALL apoc.periodic.iterate(
                "
                MATCH (src:{src_label})-[:EXTRACTED_FROM]->(r)
                WHERE r.ciNameAff IS NOT NULL OR r.serviceComponentAff IS NOT NULL
                WITH src,
                     coalesce(r.ciNameAff, 'ci_null') AS ciName,
                     coalesce(r.serviceComponentAff, 'sc_null') AS scName
                RETURN src, ciName, scName
                ",
                "
                MATCH (cs:CI_SC {{ciName: ciName, scName: scName}})
                MERGE (src)-[:AFFECTED_CI_SC]->(cs)
                ",
                {{batchSize: 2000, parallel: false}}
            )
            YIELD total
            RETURN total
            """
            try:
                res = session.run(affected_query)
                rec = res.single()
                print(f" Created AFFECTED_CI_SC Relations ({src_label} → CI_SC).")
            except Exception as e:
                print(f" Failed to create AFFECTED_CI_SC: {e}")

        # :CAUSED_CI_SC Relations
        for src_label in ["Incident", "Interaction", "Change"]:
            caused_query = f"""
            CALL apoc.periodic.iterate(
                "
                MATCH (src:{src_label})-[:EXTRACTED_FROM]->(r)
                WHERE r.ciNameCby IS NOT NULL OR r.serviceComponentCBy IS NOT NULL
                WITH src,
                     coalesce(r.ciNameCby, 'ci_null') AS ciName,
                     coalesce(r.serviceComponentCBy, 'sc_null') AS scName
                RETURN src, ciName, scName
                ",
                "
                MATCH (cs:CI_SC {{ciName: ciName, scName: scName}})
                MERGE (src)-[:CAUSED_BY_CI_SC]->(cs)
                ",
                {{batchSize: 2000, parallel: false}}
            )
            YIELD total
            RETURN total
            """
            try:
                res = session.run(caused_query)
                rec = res.single()
                print(f" Created CAUSED_BY_CI_SC Relations ({src_label} → CI_SC).")
            except Exception as e:
                print(f" Failed to create CAUSED_BY_CI_SC: {e}")

In [79]:
build_object_Relations(db_connection)


=== INDEXES for RELATIONSHIP BUILDING ===
Index ensured for :Record(changeId)
Index ensured for :Record(kmNumber)
Index ensured for :Record(relatedIncident)
Index ensured for :Record(relatedInteraction)
Index ensured for :Record(ciNameAff)
Index ensured for :Record(serviceComponentAff)
Index ensured for :Record(ciNameCby)
Index ensured for :Record(serviceComponentCBy)
Index ensured for :CI_SC(ciName)
Index ensured for :CI_SC(scName)

=== BUILDING OBJECT Relations ===
Created  USED_KM Relations for Incident.
Created  USED_KM Relations for Interaction.
Created RELATED_CHANGE Relations (Incident → Change).
 Created RELATED_CI Relations.
 Created RELATED_SC Relations.
Created RELATED_INCIDENT Relations and new Incident nodes
 Created AFFECTED_CI_SC Relations (Incident → CI_SC).
 Created AFFECTED_CI_SC Relations (Interaction → CI_SC).
 Created AFFECTED_CI_SC Relations (Change → CI_SC).
 Created CAUSED_BY_CI_SC Relations (Incident → CI_SC).
 Created CAUSED_BY_CI_SC Relations (Interaction → 

### Events and Relations

Add a generic label event

There are four types of events Incident Events, Incident Activity Events, Change Events and Interaction Events

In [80]:
EVENTS = {
    "IncidentEvent": {
        "log": "BPIC14Incident.csv",
        "source_label": "Incident",
        "id_field": "incidentId",
        "mappings": {
            "Opened": "openTime",
            "Resolved": "resolvedTime",
            "Closed": "closeTime",
        },
    },
    "ChangeEvent": {
        "log": "Detail_Change.csv",
        "source_label": "Change",
        "id_field": "changeId",
        "mappings": {
            "Start": "actualStart",
            "End": "actualEnd"
            #"OpenRecord": "changeRecordOpenTime",       #Added OpenRecord as additional event
            #"ClosedRecord": "changeRecordCloseTime",    #Added CloseRecord as additional event
        },
    },
    "InteractionEvent": {
        "log": "BPIC14Interaction.csv",
        "source_label": "Interaction",
        "id_field": "interactionId",
        "mappings": {
            "Open": "openTime",
            "Close": "closeTime",
        },
    },
    "IncidentActivityEvent": {
        "log": "Detail_Incident_Activity.csv",
        "id_field": "incidentId",
        "activityNumber": "activityNumber",
        "attributes": ["activity", "timestamp"],
    }
}

In [81]:
def build_events_for_objects(db_connection):
    """
    Event nodes for Interaction, Change, Incident, IncidentActivity handled separately
    """

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== INDEXES FOR EVENT ===")
        for label in EVENTS.keys():
            try:
                session.run(
                    f"CREATE INDEX {label.lower()}_sysid_index IF NOT EXISTS "
                    f"FOR (n:{label}) ON (n.sysId)"
                )
                print(f"Index for :{label}(sysId)")
            except Exception as e:
                print(f"Failed to create index for {label}: {e}")

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== BUILDING EVENT OBJECTS ===")

        for label, cfg in EVENTS.items():
            if "mappings" not in cfg:
                print(f" Skipping {label}, handled separately.")
                continue
            log_name = cfg["log"]
            id_field = cfg["id_field"]
            mappings = cfg["mappings"]
            source_label = cfg["source_label"]


            for activity, time_field in mappings.items():

                event_query = f"""
                CALL apoc.periodic.iterate(
                    "
                    MATCH (l:Log {{name: '{log_name}'}})-[:CONTAINS]->(r:Record)
                    WHERE r.{id_field} IS NOT NULL AND r.{time_field} IS NOT NULL
                    RETURN r.{id_field} AS sysIdrec, r.{time_field} AS ts, r
                    ",
                    "
                    MERGE (e:{label}:Event {{sysId: sysIdrec + '_{activity}'}})
                    SET e.activity = '{activity}', e.timestamp = ts, e.{id_field} = sysIdrec
                    MERGE (e)-[:EXTRACTED_FROM]->(r)
                    WITH e, r
                    MATCH (source:{source_label})-[:EXTRACTED_FROM]->(r)
                    MERGE (e)-[:CORR]->(source)

                    ",
                    {{batchSize: 2000, parallel: false}}
                )
                YIELD total
                RETURN total
                """
                try:
                    res = session.run(event_query)
                    rec = res.single()
                    print(f"      Created {activity} {label} nodes")
                except Exception as e:
                    print(f"      Failed for {label} {activity}: {e}")


In [82]:
build_events_for_objects(db_connection)


=== INDEXES FOR EVENT ===
Index for :IncidentEvent(sysId)
Index for :ChangeEvent(sysId)
Index for :InteractionEvent(sysId)
Index for :IncidentActivityEvent(sysId)

=== BUILDING EVENT OBJECTS ===
      Created Opened IncidentEvent nodes
      Created Resolved IncidentEvent nodes
      Created Closed IncidentEvent nodes
      Created Start ChangeEvent nodes
      Created End ChangeEvent nodes
      Created OpenRecord ChangeEvent nodes
      Created ClosedRecord ChangeEvent nodes
      Created Open InteractionEvent nodes
      Created Close InteractionEvent nodes
 Skipping IncidentActivityEvent, handled separately.


In [83]:
def build_incidentactivityevent_nodes(db_connection):
    """
    Event nodes for IncidentActivity.
    """

    with db_connection.driver.get_session(database=db_connection.db_name) as session:

        print("\n=== BUILDING INCIDENT ACTIVITY EVENTS ===")

        conf = EVENTS["IncidentActivityEvent"]
        log_name = conf["log"]
        incident_field = conf["id_field"]
        activity_number_field = conf["activityNumber"]
        attributes = conf["attributes"]

        attr_return = ", ".join([f"r.{a} AS {a}" for a in attributes])
        attr_set = ", ".join([f"e.{a} = r.{a}" for a in attributes])

        source_query = f"""
        MATCH (l:Log {{name: '{log_name}'}})-[:CONTAINS]->(r:Record)
        WHERE r.{incident_field} IS NOT NULL AND r.{activity_number_field} IS NOT NULL
        RETURN r.{incident_field} AS incidentId,
               r.{activity_number_field} AS activityNumber,
               r.assignmentGroup AS assignmentGroup,
               r.activity AS activity_name,
               {attr_return},
               r
        """

        iterate_query = f"""
        MERGE (e:IncidentActivityEvent:Event {{sysId: incidentId + '_' + activity_name+ '_' + activityNumber}})
        SET {", ".join([f"e.{a} = {a}" for a in attributes])},
            e.activityNumber = activityNumber,
            e.{incident_field} = incidentId
        MERGE (e)-[:EXTRACTED_FROM]->(r)
        WITH e, incidentId AS incId, assignmentGroup
        MATCH (i:Incident {{sysId: incId}})
        MERGE (e)-[:CORR]->(i)
        WITH e, assignmentGroup
        MATCH (res:Resource {{sysId: assignmentGroup}})
        MERGE (e)-[:EXECUTED_BY]->(res)
        """

        apoc_query = f"""
        CALL apoc.periodic.iterate(
            "{source_query}",
            "{iterate_query}",
            {{batchSize: 2000, parallel: false}}
        )
        YIELD total
        RETURN total
        """

        try:
            res = session.run(apoc_query)
            rec = res.single()
            print(f"Created IncidentActivityEvent nodes and Relations.")
        except Exception as e:
            print(f"Failed to build IncidentActivityEvent: {e}")


In [84]:
build_incidentactivityevent_nodes(db_connection)


=== BUILDING INCIDENT ACTIVITY EVENTS ===
Created IncidentActivityEvent nodes and Relations.


### Relations between Events and Events & Objects

- (Resource)-[:RELATED]->(Incident) through IncidentActivityEvent
- (IncidentEvent|IncidentActivityEvent)-[:CORR]->(CI_SC) through Incident
- (InteractionEvent)-[:CORR]->(CI_SC) through Interaction
- (ChangeEvent)-[:CORR]->(CI_SC) through Change
- (IncidentEvent|IncidentActivityEvent|ChangeEvent|InteractionEvent) -[:CORR]-> (CI|SC) through CI_SC

In [85]:
def build_cross_object_Relations(db_connection):
    """
    Relations between Events and Events & Objects
    """

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== BUILDING CROSS-OBJECT Relations ===")

        # Resource -[:RELATED]→ Incident (through IncidentActivityEvent)
        query_1 = """
        CALL apoc.periodic.iterate(
            "
            MATCH (res:Resource)<-[:EXECUTED_BY]-(e:IncidentActivityEvent)-[:CORR]->(inc:Incident)
            RETURN DISTINCT res, inc
            ",
            "
            MERGE (res)-[:RELATED]->(inc)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            session.run(query_1)
            print(" Created :RELATED Relations between Resource and Incident")
        except Exception as e:
            print(f" Failed creating Resource→Incident: {e}")

        # IncidentActivityEvent & IncidentEvent -[:CORR]→ CI_SC (through Incident)
        query_2 = """
        CALL apoc.periodic.iterate(
        '
        MATCH (e)-[:CORR]->(i:Incident)
        WHERE e:IncidentEvent OR e:IncidentActivityEvent
        OPTIONAL MATCH (i)-[:AFFECTED_CI_SC]->(cs1:CI_SC)
        OPTIONAL MATCH (i)<-[:CAUSED_BY_CI_SC]-(cs2:CI_SC)
        WITH e, collect(DISTINCT cs1) + collect(DISTINCT cs2) AS all_cs
        UNWIND all_cs AS cs
        WITH DISTINCT e, cs WHERE cs IS NOT NULL
        RETURN e, cs
        ',
        '
        MERGE (e)-[:CORR]->(cs)
        ',
        {batchSize: 2000, parallel: false}
    )
    YIELD total
    RETURN total
    """
        try:
            session.run(query_2)
            print(" Created :CORR edges between IncidentEvents/IncidentActivityEvents and CI_SC")
        except Exception as e:
            print(f" Failed creating IncidentEvents→CI_SC: {e}")


        # ChangeEvent-[:CORR]→ CI_SC (through Change)
        query_3 = """
        CALL apoc.periodic.iterate(
            "
            MATCH (e:ChangeEvent)-[:CORR]->(c:Change)-[:AFFECTED_CI_SC]->(cs:CI_SC)
            RETURN DISTINCT e, cs
            ",
            "
            MERGE (e)-[:CORR]->(cs)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            session.run(query_3)
            print(" Created :CORR edges from ChangeEvent to CI_SC")
        except Exception as e:
            print(f"   Failed creating ChangeEvent→CI_SC: {e}")


        # InteractionEvent-[:CORR]→CI_SC (through Interaction)
        query_4 = """
        CALL apoc.periodic.iterate(
            "
            MATCH (ie:InteractionEvent)-[:CORR]->(i:Interaction)-[:AFFECTED_CI_SC]->(cs:CI_SC)
            RETURN DISTINCT ie, cs
            ",
            "
            MERGE (ie)-[:CORR]->(cs)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            session.run(query_4)
            print(" Created :CORR edges from InteractionEvent to CI_SC")
        except Exception as e:
            print(f"   Failed creating InteractionEvent→CI_SC: {e}")

        #  All Event Types -[:CORR]-> CI and SC (through CI_SC)
        query_5 = """
        CALL apoc.periodic.iterate(
            "
            MATCH (e)-[:CORR]->(cs:CI_SC)
            OPTIONAL MATCH (cs)-[:RELATED_CI]->(ci:ConfigurationItem)
            OPTIONAL MATCH (cs)-[:RELATED_SC]->(sc:ServiceComponent)
            WITH e, collect(DISTINCT ci) + collect(DISTINCT sc) AS targets
            UNWIND targets AS t
            WITH DISTINCT e, t WHERE t IS NOT NULL
            RETURN e, t
            ",
            "
            MERGE (e)-[:CORR]->(t)
            ",
            {batchSize: 2000, parallel: false}
        )
        YIELD total
        RETURN total
        """
        try:
            session.run(query_5)
            print(" Created :CORR edges from all Events to ConfigurationItem and ServiceComponent (via CI_SC)")
        except Exception as e:
            print(f"  Failed creating Events→CI/SC: {e}")



In [86]:
build_cross_object_Relations(db_connection)


=== BUILDING CROSS-OBJECT Relations ===
 Created :RELATED Relations between Resource and Incident
 Created :CORR edges between IncidentEvents/IncidentActivityEvents and CI_SC
 Created :CORR edges from ChangeEvent to CI_SC
 Created :CORR edges from InteractionEvent to CI_SC
 Created :CORR edges from all Events to ConfigurationItem and ServiceComponent (via CI_SC)


RELATED_ACTIVITY edges between IncidentEvent and IncidentActivityEvent for activities (Opened → Open, Resolved → Resolved, Closed → Closed)

In [87]:
def build_related_activity_edges(db_connection):
    """
    Create RELATED_ACTIVITY edges between IncidentEvent and IncidentActivityEvent
    based on activity mapping (Opened → Open, Resolved → Resolved, Closed → Closed)
    for events correlated to the same Incident.
    """
    activity_map = {
        "Opened": "Open",
        "Resolved": "Resolved",
        "Closed": "Closed"
    }

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("\n=== BUILDING RELATED_ACTIVITY EDGES ===")

        for inc_act, act_act in activity_map.items():
            query = f"""
            CALL apoc.periodic.iterate(
                '
                MATCH (e:IncidentEvent)-[:CORR]->(i:Incident)
                MATCH (ae:IncidentActivityEvent)-[:CORR]->(i)
                WHERE e.activity = "{inc_act}" AND ae.activity = "{act_act}"
                RETURN e, ae
                ',
                '
                MERGE (e)-[:RELATED_ACTIVITY]->(ae)
                ',
                {{batchSize: 2000, parallel: true}}
            )
            YIELD total
            RETURN total
            """
            try:
                session.run(query)
                print(f" Created RELATED_ACTIVITY edges for {inc_act} → {act_act}")
            except Exception as e:
                print(f" Failed for {inc_act} → {act_act}: {e}")


In [88]:
build_related_activity_edges(db_connection)


=== BUILDING RELATED_ACTIVITY EDGES ===
 Created RELATED_ACTIVITY edges for Opened → Open
 Created RELATED_ACTIVITY edges for Resolved → Resolved
 Created RELATED_ACTIVITY edges for Closed → Closed


# Filter out all events of objects that have at least one event before cutoff
Cutoff = 2013-08-19T09:59:53.000000000+01:00

In [89]:
results = []
labels = ['Incident', 'Interaction', 'Change']

count_query = '''
    MATCH (ot:ObjectType) <- [:IS_OF_TYPE] - (o) - [] - (e:Event)
    WHERE ot.objectType in $labels
    WITH e, ot, o, e.timestamp < dateTime("2013-08-19T09:59:53.000000000+01:00") as before_cutoff
    WITH ot, o, collect(distinct before_cutoff) as before_cutoffs
    WHERE True in before_cutoffs
    MATCH (o) - [] - (all_e:Event)
    RETURN ot.objectType as _label, count(distinct o) as object_deleted, count(distinct all_e) as events_deleted
'''
query = Query(
    query_str=count_query,
    parameters={'labels': labels}
)
result = pd.DataFrame(db_connection.exec_query(query))
results.append(result)

delete_query_str = '''
    MATCH (ot:ObjectType) <- [:IS_OF_TYPE] - (o) - [] - (e:Event)
    WHERE ot.objectType in $labels
    WITH e, o, e.timestamp < dateTime("2013-08-19T09:59:53.000000000+01:00") as before_cutoff
    WITH o, collect(distinct before_cutoff) as before_cutoffs
    WHERE True in before_cutoffs
    MATCH (o) - [] - (all_e:Event)
    DETACH DELETE o
    DETACH DELETE all_e
'''

delete_query = Query(
    query_str=delete_query_str,
    parameters={'labels': labels}
)
db_connection.exec_query(delete_query)

df_result = pd.concat(results)
df_result

[]

Unnamed: 0,_label,object_deleted,events_deleted
0,Incident,238,8691
1,Interaction,451,902
2,Change,682,2378


In [90]:

query = '''
    MATCH (ot:ObjectType) <- [:IS_OF_TYPE] - (o) - [] - (e:Event)
    WHERE ot.objectType in $labels
    RETURN ot.objectType as _label, count(distinct o) as objects_kept, count(distinct e) as events_kept
'''
query = Query(
    query_str=query,
    parameters={'labels': labels}
)
df_result_kept = pd.DataFrame(db_connection.exec_query(query))
print(df_result_kept)

        _label  objects_kept  events_kept
0     Incident         46368       596012
1  Interaction        146553       293106
2       Change         17318        67003


In [91]:
df_both = df_result.set_index('_label').join(df_result_kept.set_index('_label'))
df_both['total_objects'] = df_both['objects_kept'] + df_both['object_deleted']
df_both['total_events'] = df_both['events_kept'] + df_both['events_deleted']
df_both['% events deleted'] = round(df_both['events_deleted'] / df_both['total_events'] * 100, 2)
df_both['% objects deleted'] = round(df_both['object_deleted'] / df_both['total_objects'] * 100, 2)
df_both = df_both[
    ['object_deleted', 'objects_kept', 'total_objects', '% objects deleted', 'events_deleted', 'events_kept',
     'total_events', '% events deleted']]
print(df_both)

             object_deleted  objects_kept  total_objects  % objects deleted  events_deleted  events_kept  total_events  % events deleted
_label                                                                                                                                  
Incident                238         46368          46606               0.51            8691       596012        604703              1.44
Interaction             451        146553         147004               0.31             902       293106        294008              0.31
Change                  682         17318          18000               3.79            2378        67003         69381              3.43


## Delete non-referred CI_SC

In [92]:
query = '''MATCH (ci_sc:CI_SC)
WHERE NOT EXISTS((ci_sc) <- [] - (:Event)) AND NOT EXISTS ((ci_sc) -- (:Incident|Change|Interaction))
DETACH DELETE ci_sc
RETURN count(ci_sc) as cnt'''

pd.DataFrame(db_connection.exec_query(query))

Unnamed: 0,cnt
0,323


### DF edges

Each DF has attributes:
- object (which object it correlates Incident, Interactions, Change, etc.)
- object identifier (sysid) (which object it belongs incidentId, interactionsId, changeId, etc.)
- dftype is an attribute that only exists for events of Incident objects that specifyes whether the DF relation connects IncidentActivityEvent or IncidentEvent Event nodes

In [93]:
def build_df_edges(db_connection):
    """
    Build :DF:* edges for all objects except KnowledgeDocument.
    Creates separate DF edges for each object type and incident event type.
    """

    all_objects = {**objects, **RESOURCES, **COMPLEX_OBJECTS}
    excluded_objects = {"KnowledgeDocument", "ConfigurationItem", "ServiceComponent"} # add here objects for which DF are not needed
    for key in excluded_objects:
        all_objects.pop(key, None)

    with db_connection.driver.get_session(database=db_connection.db_name) as session:
        print("=== BUILDING DIRECTLY_FOLLOWS (DF) EDGES (excluding Incident) ===")

        for obj_label, obj_data in all_objects.items():
            sys_id_attr = obj_data.get("sysId")

            query = f"""
            CALL apoc.periodic.iterate(
                '
                MATCH (e:Event)-[]-> (o) - [:IS_OF_TYPE] -> (ot:ObjectType {{objectType:"{obj_label}" }})
                WHERE e.timestamp IS NOT NULL
                WITH o, ot.objectType as oType, e ORDER BY e.timestamp, ID(e)
                WITH o, oType, collect(e) AS events
                UNWIND range(0, size(events)-2) AS idx
                WITH events[idx] AS fromEv, events[idx+1] AS toEv, o.{sys_id_attr} AS objSysId, oType
                RETURN fromEv, toEv, objSysId, oType
                ',
                '
                WITH fromEv, toEv, objSysId, oType
                MERGE (fromEv)-[rel:DF {{objectType:oType, id:objSysId}}]->(toEv)
                ',
                {{batchSize:1000, parallel:false}}
            ) YIELD total
            RETURN total
            """

            res = session.run(query).data()
            print(f"{obj_label} DF creation result:", res)


In [94]:
build_df_edges(db_connection)

=== BUILDING DIRECTLY_FOLLOWS (DF) EDGES (excluding Incident) ===
Incident DF creation result: [{'total': 549644}]
Interaction DF creation result: [{'total': 146553}]
Change DF creation result: [{'total': 49685}]
Resource DF creation result: [{'total': 458508}]
CI_SC DF creation result: [{'total': 984382}]


### Statistics

In [96]:
get_graph_statistics(db_connection)


=== GRAPH STATISTICS ===

--- Node counts ---
(:Record)                      690622
(:IncidentActivityEvent:Event) 458748
(:InteractionEvent:Event)      293106
(:Interaction)                 146553
(:IncidentEvent:Event)         137336
(:ChangeEvent:Event)           67003
(:Incident)                    46819
(:Change)                      17318
(:ConfigurationItem)           15134
(:CI_SC)                       15033
(:KnowledgeDocument)           2373
(:ServiceComponent)            340
(:Resource)                    242
(:ObjectType)                  8
(:Log)                         4

--- Relationship counts ---
[:EXTRACTED_FROM]              3036967
[:DF]                          2188772
[:CORR]                        1954578
[:CONTAINS]                    690622
[:EXECUTED_BY]                 458748
[:IS_OF_TYPE]                  243361
[:AFFECTED_CI_SC]              221660
[:USED_KM]                     192921
[:RELATED]                     113425
[:RELATED_ACTIVITY]            9

In [None]:
def add_object_type_node(object_type):
    query_str = '''
        MERGE (ot:ObjectType {objectType: '$label' })
        WITH ot
        MATCH (o:$label)
        WITH o, ot
        MERGE (o) - [:IS_OF_TYPE] -> (ot)
    '''

    query = Query(
        query_str=query_str,
        template_string_parameters={"label": object_type}
    )

    db_connection.exec_query(query)

for dictionary in [objects, RESOURCES]:
    for label in dictionary.keys():
        add_object_type_node(object_type=label)
        print(f'-> (:ObjectType {{objectType: "{label}"}} created.')