In [22]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

pd.set_option('display.width', 2000)

### Define the project that you want to do analysis on

In [23]:
# case_study = 'bpic14'
case_study = 'bpic14'
use_sample = False

In [24]:
# retrieve configuration for case_study
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config["db_name"]}")
print(f"uri: {config["uri"]}")
print(f"password: {config["password"]}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


### Prepare so we can use PromG to load the data and execute queries

In [25]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)
perf = Performance.set_up_performance(config=config)
dataset_descriptions = DatasetDescriptions(config=config)

In [26]:
def reset_pbar(pbar=perf.pbar, total=None):
    # clear db
    pbar.reset()
    # TODO update dragons in PromG, #update method to set total for pbar
    pbar.total = total
    pbar.set_postfix_str()

#### Prepare the DB

In [27]:
# read the semantic header --> this details how the data should be structured
semantic_header = SemanticHeader.create_semantic_header(config=config)

In [28]:
# Clear the DB (if use_sample = False, this should not take long on a loaded database)
db_manager = DBManagement(db_connection=db_connection, semantic_header=semantic_header)
db_manager.clear_db(
    replace=True);  # in the community version of neo4j, replace is not allowed. In that case, set replace=False

42it [56:35, 765.20s/it, clear_db: took 2945.69 seconds]                                                    

In [29]:
# Set constraints in DB
db_manager.set_constraints();

43it [56:36, 558.44s/it, set_constraints: took 0.76 seconds]

In [30]:
db_manager.get_constraints(ignore_defaults=True)

['activity_index',
 'record_id_range',
 'record_type_range',
 'unique_change_ids',
 'unique_configurationitem_ids',
 'unique_entity_ids',
 'unique_event_ids',
 'unique_eventtype_ids',
 'unique_incident_ids',
 'unique_interaction_ids',
 'unique_knowledgedocument_ids',
 'unique_objecttype_ids',
 'unique_resource_ids',
 'unique_servicecomponent_ids']

# For BPIC17 (as it is a large dataset), we test with sample whether data can be imported

In [31]:
if case_study == "bpic17":
    use_sample = True
    oced_pg = OcedPg(database_connection=db_connection,
                     dataset_descriptions=dataset_descriptions,
                     semantic_header=semantic_header,
                     use_sample=use_sample)
    
    # first, we load all records
    # (if use_sample = False, this should take less than 2 minutes)
    oced_pg.load()

In [32]:
# Check whether import and transformation was successful by checking whether all nodes were imported
check_nodes = '''
    MATCH (n) 
    RETURN labels(n) as label, count(n) as count
'''

check_relations = '''
    MATCH () - [r] -> ()
    RETURN type(r) as label, count(r) as count
'''

if case_study == "bpic17":
    node_count = db_connection.exec_query(check_nodes)
    rel_count = db_connection.exec_query(check_relations)
    print(
        f"In total, there are {sum([node['count'] for node in node_count])} nodes and {sum([rel['count'] for rel in rel_count])} relations.")


In [33]:
# Clear the DB (if use_sample = False, this should not take long on a loaded database)
if case_study == "bpic17":
    db_manager = DBManagement(db_connection=db_connection, semantic_header=semantic_header)
    db_manager.clear_db(
        replace=True);  # in the community version of neo4j, replace is not allowed. In that case, set replace=False

In [34]:
if case_study == "bpic17":
    # Set constraints in DB
    db_manager.set_constraints();

### Load the data

In [35]:
use_sample = False
oced_pg = OcedPg(database_connection=db_connection,
                 dataset_descriptions=dataset_descriptions,
                 semantic_header=semantic_header,
                 use_sample=use_sample)


In [36]:
# first, we load all records
# (if use_sample = False, this should take less than 2 minutes)
reset_pbar(total=12 if case_study=="bpic14" else 2)
oced_pg.load()

 92%|█████████▏| 11/12 [01:48<00:10, 10.17s/it, _filter_nodes for BPIC14Interaction: took 0.0 seconds]                      Failed to delete C:\Users\s156229\.Neo4jDesktop\relate-data\dbmss\dbms-c8928c7a-a8f6-468f-b6e5-cfd4d6bb5d76\import\BPIC14Incident__ServiceComponentCByRecord_RelatedInteractionRecord.csv. Reason: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\s156229\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-c8928c7a-a8f6-468f-b6e5-cfd4d6bb5d76\\import\\BPIC14Incident__ServiceComponentCByRecord_RelatedInteractionRecord.csv'
Failed to delete C:\Users\s156229\.Neo4jDesktop\relate-data\dbmss\dbms-c8928c7a-a8f6-468f-b6e5-cfd4d6bb5d76\import\BPIC14Interaction.csv. Reason: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\s156229\\.Neo4jDesktop\\relate-data\\dbmss\\dbms-c8928c7a-a8f6-468f-b6e5-cfd4d6bb5d76\\import\\BPIC14Interaction.csv'
Failed to delete C:\Users\s156229\.Neo

In [37]:
# (if use_sample = False, this should take less than 8 minutes)
reset_pbar(total=133 if case_study=="bpic14" else 41)
oced_pg.transform()  #transform the data using the semantic header

140it [04:08,  2.92it/s, _create_relations_using_nodes for (ot_from ) - [:RELATED_INTERACTION] -> (ot_to ): took 0.01 seconds]                                                                                   

In [38]:
# PromG does not have this capability yet, so we have to do it manually
# we have to lift the E2O relationships from the instance layer to the type layer

# (if use_sample = False, this should take less than 3 minutes)
lift_e2o = '''
    MATCH (et:EventType)
    MATCH (ot:ObjectType)
    MATCH (et) <- [:INSTANCE_OF] - (:Event) - [e2o] -> () - [:INSTANCE_OF] -> (ot)
    WITH distinct et, ot, type(e2o) as e2o_type
    CALL apoc.merge.relationship(et, e2o_type,{},{},ot,{})
    YIELD rel as e2o_rel
    RETURN DISTINCT et.eventType as eType, e2o_type, collect(distinct ot.objectType) as oTypes
'''

results = db_connection.exec_query(lift_e2o)
print(tabulate(results))

----------------------------  -------------  ---------------
Open Change Record            OPENS_RECORD   ['Change']
Close Change Record           CLOSES_RECORD  ['Change']
Planned Start                 CORR           ['Change']
Requested End                 CORR           ['Change']
Planned End                   CORR           ['Change']
Start Change Implementation   START          ['Change']
End Change Implementation     END            ['Change']
Open Incident                 OPENS          ['Incident']
Close Incident                CLOSES         ['Incident']
Reopen Incident               REOPENS        ['Incident']
Open Interaction              OPENS          ['Interaction']
Open Interaction              CLOSES         ['Interaction']
Close Interaction             CLOSES         ['Interaction']
Assignment                    ACTS_ON        ['Interaction']
Reassignment                  ACTS_ON        ['Interaction']
Operator Update               ACTS_ON        ['Interaction']
Descrip

In [39]:
# Check whether import and transformation was successful by checking whether all nodes were imported
node_count = db_connection.exec_query(check_nodes)
rel_count = db_connection.exec_query(check_relations)
print(
    f"In total, there are {sum([node['count'] for node in node_count])} nodes and {sum([rel['count'] for rel in rel_count])} relations.")

# we report on nodes and relationships excluding the record layer as these are not part of our schema
print(
    f"In total (excluding record layer), there are {sum([node['count'] for node in node_count if not node['label'] in [['Record'], ['RecordType'], ['Log']]])} nodes "
    f"and {sum([rel['count'] for rel in rel_count if not rel['label'] in ['IS_OF_TYPE', 'CONTAINS', 'EXTRACTED_FROM']])} relations.")


In total, there are 1981224 nodes and 15647590 relations.
In total (excluding record layer), there are 1290575 nodes and 3441519 relations.


In [40]:
# Print node statistics
print("Nodes")
print(tabulate(node_count))

Nodes
---------------------  -------
['RecordType']              23
['Log']                      4
['Record']              690622
['Event']              1061604
['EventType']               51
['ObjectType']               7
['ConfigurationItem']    14143
['ServiceComponent']       340
['Change']               18026
['Incident']             46616
['Interaction']         147173
['KnowledgeDocument']     2373
['Resource']               242
---------------------  -------


In [41]:
# Print relationship statistics
print("Relationships")
print(tabulate(rel_count))

Relationships
-------------------------  -------
CONTAINS                    690622
IS_OF_TYPE                 3880534
EXTRACTED_FROM             7634915
OPENS_RECORD                 30276
INSTANCE_OF                1290517
CLOSES_RECORD                30276
START                        27018
END                          27015
CORR                         90785
RELATED_SERVICE_COMPONENT    14333
CHANGED_BY                   30125
OPENS                       193612
REOPENS                       2285
CLOSES                      340617
RELATED_CHANGE                 537
CAUSED_BY                        1
AFFECTED_CI                 193612
USED_KM                     193899
RELATED_INTERACTION          43059
ACTS_ON                     466776
EXECUTED_BY                 466776
-------------------------  -------
