In [12]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

pd.set_option('display.width', 2000)

### Define the project that you want to do analysis on

In [13]:
case_study = 'bpic14'
# case_study = 'bpic17'
use_sample = False

In [14]:
# retrieve configuration for case_study
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config["db_name"]}")
print(f"uri: {config["uri"]}")
print(f"password: {config["password"]}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


### Prepare so we can use PromG to load the data and execute queries

In [15]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)

### Load the data

# Start Analysis
Make sure the data imported using import_data.ipynb. The imported data is in the schema as defined in the paper

# Analysis Goals = Process Discovery
Let's discover the process

In [16]:
object_types = []
if case_study == "bpic17":
    object_types = ['Application', 'Offer', 'Workflow', 'CaseAO', 'CaseAW', 'CaseWO']
elif case_study == "bpic14":
    object_types = ["ConfigurationItem", "Incident", "Interaction", "Change"]

batch_sizes = {key: 5000 for key in object_types}

In [17]:
# to ensure we don't overload the database, we will run the query for each object_type separately
# furthermore, we have a small number of resources (149) with many events, and many cases (31509) with a low number of events
# to accommodate for this, we will adapt the batch size accordingly


# Discover DF for the objects in our data model
df_instance_query_str = '''
    CALL apoc.periodic.iterate('
        MATCH (o) - [:INSTANCE_OF] -> (ot:ObjectType {objectType: $object_type})
        RETURN o, ot
    ','
        MATCH (e:Event) - [e2o_instance] -> (o) // we remove the check on the event type level for performance increase, and we have no roles so check is redundant
        WITH o.id as oId, ot.objectType as oType, e order by e.timestamp, id(e)
        WITH oId, oType, collect(e) as events
        WITH oId, oType, events
        UNWIND range(0, size(events)-2) AS i
            WITH oId, oType, events[i] AS e1, events[i+1] AS e2
            MERGE (e1) - [df:DF {objectType:oType, id:oId}] -> (e2)',
        {batchSize:$batch_size, params:{object_type:$object_type}})
'''

for object_type in object_types:
    print(f"Discovering df for object_type: {object_type}")

    df_instance_query = Query(
        query_str=df_instance_query_str,
        parameters={
            "object_type": object_type,
            "batch_size": batch_sizes[object_type]
        }
    )

    results = db_connection.exec_query(df_instance_query) 
    print(results) # ensure failed batches equals 0, if not, consider increasing the memory of the db

Discovering df for object_type: ConfigurationItem
[{'batches': 3, 'total': 14143, 'timeTaken': 0, 'committedOperations': 14143, 'failedOperations': 0, 'failedBatches': 0, 'retries': 0, 'errorMessages': {}, 'batch': {'total': 3, 'errors': {}, 'committed': 3, 'failed': 0}, 'operations': {'total': 14143, 'errors': {}, 'committed': 14143, 'failed': 0}, 'wasTerminated': False, 'failedParams': {}, 'updateStatistics': {'relationshipsDeleted': 0, 'relationshipsCreated': 0, 'nodesDeleted': 0, 'nodesCreated': 0, 'labelsRemoved': 0, 'labelsAdded': 0, 'propertiesSet': 0}}]
Discovering df for object_type: Incident
[{'batches': 10, 'total': 46616, 'timeTaken': 1, 'committedOperations': 46616, 'failedOperations': 0, 'failedBatches': 0, 'retries': 0, 'errorMessages': {}, 'batch': {'total': 10, 'errors': {}, 'committed': 10, 'failed': 0}, 'operations': {'total': 46616, 'errors': {}, 'committed': 46616, 'failed': 0}, 'wasTerminated': False, 'failedParams': {}, 'updateStatistics': {'relationshipsDeleted'

For bpic 17, we have reified entities, so we will find duplicate events

As CaseAO, CaseAW and CaseWO are reified entities, we will find duplicate DFs.

That is, if we have an `(e1:Event) - [:DF {objectType: 'Offer'}] -> (e2:Event)` and there is no Application events `e3` in between, 

then we will `(e1:Event) - [:DF {objectType: 'CaseAO'}] -> (e2:Event)`

Let's get rid of these duplicate DFs for the reified entities.

For BPIC14, running the query does not change anything.


In [18]:
delete_duplicate_df = '''
    MATCH (from_ot:ObjectType) - [:FROM] -> (reified_ot:ObjectType) - [:TO] -> (to_ot:ObjectType)
    WITH from_ot.objectType as from_oType, reified_ot.objectType as reified_oType, to_ot.objectType as to_oType
    WITH [from_oType, to_oType] as original_oTypes, reified_oType
    UNWIND original_oTypes AS original_oType
    MATCH (e1:Event) - [:DF {objectType:original_oType}] -> (e2:Event)
    MATCH (e1) - [duplicate_df:DF {objectType:reified_oType}] -> (e2) // check whether there is a duplicate df between e1 and e2
    DELETE duplicate_df
    RETURN original_oType, reified_oType, count(duplicate_df) as deleted_df
'''

result = db_connection.exec_query(delete_duplicate_df)
print(tabulate(result))




Next, we can lift the DF relationships to the event type level to discover a process model.
For this, we use a filtered DF-Graph discovery that ensures we only lift the DF relationship if we have seen it often enough

In [19]:
# Query to derive a Multi-Entity DF-Graph by aggregating instance-level DF relationships at the event type level.
df_aggregation_query_str = '''\
    CALL apoc.periodic.iterate('
        // find all consecutive event types for specific object types
        MATCH (e1:Event) - [e2e:DF] -> (e2:Event)
        MATCH (e1) - [:INSTANCE_OF] -> (et1:EventType  WHERE et1.agg is null) //ensure not to take the task instances (if they were created with the task analysis)
        MATCH (e2) - [:INSTANCE_OF] -> (et2:EventType  WHERE et2.agg is null) //ensure not to take the task instances (if they were created with the task analysis)
        WITH e2e.objectType as oType, et1, et2, count(e2e) as df_freq // count for each oType, how often we have observed DF between events that are an instance of et1 and et2
        WHERE df_freq > $df_threshold AND oType in $object_types
        RETURN oType, et1, et2
    ','
        WITH oType, et1, et2
        MERGE (et1) - [:DF {objectType:oType}] -> (et2)
    ', 
    {batchSize:$batch_size, params:{df_threshold:$df_threshold, object_types:$object_types}})
'''

df_aggregation_query = Query(
    query_str=df_aggregation_query_str,
    parameters={
        "df_threshold": 10 if use_sample else 5_000,
        "object_types": object_types
    }
)

results = db_connection.exec_query(df_aggregation_query)
print(tabulate(results))

-  --  -  --  -  -  -  --  -------------------------------------------------------  ---------------------------------------------------------  -----  --  --------------------------------------------------------------------------------------------------------------------------------------------------------
1  47  2  47  0  0  0  {}  {'total': 1, 'errors': {}, 'committed': 1, 'failed': 0}  {'total': 47, 'errors': {}, 'committed': 47, 'failed': 0}  False  {}  {'relationshipsDeleted': 0, 'relationshipsCreated': 47, 'nodesDeleted': 0, 'nodesCreated': 0, 'labelsRemoved': 0, 'labelsAdded': 0, 'propertiesSet': 47}
-  --  -  --  -  -  -  --  -------------------------------------------------------  ---------------------------------------------------------  -----  --  --------------------------------------------------------------------------------------------------------------------------------------------------------


In [20]:
# We should also account for start and end places in our Multi-Entity DFG
df_add_start_and_end_places = '''
    WITH $object_types as object_types
    CALL (object_types) {
    
    // find all starting events for a specific object type
    MATCH (start_e) - [e2o_instance] -> (object) - [:INSTANCE_OF] -> (ot:ObjectType)
    WHERE ot.objectType IN object_types
    AND NOT EXISTS ((:Event) - [:DF {objectType:ot.objectType, id:object.id}] -> (start_e))
    MATCH (start_e) - [:INSTANCE_OF] -> (et2:EventType WHERE et2.agg is null) //ensure not to take the task instances (if they were created with the task analysis)
    MERGE (et1:EventType {eventType: "START"}) 
    MERGE (et1) - [:OBSERVES] -> (ot)
    RETURN distinct et1, et2, ot.objectType as oType
    
    UNION
    
    // find all end events for a specific object type
    MATCH (end_e) - [e2o_instance] -> (object) - [:INSTANCE_OF] -> (ot:ObjectType)
    WHERE ot.objectType IN object_types
    AND NOT EXISTS ((end_e) - [:DF {objectType:ot.objectType, id:object.id}] -> (:Event))
    MATCH (end_e) - [:INSTANCE_OF] -> (et1:EventType  WHERE et1.agg is null) //ensure not to take the task instances (if they were created with the task analysis)
    MERGE (et2:EventType {eventType: "END"}) 
    MERGE (et2) - [:OBSERVES] -> (ot)
    RETURN DISTINCT et1, et2, ot.objectType as oType
    }
    
    MERGE (et1) - [:DF {objectType:oType}] -> (et2)
    // print results
    WITH DISTINCT oType, et1, et2
    RETURN oType as objectType, et1.eventType as from_event_type, et2.eventType as to_event_type order by objectType
'''


df_add_start_and_end_places_query = Query(
    query_str=df_add_start_and_end_places,
    parameters={
        "object_types": object_types
    }
)

results = db_connection.exec_query(df_add_start_and_end_places_query)
print(tabulate(results))

-----------  ---------------------------  ---------------------------
Change       START                        Open Change Record
Change       START                        Planned Start
Change       START                        Start Change Implementation
Change       Requested End                END
Change       Planned End                  END
Change       Close Change Record          END
Change       End Change Implementation    END
Incident     START                        Open Incident
Incident     Close Incident               END
Interaction  Update from customer         END
Interaction  Operator Update              END
Interaction  External Vendor Assignment   END
Interaction  Assignment                   END
Interaction  Quality Indicator            END
Interaction  Vendor Reference             END
Interaction  Update                       END
Interaction  Close Interaction            END
Interaction  Closed                       END
Interaction  Caused By CI                 E