In [16]:
%matplotlib inline
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

import pandas as pd

pd.set_option('display.width', 2000)

### Define the project that you want to do analysis on

In [17]:
# case_study = 'bpic14'
case_study = 'bpic14'
use_sample = False

In [18]:
# retrieve configuration for case_study
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config['db_name']}")
print(f"uri: {config['uri']}")
print(f"password: {config['password']}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


### Prepare so we can use PromG to load the data and execute queries

In [19]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)
perf = Performance.set_up_performance(config=config)
dataset_descriptions = DatasetDescriptions(config=config)

In [20]:
def reset_pbar(pbar=perf.pbar, total=None):
    # clear db
    pbar.reset()
    # TODO update dragons in PromG, #update method to set total for pbar
    pbar.total = total
    pbar.set_postfix_str()

#### Prepare the DB

In [21]:
# read the semantic header --> this details how the data should be structured
semantic_header = SemanticHeader.create_semantic_header(config=config)

In [22]:
# Clear the DB (if use_sample = False, this should not take long on a loaded database)
db_manager = DBManagement(db_connection=db_connection, semantic_header=None)
db_manager.clear_db(replace=True);  # in the community version of neo4j, replace is not allowed. In that case, set replace=False

13it [05:07, 56.73s/it, clear_db: took 277.57 seconds]                                                

In [23]:
# Set constraints in DB
db_manager.set_constraints();

14it [05:07, 44.26s/it, set_constraints: took 0.19 seconds]

In [24]:
db_manager.get_constraints(ignore_defaults=True)

['activity_index',
 'entity_sys_id_index',
 'record_id_range',
 'record_type_range',
 'unique_entity_ids']

# For BPIC17 (as it is a large dataset), we test with sample whether data can be imported

### Load the data

In [25]:
use_sample = False
oced_pg = OcedPg(database_connection=db_connection,
                 dataset_descriptions=dataset_descriptions,
                 semantic_header=semantic_header,
                 use_sample=use_sample)


In [26]:
# first, we load all records
# (if use_sample = False, this should take less than 2 minutes)
reset_pbar(total=11)
oced_pg.load()

 92%|█████████▏| 11/12 [00:29<00:02,  2.80s/it, _filter_nodes for BPIC14Interaction: took 0.0 seconds]                      

In [27]:
# Check whether import and transformation was successful by checking whether all nodes were imported
check_nodes = '''
    MATCH (n)
    RETURN labels(n) as label, count(n) as count'''

check_relations = '''
    MATCH () - [r] -> ()
    RETURN type(r) as label, count(r) as count'''

In [28]:
# Check whether import and transformation was successful by checking whether all nodes were imported
node_count = db_connection.exec_query(check_nodes)
rel_count = db_connection.exec_query(check_relations)
print(
    f"In total, there are {sum([node['count'] for node in node_count])} nodes and {sum([rel['count'] for rel in rel_count])} relations.")

# we report on nodes and relationships excluding the record layer as these are not part of our schema
print(
    f"In total (excluding record layer), there are {sum([node['count'] for node in node_count if not node['label'] in [['Record'], ['RecordType'], ['Log']]])} nodes "
    f"and {sum([rel['count'] for rel in rel_count if not rel['label'] in ['IS_OF_TYPE', 'CONTAINS', 'EXTRACTED_FROM']])} relations.")


In total, there are 690626 nodes and 690622 relations.
In total (excluding record layer), there are 0 nodes and 0 relations.


In [29]:
# Print node statistics
print("Nodes")
print(tabulate(node_count))

Nodes
----------  ------
['Log']          4
['Record']  690622
----------  ------


In [30]:
# Print relationship statistics
print("Relationships")
print(tabulate(rel_count))

Relationships
--------  ------
CONTAINS  690622
--------  ------
