In [2]:
import dask.dataframe as dd
from dask.dataframe import from_pandas
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask

import os
import time
from tqdm import tqdm
import pandas as pd
import re
import gc

#importers
from pm4py import convert_to_event_log, convert_to_dataframe, format_dataframe

# Miners
from pm4py import convert_to_petri_net, serialize, deserialize
from pm4py import discover_dfg as dfg_discovery

from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.correlation_mining import algorithm as correlation_miner
from pm4py.algo.discovery.temporal_profile import algorithm as temporal_profile_discovery


# Evaluators
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator

In [3]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [4]:
def useExecutionTime(func):
    
    def compute(*args, **kwargs):
        begin = time.time()
        
        result = func(*args, **kwargs)
        
        end = time.time()
        
        return {"result": result, "execution_time": end - begin}
 
    return compute

@useExecutionTime
def getComputeTime(*args, **kwargs):
    return dask.compute(*args, **kwargs)

In [5]:
def transformToDFG(dfgResult):
    result = {}
    for record in dfgResult:
        result[(record["parent"], record["child"])] = record["frequency"]
    
    return result

def transformToStartEndActivity(activities):
    result = {}
    for record in activities:
        result[record['name']] = record["frequency"]
        
    return result

In [18]:
client = Client(n_workers=4, threads_per_worker=16)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 44545 instead


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:44545/status,

0,1
Dashboard: http://127.0.0.1:44545/status,Workers: 4
Total threads: 64,Total memory: 11.68 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:43601,Workers: 4
Dashboard: http://127.0.0.1:44545/status,Total threads: 64
Started: Just now,Total memory: 11.68 GiB

0,1
Comm: tcp://127.0.0.1:44061,Total threads: 16
Dashboard: http://127.0.0.1:38023/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:44387,
Local directory: /tmp/dask-worker-space/worker-ok42m2_r,Local directory: /tmp/dask-worker-space/worker-ok42m2_r

0,1
Comm: tcp://127.0.0.1:39515,Total threads: 16
Dashboard: http://127.0.0.1:35407/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:46747,
Local directory: /tmp/dask-worker-space/worker-siu2cto3,Local directory: /tmp/dask-worker-space/worker-siu2cto3

0,1
Comm: tcp://127.0.0.1:38007,Total threads: 16
Dashboard: http://127.0.0.1:42063/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:42845,
Local directory: /tmp/dask-worker-space/worker-uiagvop9,Local directory: /tmp/dask-worker-space/worker-uiagvop9

0,1
Comm: tcp://127.0.0.1:41181,Total threads: 16
Dashboard: http://127.0.0.1:34529/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:35933,
Local directory: /tmp/dask-worker-space/worker-p6ftqtcg,Local directory: /tmp/dask-worker-space/worker-p6ftqtcg


In [57]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'case:concept:name': 'int64',
    'dueDate': 'string',
    'case:termName': 'string',
    'dateStop': 'string'
}
fileName = 'BPIC15_[1-2]'
df = dd.read_csv('{fileName}.csv'.format(fileName=fileName), dtype=columnTypes)
for column in df.columns:
    if re.search("[Dd]ate.*|time.*", column):
        df[column] = dask.dataframe.to_datetime(df[column], utc=True)

FileNotFoundError: [Errno 2] No such file or directory: '/opt/notebooks/BPIC15_[1-2].csv'

In [47]:
def getDFGQueries(dfg, start_activities, end_activities):
    listOfQueries = []
    queryTemplate = """
        MERGE (p:Activity {{name: '{parent}'}})
        MERGE (c:Activity {{name: '{child}'}})
        MERGE (p)-[r:PRODUCES]->(c)
        ON CREATE SET r.frequency={frequency}
        ON MATCH SET r.frequency=r.frequency+{frequency}
    """
    for parent, child in dfg:
        frequency = dfg[(parent, child)]
        template = queryTemplate.format(parent=parent, child=child, frequency=frequency)
        listOfQueries.append(template)
        
    queryTemplateSA = """
        MERGE (p:StartActivity {{name: '{activity}'}})
        ON CREATE SET p.frequency={frequency}
        ON MATCH SET p.frequency=p.frequency+{frequency}
    """
    queryTemplateEA = """
        MERGE (p:EndActivity {{name: '{activity}'}})
        ON CREATE SET p.frequency={frequency}
        ON MATCH SET p.frequency=p.frequency+{frequency}
    """
    
    for activity, frequency in start_activities.items():
        template = queryTemplateSA.format(activity=activity, frequency=frequency)
        listOfQueries.append(template)
        
    for activity, frequency in end_activities.items():
        template = queryTemplateEA.format(activity=activity, frequency=frequency)
        listOfQueries.append(template)
    
    
    return listOfQueries

In [48]:
def saveDFG(dfg):
    dfg, start_activities, end_activities = dfg_discovery(dfg)
    dfgQuery = getDFGQueries(dfg, start_activities, end_activities)
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    result = neo4jConnection.run_bulk_query(dfgQuery)
    return {"dfg": dfg, "start_activites": start_activities, "end_activites": end_activities}

In [49]:
indexed_df = df.set_index('case:concept:name', drop=False, sorted=True)

In [50]:
indexed_df.index = indexed_df.index.rename('caseId')
indexed_df = indexed_df.repartition(npartitions=4)

In [51]:
lazyDFG = indexed_df.map_partitions(saveDFG, meta=indexed_df).to_delayed()

In [52]:
getComputeTime(*lazyDFG, scheduler='processes', meta=[]) # save the dfg to neo4j

100%|██████████| 2123/2123 [00:08<00:00, 264.82it/s]
100%|██████████| 351/351 [00:01<00:00, 286.17it/s]
100%|██████████| 334/334 [00:01<00:00, 287.65it/s]
100%|██████████| 3307/3307 [00:11<00:00, 275.71it/s]


{'result': ({'dfg': {('01_BB_540', '01_BB_546'): 1,
    ('01_BB_540', '01_BB_550'): 1,
    ('01_BB_540', '01_BB_590'): 3,
    ('01_BB_540', '01_BB_765'): 16,
    ('01_BB_540', '01_BB_770'): 157,
    ('01_BB_540', '01_HOOFD_530'): 1,
    ('01_BB_540', '01_HOOFD_790'): 9,
    ('01_BB_540', '01_HOOFD_800'): 8,
    ('01_BB_540', '01_HOOFD_805'): 4,
    ('01_BB_540', '01_HOOFD_810'): 12,
    ('01_BB_540', '01_OLO_240'): 6,
    ('01_BB_545', '01_BB_770'): 1,
    ('01_BB_546', '01_BB_630'): 1,
    ('01_BB_550', '01_BB_560'): 3,
    ('01_BB_550_1', '01_BB_770'): 1,
    ('01_BB_550_2', '01_HOOFD_810'): 1,
    ('01_BB_560', '01_BB_590'): 1,
    ('01_BB_560', '01_BB_630'): 1,
    ('01_BB_560', '01_BB_770'): 1,
    ('01_BB_590', '01_BB_550'): 1,
    ('01_BB_590', '01_BB_630'): 3,
    ('01_BB_630', '01_BB_550'): 1,
    ('01_BB_630', '01_BB_635'): 1,
    ('01_BB_630', '01_BB_636'): 1,
    ('01_BB_630', '01_BB_640'): 1,
    ('01_BB_630', '01_BB_670'): 1,
    ('01_BB_630', '01_BB_730'): 49,
    ('01_B

In [53]:
client.close()