In [1]:
import dask.dataframe as dd
from dask.dataframe import from_pandas
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask

import os
import time
from tqdm import tqdm
import pandas as pd
import re
import gc

#importers
from pm4py import convert_to_event_log, convert_to_dataframe, format_dataframe

# Miners
from pm4py import convert_to_petri_net, serialize, deserialize
from pm4py import discover_dfg as dfg_discovery

from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.correlation_mining import algorithm as correlation_miner
from pm4py.algo.discovery.temporal_profile import algorithm as temporal_profile_discovery


# Evaluators
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator

In [35]:
dask.config.set({'distributed.scheduler.active-memory-manager.start': True})

<dask.config.set at 0xffff2c771f70>

In [2]:
import sys
sys.setrecursionlimit(30000)

In [3]:
import ctypes

def trim_memory() -> int:
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

In [4]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [5]:
def useExecutionTime(func):
    
    def compute(*args, **kwargs):
        begin = time.time()
        
        result = func(*args, **kwargs)
        
        end = time.time()
        
        return {"result": result, "execution_time": end - begin}
 
    return compute

@useExecutionTime
def getComputeTime(*args, **kwargs):
    return dask.compute(*args, **kwargs)

In [6]:
def transformToDFG(dfgResult):
    result = {}
    for record in dfgResult:
        result[(record["parent"], record["child"])] = record["frequency"]
    
    return result

def transformToStartEndActivity(activities):
    result = {}
    for record in activities:
        result[record['name']] = record["frequency"]
        
    return result

In [7]:
client = Client(n_workers=4, threads_per_worker=16)
client

2023-02-06 17:27:24,726 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-op2hb54d', purging
2023-02-06 17:27:24,726 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-nnzsgt3j', purging
2023-02-06 17:27:24,726 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-6w32np_8', purging
2023-02-06 17:27:24,727 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-9v5xi2dj', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 64,Total memory: 7.67 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:37983,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 64
Started: Just now,Total memory: 7.67 GiB

0,1
Comm: tcp://127.0.0.1:40393,Total threads: 16
Dashboard: http://127.0.0.1:42501/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:41711,
Local directory: /tmp/dask-worker-space/worker-73t7a4os,Local directory: /tmp/dask-worker-space/worker-73t7a4os

0,1
Comm: tcp://127.0.0.1:38649,Total threads: 16
Dashboard: http://127.0.0.1:35895/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:39577,
Local directory: /tmp/dask-worker-space/worker-t81g_3uw,Local directory: /tmp/dask-worker-space/worker-t81g_3uw

0,1
Comm: tcp://127.0.0.1:39755,Total threads: 16
Dashboard: http://127.0.0.1:35677/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:34477,
Local directory: /tmp/dask-worker-space/worker-zsgytf68,Local directory: /tmp/dask-worker-space/worker-zsgytf68

0,1
Comm: tcp://127.0.0.1:37411,Total threads: 16
Dashboard: http://127.0.0.1:39703/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:41465,
Local directory: /tmp/dask-worker-space/worker-vzcwnlgx,Local directory: /tmp/dask-worker-space/worker-vzcwnlgx


In [42]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'case:concept:name': 'int64',
    'dateStop': 'string'
}
df = dd.read_csv('BPIC15_1.csv', dtype=columnTypes)
for column in df.columns:
    if re.search("[Dd]ate.*|time.*", column):
        df[column] = dask.dataframe.to_datetime(df[column], utc=True)

In [10]:
def getDFGQueries(dfg, start_activities, end_activities):
    listOfQueries = []
    queryTemplate = """
        MERGE (p:Activity {{name: '{parent}'}})
        MERGE (c:Activity {{name: '{child}'}})
        MERGE (p)-[r:PRODUCES]->(c)
        ON CREATE SET r.frequency={frequency}
        ON MATCH SET r.frequency=r.frequency+{frequency}
    """
    for parent, child in dfg:
        frequency = dfg[(parent, child)]
        template = queryTemplate.format(parent=parent, child=child, frequency=frequency)
        listOfQueries.append(template)
        
    queryTemplateSA = """
        MERGE (p:StartActivity {{name: '{activity}'}})
        ON CREATE SET p.frequency={frequency}
        ON MATCH SET p.frequency=p.frequency+{frequency}
    """
    queryTemplateEA = """
        MERGE (p:EndActivity {{name: '{activity}'}})
        ON CREATE SET p.frequency={frequency}
        ON MATCH SET p.frequency=p.frequency+{frequency}
    """
    
    for activity, frequency in start_activities.items():
        template = queryTemplateSA.format(activity=activity, frequency=frequency)
        listOfQueries.append(template)
        
    for activity, frequency in end_activities.items():
        template = queryTemplateEA.format(activity=activity, frequency=frequency)
        listOfQueries.append(template)
    
    
    return listOfQueries

In [11]:
def saveDFG(dfg):
    dfg, start_activities, end_activities = dfg_discovery(dfg)
    dfgQuery = getDFGQueries(dfg, start_activities, end_activities)
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    result = neo4jConnection.run_bulk_query(dfgQuery)
    return {"dfg": dfg, "start_activites": start_activities, "end_activites": end_activities}

In [12]:
def getDFG():
    queries = {
        "dfgQuery": """MATCH result=(p:Activity)-[r:PRODUCES]->(c:Activity) RETURN p.name as parent, c.name as child, r.frequency as frequency""",
        "startEndActivitiesQuery": ["MATCH (a:StartActivity) RETURN a.name as name , a.frequency as frequency", "MATCH (a:EndActivity) RETURN a.name as name , a.frequency as frequency"],
    }
    
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    
    dfgResult = neo4jConnection.run_single_query(queries['dfgQuery'])
    startEndActivitiesResult = neo4jConnection.run_bulk_query(queries['startEndActivitiesQuery'])
    return [transformToDFG(dfgResult), transformToStartEndActivity(startEndActivitiesResult[0]["result"]), transformToStartEndActivity(startEndActivitiesResult[1]["result"])]
    

In [13]:
indexed_df = df.set_index('case:concept:name', drop=False, sorted=True)

In [14]:
indexed_df.index = indexed_df.index.rename('caseId')
indexed_df = indexed_df.repartition(npartitions=4)

In [15]:
lazyDFG = indexed_df.map_partitions(saveDFG, meta=indexed_df).to_delayed()

In [None]:
getComputeTime(*lazyDFG, scheduler='processes', meta=[]) # save the dfg to neo4j

In [43]:
dfg, start, end = getDFG()

100% 2/2 [00:00<00:00, 142.49it/s]


In [17]:
@useExecutionTime
def getMinerResult(dfg, miner, threshold = 0.5):
    result = {}
    if miner == 'heuristic_miner':
        net, im, fm = heuristics_miner.apply_dfg(dfg, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: threshold})
    elif miner == 'inductive_miner':
        net, im, fm = inductive_miner.apply_dfg(dfg)
    elif miner == 'alpha_miner':
        net, im, fm = alpha_miner.apply_dfg(dfg)
    
    result[miner] = serialize(net, im, fm)
    
    return result
    
def setLazyMiners(dfg):
    lazyList = []
    miners = [
#         'heuristic_miner',
        'inductive_miner',
#         'alpha_miner'
    ]
    for miner in miners:
        task = dask.delayed(getMinerResult)(dfg, miner)
        lazyList.append(task)
    
    return lazyList

In [18]:
@useExecutionTime
def getMetrics(log, miner, metric, net, im, fm):
    result = {
        miner: {
            metric: 0
        }
    }
    if metric == 'fitness':
        result[miner][metric] = replay_fitness_evaluator.apply(log, net, im, fm, variant=replay_fitness_evaluator.TOKEN_BASED)
    elif metric == 'simplicity':
        result[miner][metric] = simplicity_evaluator.apply(net)
    elif metric == 'precision':
        result[miner][metric] = precision_evaluator.apply(log, net, im, fm, variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
    elif metric == 'generalization':
        result[miner][metric] = generalization_evaluator.apply(log, net, im, fm)
    
    return result

def setLazyMetrics(log, algorithm, net, im, fm):
    lazyList = []
    metrics = [
#         'fitness',
#         'simplicity',
        'precision', 
#         'generalization'
    ]
    
    for metric in metrics:
        task = dask.delayed(getMetrics)(log, algorithm, metric, net, im, fm)
        lazyList.append(task)
    
    return lazyList

In [44]:
lazyMiners = setLazyMiners(dfg)

In [45]:
lazyMinersResults = dask.compute(*lazyMiners)

In [41]:
def run_gc(dask_worker,**kwargs):
    gc.collect()
    return True

# Register the GC function as a plugin
client.register_worker_plugin(run_gc, "my_gc_plugin")

{'tcp://127.0.0.1:33429': {'status': 'OK'},
 'tcp://127.0.0.1:35889': {'status': 'OK'},
 'tcp://127.0.0.1:41731': {'status': 'OK'},
 'tcp://127.0.0.1:41933': {'status': 'OK'}}

In [46]:
net, im, fm = deserialize(lazyMinersResults[0]['result']['inductive_miner'])

In [47]:
lazyMetrics = setLazyMetrics(indexed_df, 'inductive_miner', net, im, fm)

In [None]:
lazyMetricsResults = dask.compute(*lazyMetrics)

























































































