In [1]:
import dask.dataframe as dd
from dask.dataframe import from_pandas
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask

import os
import time
from tqdm import tqdm
import pandas as pd
import re
import gc
import numpy as np
import dill

# Miners
from pm4py import serialize, deserialize
from pm4py import discover_dfg as dfg_discovery
from pm4py.discovery import DFG

from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py import discover_petri_net_inductive as inductive_miner


# Evaluators
from contribution import fitness_alignment, generalization, precision_alignment
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator #simplicity

In [2]:
dask.config.set({'distributed.scheduler.active-memory-manager.start': True})

<dask.config.set at 0xffff846013d0>

In [3]:
import sys

In [4]:
import ctypes

def trim_memory() -> int:
    libc = ctypes.CDLL("libc.so.6")
    return libc.malloc_trim(0)

In [5]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [6]:
def useExecutionTime(func):
    
    def compute(*args, **kwargs):
        begin = time.time()
        
        result = func(*args, **kwargs)
        
        end = time.time()
        
        return {"result": result, "execution_time": end - begin}
 
    return compute

@useExecutionTime
def getComputeTime(*args, **kwargs):
    return dask.compute(*args, **kwargs)

In [7]:
cluster = LocalCluster(n_workers=1, threads_per_worker=1, memory_limit=None)

2023-03-16 22:18:48,200 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-u8xb7f4w', purging


In [8]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 1,Total memory: 0 B
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:42171,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 1
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:33833,Total threads: 1
Dashboard: http://127.0.0.1:44923/status,Memory: 0 B
Nanny: tcp://127.0.0.1:39171,
Local directory: /tmp/dask-worker-space/worker-zwhl17kw,Local directory: /tmp/dask-worker-space/worker-zwhl17kw


In [9]:
def run_gc(dask_worker,**kwargs):
    gc.collect()
    return True

# Register the GC function as a plugin
client.register_worker_plugin(run_gc, "my_gc_plugin")
client.register_worker_plugin(trim_memory, "my_trim_plugin")

{'tcp://127.0.0.1:33833': {'status': 'OK'}}

In [10]:
cluster.adapt(minimum=1, maximum=8)

<distributed.deploy.adaptive.Adaptive at 0xffff6d3617c0>

In [11]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'case:concept:name': 'int64',
    'dueDate': 'string',
    'case:termName': 'string',
    'dateStop': 'string',
    'case:endDate': 'object',
    'case:endDatePlanned': 'object',
    'case:parts': 'object'
}

# list of file paths to be loaded
file_paths = ['BPIC15_1.csv']

# load the first file as a Dask dataframe
df = dd.read_csv(file_paths[0], dtype=columnTypes)

# iterate over the remaining files
for file_path in file_paths[1:]:
    # usecols parameter to load only the columns that are present in both dataframes
    df_temp = dd.read_csv(file_path)
    # concatenate the dataframes along the rows
    df = dd.concat([df, dd.read_csv(file_path, dtype=columnTypes)], interleave_partitions=True)

# columnTypes = {
#     'OfferID': 'string'
# }

# fileName = 'BPI Challenge 2017'
# df = dd.read_csv('{fileName}.csv'.format(fileName=fileName), dtype=columnTypes)
for column in df.columns:
    if re.search("[Dd]ate.*|time.*", column):
        df[column] = dask.dataframe.to_datetime(df[column], utc=True)
        
# df['case:concept:name'] = df['case:concept:name'].replace(to_replace="Application_", value='', regex=True)
df['case:concept:name'] = df['case:concept:name'].astype({'case:concept:name': 'int64'})
        
df = df.repartition(npartitions=1)

In [12]:
def transformToDFG(dfgResult):
    result = {}
    for record in dfgResult:
        result[(record["parent"], record["child"])] = record["frequency"]
    
    return result

def transformToStartEndActivity(activities):
    result = {}
    for record in activities:
        result[record['name']] = record["frequency"]
        
    return result

In [13]:
def getDFG():
    queries = {
        "dfgQuery": """MATCH result=(p:Activity)-[r:PRODUCES]->(c:Activity) RETURN p.name as parent, c.name as child, r.frequency as frequency""",
        "startEndActivitiesQuery": ["MATCH (a:StartActivity) RETURN a.name as name , a.frequency as frequency", "MATCH (a:EndActivity) RETURN a.name as name , a.frequency as frequency"],
    }
    
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    
    dfgResult = neo4jConnection.run_single_query(queries['dfgQuery'])
    startEndActivitiesResult = neo4jConnection.run_bulk_query(queries['startEndActivitiesQuery'])
    return [transformToDFG(dfgResult), transformToStartEndActivity(startEndActivitiesResult[0]["result"]), transformToStartEndActivity(startEndActivitiesResult[1]["result"])]
    

In [14]:
indexed_df = df.set_index('case:concept:name', drop=False, sorted=True)
indexed_df['case:concept:name'] = indexed_df['case:concept:name'].astype({'case:concept:name': 'string'})

In [15]:
indexed_df.index = indexed_df.index.rename('caseId')
indexed_df = indexed_df.repartition(npartitions=4)

In [16]:
dfg, start, end = getDFG()
dfgObj = DFG(dfg, start_activities=start, end_activities=end)

100% 2/2 [00:00<00:00, 240.12it/s]


In [17]:
@useExecutionTime
def getMinerResult(dfg, miner, threshold = 0.5):
    result = {}
    if miner == 'heuristic_miner':
        net, im, fm = heuristics_miner.apply_dfg(dfg['dfg'], parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: threshold})
    elif miner == 'inductive_miner':
        net, im, fm = inductive_miner(dfg['dfgObj'])
    elif miner == 'alpha_miner':
        net, im, fm = alpha_miner.apply_dfg(dfg['dfg'])
    
    result[miner] = serialize(net, im, fm)
    
    return result
    
def setLazyMiners(dfg):
    lazyList = []
    miners = [
#         'heuristic_miner',
        'inductive_miner',
#         'alpha_miner'
    ]
    for miner in miners:
        task = dask.delayed(getMinerResult)(dfg, miner)
        lazyList.append(task)
    
    return lazyList

In [18]:
@useExecutionTime
def getMetrics(log, miner, metric, net, im, fm):
    sys.setrecursionlimit(3000)
    try:
        result = {
            miner: {
                metric: 0
            }
        }
        if metric == 'fitness':
            result[miner][metric] = fitness_alignment.apply(log, net, im, fm)
        elif metric == 'simplicity':
            result[miner][metric] = simplicity_evaluator.apply(net)
        elif metric == 'precision':
            result[miner][metric] = precision_alignment.apply(log, net, im, fm)
        elif metric == 'generalization':
            result[miner][metric] = generalization.apply(log, net, im, fm)

        return result
    except Exception as e:
        return {miner: {metric: {"error": e}}}

def setLazyMetrics(log, miners):
    lazyList = []
    metrics = [
        'fitness',
        'simplicity',
        'precision',
        'generalization'
    ]
    
    for metric in metrics:
        for miner in miners:
            algorithm = list(miner['result'].keys())[0]
            net, im, fm = deserialize(miner['result'][algorithm])
            task = getMetrics(log, algorithm, metric, net, im, fm)
            lazyList.append(task)
    
    return lazyList

In [19]:
lazyMiners = setLazyMiners({"dfgObj": dfgObj, "dfg": dfg})

In [20]:
lazyMinersResults = dask.compute(*lazyMiners)

In [21]:
lazyMetrics = setLazyMetrics(indexed_df, lazyMinersResults)

In [22]:
@dask.delayed
def aggregate(partitions):
    result = {}
    for partition in partitions:
        for output in partition:
            miner = list(output['result'].keys())[0]
            metric = list(output['result'][miner].keys())[0]
            e_time = output['execution_time']
            
            result.setdefault(miner, {})
            result[miner].setdefault(metric, None)
            
            if result[miner][metric] == None:
                result[miner][metric] = output['result'][miner][metric]
            
            
            if metric and metric == 'fitness':
                result[miner][metric] = fitness_alignment.aggregate(output['result'][miner][metric], result[miner][metric])        
            elif metric and metric == 'precision':
                result[miner][metric] = precision_alignment.aggregate(output['result'][miner][metric], result[miner][metric])
            elif metric and metric == 'generalization':
                result[miner][metric] = generalization.aggregate([output['result'][miner][metric], result[miner][metric]])
            
                
    return result

In [23]:
from pm4py.objects import log as log_lib
from pm4py.algo.evaluation.precision import utils as precision_utils
from pm4py.objects.petri_net.utils import align_utils as utils, check_soundness
from pm4py.statistics.start_activities.log.get import get_start_activities
from pm4py.objects.petri_net.utils.align_utils import get_visible_transitions_eventually_enabled_by_marking
from pm4py.util import exec_utils
from typing import Optional, Dict, Any, Union, Tuple
from pm4py.objects.log.obj import EventLog, EventStream
from pm4py.objects.petri_net.obj import PetriNet, Marking
import pandas as pd
from pm4py.algo.evaluation.precision.variants.align_etconformance import align_fake_log_stop_marking, transform_markings_from_sync_to_original_net
from enum import Enum
from pm4py.util import constants

In [24]:
# @dask.delayed
def compute_metrics(aggregatedMetrics, minersResults):
    results = {}
    
    getMinerResultByMiner = lambda results, miner: [value for value in lazyMinersResults if list(value['result'].keys())[0] == miner].pop()
    
    for miner, metrics in ar.items():
        net, im, fm = deserialize(getMinerResultByMiner(minersResults, miner)['result'][miner])
        for metricKey, metricValue in metrics.items():
            results.setdefault(miner, {})
            results[miner].setdefault(metricKey, None)
            if metricKey and metricKey == 'fitness':
                results[miner][metricKey] = dask.delayed(fitness_alignment.compute)(metricValue)
            elif metricKey and metricKey == 'precision':
                results[miner][metricKey] = dask.delayed(precision_alignment.compute)(**metricValue, net=net, im=im, fm=fm)
            elif metricKey and metricKey == 'generalization':
                results[miner][metricKey] = dask.delayed(generalization.compute)(**metricValue, net=net)
            elif metricKey and metricKey == 'simplicity':
                results[miner][metricKey] = dask.delayed(simplicity_evaluator.apply)(net)
                
    # loop over the delayed functions for each miner/metric
    for miner, metrics in results.items():
        for metricKey, metricValue in metrics.items():
            results[miner][metricKey] = client.compute(results[miner][metricKey])
        
    return results

In [25]:
mapped_data = indexed_df.map_partitions(setLazyMetrics, lazyMinersResults)

In [26]:
aggregated_results = aggregate(mapped_data)

In [27]:
# aggregated_results.visualize()

In [28]:
ar = aggregated_results.compute()

aligning log, completed variants :: 100%|██████████| 158/158 [01:00<00:00,  2.61it/s]
computing precision with alignments, completed variants :: 100%|██████████| 4895/4895 [02:18<00:00, 35.28it/s]
replaying log with TBR, completed variants :: 100%|██████████| 158/158 [00:01<00:00, 82.48it/s]
aligning log, completed variants :: 100%|██████████| 611/611 [03:58<00:00,  2.56it/s]
aligning log, completed variants :: 100%|██████████| 158/158 [01:07<00:00,  2.34it/s]1 [01:34<03:08, 78.52it/s]  
computing precision with alignments, completed variants :: 100%|██████████| 4895/4895 [02:35<00:00, 31.41it/s]
replaying log with TBR, completed variants :: 100%|██████████| 158/158 [00:01<00:00, 84.41it/s]
aligning log, completed variants :: 100%|██████████| 611/611 [03:55<00:00,  2.60it/s]


aligning log, completed variants :: 100%|██████████| 158/158 [01:08<00:00,  2.31it/s]1 [01:32<03:35, 68.51it/s]  
computing precision with alignments, completed variants :: 100%|██████████| 4895/4895 [33:10<00:00,  2.46it/s]
replaying log with TBR, completed variants :: 100%|██████████| 158/158 [00:01<00:00, 88.06it/s]
aligning log, completed variants :: 100%|██████████| 611/611 [3:53:50<00:00, 22.96s/it]     
aligning log, completed variants :: 100%|██████████| 158/158 [17:22<00:00,  6.60s/it]1 [35:28<03:53, 61.04it/s]  


computing precision with alignments, completed variants :: 100%|██████████| 4895/4895 [02:21<00:00, 34.66it/s]
replaying log with TBR, completed variants :: 100%|██████████| 158/158 [00:01<00:00, 84.35it/s]
aligning log, completed variants :: 100%|██████████| 221/221 [01:43<00:00,  2.13it/s]
aligning log, completed variants :: 100%|██████████| 158/158 [01:04<00:00,  2.44it/s] [01:56<03:43, 19.49it/s]
aligning log, completed variants :: 100%|██████████| 611/611 [04:07<00:00,  2.46it/s][00:24<01:41, 38.86it/s]
aligning log, completed variants :: 100%|██████████| 221/221 [02:00<00:00,  1.83it/s]1 [02:12<03:15, 76.49it/s]  
aligning log, completed variants :: 100%|██████████| 158/158 [01:11<00:00,  2.21it/s] [00:44<02:31, 45.55it/s]


computing precision with alignments, completed variants ::  20%|█▉        | 1624/8242 [01:03<03:18, 33.29it/s]

KilledWorker: ("('repartition-merge-1882b37b23a024fe030a9a1882768b16', 2)", <WorkerState 'tcp://127.0.0.1:40597', name: 1, status: closed, memory: 0, processing: 2>)

In [None]:
# net, im, fm = deserialize(lazyMinersResults[0]['result']['inductive_miner'])

In [None]:
# ar['inductive_miner']['precision'].keys()

In [None]:
# compute(**ar['inductive_miner']['precision'], net=net, im=im, fm=fm)

In [None]:
# r = compute_metrics(ar, lazyMinersResults)

In [None]:
# r

In [None]:
# import importlib
# importlib.reload(precision_alignment)