In [1]:
# Miners
from pm4py import serialize, deserialize
from pm4py import discover_dfg as dfg_discovery

from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner


# Evaluators
import pandas as pd
import re
import pm4py
import sys
import dask
from dask.distributed import Client, LocalCluster
from pm4py.discovery import DFG
from pm4py import discover_dfg_typed

from pm4py.algo.evaluation.simplicity import algorithm as simplicity #simplicity
from pm4py.algo.evaluation.replay_fitness import algorithm as fitness_alignment #fitness
from pm4py.algo.evaluation.generalization import algorithm as generalization #generalization

In [2]:
cluster = LocalCluster(n_workers=1, threads_per_worker=1, memory_limit=None)

2023-04-04 14:31:02,652 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-w9guvi_g', purging
2023-04-04 14:31:02,653 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-kgwcq_u7', purging
2023-04-04 14:31:02,653 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-0ai8y9dn', purging
2023-04-04 14:31:02,653 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-8e99p644', purging
2023-04-04 14:31:02,653 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-77hc6cm8', purging


In [3]:
client = Client(cluster)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 1
Total threads: 1,Total memory: 0 B
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:35045,Workers: 1
Dashboard: http://127.0.0.1:8787/status,Total threads: 1
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:43377,Total threads: 1
Dashboard: http://127.0.0.1:42995/status,Memory: 0 B
Nanny: tcp://127.0.0.1:34339,
Local directory: /tmp/dask-worker-space/worker-t5_7yaso,Local directory: /tmp/dask-worker-space/worker-t5_7yaso


In [4]:
cluster.adapt(minimum=1, maximum=8)

<distributed.deploy.adaptive.Adaptive at 0xffff920a3700>

In [5]:
sys.setrecursionlimit(3000)

In [6]:
import timeit

def useExecutionTime(func):
    def wrapper(*args, **kwargs):
        start_time = timeit.default_timer()
        result = func(*args, **kwargs)
        end_time = timeit.default_timer()
        execution_time = end_time - start_time
        return {'result': result, 'execution_time': execution_time}
    return wrapper

In [7]:
@useExecutionTime
def getMinerResult(dfg, miner, threshold = 0.5):
    result = {}
    if miner == 'heuristic_miner':
        net, im, fm = heuristics_miner.apply_dfg(dfg['dfg'], parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: threshold})
    elif miner == 'inductive_miner':
        net, im, fm = pm4py.discover_petri_net_inductive(dfg['dfgObj'])
    elif miner == 'alpha_miner':
        net, im, fm = alpha_miner.apply_dfg(dfg['dfg'])
    
    result[miner] = serialize(net, im, fm)
    
    return result
    
def setMiners(dfg):
    lazyList = []
    miners = [
        'heuristic_miner',
        'inductive_miner',
#         'alpha_miner'
    ]
    for miner in miners:
        task = dask.delayed(getMinerResult)(dfg, miner)
        lazyList.append(task)
    
    return lazyList

In [8]:
@useExecutionTime
def getMetrics(log, miner, metric, net, im, fm):
    try:
        result = {
            miner: {
                metric: 0
            }
        }
        if metric == 'fitness':
            result[miner][metric] = fitness_alignment.apply(log, net, im, fm)['log_fitness']
        elif metric == 'simplicity':
            result[miner][metric] = simplicity.apply(net)
        elif metric == 'precision':
            result[miner][metric] = pm4py.precision_alignments(log, net, im, fm)
        elif metric == 'generalization':
            result[miner][metric] = generalization.apply(log, net, im, fm)

        return result
    except Exception as e:
        return {miner: {metric: {"error": e}}}

def setMetrics(log, miners):
    sys.setrecursionlimit(30000)
    results = []
    metrics = [
        'fitness',
        'simplicity',
#         'precision',
        'generalization'
    ]
    
    for metric in metrics:
        for miner in miners:
            algorithm = list(miner['result'].keys())[0]
            net, im, fm = deserialize(miner['result'][algorithm])
            task = dask.delayed(getMetrics)(log, algorithm, metric, net, im, fm)
            results.append(task)
    
    return results

In [9]:
data_files = ['BPIC15_1.csv', 'BPIC15_2.csv', 'BPIC15_3.csv', 'BPIC15_4.csv', 'BPIC15_5.csv']
dataframe = pd.concat((pd.read_csv(filename) for filename in data_files))
fileName = '-'.join(data_files)

  dataframe = pd.concat((pd.read_csv(filename) for filename in data_files))
  dataframe = pd.concat((pd.read_csv(filename) for filename in data_files))
  dataframe = pd.concat((pd.read_csv(filename) for filename in data_files))
  dataframe = pd.concat((pd.read_csv(filename) for filename in data_files))
  dataframe = pd.concat((pd.read_csv(filename) for filename in data_files))


In [10]:
# fileName = 'BPIC15_5'

In [11]:
# dataframe = pd.read_csv('{}.csv'.format(fileName), dtype={'dateStop': 'string'})

In [12]:
# dataframe = dataframe.rename(columns={"Incident ID": "case:concept:name", "IncidentActivity_Type": "concept:name", "DateStamp": "time:timestamp"})

In [13]:
# dataframe = dataframe.rename(columns={"CustomerID": "case:concept:name", "SessionID": "concept:name", "TIMESTAMP": "time:timestamp"})

In [14]:
# BPI_2019
# dataframe = dataframe.rename(columns={"case concept:name": "case:concept:name", "event concept:name": "concept:name", "event time:timestamp": "time:timestamp"})

In [15]:
dataframe['time:timestamp'] = pd.to_datetime(dataframe['time:timestamp'], utc=True)
dataframe['concept:name'] = dataframe['concept:name'].astype(str)
dataframe['case:concept:name'] = dataframe['case:concept:name'].astype(str)

In [16]:
start_time = timeit.default_timer()
dfg, start_activities, end_activities = dfg_discovery(dataframe)
end_time = timeit.default_timer()
dfgDiscoveryExecutionTime = end_time - start_time

In [17]:
dfgObj = DFG(dfg, start_activities=start_activities, end_activities=end_activities)

In [18]:
minersResults = setMiners({"dfgObj": dfgObj, "dfg": dfg})

In [19]:
minersResults = dask.compute(*minersResults)

In [20]:
scattered_data = client.scatter(dataframe)

In [21]:
metricsResults = setMetrics(scattered_data, minersResults)

In [22]:
metricsResults = dask.compute(*metricsResults)

replaying log with TBR, completed variants :: 100%|██████████| 5539/5539 [01:32<00:00, 60.17it/s] 
aligning log, completed variants :: 100%|██████████| 5539/5539 [1:06:05<00:00,  1.40it/s] 1.24it/s]
replaying log with TBR, completed variants :: 100%|██████████| 5539/5539 [1:16:13<00:00,  1.21it/s]
replaying log with TBR, completed variants :: 100%|██████████| 5539/5539 [1:17:11<00:00,  1.20it/s]


In [23]:
def getStatisticalDataFrames(metricsResults, minersResults):

    resultsPerMiner = {}
    metricsExecutionTimePerMiner = {}
    minerExecutionTime = {}

    for result in metricsResults:
        miner = list(result['result'].keys())[0]
        execution_time = result['execution_time']
        metricKey = list(result['result'][miner].keys())[0]
        metricValue = result['result'][miner][metricKey]

        resultsPerMiner.setdefault(miner, {})
        resultsPerMiner[miner].setdefault(metricKey, None)

        metricsExecutionTimePerMiner.setdefault(miner, {})
        metricsExecutionTimePerMiner[miner].setdefault(metricKey, None)

        if resultsPerMiner[miner][metricKey] == None:
            resultsPerMiner[miner][metricKey] = metricValue

        if metricsExecutionTimePerMiner[miner][metricKey] == None:
            metricsExecutionTimePerMiner[miner][metricKey] = execution_time
            
    for result in minersResults:
        miner = list(result['result'].keys())[0]
        execution_time = result['execution_time']
        minerExecutionTime.setdefault(miner, execution_time)
        minerExecutionTime[miner] = execution_time
            
    resultsPerMiner['data_set'] = fileName
    metricsExecutionTimePerMiner['data_set'] = fileName
    minerExecutionTime['data_set'] = fileName
            
    return [
        pd.DataFrame(resultsPerMiner),
        pd.DataFrame(metricsExecutionTimePerMiner),
        pd.DataFrame(minerExecutionTime, index=['execution_time'])
    ]

In [24]:
results, execution_times, miner_execution_time = getStatisticalDataFrames(metricsResults, minersResults)

In [25]:
# results

In [26]:
# execution_times

In [27]:
execution_times.to_csv('./results/2 - parallel setup/{}_metrics_execution_times.csv'.format(fileName))

In [28]:
results.to_csv('./results/2 - parallel setup/{}_results.csv'.format(fileName))

In [29]:
miner_execution_time.to_csv('./results/2 - parallel setup/{}_miner_execution_time.csv'.format(fileName))

In [30]:
fileName

'BPIC15_1.csv-BPIC15_2.csv-BPIC15_3.csv-BPIC15_4.csv-BPIC15_5.csv'