In [None]:
import dask.dataframe as dd
from dask.dataframe import from_pandas
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask

import os
import time
from tqdm import tqdm
import pandas as pd
import re

#importers
from pm4py import read_xes, convert_to_event_log, convert_to_dataframe

# Miners
from pm4py import discover_dfg_typed as dfg_discovery, serialize, deserialize
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.algo.discovery.inductive import algorithm as inductive_miner
from pm4py.algo.discovery.correlation_mining import algorithm as correlation_miner
from pm4py.algo.discovery.temporal_profile import algorithm as temporal_profile_discovery


# Evaluators
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness_evaluator
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator

In [None]:
import sys
sys.setrecursionlimit(30000)

In [None]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [None]:
client.close()

In [None]:
client = Client(n_workers=4, threads_per_worker=8)
client

In [None]:
df = from_pandas(convert_to_dataframe(read_xes('BPIC15_1.xes')), npartitions=1)

In [None]:
df = df.repartition(npartitions=4).set_index('case:concept:name', drop=False)

In [None]:
df.index = df.index.rename("caseId")

In [None]:
def getDFGQueries(dfg):
    listOfQueries = []
    queryTemplate = """
        MERGE (p:Activity {{name: '{parent}'}})
        MERGE (c:Activity {{name: '{child}'}})
        MERGE (p)-[r:PRODUCES]->(c)
        ON CREATE SET r.frequency={frequency}
        ON MATCH SET r.frequency=r.frequency+{frequency}
    """
    for parent, child in dfg:
        frequency = dfg[(parent, child)]
        template = queryTemplate.format(parent=parent, child=child, frequency=frequency)
        listOfQueries.append(template)
    return listOfQueries

In [None]:
def saveDFG(dfg):
    dfgResult = dfg_discovery(dfg)
    dfgQuery = getDFGQueries(dfgResult.graph)
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    result = neo4jConnection.run_bulk_query(dfgQuery)
    return dfgResult

In [None]:
paritionedLazyDFG = df.map_partitions(saveDFG)

In [None]:
%%time
test = dask.compute(paritionedLazyDFG)

In [None]:
dfg_output = dask.compute(lazyDFG)[0] # hosted locally

In [None]:
def getMinerResult(dfg, miner, threshold = 0.5):
    result = {}
    if miner == 'heuristic_miner':
        net, im, fm = heuristics_miner.apply_dfg(dfg, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: threshold})
    elif miner == 'inductive_miner':
        net, im, fm = inductive_miner.apply_dfg(dfg, noise_threshold=0.9, multi_processing=True)
    elif miner == 'alpha_miner':
        net, im, fm = alpha_miner.apply_dfg(dfg)
    
    result[miner] = serialize(net, im, fm)
    return result
    
def setLazyMiners(dfg):
    lazyList = []
    miners = [
        'heuristic_miner', 
#         'inductive_miner',
#         'alpha_miner'
    ]
    for miner in miners:
        task = dask.delayed(getMinerResult)(dfg, miner)
        lazyList.append(task)
    
    return lazyList

In [None]:
def getMetrics(metric, log, petrinet, im, fm):
    result = {}
    if metric == 'fitness':
        result[metric] = replay_fitness_evaluator.apply(log, petrinet, im, fm, variant=replay_fitness_evaluator.TOKEN_BASED)
    elif metric == 'simplicity':
        result[metric] = simplicity_evaluator.apply(petrinet)
    elif metric == 'precision':
        result[metric] = precision_evaluator.apply(log, petrinet, im, fm, variant=precision_evaluator.Variants.ETCONFORMANCE_TOKEN)
    elif metric == 'generalization':
        result[metric] = generalization_evaluator.apply(log, petrinet, im, fm)
    
    return result

def setLazyMetrics(dfg, petrinet, im, fm):
    lazyList = []
    metrics = [
        'fitness', 
#         'simplicity', 
#         'precision', 
#         'generalization'
    ]
    
    for metric in metrics:
        task = dask.delayed(getMetrics)(metric, dfg, petrinet, im, fm)
        lazyList.append(task)
    
    return lazyList

In [None]:
lazyMiners = setLazyMiners(dfg_output.graph)

In [None]:
lazyMiners[0].visualize()

In [None]:
lazyMinersResults = dask.compute(*lazyMiners)

In [None]:
net, im, fm = deserialize(lazyMinersResults[0]['heuristic_miner'])

In [None]:
lazyMetrics = setLazyMetrics(df, net, im, fm)

In [None]:
lazyMetricsResults = dask.compute(*lazyMetrics)