In [None]:
import dask.dataframe as dd
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask
import dill
import os
import time
from tqdm import tqdm
import pandas as pd
import re

from pm4py.discovery import discover_petri_net_inductive as inductive_miner
from pm4py.discovery import discover_petri_net_alpha as alpha_miner
from pm4py.discovery import discover_petri_net_heuristics as heuristics_miner
from pm4py.discovery import discover_dfg_typed
from pm4py.conformance import fitness_alignments, fitness_token_based_replay
from pm4py import serialize, deserialize

In [None]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [None]:
def setLinks(row):
    row['predecessor'] = row['activityNameEN'].shift(1); #lag(1)
    row['successor'] = row['activityNameEN'].shift(-1); #lead(1)
    return row;

def convertToDFG(records):
    dfg = {}
    for record in records:
        r = record.data()
        dfg[(r["Parent"], r["Child"])] = float(r["Frequency"])
    return dfg

In [None]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'case:concept:name': 'string',
    'dateStop': 'string'
}
df = dd.read_csv('BPIC15_1.csv', dtype=columnTypes)
for column in df.columns:
    if re.search("[Dd]ate.*|time.*", column):
        df[column] = dask.dataframe.to_datetime(df[column], utc=True)

In [None]:
client = Client()
client

In [None]:
df['successor'] = ''
df['predecessor'] = ''
df = df.sort_values(by='time:timestamp').groupby('case:concept:name').apply(setLinks, meta=df)

In [None]:
def getDFGQueries(dfg):
    listOfQueries = []
    queryTemplate = """
        MERGE (p:Activity {{name: '{parent}'}})
        MERGE (c:Activity {{name: '{child}'}})
        MERGE (p)-[r:PRODUCES {{frequency: '{frequency}'}}]->(c)
    """
    for parent, child in dfg:
        frequency = dfg[(parent, child)]
        template = queryTemplate.format(parent=parent, child=child, frequency=frequency)
        listOfQueries.append(template)
    return listOfQueries

In [None]:
def saveDFG(dfg):
    dfgResult = discover_dfg_typed(dfg)
    dfgQuery = getDFGQueries(dfgResult.graph)
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    result = neo4jConnection.run_bulk_query(dfgQuery)
    return dfgResult

In [None]:
lazyDFG = dask.delayed(saveDFG)(df)

In [None]:
dfg_output = dask.compute(lazyDFG) # hosted locally

In [None]:
# neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
# readQuery = """
#     MATCH result=(p:Activity)-[r:PRODUCES]->(c:Activity) 
#     RETURN p.name as Parent, c.name as Child, r.cost as Frequency"""
# result = neo4jConnection.run_single_query(readQuery)

In [None]:
def getMinerResult(log, miner):
    result = {}
    if miner == 'heuristic_miner':
        net, im, fm = heuristics_miner(log)
    elif miner == 'inductive_miner':
        net, im, fm = serialize(inductive_miner(log, noise_threshold=0.9, multi_processing=True))
    elif miner == 'alpha_miner':
        net, im, fm = serialize(alpha_miner(log))
    
    result[miner] = serialize(net, im, fm)
    return result
    
def setLazyMiners(log):
    lazyList = []
    miners = [
        'heuristic_miner', 
#         'inductive_miner', 
#         'alpha_miner'
    ]
    for miner in miners:
        lazyList.append(dask.delayed(getMinerResult)(log, miner))
    
    return lazyList

In [3]:
def getMetrics(metric, log, petrinet, im, fm):
    result = {}
    if metric == 'fitness':
        result[metric] = fitness_token_based_replay(log, petrinet, im, fm)
    
    return result

def setLazyMetrics(log, petrinet, im, fm):
    lazyList = []
    metrics = ['fitness']
    
    for metric in metrics:
        lazyList.append(dask.delayed(getMetrics)(metric, log, petrinet, im, fm))
    
    return lazyList

In [None]:
lazyMiners = setLazyMiners(df)

In [None]:
lazyMinersResults = dask.compute(*lazyMiners)

In [None]:
net, im, fm = deserialize(lazyMinersResults[0]['heuristic_miner'])

In [None]:
lazyMetrics = setLazyMetrics(df, net, im, fm)

In [1]:
import sys
# sys.setrecursionlimit(30000)

In [None]:
lazyMetricsResults = dask.compute(*lazyMetrics)