In [69]:
import dask.dataframe as dd
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask
import dill
import os
import time
from tqdm import tqdm
import pandas as pd
import re

from pm4py.discovery import discover_petri_net_inductive as inductive_miner
from pm4py.discovery import discover_petri_net_alpha as alpha_miner
from pm4py.discovery import discover_petri_net_heuristics as heuristics_miner
from pm4py.discovery import discover_dfg_typed

In [2]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [3]:
def setLinks(row):
    row['predecessor'] = row['activityNameEN'].shift(1); #lag(1)
    row['successor'] = row['activityNameEN'].shift(-1); #lead(1)
    return row;

def convertToDFG(records):
    dfg = {}
    for record in records:
        r = record.data()
        dfg[(r["Parent"], r["Child"])] = float(r["Frequency"])
    return dfg

In [57]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'case:concept:name': 'string',
    'dateStop': 'string'
}
df = dd.read_csv('BPIC15_1.csv', dtype=columnTypes)
for column in df.columns:
    if re.search("[Dd]ate.*|time.*", column):
        df[column] = dask.dataframe.to_datetime(df[column], utc=True)

In [5]:
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 7.68 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:45729,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 7.68 GiB

0,1
Comm: tcp://127.0.0.1:39867,Total threads: 1
Dashboard: http://127.0.0.1:39303/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:38517,
Local directory: /tmp/dask-worker-space/worker-i4pvijlp,Local directory: /tmp/dask-worker-space/worker-i4pvijlp

0,1
Comm: tcp://127.0.0.1:45131,Total threads: 1
Dashboard: http://127.0.0.1:40463/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:41959,
Local directory: /tmp/dask-worker-space/worker-mmlk_rkz,Local directory: /tmp/dask-worker-space/worker-mmlk_rkz

0,1
Comm: tcp://127.0.0.1:36281,Total threads: 1
Dashboard: http://127.0.0.1:37157/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:46205,
Local directory: /tmp/dask-worker-space/worker-1xmo33oz,Local directory: /tmp/dask-worker-space/worker-1xmo33oz

0,1
Comm: tcp://127.0.0.1:44339,Total threads: 1
Dashboard: http://127.0.0.1:45369/status,Memory: 1.92 GiB
Nanny: tcp://127.0.0.1:45293,
Local directory: /tmp/dask-worker-space/worker-px2te_ru,Local directory: /tmp/dask-worker-space/worker-px2te_ru


In [6]:
df['successor'] = ''
df['predecessor'] = ''
df = df.sort_values(by='time:timestamp').groupby('case:concept:name').apply(setLinks, meta=df)

In [7]:
def getQueries(activities):
    listOfQueries = []
    queryTemplate = """
        MERGE (a:Activity {{name: '{activity}'}})
        MERGE (s:Activity {{name: '{successor}'}})
        MERGE (a)-[r:PRODUCES {{cost: '{cost}'}}]->(s)
    """
    for index, record in activities.iterrows():
        template = queryTemplate.format(activity=record['activityNameEN'], successor=record['successor'], cost=1)
        listOfQueries.append(template)
    return listOfQueries

In [8]:
def saveActivities(activities):
    read_queries_start_time = time.time()
    activitiesQueries = getQueries(activities)
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    result = neo4jConnection.run_bulk_query(activitiesQueries)
    read_queries_time = time.time()-read_queries_start_time
    print("----Finshed saving nodes: in {time}".format(time=str(read_queries_time)))

In [9]:
lazyTasks = dask.delayed(saveActivities)(df)

In [10]:
result = dask.compute(lazyTasks) # hosted locally

100%|██████████| 52217/52217 [19:29<00:00, 44.65it/s]  


In [11]:
neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
readQuery = """
    MATCH result=(p:Activity)-[r:PRODUCES]->(c:Activity) 
    RETURN p.name as Parent, c.name as Child, r.cost as Frequency"""
result = neo4jConnection.run_single_query(readQuery)

In [12]:
dfg = convertToDFG(result)

In [13]:
def getMinersResult(dfg, threshold=0.99):
    alphaMiner = {} #alpha_miner.apply_dfg(dfg)
    heuristicMiner = heuristics_miner.apply_dfg(dfg, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: threshold})
    inductiveMiner = inductive_miner.apply_dfg(dfg, parameters={heuristics_miner.Variants.CLASSIC.value.Parameters.DEPENDENCY_THRESH: threshold})
    return {
        result: {
            "alphaMiner": alphaMiner,
            "heuristicMiner": heuristicMiner,
            "inductiveMiner": inductiveMiner
        }
    }

In [73]:
def logToDFG(activities):
    dfgOutput = discover_dfg_typed(activities)
    return dfgOutput

In [74]:
lazyOutput = dask.delayed(logToDFG)(df)

In [85]:
dfg_output = dask.compute(lazyOutput)[0]

In [106]:
for k, v in dfg_output.graph:
    print("{key} -> {value} --- {freq}".format(key=k, value=v, freq=dfg_output.graph[(k,v)]))

01_BB_540 -> 01_BB_545 --- 1
01_BB_540 -> 01_BB_546 --- 2
01_BB_540 -> 01_BB_590 --- 1
01_BB_540 -> 01_BB_765 --- 6
01_BB_540 -> 01_BB_770 --- 24
01_BB_540 -> 01_BB_775 --- 13
01_BB_540 -> 01_HOOFD_101b --- 1
01_BB_540 -> 01_HOOFD_195 --- 1
01_BB_540 -> 01_HOOFD_250 --- 1
01_BB_540 -> 01_HOOFD_370 --- 1
01_BB_540 -> 01_HOOFD_500 --- 1
01_BB_540 -> 01_HOOFD_510_1 --- 1
01_BB_540 -> 01_HOOFD_510_3 --- 1
01_BB_540 -> 01_HOOFD_530b --- 2
01_BB_540 -> 01_HOOFD_790 --- 1
01_BB_540 -> 01_HOOFD_800 --- 6
01_BB_540 -> 01_HOOFD_805 --- 1
01_BB_540 -> 01_HOOFD_809 --- 11
01_BB_540 -> 01_HOOFD_809c --- 14
01_BB_540 -> 01_HOOFD_810 --- 3
01_BB_540 -> 01_HOOFD_811 --- 9
01_BB_540 -> 01_HOOFD_814 --- 8
01_BB_540 -> 01_HOOFD_815 --- 9
01_BB_540 -> 01_HOOFD_820 --- 12
01_BB_545 -> 01_BB_546 --- 1
01_BB_545 -> 01_BB_550_1 --- 1
01_BB_545 -> 01_HOOFD_490_1a --- 1
01_BB_546 -> 01_BB_545 --- 1
01_BB_546 -> 01_BB_590 --- 1
01_BB_546 -> 01_BB_630 --- 1
01_BB_550 -> 01_BB_560 --- 1
01_BB_550_1 -> 01_BB_550_2 