In [1]:
import dask.dataframe as dd
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from neo4j.exceptions import ClientError
from dask.distributed import Client, LocalCluster, get_worker
import dask

import os
import time
from tqdm import tqdm
import pandas as pd
import re
import gc

# Miners
from pm4py import serialize, deserialize
from pm4py import discover_dfg_typed as dfg_discovery

In [2]:
class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password='123456'):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

In [3]:
def useExecutionTime(func):
    
    def compute(*args, **kwargs):
        begin = time.time()
        
        result = func(*args, **kwargs)
        
        end = time.time()
        
        return {"result": result, "execution_time": end - begin}
 
    return compute

@useExecutionTime
def getComputeTime(*args, **kwargs):
    return dask.compute(*args, **kwargs)

In [4]:
def transformToDFG(dfgResult):
    result = {}
    for record in dfgResult:
        result[(record["parent"], record["child"])] = record["frequency"]
    
    return result

def transformToStartEndActivity(activities):
    result = {}
    for record in activities:
        result[record['name']] = record["frequency"]
        
    return result

In [5]:
client = Client(n_workers=4, threads_per_worker=1)
client

2023-03-15 17:25:33,816 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-fp31bmas', purging
2023-03-15 17:25:33,816 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-31urcrsv', purging
2023-03-15 17:25:33,817 - distributed.diskutils - INFO - Found stale lock file and directory '/tmp/dask-worker-space/worker-1hx28it_', purging


0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 4,Total memory: 11.68 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33061,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 4
Started: Just now,Total memory: 11.68 GiB

0,1
Comm: tcp://127.0.0.1:42749,Total threads: 1
Dashboard: http://127.0.0.1:44403/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:43919,
Local directory: /tmp/dask-worker-space/worker-n34dwiwl,Local directory: /tmp/dask-worker-space/worker-n34dwiwl

0,1
Comm: tcp://127.0.0.1:43657,Total threads: 1
Dashboard: http://127.0.0.1:34375/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:38709,
Local directory: /tmp/dask-worker-space/worker-zttg_i7s,Local directory: /tmp/dask-worker-space/worker-zttg_i7s

0,1
Comm: tcp://127.0.0.1:45835,Total threads: 1
Dashboard: http://127.0.0.1:38337/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:35651,
Local directory: /tmp/dask-worker-space/worker-3d6ml2sn,Local directory: /tmp/dask-worker-space/worker-3d6ml2sn

0,1
Comm: tcp://127.0.0.1:38411,Total threads: 1
Dashboard: http://127.0.0.1:32805/status,Memory: 2.92 GiB
Nanny: tcp://127.0.0.1:46075,
Local directory: /tmp/dask-worker-space/worker-27vaya7k,Local directory: /tmp/dask-worker-space/worker-27vaya7k


In [6]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'case:concept:name': 'int64',
    'dueDate': 'string',
    'case:termName': 'string',
    'dateStop': 'string',
    'case:endDate': 'object',
    'case:endDatePlanned': 'object',
    'case:parts': 'object'
}

# columnTypes = {
#     'OfferID': 'string'
# }

fileName = 'BPIC15_1'
df = dd.read_csv('{fileName}.csv'.format(fileName=fileName), dtype=columnTypes)
for column in df.columns:
    if re.search("[Dd]ate.*|time.*", column):
        df[column] = dask.dataframe.to_datetime(df[column], utc=True)

# df['case:concept:name'] = df['case:concept:name'].replace(to_replace="Application_", value='', regex=True)
# df['case:concept:name'] = df['case:concept:name'].astype({'case:concept:name': 'int64'})
# df = df.repartition(npartitions=1)

In [7]:
def getDFGQueries(dfg):
    listOfQueries = []
    queryTemplate = """
        MERGE (p:Activity {{name: '{parent}'}})
        MERGE (c:Activity {{name: '{child}'}})
        MERGE (p)-[r:PRODUCES]->(c)
        ON CREATE SET r.frequency={frequency}
        ON MATCH SET r.frequency=r.frequency+{frequency}
    """
    for parent, child in dfg.graph:
        frequency = dfg.graph[(parent, child)]
        template = queryTemplate.format(parent=parent, child=child, frequency=frequency)
        listOfQueries.append(template)
        
    queryTemplateSA = """
        MERGE (p:StartActivity {{name: '{activity}'}})
        ON CREATE SET p.frequency={frequency}
        ON MATCH SET p.frequency=p.frequency+{frequency}
    """
    queryTemplateEA = """
        MERGE (p:EndActivity {{name: '{activity}'}})
        ON CREATE SET p.frequency={frequency}
        ON MATCH SET p.frequency=p.frequency+{frequency}
    """
    
    for activity, frequency in dfg.start_activities.items():
        template = queryTemplateSA.format(activity=activity, frequency=frequency)
        listOfQueries.append(template)
        
    for activity, frequency in dfg.end_activities.items():
        template = queryTemplateEA.format(activity=activity, frequency=frequency)
        listOfQueries.append(template)
    
    
    return listOfQueries

In [8]:
def saveDFG(dfg):
    dfg = dfg_discovery(dfg)
    dfgQuery = getDFGQueries(dfg)
    neo4jConnection = graph_driver(uri_scheme="neo4j",host="neo4j", password="123456")
    result = neo4jConnection.run_bulk_query(dfgQuery)
    return {"dfg": dfg}

In [9]:
indexed_df = df.set_index('case:concept:name', drop=False, sorted=True)

In [10]:
indexed_df.index = indexed_df.index.rename('caseId')
indexed_df = indexed_df.repartition(npartitions=4)

In [11]:
lazyDFG = indexed_df.map_partitions(saveDFG, meta=indexed_df).to_delayed()

In [12]:
dfgResult = getComputeTime(*lazyDFG, meta=[]) # save the dfg to neo4j

100%|██████████| 1235/1235 [00:08<00:00, 141.60it/s]
100%|██████████| 2225/2225 [00:14<00:00, 156.68it/s]
100%|██████████| 1602/1602 [00:07<00:00, 207.97it/s]
100%|██████████| 1874/1874 [00:08<00:00, 212.63it/s]


In [13]:
client.close()