In [24]:
from tqdm import tqdm

from neo4j import GraphDatabase
from neo4j.exceptions import ClientError

class graph_driver():
    def __init__(self, uri_scheme='bolt', host='localhost', port='7687', username='neo4j', password=''):
        self.uri_scheme = uri_scheme
        self.host = host
        self.port = port
        
        self.username = username
        self.password = password
        
        self.connection_uri = "{uri_scheme}://{host}:{port}".format(uri_scheme=self.uri_scheme, host=self.host, port=self.port)
        self.auth = (self.username, self.password)
        self.driver = GraphDatabase.driver(self.connection_uri, auth=self.auth)
        
    def __del__(self):
        self._close_driver()
    
    def _close_driver(self):
        if self.driver:
            self.driver.close()
    
    def run_single_query(self, query):
        res = None
        with self.driver.session() as session:
            raw_res = session.run(query)
            res = self.format_raw_res(raw_res)
        return res
    
    def run_bulk_query(self, query_list):
        results = []
        with self.driver.session() as session:
            for query in tqdm(query_list):
                raw_res = session.run(query)
                res = self.format_raw_res(raw_res)
                results.append({'query':query, 'result':res})
        return results
    
    def reset_graph(self, db=None):
        return self.run_single_query("MATCH (n) DETACH DELETE n")
    
    def test_connection(self):
        return self.run_single_query("MATCH (n) RETURN COUNT(n) as nodes, sum(size((n)-[]->())) as relations")
    
    @staticmethod
    def format_raw_res(raw_res):
        res = []
        for r in raw_res:
            res.append(r)
        return res

        
# def main():
#     driver = graph_driver(host="44.200.240.194", password="liter-choke-sizing")
#     res = driver.test_connection()
#     print(res)
    
# if __name__ == "__main__":
#     main()

In [28]:
import dask.dataframe as dd
from dask.dataframe.utils import make_meta
from neo4j import GraphDatabase
from dask.distributed import Client, LocalCluster, get_worker
import dask
import dill
import os
import time

In [2]:
def addActivity(tx, activityName):
    return tx.run("MERGE (p:Activity {name: $activityName})", activityName=activityName)

def setLinks(row):
    row['predecessor'] = row['activityNameEN'].shift(1); #lag(1)
    row['successor'] = row['activityNameEN'].shift(-1); #lead(1)
    return row;

In [3]:
columnTypes = {
    'case:IDofConceptCase': 'string',
    'case:Includes_subCases': 'string',
    'case:Responsible_actor': 'string',
    'case:caseProcedure': 'string',
    'dateStop': 'string'
}
df = dd.read_csv('BPIC15_1.csv', dtype=columnTypes)

In [4]:
client = Client()
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 5
Total threads: 5,Total memory: 7.67 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:46475,Workers: 5
Dashboard: http://127.0.0.1:8787/status,Total threads: 5
Started: Just now,Total memory: 7.67 GiB

0,1
Comm: tcp://127.0.0.1:39033,Total threads: 1
Dashboard: http://127.0.0.1:38897/status,Memory: 1.53 GiB
Nanny: tcp://127.0.0.1:46781,
Local directory: /tmp/dask-worker-space/worker-klzaw3ki,Local directory: /tmp/dask-worker-space/worker-klzaw3ki

0,1
Comm: tcp://127.0.0.1:38423,Total threads: 1
Dashboard: http://127.0.0.1:36893/status,Memory: 1.53 GiB
Nanny: tcp://127.0.0.1:41017,
Local directory: /tmp/dask-worker-space/worker-88ody7l2,Local directory: /tmp/dask-worker-space/worker-88ody7l2

0,1
Comm: tcp://127.0.0.1:36907,Total threads: 1
Dashboard: http://127.0.0.1:35001/status,Memory: 1.53 GiB
Nanny: tcp://127.0.0.1:39591,
Local directory: /tmp/dask-worker-space/worker-mfzj50j5,Local directory: /tmp/dask-worker-space/worker-mfzj50j5

0,1
Comm: tcp://127.0.0.1:41317,Total threads: 1
Dashboard: http://127.0.0.1:36643/status,Memory: 1.53 GiB
Nanny: tcp://127.0.0.1:34465,
Local directory: /tmp/dask-worker-space/worker-jz2p9eug,Local directory: /tmp/dask-worker-space/worker-jz2p9eug

0,1
Comm: tcp://127.0.0.1:45397,Total threads: 1
Dashboard: http://127.0.0.1:36459/status,Memory: 1.53 GiB
Nanny: tcp://127.0.0.1:32867,
Local directory: /tmp/dask-worker-space/worker-2065sltk,Local directory: /tmp/dask-worker-space/worker-2065sltk


In [5]:
creds = getNeo4jCredentials()

In [6]:
df['successor'] = ''
df['predecessor'] = ''
df = df.sort_values(by='time:timestamp').groupby('case:concept:name').apply(setLinks, meta=df)

In [7]:
result = df['activityNameEN'].unique()

In [8]:
f_result = client.scatter(result, broadcast=True)

In [20]:
def getQueries(activities):
    listOfQueries = []
    queryTemplate = "MERGE (p:Activity {{name: '{activityName}'}})"
    for index, record in activities.iteritems():
        template = queryTemplate.format(activityName=record)
        listOfQueries.append(template)
    return listOfQueries

In [29]:
def saveActivities(activities):
    read_queries_start_time = time()
    activitiesQueries = getQueries(activities)
    neo4jConnection = graph_driver(host="44.200.240.194", password="liter-choke-sizing")
    result = neo4jConnection.run_bulk_query(activitiesQueries)
    read_queries_time = time.time()-read_queries_start_time
    print("----Finshed saving nodes: in {time}".format(time=str(read_queries_time)))

In [26]:
lazyTasks = dask.delayed(saveActivities)(result)

In [27]:
result = dask.compute(lazyTasks)

100%|██████████| 289/289 [01:16<00:00,  3.79it/s]


In [9]:
# def test(activityName):
#     driver = GraphDatabase.driver(creds.get('host'), auth=(creds.get('user'), creds.get('password')))
#     with driver.session() as session:
#         session.run("MERGE (p:Activity {name: $activityName})", activityName=activityName)
#         session.close()
#     return activityName

In [10]:
# driver = GraphDatabase.driver(creds.get('host'), auth=(creds.get('user'), creds.get('password')))
# x.apply(test, meta=('activityNameEN', 'object')).compute()
# # with driver.session() as session:
# # #     session.run("MERGE (p:Activity {name: $activityName})", activityName="Test_Activity2")
# #     x[0].apply(test, meta=('activityNameEN', 'object'), session=dill.dumps(session)).compute()
# # #     session.close()
# # driver.close()