In [1]:
from neo4j import GraphDatabase
from neo4j import Query
import time
import numpy as np
import pandas as pd
import random

## Create Dataset

In [2]:
# define the choice of children patterns.
child_patterns = [['Star', 'E'], ['Star', 'R'], ['A', 'E'], ['A', 'R'], ['M', 'O'], ['O', 'E'], ['O', 'R']]

# Initialize the datasets and the first two node for output
root_pattern = random.choice(child_patterns)    # select a pattern for root
root_1 = root_pattern[0]    # indicate the realtionshiptype for root to 1
output_initial = {'parent':['root'], 'child':['Node1'], 'relationshiptype':[root_1]}

# Define and initialize the nodes we can choose to start and end
nodes_start_initial = {'root':{'children':[['Node1']], 'ancestors':[[]], 'children_pattern':root_pattern}}
nodes_start_initial['Node1'] = {'children':[[]], 'ancestors':nodes_start_initial['root']['ancestors']+[['root']], 'children_pattern':random.choice(child_patterns)} # use the attribute of python shallow copy for ancestors, so that we can do change at same time
nodes_end_initial = {'Node1':{'parent':{root_1:'root'}}}
nodes_start_initial['root']['children'] = nodes_start_initial['root']['children'] + nodes_start_initial['Node1']['children']

# nodes_start['root']['ancestors'][0].append(1) #check for the design
print(output_initial)
print(nodes_start_initial)
print(nodes_end_initial)

{'parent': ['root'], 'child': ['Node1'], 'relationshiptype': ['Star']}
{'root': {'children': [['Node1'], []], 'ancestors': [[]], 'children_pattern': ['Star', 'R']}, 'Node1': {'children': [[]], 'ancestors': [[], ['root']], 'children_pattern': ['Star', 'R']}}
{'Node1': {'parent': {'Star': 'root'}}}


In [3]:
# Define function to delete node in nodes_end
def delete_node_end(end_node:str, nodes_end:dict) -> None:
    if len(nodes_end[end_node]['parent']) == 2:
        del nodes_end[end_node]
    if 'M' in list(nodes_end[end_node]['parent'].keys()):
        del nodes_end[end_node]

delete_node_end('Node1', nodes_end_initial)

# Define function to get a node's nonancestors
def get_nonancestor(start_node:str, nodes_start:dict,nodes_end:dict):
    ancestors = list(set(item for sublist in nodes_start[start_node]['ancestors'] for item in sublist))
    nonancestors = [item for item in list(nodes_end.keys()) if item not in ancestors+[start_node]+nodes_start[start_node]['children'][0]]
    # print(ancestors+[start_node]+nodes_start[start_node]['children'][0],nonancestors)
    return [ancestors,nonancestors]

# Define function to get a node's nonchildren
def get_nonchildren(start_node:str, nodes_start:dict, nodes_end:dict):
    children = list(set(item for sublist in nodes_start[start_node]['children'] for item in sublist))
    nonchildren = [item for item in list(nodes_end.keys()) if item not in children+[start_node]]
    # print(children+[start_node],nonchildren)
    return [children,nonchildren]

# Define function to add relationship to the graph
def add_relationship(nodes_start: dict, nodes_end: dict, output: dict, node_count, child_patterns = child_patterns):
    start_node = random.choice(list(nodes_start.keys())) # select the start node
    end_node_type = random.choice(['new','old'])    # select a type for end_node with probability 0.5

    if  len(nodes_end) != 0:    # get the nonancestor list
        end_node = random.choice(list(nodes_end.keys()))    # set a old nodes for the temp selected end_node
        nonancestor = get_nonancestor(start_node,nodes_start, nodes_end)[1]  # get the nonancestors for temp selected end_node
        # nonchildren = get_nonchildren(start_node, nodes_start, nodes_end)[1] # get non children

    if len(nodes_end) == 0 or len(nonancestor) == 0: # set the type to new when there's not a choice
        end_node_type = 'new'

    if end_node_type == 'old':   # update data if type = 'old'   
        end_node = random.choice(nonancestor)   # choose the start node for this situation.
        # print('old',start_node,end_node,nodes_start[start_node]) # code for check
        node_pattern = nodes_start[start_node]['children_pattern'][1] # get the pattern for end_node 
        
        # updata nodes_end
        nodes_end[end_node]['parent'][node_pattern] = start_node    # update nodes_end with new relationship
        del nodes_end[end_node] # delete the nodes that cannot be selected anymore
        
        # updata output
        output['parent'].append(start_node)
        output['child'].append(end_node)
        output['relationshiptype'].append(node_pattern)
        
        # updata nodes_start
        nodes_start[start_node]['children'][0].append(end_node)
        nodes_start[start_node]['children'] = nodes_start[start_node]['children'] + nodes_start[end_node]['children']
        nodes_start[end_node]['ancestors'][-1].append(start_node)
        nodes_start[end_node]['ancestors'] = nodes_start[start_node]['ancestors'] + nodes_start[end_node]['ancestors']

    else:   # update data if type = 'new' 
        # print('new', start_node,nodes_start[start_node])  # code for check
        node_pattern = nodes_start[start_node]['children_pattern'][0]   # should not get a 'E' or 'R'relaitonship with new node,incase it just end with it finally
        end_node = 'Node' + str(node_count) # Define the name for new node
        node_count += 1 # update node_count

        # update output
        output['parent'].append(start_node)
        output['child'].append(end_node)
        output['relationshiptype'].append(node_pattern)

        # update nodes_end
        nodes_end[end_node] = {'parent':{}} # initialize node_end for new node
        nodes_end[end_node]['parent'][node_pattern] = start_node    # update nodes_end with new relationship
        delete_node_end(end_node,nodes_end) # delete the nodes that cannot be selected anymore

        # update nodes_start
        new_node_pattern = random.choice(child_patterns) # select a pattern for new node
        # print(new_node_pattern) # code for check
        nodes_start[end_node] = {'children':[[]], 'ancestors':[[]], 'children_pattern':new_node_pattern} # initialize node_end for new node
        nodes_start[start_node]['children'][0].append(end_node)
        nodes_start[start_node]['children'] = nodes_start[start_node]['children'] + nodes_start[end_node]['children']
        nodes_start[end_node]['ancestors'][-1].append(start_node)
        # print(nodes_start[end_node]['ancestors'][-1])
        nodes_start[end_node]['ancestors'] = nodes_start[start_node]['ancestors'] + nodes_start[end_node]['ancestors']
    return [nodes_start, nodes_end, output, node_count]
        
        


In [289]:
output,nodes_start,nodes_end = copy.deepcopy(output_initial),copy.deepcopy(nodes_start_initial),copy.deepcopy(nodes_end_initial)
node_count = len(nodes_start)
nodes_start, nodes_end, output, node_count = add_relationship(nodes_start, nodes_end, output, node_count, child_patterns)
print(nodes_start)
print(nodes_end)
print(output)

{'root': {'children': [['Node1'], ['Node2']], 'ancestors': [[]], 'children_pattern': ['M', 'O']}, 'Node1': {'children': [['Node2'], []], 'ancestors': [[], ['root']], 'children_pattern': ['M', 'O']}, 'Node2': {'children': [[]], 'ancestors': [[], ['root'], ['Node1']], 'children_pattern': ['Star', 'E']}}
{}
{'parent': ['root', 'Node1'], 'child': ['Node1', 'Node2'], 'relationshiptype': ['M', 'M']}


In [292]:
nodes_start, nodes_end, output, node_count = add_relationship(nodes_start, nodes_end, output, node_count, child_patterns)
print(nodes_start)
print(nodes_end)
print(output)

{'root': {'children': [['Node1', 'Node4'], ['Node2', 'Node3'], []], 'ancestors': [[]], 'children_pattern': ['M', 'O']}, 'Node1': {'children': [['Node2', 'Node3'], [], ['Node5']], 'ancestors': [[], ['root']], 'children_pattern': ['M', 'O']}, 'Node2': {'children': [[]], 'ancestors': [[], ['root'], ['Node1']], 'children_pattern': ['Star', 'E']}, 'Node3': {'children': [['Node5'], []], 'ancestors': [[], ['root'], ['Node1']], 'children_pattern': ['Star', 'E']}, 'Node4': {'children': [[]], 'ancestors': [[], ['root']], 'children_pattern': ['O', 'E']}, 'Node5': {'children': [[]], 'ancestors': [[], ['root'], ['Node1'], ['Node3']], 'children_pattern': ['O', 'R']}}
{'Node5': {'parent': {'Star': 'Node3'}}}
{'parent': ['root', 'Node1', 'Node1', 'root', 'Node3'], 'child': ['Node1', 'Node2', 'Node3', 'Node4', 'Node5'], 'relationshiptype': ['M', 'M', 'M', 'M', 'Star']}


In [4]:
import copy
# Define how many rows we want
num_rows1 = 500 - 1
num_rows2 = 1000 - 1
num_rows3 = 10000 - 1
num_rows4 = 20000 - 1

# Initialize the parameters
output,nodes_start,nodes_end = copy.deepcopy(output_initial),copy.deepcopy(nodes_start_initial),copy.deepcopy(nodes_end_initial)
node_count = len(nodes_start)
# print(output,'\n',nodes_start,'\n',nodes_end,'\n',node_count) #code for check initial
# run and get output with size 500
for _ in range(num_rows1):
    nodes_start, nodes_end, output, node_count = add_relationship(nodes_start, nodes_end, output, node_count, child_patterns)
output_500 = output

# # run with size 1k
# output,nodes_start,nodes_end = copy.deepcopy(output_initial),copy.deepcopy(nodes_start_initial),copy.deepcopy(nodes_end_initial)
# node_count = len(nodes_start)
# # print(output,'\n',nodes_start,'\n',nodes_end,'\n',node_count) # code for check initial
# for _ in range(num_rows2):
#     nodes_start, nodes_end, output, node_count = add_relationship(nodes_start, nodes_end, output, node_count, child_patterns)
# output_1k = output

# # run with size 10k
# output,nodes_start,nodes_end = copy.deepcopy(output_initial),copy.deepcopy(nodes_start_initial),copy.deepcopy(nodes_end_initial)
# node_count = len(nodes_start)
# for _ in range(num_rows3):
#     nodes_start, nodes_end, output, node_count = add_relationship(nodes_start, nodes_end, output, node_count, child_patterns)
# output_10k = output

# # run with size 20k
# output,nodes_start,nodes_end = copy.deepcopy(output_initial),copy.deepcopy(nodes_start_initial),copy.deepcopy(nodes_end_initial)
# node_count = len(nodes_start)
# for _ in range(num_rows4):
#     nodes_start, nodes_end, output, node_count = add_relationship(nodes_start, nodes_end, output, node_count, child_patterns)
# output_20k = output

In [309]:
print(output_500['parent'][-2])
print(output_500)
print(nodes_end)

Node257
{'parent': ['root', 'root', 'root', 'Node2', 'root', 'Node3', 'root', 'Node1', 'Node3', 'Node3', 'Node3', 'Node5', 'Node5', 'Node7', 'Node5', 'Node9', 'Node9', 'Node2', 'Node11', 'Node5', 'root', 'Node3', 'Node7', 'Node7', 'Node13', 'Node1', 'Node1', 'Node11', 'Node10', 'Node5', 'Node2', 'Node20', 'Node20', 'Node22', 'Node19', 'Node8', 'Node8', 'Node11', 'Node4', 'root', 'Node5', 'Node14', 'Node6', 'Node14', 'Node10', 'Node17', 'Node20', 'Node10', 'Node23', 'root', 'Node23', 'Node26', 'Node10', 'Node19', 'Node10', 'root', 'Node6', 'Node18', 'Node5', 'Node2', 'Node33', 'Node32', 'Node13', 'Node14', 'Node36', 'Node14', 'Node14', 'Node25', 'Node37', 'Node3', 'Node27', 'Node13', 'Node6', 'Node12', 'Node37', 'Node1', 'Node14', 'Node34', 'Node11', 'Node1', 'Node40', 'Node23', 'Node34', 'Node32', 'Node4', 'Node4', 'Node15', 'Node12', 'Node1', 'Node24', 'Node32', 'Node43', 'Node10', 'Node16', 'Node33', 'Node47', 'Node53', 'Node37', 'Node9', 'Node32', 'Node20', 'Node26', 'Node49', 'Node

In [7]:
data_500 = pd.DataFrame(output_500)
# data_1k = pd.DataFrame(output_1k)
# data_10k = pd.DataFrame(output_10k)
# data_20k = pd.DataFrame(output_20k)
data_500.to_csv("data_500.csv", index=False)
# data_1k.to_csv("data_1k.csv", index=False)
# data_10k.to_csv("data_10k.csv", index=False)
# data_20k.to_csv("data_20k.csv", index=False)

In [65]:
# # illustrate function for list attribute in this file.
# A = [[1,2,3]]
# B = [[4,5]]
# C = A + B
# # C.append(A[0])
# # C.append(B[0])
# print(A,B,C)
# # A[0] = 2
# A[0].append(6)
# # B.append(A)
# print(A,B,C)
# D = [[]]
# E = [[]]
# D = A + D
# E = D + E
# D[-1].append(7)
# A[0].append(8)
# print(A,B,C,D,E)

[[1, 2, 3]] [[4, 5]] [[1, 2, 3], [4, 5]]
[[1, 2, 3, 6]] [[4, 5]] [[1, 2, 3, 6], [4, 5]]
[[1, 2, 3, 6, 8]] [[4, 5]] [[1, 2, 3, 6, 8], [4, 5]] [[1, 2, 3, 6, 8], [7]] [[1, 2, 3, 6, 8], [7], []]


## Load data

In [8]:
#Establishing a connection to the Neo4j database
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "12345678"))

#Using the connection to run a session
with driver.session(database='assign3data500') as session:

    #---------------------------------------------------------------------------------

    #Query to load a CSV file (attached on BB). 
    #You can break the long Cypher query into multiple lines in Python using triple-quoted stringse:
    loadQuery = """
    LOAD CSV WITH HEADERS FROM 'file:///data_500.csv' AS line
    MERGE (n:Node {name: line.child})
    MERGE (m:Node {name: line.parent})
    MERGE (n)<-[r:isParentOf {rType: line.relationshiptype}]-(m)
    """
    #Record the start time before executing the query to load data
    loadStartTime = time.time()

    #Execute the query to load data
    LoadResult = session.run(loadQuery)

    #Record the end time after executing the query to load data
    LoadEndTime = time.time()

    #Calculate and print the time taken to execute the query for loading data
    print("\n\nThe execution time to load the CSV file to Neo4j is :", LoadEndTime - loadStartTime, "seconds\n")



The execution time to load the CSV file to Neo4j is : 2.2527241706848145 seconds

