In [53]:
from py2neo import Node, Relationship, Graph, Subgraph
import pandas as pd
from tqdm import tqdm
import re
import numpy as np

# Connect to neo4j database

In [54]:
# with neo4j running
graph = Graph("bolt://localhost:7687", auth=("neo4j", "igem2023"))
graph.delete_all()

# Load data

In [55]:
def gradient_color_generate(time: str):
    from datetime import datetime
    import matplotlib as mpl
    begin_time = datetime.strptime('2004-1-1', r'%Y-%m-%d')
    try:
        exact_time = datetime.strptime(time, r'%Y-%m-%d')
    except:
        exact_time = datetime.strptime('2004-1-1', r'%Y-%m-%d')
    now = datetime.now()
    max_interavl = now - begin_time
    interval = now - exact_time
    norm = mpl.colors.Normalize(vmin=0, vmax=max_interavl.days)
    cmap = mpl.colormaps.get_cmap('plasma')
    hex_rgb = mpl.colors.rgb2hex(cmap(norm(interval.days)))
    return hex_rgb

In [56]:
data = pd.read_csv(r'data/all_collections_filted.csv')
part_node_dict = {}
part_list = []
relationship_list = []
for i in tqdm(data.index):
    part_num = str(data['part_num'].values[i])
    part_name = str(data['part_name'].values[i])
    part_url = str(data['part_url'].values[i])
    part_desc = str(data['short_desc'].values[i])
    part_type = str(data['part_type'].values[i])
    part_team = str(data['team'].values[i])
    part_sequence = str(data['sequence'].values[i])
    part_contents = str(re.sub(' Sequence and Features', '', str(data['contents'].values[i])))
    part_released = str(data['released'].values[i])
    part_sample = str(data['sample'].values[i])
    part_twins = str(data['twins'].values[i])
    part_assemble = str(data['assemble_std'].values[i])
    part_used = str(data['parts_used'].values[i])
    part_using = str(data['using_parts'].values[i])
    part_len = str(data['len'].values[i])
    part_date = str(data['date'].values[i])
    part_isfavorite = str(data['isfavorite'].values[i])
    part_year = str(data['year'].values[i])
    part_designer = str(data['designer'].values[i])
    try:
        part_used_list = part_used.split(' ')
        part_using_list = part_using.split(' ')
        part_twins_list = part_twins.split(' ')
        if part_used == 'None' or part_used == '' or part_used == 'N o n e':
            part_used_list = []
        if part_using == 'self' or part_using == '':
            part_using_list = []
        if part_twins == 'None' or part_twins == '' or part_twins == 'N o n e':
            part_twins_list = []
    except:
        part_used_list = []
        part_using_list = []
        part_twins_list = []
    part_node = Node('Part', number=str(part_num), name=part_name, url=part_url, description=part_desc, type=part_type,
                     team=part_team, sequence=part_sequence, contents=part_contents, released=part_released,
                     sample=part_sample, assemble=part_assemble, length=part_len, date=part_date,
                     isfavorite=str(part_isfavorite), twins=part_twins_list, twins_num=str(len(part_twins_list)),
                     cited_by=part_used_list, year=part_year, cites=str(len(part_used_list)), ref=part_using_list,
                     citing=str(len(part_using_list)), designer=part_designer, prweight=max(1,len(part_used_list) * 0.5+len(part_using_list)+0.75 * len(part_twins_list)),
                     color=gradient_color_generate(part_date))
    part_list.append(part_node)
    part_node_dict.update({str(part_num): part_node})
twins_set_list = []
twins_node_list = []
for pNode in tqdm(part_node_dict.values()):
    if pNode['ref']:
        for ref_part in pNode['ref']:
            try:
                pNode1 = part_node_dict[ref_part]
                relationShip = Relationship(pNode, 'refers to', pNode1)
                relationShip["weight"] = pNode['prweight']
                relationship_list.append(relationShip)
            except:
                pass
    if pNode['twins']:
        for twin_part in pNode['twins']:
            try:
                pNode2 = part_node_dict[twin_part]
                if pNode2['number'] != pNode['number']:
                    if set([pNode2['number'],pNode['number']]) not in twins_set_list:
                        relationShip = Relationship(pNode, 'twins', pNode2)
                        relationship_list.append(relationShip)
                        twins_set_list.append(set([pNode2['number'],pNode['number']]))
            except:
                pass
    if pNode['cited_by']:
        for cite_part in pNode['cited_by']:
            try:
                pNode3 = part_node_dict[cite_part]
                pNode3 = part_node_dict[cite_part]
                relationShip = Relationship(pNode3, 'refers to', pNode)
                relationShip["weight"] = pNode3['prweight']
                relationship_list.append(relationShip)
            except:
                pass
relationship_list = list(set(relationship_list))

100%|██████████| 54319/54319 [00:18<00:00, 3015.63it/s]
100%|██████████| 54070/54070 [00:04<00:00, 12385.37it/s]


# Upload to neo4j

In [57]:
subgraph = Subgraph(part_list, relationship_list)
tx = graph.begin()
tx.create(subgraph)
graph.commit(tx)

# calculate PageRank

In [59]:
# create graph
query = """
CALL gds.graph.project(
  'parthub',
  'Part',
  'refers to',
  {
    relationshipProperties: 'weight'
  }
)
"""
graph.run(query)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Part: {label: 'Part', properties: {}}}","{`refers to`: {orientation: 'NATURAL', indexInverse: false, aggregation: 'DEFAULT', type: 'refers to', properties: {weight: {defaultValue: null, property: 'weight', aggregation: 'DEFAULT'}}}}",parthub,54319,51463,294


In [60]:
# calculate PageRanks
query = '''
CALL gds.pageRank.write('parthub', {
  maxIterations: 20,
  dampingFactor: 0.85,
  writeProperty: 'pagerank',
  relationshipWeightProperty: 'weight'
})
YIELD nodePropertiesWritten, ranIterations
'''
graph.run(query)

nodePropertiesWritten,ranIterations
54319,13


In [61]:
# get max pagerank and min pagerank
query = '''
MATCH (n:Part)
RETURN max(n.pagerank) AS max_val, min(n.pagerank) AS min_val
'''
graph.run(query)

max_val,min_val
46.2585416018457,0.15


In [62]:
# generate nodesize
query = '''
MATCH (n:Part)
SET n.nodesize = (n.pagerank - 0.15000000000000002) / (46.258541601845714 - 0.15000000000000002) * 90 + 30
'''
graph.run(query)

# Louvain method

In [63]:
# run Louvain method
query = '''
CALL gds.louvain.write('parthub', {
  writeProperty: 'community',
  relationshipWeightProperty: 'weight'
})
'''
graph.run(query)

writeMillis,nodePropertiesWritten,modularity,modularities,ranLevels,communityCount,communityDistribution,postProcessingMillis,preProcessingMillis,computeMillis,configuration
122,54319,0.8159140486219864,"[0.2929013510321137, 0.5554356463201578, 0.6585916830357512, 0.7179196711533793, 0.7513866300153957, 0.7744117586877015, 0.7898677387447763, 0.801128122197099, 0.8093501507185161, 0.8159140486219864]",10,23825,"{p99: 11, min: 1, max: 5666, mean: 2.279916054564533, p90: 1, p50: 1, p999: 179, p95: 2, p75: 1}",18,0,4734,"{maxIterations: 10, writeConcurrency: 4, seedProperty: null, consecutiveIds: false, maxLevels: 10, relationshipWeightProperty: 'weight', concurrency: 4, jobId: '697ff15c-a2e7-41bc-81a0-e277f20c5205', writeProperty: 'community', logProgress: true, includeIntermediateCommunities: false, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], tolerance: 0.0001}"


# delete temp graph

In [64]:
query = '''
CALL gds.graph.drop('parthub')
'''
graph.run(query)

graphName,database,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema,schemaWithOrientation
parthub,neo4j,,-1,54319,51463,{},1.744213175146908e-05,datetime('2023-09-05T14:02:08.109839010+00:00'),datetime('2023-09-05T14:02:08.430935081+00:00'),"{graphProperties: {}, relationships: {`refers to`: {weight: 'Float (DefaultValue(NaN), PERSISTENT, Aggregation.NONE)'}}, nodes: {Part: {}}}","{graphProperties: {}, relationships: {`refers to`: {properties: {weight: 'Float (DefaultValue(NaN), PERSISTENT, Aggregation.NONE)'}, direction: 'DIRECTED'}}, nodes: {Part: {}}}"
