In [8]:
from py2neo import Node, Relationship, Graph, Subgraph
import pandas as pd
from tqdm import tqdm
import re
import numpy as np

# Connect to neo4j database

In [9]:
# with neo4j running
graph = Graph("bolt://localhost:7687", auth=("neo4j", "igem2023"))
graph.delete_all()

# Load data

In [3]:
def gradient_color_generate(time: str):
    from datetime import datetime
    import matplotlib as mpl
    begin_time = datetime.strptime('2004-1-1', r'%Y-%m-%d')
    try:
        exact_time = datetime.strptime(time, r'%Y-%m-%d')
    except:
        exact_time = datetime.strptime('2004-1-1', r'%Y-%m-%d')
    now = datetime.now()
    max_interavl = now - begin_time
    interval = now - exact_time
    norm = mpl.colors.Normalize(vmin=0, vmax=max_interavl.days)
    cmap = mpl.colormaps.get_cmap('plasma')
    hex_rgb = mpl.colors.rgb2hex(cmap(norm(interval.days)))
    return hex_rgb

In [10]:
data = pd.read_csv(r'data/all_collections_filted.csv')
part_node_dict = {}
part_list = []
relationship_list = []
for i in tqdm(data.index):
    part_num = data['part_num'].values[i]
    part_name = data['part_name'].values[i]
    part_url = data['part_url'].values[i]
    part_desc = data['short_desc'].values[i]
    part_type = data['part_type'].values[i]
    part_team = data['team'].values[i]
    part_sequence = data['sequence'].values[i]
    part_contents = re.sub(' Sequence and Features', '', str(data['contents'].values[i]))
    part_released = data['released'].values[i]
    part_sample = data['sample'].values[i]
    part_twins = data['twins'].values[i]
    part_assemble = data['assemble_std'].values[i]
    part_used = str(data['parts_used'].values[i])
    part_using = str(data['using_parts'].values[i])
    part_len = data['len'].values[i]
    part_date = data['date'].values[i]
    part_isfavorite = data['isfavorite'].values[i]
    part_year = str(data['year'].values[i])
    part_designer = data['designer'].values[i]
    try:
        part_used_list = part_used.split(' ')
        part_using_list = part_using.split(' ')
        part_twins_list = part_twins.split(' ')
        if part_used == 'None' or part_used == '' or part_used == 'N o n e':
            part_used_list = []
        if part_using == 'self' or part_using == '':
            part_using_list = []
        if part_twins == 'None' or part_twins == '' or part_twins == 'N o n e':
            part_twins_list = []
    except:
        part_used_list = []
        part_using_list = []
        part_twins_list = []
    part_node = Node('Part', number=str(part_num), name=part_name, url=part_url, description=part_desc, type=part_type,
                     team=part_team, sequence=part_sequence, contents=part_contents, released=part_released,
                     sample=part_sample, assemble=part_assemble, length=part_len, date=part_date,
                     isfavorite=str(part_isfavorite), twins=part_twins_list, twins_num=str(len(part_twins_list)),
                     cited_by=part_used_list, year=part_year, cites=str(len(part_used_list)), ref=part_using_list,
                     citing=str(len(part_using_list)), designer=part_designer, prweight=max(1,len(part_used_list) * 0.5+len(part_using_list)+0.75 * len(part_twins_list)),
                     color=gradient_color_generate(part_date))
    part_list.append(part_node)
    part_node_dict.update({str(part_num): part_node})
for pNode in tqdm(part_node_dict.values()):
    if pNode['ref']:
        for ref_part in pNode['ref']:
            try:
                pNode1 = part_node_dict[ref_part]
                relationShip = Relationship(pNode, 'refers to', pNode1)
                relationShip["weight"] = pNode['prweight']
                relationship_list.append(relationShip)
            except:
                pass
    if pNode['twins']:
        for twin_part in pNode['twins']:
            try:
                pNode2 = part_node_dict[twin_part]
                if pNode2['number'] != pNode['number']:
                    relationShip = Relationship(pNode, 'twins', pNode2)
                    relationship_list.append(relationShip)
            except:
                pass
    if pNode['cited_by']:
        for cite_part in pNode['cited_by']:
            try:
                pNode3 = part_node_dict[cite_part]
                pNode3 = part_node_dict[cite_part]
                relationShip = Relationship(pNode3, 'refers to', pNode)
                relationShip["weight"] = pNode3['prweight']
                relationship_list.append(relationShip)
            except:
                pass
relationship_list = list(set(relationship_list))

100%|██████████| 54319/54319 [00:17<00:00, 3083.84it/s]
100%|██████████| 54070/54070 [00:00<00:00, 96322.90it/s] 


# Upload to neo4j

In [11]:
subgraph = Subgraph(part_list, relationship_list)
tx = graph.begin()
tx.create(subgraph)
graph.commit(tx)

# calculate PageRank

In [12]:
# create graph
query = """
CALL gds.graph.project(
  'parthub',
  'Part',
  'refers to',
  {
    relationshipProperties: 'weight'
  }
)
"""
graph.run(query)

nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Part: {label: 'Part', properties: {}}}","{`refers to`: {orientation: 'NATURAL', indexInverse: false, aggregation: 'DEFAULT', type: 'refers to', properties: {weight: {defaultValue: null, property: 'weight', aggregation: 'DEFAULT'}}}}",parthub,54319,14307,55


In [13]:
# calculate PageRanks
query = '''
CALL gds.pageRank.write('parthub', {
  maxIterations: 20,
  dampingFactor: 0.85,
  writeProperty: 'pagerank',
  relationshipWeightProperty: 'weight'
})
YIELD nodePropertiesWritten, ranIterations
'''
graph.run(query)

nodePropertiesWritten,ranIterations
54319,5


# Louvain method

In [20]:
# run Louvain method
query = '''
CALL gds.louvain.write('parthub', {
  writeProperty: 'community',
  relationshipWeightProperty: 'weight'
})
'''
graph.run(query)

writeMillis,nodePropertiesWritten,modularity,modularities,ranLevels,communityCount,communityDistribution,postProcessingMillis,preProcessingMillis,computeMillis,configuration
40,54319,0.9042461555324456,"[0.5253737060994204, 0.7372567110719895, 0.8359484911842526, 0.8885564991582454, 0.8940020297177104, 0.8983109976663628, 0.9008532361090621, 0.9025451387296474, 0.9034142032756712, 0.9042461555324456]",10,43581,"{p99: 3, min: 1, max: 1703, mean: 1.246391776232762, p90: 1, p50: 1, p999: 26, p95: 1, p75: 1}",12,0,2296,"{maxIterations: 10, writeConcurrency: 4, seedProperty: null, consecutiveIds: false, maxLevels: 10, relationshipWeightProperty: 'weight', concurrency: 4, jobId: 'e921f470-e6c6-46e1-98ce-301484ca8c6f', writeProperty: 'community', logProgress: true, includeIntermediateCommunities: false, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], tolerance: 0.0001}"
