Creation of the neo4j database based on the node and edges files that we've already created.

Embeddings were created with ``SentenceTransformer('all-MiniLM-L6-v2')``.

There are some fixes to field and node names that could be modified beforehand to simplify this code.
* Nodes should be added their type.
* Edges should be added the node types.

* Check node order in ``was admitted`` relation. Edges might be mixed and relation might end up being added twice!

In [None]:
import os
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
path_dir = './'

In [None]:
from langchain.graphs import Neo4jGraph

NEO4J_URI= "bolt://localhost:7687"
NEO4J_USERNAME= "neo4j" 
NEO4J_PASSWORD= ""

graph = Neo4jGraph(
    url=NEO4J_URI,
    username=NEO4J_USERNAME,
    password=NEO4J_PASSWORD
)

In [None]:
nodes = pd.read_pickle(path_dir + 'neo4j_nodes.pickle')
len(nodes)

In [None]:
nodes_patient = [dict(v, **{'id':k}) for k,v in nodes.items() if v['is a'] == 'patient']
nodes_admission = [dict(v, **{'id':k}) for k,v in nodes.items() if v['is a'] == 'admission']
nodes_prescription = [dict(v, **{'id':k}) for k,v in nodes.items() if v['is a'] == 'prescription']

len(nodes_patient),len(nodes_admission),len(nodes_prescription)

In [None]:
nodes_types = defaultdict(set)
for n,v in nodes.items():
    nodes_types[v['is a']].add(n)

In [None]:
graph.query("""
UNWIND $data AS row
CREATE (c:Patient {id: row.id})
WITH c, row
RETURN distinct 'done'
""", {'data': nodes_patient})

In [None]:
graph.query("""
UNWIND $data AS row
CREATE (c:Admission {id: row.id, has_age: row['has age'], has_gender: row['has gender']})
WITH c, row
RETURN distinct 'done'
""", {'data': nodes_admission})

In [None]:
n = 150000
for i in tqdm(range(0, len(nodes_prescription), n)):
    graph.query("""
    UNWIND $data AS row
    CREATE (c:Prescription {id: row.id,
                            has_a_duration_of: row['has a duration of'],
                            has_a_dose_of: row['has a dose of'],
                            has_a_drug_of: row['has a drug of']
                           })
    WITH c, row
    RETURN distinct 'done'
    """, {'data': nodes_prescription[i:i + n]})

In [None]:
del nodes_patient
del nodes_admission
del nodes_prescription
del nodes

In [None]:
nodes = pd.read_pickle(path_dir + 'neo4j_nodes_drugs.pickle')
nodes_drugs = [dict(v, **{'id':k}) for k,v in nodes.items()]
len(nodes)

In [None]:
graph.query("""
CALL db.index.vector.createNodeIndex(
  'acts_as_index', // index name
  'Drug',     // node label
  'acts_as__embeddings', // node property
   384,       // vector size
   'cosine'    // similarity metric
)
""")

graph.query("""
CALL db.index.vector.createNodeIndex(
  'is_known_as_index', // index name
  'Drug',     // node label
  'is_known_as__embeddings', // node property
   384,       // vector size
   'cosine'    // similarity metric
)
""")

graph.query("""
CALL db.index.vector.createNodeIndex(
  'is_indicated_for_index', // index name
  'Drug',     // node label
  'is_indicated_for__embeddings', // node property
   384,       // vector size
   'cosine'    // similarity metric
)
""")

graph.query("""
CALL db.index.vector.createNodeIndex(
  'can_be_described_as_index', // index name
  'Drug',     // node label
  'can_be_described_as__embeddings', // node property
   384,       // vector size
   'cosine'    // similarity metric
)
""")

In [None]:
graph.query("""
UNWIND $data AS row
CREATE (c:Drug {id : row['id'],

                is_known_as: row['is known as'], 
                is_indicated_for: row['is indicated for'],
                acts_as: row['acts as'],
                can_be_described_as: row['can be described as'],
            
                is_known_as__embeddings: row['is known as__embeddings'],
                acts_as__embeddings: row['acts as__embeddings'],
                is_indicated_for__embeddings: row['is indicated for__embeddings'],
                can_be_described_as__embeddings: row['can be described as__embeddings']
                })
WITH c, row
RETURN distinct 'done'
""", {'data': nodes_drugs})

In [None]:
graph.query("""
MATCH (c:Drug)
CALL db.create.setVectorProperty(c, 'is_known_as__embeddings', c['is_known_as__embeddings'])
YIELD node RETURN node;
""")

graph.query("""
MATCH (c:Drug)
WHERE c.acts_as__embeddings IS NOT NULL
CALL db.create.setVectorProperty(c, 'acts_as__embeddings', c['acts_as__embeddings'])
YIELD node RETURN node;
""")

graph.query("""
MATCH (c:Drug)
WHERE c.is_indicated_for__embeddings IS NOT NULL
CALL db.create.setVectorProperty(c, 'is_indicated_for__embeddings', c['is_indicated_for__embeddings'])
YIELD node RETURN node;
""")

graph.query("""
MATCH (c:Drug)
WHERE c.can_be_described_as__embeddings IS NOT NULL
CALL db.create.setVectorProperty(c, 'can_be_described_as__embeddings', c['can_be_described_as__embeddings'])
YIELD node RETURN node;
""")

In [None]:
nodes = pd.read_pickle(path_dir + 'neo4j_nodes_icds.pickle')
nodes_icds = [dict(v, **{'id':k}) for k,v in nodes.items()]
len(nodes)

In [None]:
graph.query("""
CALL db.index.vector.createNodeIndex(
  'is_known_as_icd_index', // index name
  'ICD',     // node label
  'is_known_as__embeddings', // node property
   384,       // vector size
   'cosine'    // similarity metric
)
""")

In [None]:
graph.query("""
UNWIND $data AS row
CREATE (c:ICD {id: row.id,
               is_known_as: row['is known as'],
               is_known_as__embeddings: row['is known as__embeddings']
              })
WITH c, row
CALL db.create.setVectorProperty(c, 'is_known_as__embeddings', row['is known as__embeddings'])
YIELD node
RETURN distinct 'done'
""", {'data': nodes_icds})

In [None]:
edges = pd.read_pickle(path_dir + 'neo4j_edges.pickle')
len(edges)

In [None]:
from collections import defaultdict

edges_types = defaultdict(list)

for e in tqdm(edges):
    if e['rel_type'] == 'has prescription':
        e['source_type'] = 'Admission'
        e['dest_type'] = 'Prescription'
        e['rel_type'] = e['rel_type'].replace(' ','_')
        edges_types[e['rel_type']].append(e)
        continue
        
    if e['rel_type'] == 'was admitted in':
        e['source_type'] = 'Patient'
        e['dest_type'] = 'Admission'
        e['rel_type'] = e['rel_type'].replace(' ','_')
        edges_types[e['rel_type']].append(e)
        continue
    
    if e['rel_type'] == 'takes drug':
        if e['source'].startswith('pres'):
            e['source_type'] = 'Prescription'
            e['dest_type'] = 'Drug'
            e['rel_type'] = 'prescribed_drug'
        else:
            e['source_type'] = 'Admission'
            e['dest_type'] = 'Drug'
            e['rel_type'] = e['rel_type'].replace(' ','_')
        edges_types[e['rel_type']].append(e)
        continue

    if e['rel_type'] == 'was diagnosed':
        e['source_type'] = 'Admission'
        e['dest_type'] = 'ICD'
        e['rel_type'] = e['rel_type'].replace(' ','_')
        edges_types[e['rel_type']].append(e)
        continue
    
    if e['rel_type'] == 'interacts with':
        e['source_type'] = 'Drug'
        e['dest_type'] = 'Drug'
        e['rel_type'] = e['rel_type'].replace(' ','_')
        edges_types[e['rel_type']].append(e)
        continue
    
    if e['rel_type'] == 'is subclass of':
        e['source_type'] = 'ICD'
        e['dest_type'] = 'ICD'
        e['rel_type'] = e['rel_type'].replace(' ','_')
        edges_types[e['rel_type']].append(e)
        continue
    
    print(e)
    break

In [None]:
for k,v in edges_types.items():
    print(k,len(v))

In [None]:
n = 100

In [None]:
direct_ = deque()
reversed_ = deque()

for x in tqdm(edges_types['was_admitted_in']):
    if x['source'] in nodes_types[x['source_type'].lower()] and x['dest'] in nodes_types[x['dest_type'].lower()]:
        direct_.append(x)
        if len(direct_) == n:
            graph.query("""
                UNWIND $data AS edge
                MATCH (a:Patient), (b:Admission)
                WHERE a.id = edge.source AND b.id = edge.dest
                MERGE (a)-[r:was_admitted_in]->(b)
                RETURN distinct 'done'
                """, {'data':direct_})   
            direct_.clear()
        
    if x['dest'] in nodes_types[x['source_type'].lower()] and x['source'] in nodes_types[x['dest_type'].lower()]:        
        reversed_.append(x) 
        if len(reversed_) == n:
            graph.query("""
                UNWIND $data AS edge
                MATCH (a:Patient), (b:Admission)
                WHERE a.id = edge.dest AND b.id = edge.source
                MERGE (a)-[r:was_admitted_in]->(b)
                RETURN distinct 'done'
                """, {'data':reversed_})    
            reversed_.clear()

if len(direct_) > 0:
    graph.query("""
        UNWIND $data AS edge
        MATCH (a:Patient), (b:Admission)
        WHERE a.id = edge.source AND b.id = edge.dest
        MERGE (a)-[r:was_admitted_in]->(b)
        RETURN distinct 'done'
        """, {'data':direct_})  
        
if len(reversed_) > 0:
    graph.query("""
        UNWIND $data AS edge
        MATCH (a:Patient), (b:Admission)
        WHERE a.id = edge.dest AND b.id = edge.source
        MERGE (a)-[r:was_admitted_in]->(b)
        RETURN distinct 'done'
        """, {'data':reversed_}) 

In [None]:
n = 25

for i in tqdm(range(0, len(edges_types['has_prescription']), n)): # llegó al 337
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:Admission), (b:Prescription)
    WHERE a.id = edge.source AND b.id = edge.dest
    MERGE (a)-[r:has_prescription]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['has_prescription'][i:i + n]})

for i in tqdm(range(0, len(edges_types['prescribed_drug']), n)): # 3277
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:Prescription), (b:Drug)
    WHERE a.id = edge.source AND b.id = edge.dest
    MERGE (a)-[r:prescribed_drug]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['prescribed_drug'][i:i + n]})

In [None]:
for i in tqdm(range(0, len(edges_types['is_subclass_of']), n)): # llegó al 337
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:ICD), (b:ICD)
    WHERE a.id = edge.source AND b.id = edge.dest
    MERGE (a)-[r:is_subclass_of]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['is_subclass_of'][i:i + n]})

In [None]:
n = 25

for i in tqdm(range(0, len(edges_types['takes_drug']), n)):
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:Admission), (b:Drug)
    WHERE a.id = edge.source AND b.id = edge.dest
    CREATE (a)-[r:takes_drug]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['takes_drug'][i:i + n]})

for i in tqdm(range(0, len(edges_types['interacts_with']), n)):
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:Drug), (b:Drug)
    WHERE a.id = edge.source AND b.id = edge.dest
    CREATE (a)-[r:interacts_with]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['interacts_with'][i:i + n]})

In [None]:
n = 10
for i in tqdm(range(0, len(edges_types['was_diagnosed']), n)):
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:Admission), (b:ICD)
    WHERE a.id = edge.source AND b.id = edge.dest
    CREATE (a)-[r:was_diagnosed]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['was_diagnosed'][i:i + n]})

In [None]:
n = 25

for i in tqdm(range(0, len(edges_types['is_subclass_of']), n)): 
    graph.query("""
    UNWIND $data AS edge
    MATCH (a:ICD), (b:ICD)
    WHERE a.id = edge.source AND b.id = edge.dest
    MERGE (a)-[r:is_subclass_of]->(b)
    RETURN distinct 'done'
    """, {'data':edges_types['is_subclass_of'][i:i + n]})