In [1]:
# imports
import pandas as pd
from graphdatascience import GraphDataScience
import torch
from torch_geometric.data import Data
from torch_geometric.transforms import RandomLinkSplit
import random 
import numpy as np
import torch.optim as optim
from torch_geometric.nn import GAE


In [3]:
# set seeds for consistent results
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

# override neo4j_uri and neo4j_auth - your setup
NEO4J_URI = "bolt://localhost:7687"
NEO4J_USER="neo4j"
NEO4J_PASSWORD="12345678"
# NEO4J_URI = "neo4j://localhost:7687"
NEO4J_AUTH = (NEO4J_USER, NEO4J_PASSWORD)
DATABASE = "bioasq300"
gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH, database=DATABASE)

In [4]:
G = gds.graph.get(graph_name="contexts")
print(G)
print(G.node_count())
print(G.relationship_count())
print(G.node_labels())

Graph(name=contexts, node_count=2487, relationship_count=4818)
2487
4818
['CONTEXT']


In [5]:
# use random walk with restarts sampling algorithm with default values
if gds.graph.exists("contexts_sample")['exists']:
    G_sample = gds.graph.get("contexts_sample")
else:
    G_sample, _ = gds.alpha.graph.sample.rwr("contexts_sample", G, random_seed=42)

print(f"total nodes in sampled graph: {G_sample.node_count()}")
print(f"total relationships in sampled graph: {G_sample.relationship_count()}")

total nodes in sampled graph: 373
total relationships in sampled graph: 1640


In [25]:
G_sample.node_properties

<bound method Graph.node_properties of Graph({'graphName': 'contexts_sample', 'nodeCount': 133, 'relationshipCount': 738, 'database': 'bioasq200', 'configuration': {'samplingRatio': 0.15, 'readConcurrency': 4, 'startNodes': [], 'nodeLabelStratification': False, 'undirectedRelationshipTypes': ['IS_SIMILAR_TO'], 'jobId': '18ee354e-256b-440c-9807-fe75a7e87c27', 'logProgress': True, 'query': "MATCH (source:CONTEXT)\r\n\t\tOPTIONAL MATCH (source:CONTEXT)-[r:IS_SIMILAR_TO]->(target:CONTEXT)\r\n\t\tRETURN gds.graph.project(\r\n\t\t  'contexts',\r\n\t\t  source,\r\n\t\t  target,\r\n\t\t  {\r\n\t\t    sourceNodeLabels: labels(source),\r\n\t\t    targetNodeLabels: labels(target),\r\n\t\t    sourceNodeProperties: source { .embedding },\r\n\t\t    targetNodeProperties: target { .embedding },\r\n\t\t    relationshipType: type(r),\r\n\t\t    relationshipProperties: r { .score }\r\n\t\t  },\r\n\t\t  { undirectedRelationshipTypes: ['IS_SIMILAR_TO'] }\r\n)", 'inverseIndexedRelationshipTypes': [], 'node

### Export sampled graph

In [6]:
sample_topology_df = gds.beta.graph.relationships.stream(G_sample)
display(sample_topology_df)

Unnamed: 0,sourceNodeId,targetNodeId,relationshipType
0,2812,2817,IS_SIMILAR_TO
1,2812,2822,IS_SIMILAR_TO
2,2812,2831,IS_SIMILAR_TO
3,2812,2833,IS_SIMILAR_TO
4,2812,2835,IS_SIMILAR_TO
...,...,...,...
1635,4972,4961,IS_SIMILAR_TO
1636,4969,4963,IS_SIMILAR_TO
1637,4969,4971,IS_SIMILAR_TO
1638,5128,5131,IS_SIMILAR_TO


In [7]:
# Fix the format to comply with PyG
# By using 'by_rel_type' we get the topology in a format that can be used as input to several GNN frameworks:
# {"rel_type": [ [source_nodes], [target_nodes] ]}
sample_topology = sample_topology_df.by_rel_type()
sample_topology

{'IS_SIMILAR_TO': [[2812,
   2812,
   2812,
   2812,
   2812,
   2812,
   2812,
   2812,
   2812,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2817,
   2822,
   2822,
   2822,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2831,
   2833,
   2833,
   2833,
   2833,
   2833,
   2833,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2835,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2838,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
   2839,
  

In [8]:
# should only have the "IS_SIMILAR_TO" keys since there's only one relationship type
print(f"sample topology keys: {list(sample_topology.keys())}")
print(len(sample_topology['IS_SIMILAR_TO']))

# how many source nodes do we have?
print(f"number of source nodes: {len(sample_topology['IS_SIMILAR_TO'][0])}")

sample topology keys: ['IS_SIMILAR_TO']
2
number of source nodes: 1640


In [9]:
# we need to export the node properties corresponding to our node labels and features, represented by the
# "subject" and "features"node properties in the graph
sample_node_properties = gds.graph.nodeProperties.stream(
    G_sample, 
    ["embedding"],
    separate_property_columns=True
)
sample_node_properties

Unnamed: 0,nodeId,embedding
0,2681,"[0.24675986170768738, -0.11024772375822067, 0...."
1,2715,"[-0.09246793389320374, -0.650370717048645, 1.0..."
2,2741,"[-0.17698262631893158, -0.33779197931289673, -..."
3,2749,"[0.11361522972583771, -0.7372385263442993, -0...."
4,2778,"[0.7956816554069519, 0.04456862434744835, 0.24..."
...,...,...
368,4972,"[-0.011746952310204506, -0.31106245517730713, ..."
369,5000,"[-0.011844214983284473, -0.2224242091178894, 0..."
370,5044,"[0.24937978386878967, -0.4107266366481781, -0...."
371,5128,"[-0.16294977068901062, -1.125403642654419, -0...."


### Constructing GCN input
construct the PyG `Data` object we will use as training input

In [10]:
# In order for the node ids used in the `topology` to be consecutive and starting from zero,
# we will need to remap them. This way they will also align with the row numbering of the
# `sample_node_properties` data frame
def normalize_topology_index(new_idx_to_old, topology):
    # Create a reverse mapping based on new idx -> old idx
    old_idx_to_new = dict((v, k) for k, v in new_idx_to_old.items())
    return [[old_idx_to_new[node_id] for node_id in nodes] for nodes in topology]


# We use the ordering of node ids in `sample_node_properties` as our remapping
# The result is: [[mapped_source_nodes], [mapped_target_nodes]]
normalized_topology = normalize_topology_index(dict(sample_node_properties["nodeId"]), sample_topology["IS_SIMILAR_TO"])

In [20]:
normalized_topology[0][0]

6

In [11]:
# We use the ordering of node ids in `sample_node_properties` as our remapping
edge_index = torch.tensor(normalized_topology, dtype=torch.long)

# We specify the node property "features" as the zero-layer node embeddings
x = torch.tensor(sample_node_properties["embedding"], dtype=torch.float)

data = Data(x=x, edge_index=edge_index)


print(data)

Data(x=[373, 768], edge_index=[2, 1640])


In [22]:
data.edge_index[1].shape


torch.Size([1640])

In [33]:
data.is_undirected()

True

In [34]:
# split edges: 10% val, 5% test, rest train
transform = RandomLinkSplit(
    num_val=0.10,
    num_test=0.05,
    is_undirected=True,
    split_labels=True,
)
train_data, val_data, test_data = transform(data)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

train_data = train_data.to(device)
val_data   = val_data.to(device)
test_data  = test_data.to(device)

Using device: cuda


In [35]:
train_data

Data(x=[133, 768], edge_index=[2, 630], pos_edge_label=[315], pos_edge_label_index=[2, 315], neg_edge_label=[315], neg_edge_label_index=[2, 315])

#### Define your encoder (GCN‐based)

In [29]:
from torch_geometric.nn import GCNConv


# A simple 2-layer GCN encoder that reduces your 768-d BERT vectors to e.g. 64-d structural embeddings:

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels: int, hidden_channels: int, out_channels: int):
        super().__init__()
        # first layer: semantic → hidden
        self.conv1 = GCNConv(in_channels, hidden_channels)
        # second layer: hidden → structural latent
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        # 1) aggregate neighbors
        x = self.conv1(x, edge_index)
        x = x.relu()
        # 2) final projection
        z = self.conv2(x, edge_index)
        return z

In [None]:

# instantiate encoder + GAE
in_dim   = data.num_node_features    # 768
hid_dim  = 128
out_dim  = 64

encoder = GCNEncoder(in_dim, hid_dim, out_dim)
model   = GAE(encoder)

In [None]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, train_data = model.to(device), train_data.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)

def train():
    model.train()
    optimizer.zero_grad()
    # forward: get embeddings for train connectivity
    z = model.encode(train_data.x, train_data.edge_index)
    # compute loss on pos + neg samples
    loss = model.recon_loss(
        z,
        pos_edge_index=train_data.pos_edge_label_index,
        neg_edge_index=train_data.neg_edge_label_index,
    )
    # optional: add latent regularization (KL for VGAE) or feature smoothness
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def test(split_data):
    model.eval()
    z = model.encode(split_data.x, split_data.edge_index)
    return model.test(
        z,
        pos_edge_index=split_data.pos_edge_label_index,
        neg_edge_index=split_data.neg_edge_label_index,
    )

for epoch in range(1, 301):
    loss = train()
    if epoch % 10 == 0:
        val_auc, val_ap = test(val_data)
        print(f'Epoch {epoch:03d}, Loss: {loss:.4f}, Val AUC: {val_auc:.4f}, AP: {val_ap:.4f}')

Epoch 010, Loss: 7.5227, Val AUC: 0.8191, AP: 0.7344
Epoch 020, Loss: 8.4428, Val AUC: 0.8191, AP: 0.7344
Epoch 030, Loss: 11.9042, Val AUC: 0.7660, AP: 0.6812
Epoch 040, Loss: 12.0153, Val AUC: 0.7553, AP: 0.6714
Epoch 050, Loss: 12.0248, Val AUC: 0.7660, AP: 0.6812
Epoch 060, Loss: 12.2805, Val AUC: 0.7660, AP: 0.6812
Epoch 070, Loss: 9.5515, Val AUC: 0.7872, AP: 0.7015
Epoch 080, Loss: 10.1782, Val AUC: 0.7872, AP: 0.7015
Epoch 090, Loss: 12.3657, Val AUC: 0.7872, AP: 0.7015
Epoch 100, Loss: 12.4510, Val AUC: 0.7872, AP: 0.7015
Epoch 110, Loss: 12.2805, Val AUC: 0.7979, AP: 0.7121
Epoch 120, Loss: 12.1099, Val AUC: 0.8085, AP: 0.7231
Epoch 130, Loss: 12.9627, Val AUC: 0.7979, AP: 0.7121
Epoch 140, Loss: 10.6601, Val AUC: 0.8085, AP: 0.7231
Epoch 150, Loss: 15.8623, Val AUC: 0.6809, AP: 0.6104
Epoch 160, Loss: 16.5640, Val AUC: 0.6702, AP: 0.6026
Epoch 170, Loss: 16.1181, Val AUC: 0.6702, AP: 0.6026
Epoch 180, Loss: 15.5211, Val AUC: 0.6809, AP: 0.6104
Epoch 190, Loss: 15.5211, Val A

In [None]:
# after training

model.eval()
with torch.no_grad():
    # full‐graph encoding
    z = model.encode(data.x.to(device), data.edge_index.to(device))
# z is of shape [N, out_channels], your new graph embeddings

new_embeddings = z.cpu().numpy()
print(new_embeddings.shape)
# >>> (133, 64)

# assuming you have a pandas DataFrame mapping row idx → original nodeId
df = sample_node_properties[['nodeId']].copy()
df['graph_embedding'] = list(new_embeddings.tolist())
# then use gds.graph.writeNodeProperties or the Bolt driver to write back
print(df)
# >>> nodeId	graph_embedding
# >>> 0	3559	[-0.6165163516998291, 0.5408837199211121, -1.0...
# >>> 1	3642	[-3.4880423545837402, -2.6706700325012207, -2....
