## HGT

In [None]:
#!pip install pandas dgl

In [None]:
import json 
import pandas as pd
from neo4j import GraphDatabase
import seaborn as sns
sns.set()

### Connect

In [None]:
driver = GraphDatabase.driver(uri = "bolt://localhost:7687",\
                              auth = ("neo4j","12341234"))

In [None]:
driver.verify_connectivity()

### Link Prediction (Data Preperation)

- I have the following nodes in Neo4j: Compound.
Modified_residue, Organism, PTM_Type, Pathway, Protein, Reference. 
- Following are the relationships: (Protein, Pathways), (Protein, PTM_types),(PTM_Types, Modified_residue),(Modified_residue, reference),(Pathway, Compound). Help me with the following 

- (1) Extract a heterogeneous graph from neo4j  and save it as CSV file 
- (2) Use a CSV file to create a DGL heterogeneous graph, 

next I am asking you to create a graph neural network on this heterogeneous graph.

![img](schema.png)

### Get Node Data

In [None]:
# Find  unique nodes
def getnode(query):
    res = []
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            res.append({"source":item.values()[0]})
    return pd.DataFrame(res)

In [None]:
protein_query = "MATCH (p:Protein) RETURN p.UID AS source"
protein_df=getnode(protein_query)
p2index = {p:i for i,p in enumerate(protein_df["source"].unique())}
with open("hgt_data/p2index.json","w") as f:
    json.dump(p2index,f)

pathway_query = "MATCH (p:Pathway) RETURN p.PWID AS source"
pathway_df=getnode(pathway_query)
pw2index = {p:i for i,p in enumerate(pathway_df["source"].unique())}
with open("hgt_data/pw2index.json","w") as f:
    json.dump(pw2index,f)

modified_residue_query = "MATCH (p:Modified_residue) RETURN p.MRID AS source"
modified_residue_df=getnode(modified_residue_query)
mr2index = {p:i for i,p in enumerate(modified_residue_df["source"].unique())}
with open("hgt_data/mr2index.json","w") as f:
    json.dump(mr2index,f)

PTM_query = "MATCH (p:PTM_Type) RETURN p.PTMID AS source"
PTM_df=getnode(PTM_query)
ptm2index = {p:i for i,p in enumerate(PTM_df["source"].unique())}
with open("hgt_data/ptm2index.json","w") as f:
    json.dump(ptm2index,f)

organism_query = "MATCH (p:Organism) RETURN p.name AS source"
organism_df=getnode(organism_query)
org2index = {p:i for i,p in enumerate(organism_df["source"].unique())}
with open("hgt_data/org2index.json","w") as f:
    json.dump(org2index,f)

compound_query = "MATCH (p:Compound) RETURN p.CID AS source"
compound_df=getnode(compound_query)
cpd2index = {p:i for i,p in enumerate(compound_df["source"].unique())}
with open("hgt_data/cpd2index.json","w") as f:
    json.dump(cpd2index,f)

Reference_query = "MATCH (p:Reference) RETURN p.PMID AS source"
Reference_df=getnode(Reference_query)
ref2index = {p:i for i,p in enumerate(Reference_df["source"].unique())}
with open("hgt_data/ref2index.json","w") as f:
    json.dump(ref2index,f)

### Get Relationship Data

In [None]:
def getdata(query,relation):
    res = []
    with driver.session() as session:
        info = session.run(query)
        for item in info:
            res.append({"source":item.values()[0],\
                        "target":item.values()[1],\
                        "relation":relation})
    return pd.DataFrame(res)


#### Protein to Pathway relationship

In [None]:
protein_pathway_query = "MATCH (p:Protein)-[r:PARTICIPATES_IN]->(pw:Pathway)\
         RETURN p.UID AS source, pw.PWID AS target"
protein_pathway_df=getdata(protein_pathway_query,"PARTICIPATES_IN")

protein_pathway_df['source_index'] = protein_pathway_df['source'].apply(lambda x:p2index[x])
protein_pathway_df['target_index'] = protein_pathway_df['target'].apply(lambda x:pw2index[x])

protein_pathway_df.to_csv("hgt_data/protein_pathway.csv",index=False)
protein_pathway_df.shape

In [None]:
protein_pathway_df.head(3)

#### Protein to Modified_residue relationship

In [None]:

protein_modified_residue_query = "MATCH (p:Protein)-[r:HAS_MODIFIED_RESIDUE]->(mr:Modified_residue)\
         RETURN p.UID AS source, mr.MRID AS target"
protein_modified_residue_df=getdata(protein_modified_residue_query,"HAS_MODIFIED_RESIDUE")

protein_modified_residue_df['source_index'] = protein_modified_residue_df['source'].apply(lambda x:p2index[x])
protein_modified_residue_df['target_index'] = protein_modified_residue_df['target'].apply(lambda x:mr2index[x])

protein_modified_residue_df.to_csv("hgt_data/protein_modified_residue.csv",index=False)
protein_modified_residue_df.shape

In [None]:
protein_modified_residue_df.head(3)

#### Modified_residue to PTM_Type relationship

In [None]:
modified_residue_ptm_type_query = "MATCH (mr:Modified_residue)-[r:HAS_PTM_TYPE]->(pt:PTM_Type)\
            RETURN mr.MRID AS source, pt.PTMID AS target"
modified_residue_ptm_type_df=getdata(modified_residue_ptm_type_query,"HAS_PTM_TYPE")

modified_residue_ptm_type_df['source_index'] = modified_residue_ptm_type_df['source'].apply(lambda x:mr2index[x])
modified_residue_ptm_type_df['target_index'] = modified_residue_ptm_type_df['target'].apply(lambda x:ptm2index[x])

modified_residue_ptm_type_df.to_csv("hgt_data/modified_residue_ptm_type.csv",index=False)
modified_residue_ptm_type_df.shape

In [None]:
modified_residue_ptm_type_df.head(3)

#### PPI_INTERACTION between two proteins

In [None]:
ppi_query = "MATCH (p1:Protein)-[r:HAS_PPI_INTERACTION]->(p2:Protein)\
            RETURN p1.UID AS source, p2.UID AS target"
ppi_df=getdata(ppi_query,"HAS_PPI_INTERACTION")

ppi_df['source_index'] = ppi_df['source'].apply(lambda x:p2index[x])
ppi_df['target_index'] = ppi_df['target'].apply(lambda x:p2index[x])

ppi_df.to_csv("hgt_data/ppi.csv",index=False)
ppi_df.shape

In [None]:
ppi_df.head(3)

#### Modified_residue to Reference relationship

In [None]:
modified_residue_reference_query = "MATCH (mr:Modified_residue)-[r:HAS_REFERENCE]->(ref:Reference)\
            RETURN mr.MRID AS source, ref.PMID AS target"
modified_residue_reference_df=getdata(modified_residue_reference_query,"HAS_REFERENCE")

#drop nan
modified_residue_reference_df = modified_residue_reference_df.dropna()

modified_residue_reference_df['source_index'] = modified_residue_reference_df['source'].apply(lambda x:mr2index[x])
modified_residue_reference_df['target_index'] = modified_residue_reference_df['target'].apply(lambda x:ref2index[x])

modified_residue_reference_df.to_csv("hgt_data/modified_residue_reference.csv",index=False)
modified_residue_reference_df.shape

In [None]:
modified_residue_reference_df.head(3)

#### Compound to Pathway relationship

In [None]:
compound_pathway_query = "MATCH (c:Compound)-[r:PARTICIPATES_IN]->(pw:Pathway)\
            RETURN c.CID AS source, pw.PWID AS target"
compound_pathway_df=getdata(compound_pathway_query,"PARTICIPATES_IN")

compound_pathway_df['source_index'] = compound_pathway_df['source'].apply(lambda x:cpd2index[x])
compound_pathway_df['target_index'] = compound_pathway_df['target'].apply(lambda x:pw2index[x])

compound_pathway_df.to_csv("hgt_data/compound_pathway.csv",index=False)
compound_pathway_df.shape


In [None]:
compound_pathway_df.head(3)

#### Protein to Organism relationship

In [None]:
protein_organism_query = "MATCH (p:Protein)-[r:BELONGS_TO]->(o:Organism)\
            RETURN p.UID AS source, o.name AS target"
protein_organism_df=getdata(protein_organism_query,"BELONGS_TO")


protein_organism_df['source_index'] = protein_organism_df['source'].apply(lambda x:p2index[x])
protein_organism_df['target_index'] = protein_organism_df['target'].apply(lambda x:org2index[x])

protein_organism_df.to_csv("hgt_data/protein_organism.csv",index=False)
protein_organism_df.shape

In [None]:
protein_organism_df.head(3)

## Heterogeneous Graph

In [None]:
import dgl
import pandas as pd
import torch

Here's a Python snippet to read CSV files and create a DGL heterogeneous graph:

In [None]:
# Reading data from CSV files
protein_pathway = pd.read_csv('hgt_data/protein_pathway.csv')
protein_modified_residue = pd.read_csv('hgt_data/protein_modified_residue.csv')
modified_residue_ptm_type = pd.read_csv('hgt_data/modified_residue_ptm_type.csv')
ppi = pd.read_csv('hgt_data/ppi.csv')
modified_residue_reference = pd.read_csv('hgt_data/modified_residue_reference.csv')
compound_pathway = pd.read_csv('hgt_data/compound_pathway.csv')
protein_organism = pd.read_csv('hgt_data/protein_organism.csv')


In [None]:
# Creating DGL Heterogeneous graph
g = dgl.heterograph({
    ('protein', 'PARTICIPATES_IN', 'pathway'): (protein_pathway['source_index'].values,\
                                                protein_pathway['target_index'].values),
    ('protein', 'HAS_MODIFIED_RESIDUE', 'modified_residue'): (protein_modified_residue['source_index'].values,\
                                                              protein_modified_residue['target_index'].values),
    ('modified_residue', 'HAS_PTM_TYPE', 'ptm_type'): (modified_residue_ptm_type['source_index'].values,\
                                                       modified_residue_ptm_type['target_index'].values),
    ('protein', 'HAS_PPI_INTERACTION', 'protein'): (ppi['source_index'].values,\
                                                    ppi['target_index'].values),
    ('modified_residue', 'HAS_REFERENCE', 'reference'): (modified_residue_reference['source_index'].values,\
                                                         modified_residue_reference['target_index'].values),
    ('compound', 'PARTICIPATES_IN', 'pathway'): (compound_pathway['source_index'].values,\
                                                 compound_pathway['target_index'].values),
    ('protein', 'BELONGS_TO', 'organism'): (protein_organism['source_index'].values,\
                                            protein_organism['target_index'].values)
})


In [None]:
g

In [None]:
# Prepare input features (here randomly initialized)
h_dict = {'protein': torch.randn(g.number_of_nodes('protein'), 10),
          'pathway': torch.randn(g.number_of_nodes('pathway'), 10),
          'ptm_type': torch.randn(g.number_of_nodes('ptm_type'), 10),
          'modified_residue': torch.randn(g.number_of_nodes('modified_residue'), 10),
          'reference': torch.randn(g.number_of_nodes('reference'), 10),
          'organism': torch.randn(g.number_of_nodes('organism'), 10),
          'compound': torch.randn(g.number_of_nodes('compound'), 10)}

## Graph Neural Network

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn as dglnn
import random

In [None]:
# # Get the set of nodes and existing edges
# all_nodes = set(range(g.number_of_nodes('protein')))
# existing_edges = set(g.edges(etype='HAS_MODIFIED_RESIDUE')[0].cpu().numpy())

# # Calculate the set of nodes that do not have an edge
# non_existing_edges = all_nodes - existing_edges

# # Check if we have enough negative samples
# if len(non_existing_edges) < 50:
#     print(f"Not enough negative samples, only have {len(non_existing_edges)}")
# else:
#     neg_edges = torch.tensor(random.sample(list(non_existing_edges), 50))

#     # Your existing code for generating positive samples
#     pos_edges = torch.tensor(random.sample(list(existing_edges), 50))

#     # Concatenate positive and negative samples and create labels
#     train_edges = torch.cat([pos_edges, neg_edges], dim=0)
#     train_labels = torch.cat([torch.ones(len(pos_edges)), torch.zeros(len(neg_edges))], dim=0)


In [None]:
positive_edges = list(zip(*g.edges(etype='HAS_MODIFIED_RESIDUE')))
negative_edges = []

while len(negative_edges) < len(positive_edges):
    u, v = random.randint(0, g.number_of_nodes('protein') - 1),\
          random.randint(0, g.number_of_nodes('protein') - 1)
    if (u, v) not in positive_edges and (u, v) not in negative_edges:
        negative_edges.append((u, v))

train_edges = torch.tensor(positive_edges + negative_edges)
train_labels = torch.tensor([1]*len(positive_edges) + [0]*len(negative_edges))

In [None]:
class SimpleGNN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super(SimpleGNN, self).__init__()
        self.conv1 = dglnn.HeteroGraphConv({
                        '_'.join(etype): dglnn.GraphConv(in_dim, hidden_dim)\
                         for etype in g.canonical_etypes}, aggregate='mean')
        
        self.conv2 = dglnn.HeteroGraphConv({
                        '_'.join(etype): dglnn.GraphConv(hidden_dim, out_dim)\
                         for etype in g.canonical_etypes}, aggregate='mean')
        
        
    def forward(self, g, h_dict):
        h_dict = self.conv1(g, h_dict)
        h_dict = {k: F.relu(h) for k, h in h_dict.items()}
        h_dict = self.conv2(g, h_dict)
        return h_dict

In [None]:
# Initialize the model and optimizer
model = SimpleGNN(10, 20, 10)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    model.train()
    h_dict = model(g, h_dict)
    
    # Only take the embeddings for the 'protein' type as an example
    protein_embeddings = h_dict['protein']
    
    # Compute scores using dot-product similarity as an example
    edge_embeddings = protein_embeddings[train_edges]
    scores = (edge_embeddings[:, 0] * edge_embeddings[:, 1]).sum(dim=1)
    
    # Compute loss
    loss = F.binary_cross_entropy_with_logits(scores, train_labels.float())
    
    # Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')