In [2]:
# Import kamping library before starting the tutorial
import kamping

%load_ext autoreload
%autoreload 2

In [3]:
gene_graphs = kamping.create_graphs('../data/kgml_hsa', type='mixed', verbose=True, ignore_file=['hsa01100.xml'])


            Visit https://www.kegg.jp/kegg-bin/show_pathway?hsa00190 for pathway details.

            There are likely no edges in which to parse...
INFO:KeggGraph:Now parsing: path:hsa00220...
INFO:KeggGraph:Graph path:hsa00220 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00230...
INFO:KeggGraph:Graph path:hsa00230 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00232...
INFO:KeggGraph:Graph path:hsa00232 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00240...
INFO:KeggGraph:Graph path:hsa00240 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00250...
INFO:KeggGraph:Graph path:hsa00250 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00260...
INFO:KeggGraph:Graph path:hsa00260 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00270...
INFO:KeggGraph:Graph path:hsa00270 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00280...
INFO:KeggGraph:Graph path:hsa00280 parsed successfully!
INFO:KeggGraph:Now parsing: path:hsa00290

In [4]:
gene_graph_00010 = [graph for graph in gene_graphs if graph.name == 'path:hsa00010'][0]
gene_graph_00010

KEGG Pathway: 
            [Title]: Glycolysis / Gluconeogenesis
            [Name]: path:hsa00010
            [Org]: hsa
            [Link]: https://www.kegg.jp/kegg-bin/show_pathway?hsa00010
            [Image]: https://www.kegg.jp/kegg/pathway/hsa/hsa00010.png
            [Link]: https://www.kegg.jp/kegg-bin/show_pathway?hsa00010
            Graph type: mixed 
            Number of Genes: 67
            Number of Compounds: 26
            Gene ID type : kegg
            Compound ID type : kegg
            Number of Nodes: 93
            Number of Edges: 279

In [5]:
converter = kamping.Converter('hsa', gene_target='uniprot', verbose=True)

In [6]:
for graph in gene_graphs:
    converter.convert(graph)

INFO:kamping.parser.convert:Conversion of path:hsa00010 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00020 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00030 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00040 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00051 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00052 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00053 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00061 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00062 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00071 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00100 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00120 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00130 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00140 complete!
INFO:kamping.parser.convert:Conversion of path:hsa00220 complete!
INFO:kampi

In [7]:
import pandas as pd

# uncommented code below if run the first time
# save the mols to a file
# mols.to_pickle('data/mols.pkl')
# retrieve mol from file
mols = pd.read_pickle('data/mols.pkl')
mol_embeddings = kamping.get_mol_embeddings_from_dataframe(mols, transformer='morgan')

'
                    total 231 Invalid rows with "None" in the ROMol column


In [8]:
mol_embeddings

{'cpd:C00038': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C01180': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C20683': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C02593': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C00286': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C03564': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C05452': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C00603': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C05443': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C06157': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C00055': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C04487': array([0., 1., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C05294': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C01674': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 'cpd:C16549': array([0., 1., 0., ..., 0., 0., 0

In [9]:
protein_embeddings = kamping.get_uniprot_protein_embeddings(gene_graphs, '../data/embedding/protein_embedding.h5') 
protein_embeddings

{'up:O43900': array([-0.00197182,  0.03852098,  0.00117973, ...,  0.02317672,
        -0.00850552,  0.03396352], dtype=float32),
 'up:Q9C0C4': array([-0.0193558 ,  0.03834414,  0.02251088, ..., -0.01327153,
         0.03147129,  0.03128346], dtype=float32),
 'up:Q14814': array([ 0.06546359,  0.07149354,  0.05479294, ..., -0.02488449,
        -0.0332847 , -0.00036658], dtype=float32),
 'up:Q9H0I9': array([ 0.05965632,  0.05845138,  0.00416512, ..., -0.04447282,
         0.0489706 ,  0.03225573], dtype=float32),
 'up:P24864': array([-0.00073646,  0.01091676,  0.0028319 , ...,  0.01535806,
        -0.06121586,  0.01390162], dtype=float32),
 'up:Q8NGC1': array([-0.04732009,  0.06667076,  0.01079357, ..., -0.0313393 ,
         0.02712163,  0.01280636], dtype=float32),
 'up:U5ZC31': array([ 0.0004784 ,  0.06464517, -0.01709263, ...,  0.05403793,
        -0.05003293,  0.03100288], dtype=float32),
 'up:Q01362': array([-0.03575023, -0.00711749,  0.04216428, ..., -0.04142374,
         0.02649667

In [10]:
# combine protein embeddings and metabolite embeddings into one dictionary
embeddings = {**protein_embeddings, **mol_embeddings}
len(embeddings)

8837

In [11]:
pyg_graph = kamping.convert_to_single_pyg(gene_graphs, embeddings=embeddings)
data= pyg_graph
data

  hetero_data_dict[group][key] = torch.tensor(value)
  hetero_data_dict[group][key] = torch.tensor(value)


HeteroData(
  name='combined',
  type='mixed',
  compound={ x=[1432, 1024] },
  gene={ x=[7405, 1024] },
  (compound, to, compound)={ edge_index=[2, 362] },
  (compound, to, gene)={ edge_index=[2, 8789] },
  (gene, to, compound)={ edge_index=[2, 6955] },
  (gene, to, gene)={ edge_index=[2, 77375] }
)

In [12]:
# # del data['compound']
# # del data[('gene', 'to', 'gene')]
# # del data[('gene', 'to', 'compound')]
# # del data[('compound', 'to', 'gene')]
# # del data[('compound', 'to', 'compound')]
# data

In [13]:
# to undirected graph
from torch_geometric.transforms.to_undirected import ToUndirected
# transform = ToUndirected()
# data = transform(data)

In [14]:
import torch_geometric.transforms as T
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    is_undirected=True,
    # disjoint_train_ratio=0.3, # TODO
    neg_sampling_ratio=1.0, # TODO
    add_negative_train_samples=False,
    edge_types=("gene", "to", "gene")
)
train_data, val_data, test_data = transform(data)

In [26]:
val_data[('gene', 'to', 'gene')]

{'edge_index': tensor([[1717, 3076, 3077,  ..., 6011, 7215, 3731],
        [5493, 7239, 7171,  ..., 2196, 3077, 2067]]), 'edge_label': tensor([1., 1., 1.,  ..., 0., 0., 0.]), 'edge_label_index': tensor([[2834, 1785, 4523,  ...,  875, 1047, 4735],
        [2956, 1837, 4527,  ...,  203, 1000, 4142]])}

In [15]:
import torch

In [16]:
from torch.utils.data import random_split
import torch.nn as nn
from torch_geometric.nn import GCNConv, GAE, GATConv, Linear, to_hetero, SAGEConv
from torch_geometric.loader import DataLoader
import torch
from tqdm import tqdm
import torch_geometric.utils as utils
import torch.nn.functional as F

class GCNEncoder(torch.nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv(in_channels, 2 * out_channels)
        self.conv2 = SAGEConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        return self.conv2(x, edge_index)


In [18]:
from sklearn.metrics import roc_auc_score

In [19]:
def train(model, optimizer, data):

    z_dict = model(data.x_dict, data.edge_index_dict)

    pos_edge_label_index = data[('gene', 'to', 'gene')].edge_label_index
    pos_edge_label = torch.ones(pos_edge_label_index.size(1))

    neg_edge_label_index = utils.negative_sampling(edge_index=pos_edge_label_index, #positive edges
                                                    num_nodes=data['gene'].x.size(0), # number of nodes
                                                    num_neg_samples=pos_edge_label_index.size(1))
    neg_edge_label = torch.zeros(neg_edge_label_index.size(1))

    edge_label_index = torch.cat([pos_edge_label_index, neg_edge_label_index], dim=1)
    edge_label = torch.cat([pos_edge_label, neg_edge_label], dim=0)

    z_src = z_dict['gene'][edge_label_index[0]]
    z_dst = z_dict['gene'][edge_label_index[1]]

    recon = (z_src * z_dst).sum(dim=-1)
    loss = F.binary_cross_entropy_with_logits(recon, edge_label)
    # calculate AUC
    auc = roc_auc_score(edge_label.cpu().detach().numpy(), recon.cpu().detach().numpy())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item(), auc


In [45]:
@torch.no_grad()
def validate(model, data):
    z_dict = model(data.x_dict, data.edge_index_dict)
    edge_label_index = data[('gene', 'to', 'gene')].edge_label_index
    edge_label = data[('gene', 'to', 'gene')].edge_label
    z_src = z_dict['gene'][edge_label_index[0]]
    z_dst = z_dict['gene'][edge_label_index[1]]

    recon = (z_src * z_dst).sum(dim=-1)
    loss = F.binary_cross_entropy_with_logits(recon, edge_label)
    # calculate AUC
    auc = roc_auc_score(edge_label.cpu().detach().numpy(), recon.cpu().detach().numpy())

    return loss.item(), auc


In [46]:
@torch.no_grad()
def test(model, data):
    z_dict = model(data.x_dict, data.edge_index_dict)
    edge_label_index = data[('gene', 'to', 'gene')].edge_label_index
    edge_label = data[('gene', 'to', 'gene')].edge_label
    z_src = z_dict['gene'][edge_label_index[0]]
    z_dst = z_dict['gene'][edge_label_index[1]]

    recon = (z_src * z_dst).sum(dim=-1)
    loss = F.binary_cross_entropy_with_logits(recon, edge_label)
    # calculate AUC
    auc = roc_auc_score(edge_label.cpu().detach().numpy(), recon.cpu().detach().numpy())

    return loss.item(), auc


In [47]:
def main():
    num_epochs = 100
    batch_size = 10
    in_channels, out_channels = 1024, 10
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = GCNEncoder(in_channels, out_channels)
    model = to_hetero(model, data.metadata(), aggr='sum')
    model = model.to(device)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

    for epoch  in range(num_epochs):
        train_loss, train_auc = train(model, optimizer, train_data)
        val_loss, val_auc = validate(model, val_data)
        test_loss, test_auc = test(model, test_data)
        print(f'epoch: {epoch}, Train Loss: {train_loss:.4f}, Val AUC: {val_auc:.4f}, Test AUC: {test_auc:.4f}')



In [48]:
data

HeteroData(
  name='combined',
  type='mixed',
  compound={ x=[1432, 1024] },
  gene={ x=[7405, 1024] },
  (compound, to, compound)={ edge_index=[2, 362] },
  (compound, to, gene)={ edge_index=[2, 8789] },
  (gene, to, compound)={ edge_index=[2, 6955] },
  (gene, to, gene)={ edge_index=[2, 77375] }
)

In [49]:
main()

epoch: 0, Train Loss: 0.7056, Val AUC: 0.7989, Test AUC: 0.8051
epoch: 1, Train Loss: 0.6615, Val AUC: 0.8778, Test AUC: 0.8750
epoch: 2, Train Loss: 0.7065, Val AUC: 0.8499, Test AUC: 0.8489
epoch: 3, Train Loss: 0.6160, Val AUC: 0.8058, Test AUC: 0.8122
epoch: 4, Train Loss: 0.6183, Val AUC: 0.8426, Test AUC: 0.8495
epoch: 5, Train Loss: 0.5745, Val AUC: 0.8846, Test AUC: 0.8866
epoch: 6, Train Loss: 0.5338, Val AUC: 0.9060, Test AUC: 0.9053
epoch: 7, Train Loss: 0.5413, Val AUC: 0.9139, Test AUC: 0.9131
epoch: 8, Train Loss: 0.5394, Val AUC: 0.9166, Test AUC: 0.9168
epoch: 9, Train Loss: 0.5134, Val AUC: 0.9197, Test AUC: 0.9212
epoch: 10, Train Loss: 0.4956, Val AUC: 0.9222, Test AUC: 0.9251
epoch: 11, Train Loss: 0.4901, Val AUC: 0.9231, Test AUC: 0.9264
epoch: 12, Train Loss: 0.4879, Val AUC: 0.9277, Test AUC: 0.9301
epoch: 13, Train Loss: 0.4800, Val AUC: 0.9365, Test AUC: 0.9384
epoch: 14, Train Loss: 0.4710, Val AUC: 0.9430, Test AUC: 0.9446
epoch: 15, Train Loss: 0.4690, Val 