In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import dgl
from dgl.nn import Set2Set
from data import read_omics, read_clin, build_graph, read_pathways
from util import evaluate, check_files, clin_process_tsi

In [2]:
clin_process_tsi(in_file='./data/GBM/Clinical.tsi', out_file='./data/GBM/clinincal.csv')

from data import read_omics, read_clin
omics_files = ['./data/GBM/GBM.cnv.csv.gz', './data/GBM/GBM.expression.csv.gz', './data/GBM/GBM.met.csv.gz']
omics = read_omics(omics_files=omics_files, clin_file= './data/GBM/clinincal.csv')

from data import build_graph

graphs, labels, clin_features, id_mapping = build_graph(omics=omics, clinical_file='./data/GBM/clinincal.csv')


[INFO] The overlaping genes number between omics and ppi dataset is: 8245


In [3]:
from sklearn.model_selection import train_test_split

graphs_train, graphs_test, lables_train, labels_test, clin_features_train, clin_features_test = train_test_split(graphs, labels, clin_features, test_size=0.2, random_state=42)

len(graphs_train), len(graphs_test), len(lables_train), len(labels_test), len(clin_features_train), len(clin_features_test)

(176, 44, 176, 44, 176, 44)

In [5]:
graphs_train, graphs_test, lables_train, labels_test, clin_features_train, clin_features_test = train_test_split(graphs, labels, clin_features, test_size=0.2, random_state=42)
clin_features_train[0]

tensor([[-0.0108],
        [ 0.4866],
        [-0.0674],
        [-0.2978],
        [ 0.3068],
        [-0.8224],
        [ 0.8224],
        [-0.3572],
        [-2.4635],
        [-0.1671],
        [-0.2177],
        [ 0.3068],
        [-0.1358],
        [ 0.4631]], dtype=torch.float64)

In [9]:
from main import batch_idx
for idx in batch_idx(graphs=graphs_train, minibatch=16):
    clin_features_train[idx]
    print(idx)
    print(clin_features_train[idx])
    break

[103, 20, 11, 86, 39, 72, 96, 9, 132, 82, 119, 107, 129, 21, 117, 57]
tensor([[[ 0.8690],
         [ 0.2318],
         [-0.0674],
         [-0.2978],
         [ 0.3068],
         [ 1.2105],
         [-1.2105],
         [-0.3572],
         [ 0.4041],
         [-0.1671],
         [-0.2177],
         [ 0.3068],
         [-0.1358],
         [ 0.4631]],

        [[ 0.2599],
         [ 0.2123],
         [-0.0674],
         [-0.2978],
         [ 0.3068],
         [-0.8224],
         [ 0.8224],
         [-0.3572],
         [ 0.4041],
         [-0.1671],
         [-0.2177],
         [ 0.3068],
         [-0.1358],
         [ 0.4631]],

        [[-0.2138],
         [-1.2158],
         [-0.0674],
         [-0.2978],
         [ 0.3068],
         [-0.8224],
         [ 0.8224],
         [-0.3572],
         [ 0.4041],
         [-0.1671],
         [-0.2177],
         [ 0.3068],
         [-0.1358],
         [-2.1494]],

        [[ 0.2599],
         [ 0.7736],
         [-0.0674],
         [-0.2978],
    

In [3]:
pathways = read_pathways(id_mapping=id_mapping, file="./Pathway/pathway_genes.gmt")

In [101]:
import sys, os, argparse, warnings
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.nn.pytorch as dglnn
from dgl.nn import SumPooling
from data import read_omics, read_clin, build_graph, read_pathways
from util import evaluate, check_files

class DeepMOI(nn.Module):
    def __init__(self, in_dim, pathway, clinical_feature_num=None):
        """
        in_dim: == omics' number
        hidden_dim: == 
        """
        super(DeepMOI, self).__init__()
        # GNN
        self.gin_lin1 = torch.nn.Linear(in_dim, in_dim*2)
        self.conv1 = dglnn.GINConv(self.gin_lin1, 'sum')

        self.gin_lin2 = torch.nn.Linear(in_dim*2, in_dim)
        self.conv2 = dglnn.GINConv(self.gin_lin2)
        
        # MLP
        self.lin1 = nn.Linear(len(pathway)*in_dim*2, len(pathway))
        if clinical_feature_num == None:
            self.lin2 = nn.Linear(len(pathway), 1)  # not including clinical features 
        else:
            self.lin2 = nn.Linear(len(pathway) + clinical_feature_num, 1)  # including clinical features
        self.pathway = pathway

    def forward(self, g, h, c=None):
        # subnetwork1: GRL layers
        h = F.relu(self.conv1(g, h))
        h = F.relu(self.conv2(g, h))

        # subnetwork2: patyway layers
        with g.local_scope():
            g.ndata['h'] = h

            # global pooling with Set2Set: output dim = 2*node_dim
            subgraphs = [dgl.node_subgraph(g, n) for n in self.pathway.values()]
            graphs_ = dgl.batch(subgraphs)
            readout1 =  Set2Set(3, 2, 1)(graphs_, graphs_.ndata['h'])
            readout1 = readout1.reshape(1,-1).squeeze(0)
            
            # linear-1
            x = nn.ReLU()(self.lin1(readout1))
            
            if c != None:
                x = torch.cat([x, c], dim=0)
            
            # linear-2
            logit = nn.Sigmoid()(self.lin2(x))

            return logit

        
### test
idx = 0
model = DeepMOI(in_dim=3, pathway=pathways, clinical_feature_num=14)
g, h = graphs[idx], graphs[idx].ndata['h']
clinical_feature = clin_features[idx].reshape(1,-1).squeeze(0).to(torch.float32)
model(g, h, c=clinical_feature)

tensor([0.], grad_fn=<SigmoidBackward>)

array([1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [106]:
model.eval()
logits = []
for i in range(len(graphs_test)):
    g = graphs_test[i]
    c = clin_features_test[i].reshape(1,-1).squeeze(0).to(torch.float32)
    logit = model(g,g.ndata['h'], c)
    logits.append(logit.detach().numpy())
logits = np.concatenate(logits)
evaluate(logits, labels_test.detach().numpy())

array([0.72712266, 0.7647083 , 0.7730331 , 0.58971685, 0.72855693,
       0.7221746 , 0.8073716 , 0.67126787, 0.6978372 , 0.7623402 ,
       0.6673488 , 0.77350944, 0.80925167, 0.7111525 , 0.7221822 ,
       0.7229939 , 0.82042336, 0.7831623 , 0.7677239 , 0.74911755,
       0.751095  , 0.81540525, 0.78435135, 0.76604235, 0.7700748 ,
       0.77543247, 0.70801973, 0.76717377, 0.80196357, 0.7227293 ,
       0.71504444, 0.8019791 , 0.66941315, 0.7413178 , 0.82555   ,
       0.7465379 , 0.71073383, 0.74815136, 0.7835484 , 0.76205635,
       0.753549  , 0.74834424, 0.71913517, 0.8510656 ], dtype=float32)

In [109]:
from util import evaluate


(0.8409090909090909, 0.7181467181467182, 0.9135802469135803, 1.0, 0.0)

In [161]:
logits_epoch, labels_epoch, loss_epoch = [], [], [] # for training dataset evaluation

opt = torch.optim.Adam(model.parameters())
opt.zero_grad()
for i in range(8):
    logit  = model(g, h)
    loss = nn.BCELoss()(logit, label.unsqueeze(0).float())
    loss = loss/3
    loss.backward()
    logits_epoch.append(logit.detach().numpy())
    loss_epoch.append(loss.item())
opt.step()
opt.zero_grad()

---

In [32]:
clinical_feature = clin_features[0].reshape(1,-1).squeeze(0)
clinical_feature.to(torch.float32)

tensor([58.0000,  0.7653,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000,  0.0000,
         1.0000,  0.0000,  0.0000,  1.0000,  0.0000,  1.0000])

In [28]:
torch.cat([clinical_feature, clinical_feature], dim=1)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)