In [3]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, GATv2Conv
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.logging import log
from torch_geometric.utils import to_edge_index
from sklearn.metrics import accuracy_score, f1_score
import random

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
SEED = 42

def setup_seed(seed=SEED):
    """
    setup seed to make the experiments deterministic

    Parameters:
        seed(int) -- the random seed

    @source https://github.com/zhangxiaoyu11/OmiEmbed
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True


setup_seed(42)

def get_edge_index(X, threshold=0.005):
    if threshold:
        X[X < threshold] = 0

    return to_edge_index((torch.tensor(X, dtype=torch.float).to_sparse()))

In [4]:
df_latent = pd.read_csv("./data/MoGCN_results/latent_data.csv")
x = df_latent.iloc[:, 1:].values

df_results = pd.read_csv("./data/TCGA/sample_classes.csv")
y = df_results["class"].to_list()

In [5]:
counts = df_results[["PAM50Call_RNAseq","class"]].value_counts()
counts = counts.reset_index().sort_values(["class"])
labels = counts["PAM50Call_RNAseq"].to_list()
labels

['LumA', 'LumB', 'Basal', 'Her2']

In [6]:
df_snf = pd.read_csv("./data/MoGCN_results/SNF_fused_matrix.csv")
adj = df_snf.iloc[:,1:].values
np.fill_diagonal(adj, 0)
edge_index, edge_attr = get_edge_index(adj)

In [48]:
edge_attr.shape

torch.Size([866])

In [7]:
list_test = pd.read_csv("./data/TCGA/test_sample.csv")
test_mask = np.array(df_latent["Sample"].isin(list_test["Sample_ID"]))

list_train = pd.read_csv("./data/TCGA/train_sample.csv")
train_mask = np.array(df_latent["Sample"].isin(list_train["Sample_ID"]))

In [43]:
dataset = Data(
    x=torch.tensor(x, dtype=torch.float32),
    # edge_index = torch.tensor([[], []], dtype=torch.long),
    edge_index=edge_index,
    edge_attr=edge_attr,
    y=torch.tensor(y, dtype=torch.long),
)

dataset.train_mask = train_mask
dataset.test_mask = test_mask

In [52]:
class GNN(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def train_loop(self, data, optimizer, epochs=100):
        for epoch in range(1, epochs + 1):
            self.train()
            optimizer.zero_grad()

            pred = self.forward(data.x, data.edge_index, data.edge_attr)
            loss = F.cross_entropy(pred[train_mask], data.y[train_mask])
            loss.backward()
            optimizer.step()

            train_acc, train_f1 = self.validate(data, data.train_mask)
            val_acc, val_f1 = self.validate(data, data.test_mask)

            log(
                Epoch=epoch,
                Loss=loss,
                Train_Acc=train_acc,
                Val_Acc=val_acc,
                Train_f1=train_f1,
                Val_f1=val_f1,
            )

    @torch.no_grad()
    def validate(self, data, mask):
        self.eval()
        pred = self.forward(data.x, data.edge_index, data.edge_attr)
        pred = pred.argmax(dim=-1)

        f1 = f1_score(
            data.y[mask].cpu().numpy(),
            pred[mask].cpu().numpy(),
            average="macro",
        )
        acc = accuracy_score(data.y[mask].cpu().numpy(), pred[mask].cpu().numpy())

        return acc, f1


class GCN(GNN):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        # self.fc = GCNConv(hidden_channels, out_channels)
        self.fc = nn.Linear(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc(x)
        # x = self.fc(x, edge_index, edge_weight)

        return x


class GAT(GNN):
    def __init__(self, in_channels, hidden_channels, out_channels, dropout=0.5, n_heads=8):
        super().__init__()
        self.conv1 = GATv2Conv(in_channels, hidden_channels)
        self.conv2 = GATv2Conv(hidden_channels, hidden_channels)
        self.fc = GATv2Conv(hidden_channels, out_channels)
        self.dropout = dropout

    def forward(self, x, edge_index, edge_weight=None):
        x = self.conv1(x, edge_index, edge_weight)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        # x = self.fc(x)
        x = self.fc(x, edge_index, edge_weight)

        return x


In [53]:
input_dim = dataset.x.shape[1]
hidden_dim = 64
n_classes = len(labels)
dropout = 0.5
lr = 0.001
wd = 0.01

model = GCN(input_dim, hidden_dim, n_classes, dropout)
optimizer = torch.optim.Adam(model.parameters(),lr=lr, weight_decay=wd)

model.to(DEVICE)
data = dataset.to(DEVICE)

In [54]:
model.train_loop(data, optimizer, 100)

Epoch: 001, Loss: 1.4276748895645142, Train_Acc: 0.4831, Val_Acc: 0.5147, Train_f1: 0.1695, Val_f1: 0.1699
Epoch: 002, Loss: 1.3867160081863403, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 003, Loss: 1.3300784826278687, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 004, Loss: 1.2930636405944824, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 005, Loss: 1.2810801267623901, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 006, Loss: 1.247971773147583, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 007, Loss: 1.2436822652816772, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 008, Loss: 1.1955045461654663, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 009, Loss: 1.1963462829589844, Train_Acc: 0.4808, Val_Acc: 0.5147, Train_f1: 0.1623, Val_f1: 0.1699
Epoch: 010, Loss: 1.1959319114685059, 

# MLP

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, n_classes, dropout=0.2):
        super().__init__()

        self.fc1 = nn.Sequential(nn.Linear(input_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.Sigmoid())

        self.fc2 = nn.Sequential(nn.Linear(hidden_dim, hidden_dim), nn.BatchNorm1d(hidden_dim), nn.Sigmoid())
        
        self.fc_cls = nn.Linear(hidden_dim, n_classes)

        self.dropout = dropout

    def forward(self, x):
        x = self.fc1(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc2(x)
        x = F.elu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc_cls(x)

        return x
    
    def train_loop(self, dataloader, optimizer, epochs=100):
        for epoch in range(1, epochs + 1):
            self.train()

            for batch_idx, (x, y) in enumerate(dataloader):
                optimizer.zero_grad()
                x = x.to(DEVICE)
                pred = self.forward(x)
                
                loss = F.cross_entropy(pred, y)
                loss.backward()
                optimizer.step()

                # train_acc, train_f1 = self.validate(data, data.train_mask)
                # val_acc, val_f1 = self.validate(data, data.test_mask)
                print(loss)
                # log(
                #     Epoch=epoch,
                #     Loss=loss,
                #     Train_Acc=train_acc,
                #     Val_Acc=val_acc,
                #     Train_f1=train_f1,
                #     Val_f1=val_f1,
                # )

    @torch.no_grad()
    def validate(self, data, mask):
        self.eval()
        pred = self.forward(data.x, data.edge_index, data.edge_attr)
        pred = pred.argmax(dim=-1)

        f1 = f1_score(
            data.y[mask].cpu().numpy(),
            pred[mask].cpu().numpy(),
            average="macro",
        )
        acc = accuracy_score(data.y[mask].cpu().numpy(), pred[mask].cpu().numpy())

        return acc, f1

In [213]:
latent = torch.tensor(x, dtype=torch.float32)
pam50 = torch.tensor(y, dtype=torch.long)

input_dim = dataset.x.shape[1]
hidden_dim = 64
n_classes = len(labels)
dropout = 0.5
lr = 0.001
wd = 0.01

model = MLP(input_dim, hidden_dim, n_classes, dropout)
optimizer = torch.optim.Adam(model.parameters(),lr=lr, weight_decay=wd)

model.to(DEVICE)
data = dataset.to(DEVICE)

In [214]:
model.train_loop(zip(latent,pam50), optimizer, 100)

ValueError: expected 2D or 3D input (got 1D input)