In [3]:
import torch
from torch.utils.data import Dataset
from utils.data import get_data
from utils.train_val_test import setup_seed
import pandas as pd
SEED = setup_seed()

class Omics(Dataset):
    def __init__(self, metabric_path):

        complete_data = pd.read_csv(
            metabric_path, index_col=None, header=0, low_memory=False
        )
        # Remove unknown classes
        complete_data = complete_data.drop(
            complete_data[complete_data["Pam50Subtype"] == "?"].index
        )
        # Get pre-processed data
        omics = get_data(complete_data, complete_data)

        self.rna = torch.tensor(omics["rnanp"], dtype=torch.float)
        self.cna = torch.tensor(omics["cnanp"], dtype=torch.float)
        self.pam50 = torch.tensor(omics["pam50np"], dtype=torch.int)

    def __len__(self):
        return len(self.rna)

    def __getitem__(self, idx):
        return [self.rna[idx], self.cna[idx]]

In [4]:
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import os
from networks.encoder_decoders import VAE, H_VAE
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

"""
params
    lr
    batch_size
    epochs
    input_layer
    dense_layer
    latent_layer
    beta

data
    rna
    cna
"""

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


class Classifier:
    def __init__(
        self, train_data, test_data, params, rna_VAE_path=None, cna_VAE_path=None
    ):
        super().__init__()

        self.train_data = train_data
        self.test_data = test_data
        self.params = params

        self.rna_VAE = VAE(
            [1000, self.params["dense_dim"], self.params["dense_dim"] // 2],
            loss_fn=nn.MSELoss(reduction="mean"),
            params=self.params,
            tolerance=self.params["epochs"],
        )
        self.rna_VAE_path = rna_VAE_path

        self.cna_VAE = VAE(
            [1000, self.params["dense_dim"], self.params["dense_dim"] // 2],
            loss_fn=nn.BCEWithLogitsLoss(reduction="mean"),
            params=self.params,
            tolerance=self.params["epochs"],
        )

        self.cna_VAE_path = cna_VAE_path

        self.h_VAE = None

        self.classifier = GaussianNB()

    def training(self, output_path=None):
        if self.rna_VAE_path is None:
            print("Training RNA VAE")
            rna_loader = DataLoader(
                self.train_data.rna, batch_size=self.params["batch_size"], shuffle=False
            )

            self.rna_VAE = self.rna_VAE.to(DEVICE)
            rna_optimizer = torch.optim.Adam(
                self.rna_VAE.parameters(), lr=self.params["lr"]
            )
            self.rna_VAE.train_loop(rna_loader, rna_optimizer, self.params["epochs"])

            if output_path:
                torch.save(
                    self.rna_VAE.state_dict(), os.path.join(output_path, "rna_VAE.pth")
                )
        else:
            print("Loading RNA VAE")
            self.rna_VAE.load_state_dict(torch.load(self.rna_VAE_path))

        if self.cna_VAE_path is None:
            print("Training CNA VAE")
            cnaLoader = DataLoader(
                self.train_data.cna, batch_size=self.params["batch_size"], shuffle=False
            )
            self.cna_VAE = self.cna_VAE.to(DEVICE)
            cna_optimizer = torch.optim.Adam(
                self.cna_VAE.parameters(), lr=self.params["lr"]
            )
            self.cna_VAE.train_loop(cnaLoader, cna_optimizer, self.params["epochs"])

            if output_path:
                torch.save(
                    self.cna_VAE.state_dict(), os.path.join(output_path, "cna_VAE.pth")
                )
        else:
            print("Loading CNA VAE")
            self.cna_VAE.load_state_dict(torch.load(self.cna_VAE_path))

        print("Training H-VAE")
        h_loader = DataLoader(
            self.train_data, batch_size=self.params["batch_size"], shuffle=False
        )
        self.h_VAE = H_VAE(
            self.rna_VAE,
            self.cna_VAE,
            [self.params["dense_dim"], self.params["latent_dim"]],
            loss_fn=nn.MSELoss(reduction="mean"),
            params=self.params,
            tolerance=self.params["epochs"],
        ).to(DEVICE)
        h_optimizer = torch.optim.Adam(self.h_VAE.parameters(), lr=self.params["lr"])
        self.h_VAE.train_loop(h_loader, h_optimizer, self.params["epochs"])

        print("Training Naive Bayes")
        latent_train = self.h_VAE.get_latent_space(h_loader)
        self.classifier.fit(latent_train, self.train_data.pam50)

    def evaluate(self):
        dataset = [self.train_data, self.test_data]
        acc_list = []
        for data in dataset:
            h_loader = DataLoader(
                data, batch_size=self.params["batch_size"], shuffle=False
            )
            latent = self.h_VAE.get_latent_space(h_loader)

            pred = self.classifier.predict(latent)
            acc = accuracy_score(data.pam50, pred)
            acc_list.append(acc)

        return acc_list

In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB


N_FOLDS = 5
fold_dir = "data/5-fold_pam50stratified/"
file_name = "MBdata_33CLINwMiss_1KfGE_1KfCNA"

output_path = "output/models/h_vae"
rna_VAE_path = "output/models/h_vae/rna_VAE.pth"
cna_VAE_path = "output/models/h_vae/cna_VAE.pth"

nb = GaussianNB()
accTrain_list = []
accTest_list = []

params = {"dropout":0.2,
          "lr": 0.001,
          "epochs": 150,
          "batch_size": 64,
          "activation_fn": nn.ELU(),
          "dense_dim":256,
          "latent_dim":64,
          "beta":50,
          "regularisation": "mmd",
          }

for i in range(1, N_FOLDS + 1):
    train_data_path = os.path.join(fold_dir, f"fold{i}", file_name + "_train.csv")
    test_data_path = os.path.join(fold_dir, f"fold{i}", file_name + "_test.csv")

    train_omics = Omics(train_data_path)
    test_omics = Omics(test_data_path)

    classifier = Classifier(train_omics, test_omics, params)
    classifier.training(output_path)
    accTrain, accTest = classifier.evaluate()

    print(f"Fold {i} - Train Acc: {accTrain}, Test Acc: {accTest}")

    accTrain_list.append(accTrain)
    accTest_list.append(accTest)

meanAccTrain = np.array(accTrain_list).mean()
meanAccTest = np.array(accTest_list).mean()
print(f"Overall - Mean Train Acc: {meanAccTrain}, Mean Test Acc: {meanAccTest}")

Training RNA VAE
Epoch: 000, Train: 0.1908, Val: 0.2275
Epoch: 020, Train: 0.0163, Val: 0.0135
Epoch: 040, Train: 0.0163, Val: 0.0160
Epoch: 060, Train: 0.0101, Val: 0.0091
Epoch: 080, Train: 0.0170, Val: 0.0130
Epoch: 100, Train: 0.0099, Val: 0.0086
Epoch: 120, Train: 0.0113, Val: 0.0124
Epoch: 140, Train: 0.0076, Val: 0.0078
Training CNA VAE
Epoch: 000, Train: 0.6928, Val: 0.7038
Epoch: 020, Train: 0.6297, Val: 0.6432
Epoch: 040, Train: 0.6260, Val: 0.6442
Epoch: 060, Train: 0.6237, Val: 0.6341
Epoch: 080, Train: 0.6219, Val: 0.6467
Epoch: 100, Train: 0.6208, Val: 0.6306
Epoch: 120, Train: 0.6205, Val: 0.6406
Epoch: 140, Train: 0.6194, Val: 0.6331
Training H-VAE
Epoch: 000, Train: 3.8202, Val: 1.0891
Epoch: 020, Train: 1.8410, Val: 0.3566
Epoch: 040, Train: 1.7408, Val: 0.3355
Epoch: 060, Train: 1.7294, Val: 0.3226
Epoch: 080, Train: 1.6848, Val: 0.3171
Epoch: 100, Train: 1.6855, Val: 0.3136
Epoch: 120, Train: 1.6792, Val: 0.3119
Epoch: 140, Train: 1.6795, Val: 0.3097
Training Naive 

KeyboardInterrupt: 

Overall - Mean Train Acc: 0.6298119303196223, Mean Test Acc: 0.6215793870076463