In [1]:
from pytorch_lightning import Trainer, seed_everything

seed_everything(42)

from pytorch_lightning import LightningModule, LightningDataModule
import torch

Global seed set to 42


In [2]:
from pytorch_lightning import trainer, LightningModule
from torch.nn import functional as F
import torch
import torchmetrics
import timm
from cadFace.loss import FocalLoss
from torch import nn
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

In [3]:
import pandas as pd

train_imputed = pd.read_pickle("1_train_imputed.pkl")
test_imputed = pd.read_pickle("1_test_imputed.pkl")

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from torch.utils.data import (
    DataLoader,
    SequentialSampler,
    RandomSampler,
    WeightedRandomSampler,
    Dataset,
)
import numpy as np


class TableDataset(Dataset):
    def __init__(self, df, features: list, label: list, num_classes=2, y_type="bt"):
        super(Dataset, self).__init__()
        assert isinstance(df, pd.DataFrame)
        assert isinstance(features, list)
        assert isinstance(label, list)

        for feature in features + label:
            assert feature in df.columns

        self.df = df.dropna(subset=features + label)
        assert len(self.df) > 0
        self.features = features
        self.label = label
        self.num_classes = num_classes
        self.y_type = y_type
        self._init_dataset()

    def _init_dataset(self):
        X = torch.tensor(self.df[self.features].values).float()

        y = torch.tensor(self.df[self.label].values)
        if (self.num_classes != len(self.label)) and self.y_type == "bt":
            y = F.one_hot(
                torch.tensor(y).long(), num_classes=self.num_classes
            ).squeeze()

        self.X = X
        self.y = y

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class DatasetModule(LightningDataModule):
    def __init__(
        self,
        train,
        test,
        batch_size=32,
        features: list = None,
        label: list = None,
        num_classes=2,
        y_type="bt",
        num_workers=4,
    ):
        super().__init__()

        self.batch_size = batch_size
        self.features = features
        self.label = label
        self.num_classes = num_classes
        self.y_type = y_type
        self.num_workers = num_workers

        self._init_dataset(train, test)

    def _init_dataset(self, train, test):
        train, val = train_test_split(train, test_size=0.2)
        print(
            f"Train : {train[self.label].value_counts()}\nval : {val[self.label].value_counts()}\nTest : {test[self.label].value_counts()}"
        )
        if self.y_type == "bt" and len(self.label) == 1:

            class_weights = dict(
                enumerate(
                    class_weight.compute_class_weight(
                        "balanced",
                        classes=np.arange(self.num_classes),
                        y=train[self.label[0]],
                    )
                )
            )
            self.class_weights = class_weights

        self.train = TableDataset(
            train, self.features, self.label, self.num_classes, self.y_type
        )
        self.validation = TableDataset(
            val, self.features, self.label, self.num_classes, self.y_type
        )
        self.test = TableDataset(
            test, self.features, self.label, self.num_classes, self.y_type
        )

    def train_dataloader(self):

        if self.y_type == "bt":
            train_class_weights = [
                self.class_weights[torch.argmax(i).item()] for i in self.train.y
            ]
            sampler = WeightedRandomSampler(
                train_class_weights, len(train_class_weights), replacement=True
            )
        else:
            sampler = RandomSampler(self.train)

        return DataLoader(
            self.train,
            batch_size=self.batch_size,
            sampler=sampler,
            drop_last=True,
            persistent_workers=True,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.validation,
            batch_size=self.batch_size,
            persistent_workers=True,
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.validation),
        )

    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.batch_size,
            persistent_workers=True,
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.test),
        )

In [5]:
import json

combination_json = json.load(open("1_X_combination_dict.json"))
proteins = combination_json["all_protein"]
proteins

['C3',
 'KLK7',
 'GCHFR',
 'NHLRC3',
 'APOD',
 'GAPDH',
 'TP53I3',
 'CPA4',
 'ANXA2',
 'GRSF1',
 'IL25',
 'HMMR',
 'MRPL52',
 'PAIP2B',
 'THAP12',
 'FOS',
 'FGF9',
 'PITHD1',
 'THSD1',
 'PTGES2',
 'DEFB103A_DEFB103B',
 'ATP1B4',
 'CYB5A',
 'UNC79',
 'SLC34A3',
 'TAGLN3',
 'SLIRP',
 'CLASP1',
 'PSMC3',
 'KIR3DL2',
 'BEX3',
 'PFDN4',
 'BCL7A',
 'SMC3',
 'SLC28A1',
 'CDC123',
 'GJA8',
 'NMRK2',
 'GATA3',
 'CPLX2',
 'RASGRF1',
 'FGF7',
 'ANKRA2',
 'RBM25',
 'LYZL2',
 'CDK1',
 'CREB3',
 'CREBZF',
 'IGLON5',
 'SHC1',
 'ZP4',
 'TMOD4',
 'CEP152',
 'MYH7B',
 'CEP350',
 'CDC25A',
 'TRIM26',
 'MANEAL',
 'MUCL3',
 'GIMAP8',
 'CYTH3',
 'PDXDC1',
 'CLINT1',
 'MAPRE3',
 'EVI2B',
 'STAU1',
 'PCNA',
 'DNAJA1',
 'JMJD1C',
 'GAGE2A',
 'GAD1',
 'IZUMO1',
 'PDCL2',
 'PDE1C',
 'STOML2',
 'BSND',
 'MAPK13',
 'PDIA2',
 'BTLA',
 'MLLT1',
 'TPRKB',
 'ARHGAP5',
 'BTNL10',
 'PHLDB2',
 'PDIA5',
 'ATF4',
 'PRAME',
 'TOP1MT',
 'KHDC3L',
 'DCUN1D2',
 'IL3',
 'DCLRE1C',
 'ERCC1',
 'DCDC2C',
 'VCPKMT',
 'SPRING1',
 'M

In [6]:
test_imputed_dataset = TableDataset(test_imputed, proteins, ["incident_cad"])

test_imputed_dataset[1]

  torch.tensor(y).long(), num_classes=self.num_classes


(tensor([-0.2857,  0.0198,  0.7424,  ...,  0.2433,  0.1278,  0.0634]),
 tensor([1, 0]))

In [7]:
test_imputed_dataset.y.shape

torch.Size([10195, 2])

In [8]:
dataset = DatasetModule(
    train=train_imputed,
    test=test_imputed,
    features=proteins,
    label=["incident_cad"],
    num_classes=2,
    batch_size=256,
)

Train : incident_cad
0.0             30841
1.0              1803
dtype: int64
val : incident_cad
0.0             7703
1.0              459
dtype: int64
Test : incident_cad
0.0             9651
1.0              544
dtype: int64


  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes


In [9]:
for x, y in dataset.train_dataloader():
    print(x.shape, y.shape)
    break

torch.Size([256, 2911]) torch.Size([256, 2])


In [10]:
class TableModel(LightningModule):
    def __init__(self, input_size, num_classes, lr=1e-1):
        super().__init__()

        self.model = nn.Sequential(
            nn.Linear(input_size, 2048),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(256, num_classes),
        )
        self.lr = lr
        self.loss_fn = torch.nn.CrossEntropyLoss(weight=torch.Tensor([0.5, 0.5]))
        # self.loss_fn = FocalLoss(alpha=torch.Tensor([1, 0.1]), gamma=2)

        self.auc = torchmetrics.AUROC(num_classes=2, task="binary")
        self.val_auc = torchmetrics.AUROC(num_classes=2, task="binary")

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):

        x, y = batch
        # x = self.norm(x)
        y_hat = self(x)

        loss = self.loss_fn(y_hat, y.float())
        self.auc.update(torch.softmax(y_hat, dim=-1), y)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("train_auc", self.auc.compute(), on_epoch=True, prog_bar=True)

    def validation_step(self, batch, batch_idx):
        x, y = batch
        # x = self.norm(x)
        y = y.squeeze(-1).long()
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y.float())
        self.val_auc.update(torch.softmax(y_hat, dim=-1), y)
        self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.log("val_auc", self.val_auc.compute(), on_epoch=True, prog_bar=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.lr)

In [11]:
model = TableModel(input_size=len(proteins), num_classes=2, lr=1e-1)
dataset = DatasetModule(
    train=train_imputed,
    test=test_imputed,
    features=proteins,
    label=["incident_cad"],
    num_classes=2,
    batch_size=64,
)

Trainer = trainer.Trainer(
    max_epochs=100,
)
Trainer.fit(model, dataset)

Train : incident_cad
0.0             30859
1.0              1785
dtype: int64
val : incident_cad
0.0             7685
1.0              477
dtype: int64
Test : incident_cad
0.0             9651
1.0              544
dtype: int64


  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: /home/xutingfeng/ukb/project/ppp_prediction/lightning_logs
2024-03-29 16:19:12.032465: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]



Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [24]:
for x, y in dataset.train_dataloader():
    print(x.shape, y.shape)
    break

torch.Size([1024, 2911]) torch.Size([1024, 2])


In [25]:
torchmetrics.AUROC(num_classes=2, task="binary")(
    torch.softmax(model(x.to("cuda:0")), dim=-1).cpu(), y.cpu()
)

tensor(0.5570)

tensor([[0.4842, 0.5158],
        [0.4597, 0.5403],
        [0.4153, 0.5847],
        [0.4828, 0.5172],
        [0.4757, 0.5243],
        [0.4731, 0.5269],
        [0.4666, 0.5334],
        [0.4522, 0.5478],
        [0.4830, 0.5170],
        [0.4884, 0.5116],
        [0.4331, 0.5669],
        [0.4602, 0.5398],
        [0.4956, 0.5044],
        [0.5095, 0.4905],
        [0.4809, 0.5191],
        [0.4773, 0.5227],
        [0.4667, 0.5333],
        [0.4735, 0.5265],
        [0.4474, 0.5526],
        [0.4495, 0.5505],
        [0.4348, 0.5652],
        [0.4460, 0.5540],
        [0.4601, 0.5399],
        [0.4656, 0.5344],
        [0.4899, 0.5101],
        [0.4695, 0.5305],
        [0.4483, 0.5517],
        [0.4574, 0.5426],
        [0.4746, 0.5254],
        [0.4680, 0.5320],
        [0.4962, 0.5038],
        [0.4409, 0.5591]], grad_fn=<SoftmaxBackward0>)