In [2]:
from pytorch_lightning import Trainer, seed_everything

seed_everything(42)

from pytorch_lightning import LightningModule, LightningDataModule
import torch
from pytorch_lightning import trainer, LightningModule
from torch.nn import functional as F
import torch
import torchmetrics
import timm

from torch import nn
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

Seed set to 42


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from torch.utils.data import (
    DataLoader,
    SequentialSampler,
    RandomSampler,
    WeightedRandomSampler,
    Dataset,
)
import numpy as np


class TableDataset(Dataset):
    def __init__(
        self,
        df,
        features: list,
        label: list,
        covariates: list = None,
        num_classes=2,
        y_type="bt",
    ):
        super(Dataset, self).__init__()
        assert isinstance(df, pd.DataFrame)
        assert isinstance(features, list)
        assert isinstance(label, list)

        for feature in features + label:
            assert feature in df.columns
        if covariates:
            for cov in covariates:

                assert cov in df.columns

        if not covariates:

            self.df = df.dropna(subset=features + label)
        else:
            self.df = df.dropna(subset=features + label + covariates)
        assert len(self.df) > 0
        self.features = features
        self.covariates = covariates
        self.label = label
        self.num_classes = num_classes
        self.y_type = y_type
        self._init_dataset()

    def _init_dataset(self):
        X = torch.tensor(self.df[self.features].values).float()
        if self.covariates:
            X_cov = torch.tensor(self.df[self.covariates].values).float()

        y = torch.tensor(self.df[self.label].values)
        if (self.num_classes != len(self.label)) and self.y_type == "bt":
            y = F.one_hot(
                torch.tensor(y).long(), num_classes=self.num_classes
            ).squeeze()

        self.X = X
        if self.covariates:
            self.X_cov = X_cov
        self.y = y

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        if self.covariates:
            return (self.X[idx], self.X_cov[idx]), self.y[idx]
        return self.X[idx], self.y[idx]


class TableDatasetModule(LightningDataModule):
    def __init__(
        self,
        train,
        test,
        batch_size=32,
        features: list = None,
        covariates: list = None,
        label: list = None,
        num_classes=2,
        y_type="bt",
        num_workers=4,
    ):
        super().__init__()

        self.batch_size = batch_size
        self.features = features
        self.covariates = covariates
        self.label = label
        self.num_classes = num_classes
        self.y_type = y_type
        self.num_workers = num_workers

        self._init_dataset(train, test)

    def _init_dataset(self, train, test):
        train, val = train_test_split(train, test_size=0.2)
        print(
            f"Train : {train[self.label].value_counts()}\nval : {val[self.label].value_counts()}\nTest : {test[self.label].value_counts()}"
        )
        if self.y_type == "bt" and len(self.label) == 1:

            class_weights = dict(
                enumerate(
                    class_weight.compute_class_weight(
                        "balanced",
                        classes=np.arange(self.num_classes),
                        y=train[self.label[0]],
                    )
                )
            )
            self.class_weights = class_weights

        self.train = TableDataset(
            df=train,
            features=self.features,
            label=self.label,
            covariates=self.covariates,
            num_classes=self.num_classes,
            y_type=self.y_type,
        )
        self.validation = TableDataset(
            df=val,
            features=self.features,
            label=self.label,
            covariates=self.covariates,
            num_classes=self.num_classes,
            y_type=self.y_type,
        )
        self.test = TableDataset(
            df=test,
            features=self.features,
            label=self.label,
            covariates=self.covariates,
            num_classes=self.num_classes,
            y_type=self.y_type,
        )

    def train_dataloader(self):

        if self.y_type == "bt":
            train_class_weights = [
                self.class_weights[torch.argmax(i).item()] for i in self.train.y
            ]
            sampler = WeightedRandomSampler(
                train_class_weights, len(train_class_weights), replacement=True
            )
        else:
            sampler = RandomSampler(self.train)

        return DataLoader(
            self.train,
            batch_size=self.batch_size,
            sampler=sampler,
            drop_last=True,
            persistent_workers=True,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.validation,
            batch_size=self.batch_size,
            persistent_workers=True,
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.validation),
        )

    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.batch_size,
            persistent_workers=True,
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.test),
        )

In [4]:
train_imputed = pd.read_pickle("result/part1/train_imputed.pkl")
test_imputed = pd.read_pickle("result/part1/test_imputed.pkl")

In [5]:
proteomics = test_imputed.columns[test_imputed.columns.tolist().index("C3") :].tolist()
risk_factors = [
    "age",
    "sex",
    "ldl_a",
    "hdl_a",
    "tc_a",
    "tg_a",
    "sbp_a",
    "BMI",
    "smoking",
    "prevalent_diabetes",
]

PRS = ["PRS"]
proteomics

['C3',
 'KLK7',
 'GCHFR',
 'NHLRC3',
 'APOD',
 'GAPDH',
 'TP53I3',
 'CPA4',
 'ANXA2',
 'GRSF1',
 'IL25',
 'HMMR',
 'MRPL52',
 'PAIP2B',
 'THAP12',
 'FOS',
 'FGF9',
 'PITHD1',
 'THSD1',
 'PTGES2',
 'DEFB103A_DEFB103B',
 'ATP1B4',
 'CYB5A',
 'UNC79',
 'SLC34A3',
 'TAGLN3',
 'SLIRP',
 'CLASP1',
 'PSMC3',
 'KIR3DL2',
 'BEX3',
 'PFDN4',
 'BCL7A',
 'SMC3',
 'SLC28A1',
 'CDC123',
 'GJA8',
 'NMRK2',
 'GATA3',
 'CPLX2',
 'RASGRF1',
 'FGF7',
 'ANKRA2',
 'RBM25',
 'LYZL2',
 'CDK1',
 'CREB3',
 'CREBZF',
 'IGLON5',
 'SHC1',
 'ZP4',
 'TMOD4',
 'CEP152',
 'MYH7B',
 'CEP350',
 'CDC25A',
 'TRIM26',
 'MANEAL',
 'MUCL3',
 'GIMAP8',
 'CYTH3',
 'PDXDC1',
 'CLINT1',
 'MAPRE3',
 'EVI2B',
 'STAU1',
 'PCNA',
 'DNAJA1',
 'JMJD1C',
 'GAGE2A',
 'GAD1',
 'IZUMO1',
 'PDCL2',
 'PDE1C',
 'STOML2',
 'BSND',
 'MAPK13',
 'PDIA2',
 'BTLA',
 'MLLT1',
 'TPRKB',
 'ARHGAP5',
 'BTNL10',
 'PHLDB2',
 'PDIA5',
 'ATF4',
 'PRAME',
 'TOP1MT',
 'KHDC3L',
 'DCUN1D2',
 'IL3',
 'DCLRE1C',
 'ERCC1',
 'DCDC2C',
 'VCPKMT',
 'SPRING1',
 'M

In [6]:
dataset = DatasetModule(
    train=train_imputed,
    test=test_imputed,
    features=proteomics,
    covariates=risk_factors,
    label=["incident_cad"],
    num_classes=2,
    batch_size=256,
)

NameError: name 'DatasetModule' is not defined

In [7]:
for x, y in dataset.train_dataloader():
    # print(x.shape, y.shape)
    print(torch.argmax(y, dim=1).sum())
    break

NameError: name 'dataset' is not defined

In [7]:
# class FullyResNetWork(pl.LightningModule):
#     def __init__(
#         self,
#         hidden_size,
#         features,
#         output_size,
#         # num_resblocks=3,
#         lr=1e-3,
#         weight_decay=1e-2,
#         weight=[1, 1],
#         **kwargs,
#     ):
#         super(FullyResNetWork, self).__init__()

#         input_size = len(features)
#         self.features = features
#         self.norm = nn.BatchNorm1d(input_size)

#         self.sharedNetWork = nn.Sequential(
#             nn.Linear(input_size, hidden_size),
#             nn.SiLU(),
#             nn.Dropout(0.3),
#             nn.Linear(hidden_size, hidden_size),
#             nn.SiLU(),
#             nn.Dropout(0.3),
#             nn.Linear(hidden_size, hidden_size * 2),
#             nn.SiLU(),
#         )
#         self.ResidualHeadNetwork_1 = nn.Sequential(
#             nn.Linear(hidden_size * 2, 256),
#             nn.SiLU(),
#             nn.Dropout(0.3),
#             nn.Linear(256, 128),
#             nn.SiLU(),
#             nn.Dropout(0.3),
#             nn.Linear(128, 32),
#             nn.SiLU(),
#         )
#         self.ResidualHeadNetwork_2 = nn.Sequential(
#             nn.Linear(input_size, 256),
#             nn.SiLU(),
#             nn.Dropout(0.6),
#             nn.Linear(256, 128),
#             nn.SiLU(),
#             nn.Dropout(0.6),
#             nn.Linear(128, 32),
#             nn.SiLU(),
#         )
#         self.ResidualHeadNetwork_3 = nn.Sequential(
#             nn.Linear(32, 128),
#             nn.SiLU(),
#             nn.Dropout(0.6),
#             nn.Linear(128, 128),
#             nn.SiLU(),
#             nn.Dropout(0.6),
#             nn.Linear(128, output_size),
#         )

#         self.lr = lr
#         self.weight_decay = weight_decay

#         self.mertic = {
#             "train_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
#             "val_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
#         }
#         self.history = defaultdict(dict)
#         self.loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(weight).float())

#     def forward(self, x):

#         x = self.norm(x)

#         out = self.sharedNetWork(x)

#         out = self.ResidualHeadNetwork_1(out) + self.ResidualHeadNetwork_2(x)
#         out = self.ResidualHeadNetwork_3(out)
#         return out

#     def training_step(self, train_batch, batch_idx):
#         x, y = train_batch
#         outputs = self.forward(x)
#         loss = self.loss_fn(outputs, y.squeeze(-1).float())

#         self.mertic["train_auc"].update(
#             torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
#         )

#         self.log("ptl/train_loss", loss, on_epoch=True, prog_bar=True, on_step=False)
#         return loss

#     def validation_step(self, val_batch, batch_idx):
#         x, y = val_batch
#         outputs = self.forward(x)
#         loss = self.loss_fn(outputs, y.squeeze(-1).float())

#         self.mertic["val_auc"].update(
#             torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
#         )

#         self.log("ptl/val_loss", loss, on_epoch=True, prog_bar=True)

#     def on_train_epoch_end(self):

#         auc = self.mertic["train_auc"].compute()
#         self.log("ptl/train_auc", auc, prog_bar=True)

#     def on_validation_epoch_end(self):
#         auc = self.mertic["val_auc"].compute()
#         self.log("ptl/val_auc", auc, prog_bar=True)

#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(
#             self.parameters(), lr=self.lr, weight_decay=self.weight_decay
#         )
#         return optimizer

#     def predict_df(self, df, batch_size=256):

#         for feature in self.features:
#             assert feature in df.columns
#         print(f"input df have NA: {df[self.features].isna().sum(axis=1).sum()}")
#         df = df.copy().dropna(subset=self.features)

#         predict_dataloader = DataLoader(
#             torch.tensor(df[self.features].values).float(),
#             batch_size=batch_size,
#             persistent_workers=True,
#             num_workers=4,
#         )

#         self.eval()
#         pred = []
#         with torch.no_grad():
#             for x in predict_dataloader:
#                 y_hat = self.forward(x).cpu().detach()
#                 y_hat = torch.softmax(y_hat, dim=-1)[:, 1]

#                 pred.append(y_hat)
#         pred = torch.cat(pred).numpy()
#         df["pred"] = pred
#         return df


# class FullyConnectedNet(pl.LightningModule):
#     def __init__(
#         self,
#         hidden_size,
#         features,
#         output_size,
#         num_resblocks=3,
#         lr=1e-3,
#         weight_decay=1e-2,
#         weight=[1, 1],
#         **kwargs,
#     ):
#         super(FullyConnectedNet, self).__init__()
#         input_size = len(features)
#         self.features = features
#         self.norm = nn.BatchNorm1d(input_size)
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.resblocks = nn.Sequential(
#             *[LinearResBlock(hidden_size, hidden_size) for _ in range(num_resblocks)]
#         )
#         self.fc2 = nn.Linear(hidden_size, output_size)

#         self.lr = lr
#         self.weight_decay = weight_decay

#         self.mertic = {
#             "train_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
#             "val_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
#         }
#         self.history = defaultdict(dict)
#         self.loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(weight).float())

#     def forward(self, x):
#         x = self.norm(x)
#         out = torch.relu(self.fc1(x))
#         out = self.resblocks(out)
#         out = self.fc2(out)
#         return out

#     def training_step(self, train_batch, batch_idx):
#         x, y = train_batch
#         outputs = self.forward(x)
#         loss = self.loss_fn(outputs, y.squeeze(-1).float())

#         self.mertic["train_auc"].update(
#             torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
#         )

#         self.log("ptl/train_loss", loss, on_epoch=True, prog_bar=True, on_step=False)
#         return loss

#     def validation_step(self, val_batch, batch_idx):
#         x, y = val_batch
#         outputs = self.forward(x)
#         loss = self.loss_fn(outputs, y.squeeze(-1).float())

#         self.mertic["val_auc"].update(
#             torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
#         )

#         self.log("ptl/val_loss", loss, on_epoch=True, prog_bar=True)

#     def on_train_epoch_end(self):

#         auc = self.mertic["train_auc"].compute()
#         self.log("ptl/train_auc", auc, prog_bar=True)

#     def on_validation_epoch_end(self):
#         auc = self.mertic["val_auc"].compute()
#         self.log("ptl/val_auc", auc, prog_bar=True)

#     def configure_optimizers(self):
#         optimizer = torch.optim.Adam(
#             self.parameters(), lr=self.lr, weight_decay=self.weight_decay
#         )
#         return optimizer

#     def predict_df(self, df, batch_size=256):

#         for feature in self.features:
#             assert feature in df.columns
#         print(f"input df have NA: {df[self.features].isna().sum(axis=1).sum()}")
#         df = df.copy().dropna(subset=self.features)

#         predict_dataloader = DataLoader(
#             torch.tensor(df[self.features].values).float(),
#             batch_size=batch_size,
#             persistent_workers=True,
#             num_workers=4,
#         )

#         self.eval()
#         pred = []
#         with torch.no_grad():
#             for x in predict_dataloader:
#                 y_hat = self.forward(x).cpu().detach()
#                 y_hat = torch.softmax(y_hat, dim=-1)[:, 1]

#                 pred.append(y_hat)
#         pred = torch.cat(pred).numpy()
#         df["pred"] = pred
#         return df

In [8]:
import os
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor

import torch
import pytorch_lightning as pl

import torch.nn as nn
import torch.optim as optim
from collections import defaultdict


class LinearResBlock(nn.Module):
    def __init__(self, d_model, d_ff=None, dropout=0.1):
        super(LinearResBlock, self).__init__()
        if d_ff is None:
            d_ff = d_model * 2

        self.fc1 = nn.Linear(d_model, d_model)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
        )
        self.norm2 = nn.LayerNorm(d_model)

        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        # d_model => d_model (default is d_model)
        x = self.norm1(x)
        x = x + self.dropout1(self.fc1(x))
        x = self.norm2(x)
        x = x + self.dropout2(self.ff(x))
        return x


class LinearTransformerEncoder(nn.Module):
    def __init__(self, d_model, d_ff=None, num_classes=0, num_layers=3, dropout=0.1):
        super(LinearTransformerEncoder, self).__init__()
        self.layers = nn.Sequential(
            *[
                LinearResBlock(d_model, d_ff=d_ff, dropout=dropout)
                for _ in range(num_layers)
            ]
        )
        self.fc_norm = (
            nn.LayerNorm(d_model, eps=1e-6) if num_classes > 0 else nn.Identity()
        )
        self.head_drop = nn.Dropout(dropout)
        self.head = (
            nn.Linear(d_model, num_classes) if num_classes > 0 else nn.Identity()
        )

        self.num_classes = num_classes
        self.d_model = d_model
        self.num_layers = num_layers
        self.dropout = dropout

    def forward(self, x):
        x = self.layers(x)

        x = self.fc_norm(x)
        x = self.head_drop(x)
        x = self.head(x)
        return x


class LinearActivationNormDropOut(nn.Module):
    def __init__(self, d_in, d_out, activation=nn.SiLU(), dropout=0.1):
        super(LinearActivationNormDropOut, self).__init__()
        self.fc = nn.Linear(d_in, d_out)
        self.norm = nn.LayerNorm(d_out)
        self.activation = activation
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.fc(x)
        x = self.norm(x)
        x = self.activation(x)
        x = self.dropout(x)
        return x


class LinearFeatureExtractor(nn.Module):
    def __init__(self, d_model, d_ff=None, d_out=None, dropout=0.1):
        super(LinearFeatureExtractor, self).__init__()
        if d_ff is None:
            d_ff = d_model * 2
        if d_out is None:
            d_out = d_model

        self.extractor = nn.Sequential(
            LinearActivationNormDropOut(
                d_model, d_ff, activation=nn.SiLU(), dropout=dropout
            ),
            LinearActivationNormDropOut(
                d_ff, d_ff, activation=nn.SiLU(), dropout=dropout
            ),
            LinearActivationNormDropOut(
                d_ff, d_out, activation=nn.SiLU(), dropout=dropout
            ),
        )

    def forward(self, x):
        return self.extractor(x)


class LinearFeatureFusionBlock(nn.Module):
    def __init__(
        self,
        d_model_list,
        d_ff=128,  # median layer dim. recommanded not to higher than any of d_model passed
        d_out=128,
        dropout=0.1,
        fusion_method="add",
    ):
        """
        fusion_method: add
        """
        super(LinearFeatureFusionBlock, self).__init__()
        self.EachPartModuleList = nn.ModuleList(
            [
                LinearFeatureExtractor(
                    d_model, d_ff=d_model * 2, d_out=d_ff, dropout=dropout
                )
                for d_model in d_model_list
            ]
        )
        self.d_ff = d_ff
        self.fusionDecoder = LinearFeatureExtractor(
            d_ff, d_ff=d_ff * 2, d_out=d_out, dropout=dropout
        )

        self.d_model_list = d_model_list

        self.d_out = d_out
        self.dropout = dropout
        self.fusion_method = fusion_method

    def intermidiate_forward(self, *x_list):
        return [module(x) for module, x in zip(self.EachPartModuleList, x_list)]

    def forward(self, *x_list):
        x_list = self.intermidiate_forward(*x_list)

        if self.fusion_method == "add":
            x = torch.stack(x_list, dim=-1).sum(dim=-1)
        # elif self.fusion_method == "concat":
        #     x = torch.cat(x_list, dim=-1)
        # elif self.fusion_method == "minus":
        else:
            raise NotImplementedError("Not implemented")

        x = self.fusionDecoder(x)
        return x


class LinearTransformer(nn.Module):
    def __init__(
        self,
        features_dict,
        covariates_dict=None,
        d_ff=128,
        num_classes=2,
        num_layers=3,
        dropout=0.1,
    ):
        super(LinearTransformer, self).__init__()
        self.features_dict = features_dict
        self.covariates_dict = covariates_dict if covariates_dict else None
        self.features_name = list(features_dict.keys())[0]
        self.covariates_name = (
            list(covariates_dict.keys())[0] if covariates_dict else None
        )
        self.features = features_dict[self.features_name]
        self.covariates = (
            covariates_dict[self.covariates_name] if covariates_dict else None
        )

        self.d_featurs = len(self.features)
        self.d_covariates = len(self.covariates) if covariates_dict else None
        self.d_ff = d_ff if d_ff else self.d_featurs

        self.encoder = LinearTransformerEncoder(
            self.d_featurs,
            d_ff=self.d_ff,
            num_classes=d_ff,
            num_layers=num_layers,
            dropout=dropout,
        )  # d_features => d_ff

        d_model_list = (
            [self.d_ff, self.d_covariates]
            if self.d_covariates is not None
            else [self.d_ff]
        )
        self.decoder = LinearFeatureFusionBlock(
            d_model_list=d_model_list, d_out=d_ff, dropout=dropout
        )

        self.fc_norm = (
            nn.LayerNorm(d_ff, eps=1e-6) if num_classes > 0 else nn.Identity()
        )
        self.head_drop = nn.Dropout(dropout)
        self.head = nn.Linear(d_ff, num_classes) if num_classes > 0 else nn.Identity()

    def run_encoder(self, x):
        return self.encoder(x)

    def forward(self, x, cov=None):

        x = self.run_encoder(x)

        if cov is not None:
            out = self.decoder(x, cov)
        else:
            out = self.decoder(x)

        out = self.fc_norm(out)
        out = self.head_drop(out)
        out = self.head(out)

        return out

In [7]:
def modelParametersNum(model):
    totalNum = sum([i.numel() for i in model.parameters()])
    print(f"模型总参数个数：{totalNum}\t占用的总显存为{totalNum*4/1024/1024:.2f}MB")
    return totalNum

In [9]:
lt = LinearTransformer(
    features_dict={"proteomics": proteomics},
    covariates_dict={"risk_factors": risk_factors},
    d_ff=512,
    num_classes=2,
    num_layers=2,
    dropout=0.1,
)
print(modelParametersNum(lt))
print(lt(torch.randn(32, len(proteomics)), torch.randn(32, len(risk_factors))).shape)
lt

NameError: name 'modelParametersNum' is not defined

In [122]:
lt = LinearTransformer(
    features_dict={"proteomics": proteomics},
    # covariates_dict={"risk_factors": risk_factors},
    d_ff=512,
    num_classes=2,
    num_layers=2,
    dropout=0.1,
)
print(modelParametersNum(lt))
# print(lt(torch.randn(32, len(proteomics)), torch.randn(32, len(risk_factors))).shape)
lt

模型总参数个数：26387254	占用的总显存为100.66MB
26387254


LinearTransformer(
  (encoder): LinearTransformerEncoder(
    (layers): Sequential(
      (0): LinearResBlock(
        (fc1): Linear(in_features=2911, out_features=2911, bias=True)
        (norm1): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
        (ff): Sequential(
          (0): Linear(in_features=2911, out_features=512, bias=True)
          (1): SiLU()
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=512, out_features=2911, bias=True)
        )
        (norm2): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): LinearResBlock(
        (fc1): Linear(in_features=2911, out_features=2911, bias=True)
        (norm1): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
        (ff): Sequential(
          (0): Linear(in_features=2911, out_features=512, bias=True)
          (1): SiLU()
          (2): Dropout(p=0.1, inplace=Fa

In [123]:
lff = LinearFeatureFusionBlock([128, 10], d_ff=256, d_out=128, dropout=0.1)
print(lff(torch.randn(32, 128), torch.randn(32, 10)).shape)
lff

torch.Size([32, 128])


LinearFeatureFusionBlock(
  (EachPartModuleList): ModuleList(
    (0): LinearFeatureExtractor(
      (extractor): Sequential(
        (0): LinearActivationNormDropOut(
          (fc): Linear(in_features=128, out_features=256, bias=True)
          (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation): SiLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (1): LinearActivationNormDropOut(
          (fc): Linear(in_features=256, out_features=256, bias=True)
          (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation): SiLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (2): LinearActivationNormDropOut(
          (fc): Linear(in_features=256, out_features=256, bias=True)
          (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (activation): SiLU()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (1): LinearFeatureExtractor(
  

In [98]:
lte = LinearTransformerEncoder(d_model=30, d_ff=256, num_classes=128, num_layers=3)
print(lte(torch.randn(32, 30)).shape)
lte

torch.Size([32, 128])


LinearTransformerEncoder(
  (layers): Sequential(
    (0): LinearResBlock(
      (fc1): Linear(in_features=30, out_features=30, bias=True)
      (norm1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=30, out_features=256, bias=True)
        (1): SiLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=256, out_features=30, bias=True)
      )
      (norm2): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
    (1): LinearResBlock(
      (fc1): Linear(in_features=30, out_features=30, bias=True)
      (norm1): LayerNorm((30,), eps=1e-05, elementwise_affine=True)
      (ff): Sequential(
        (0): Linear(in_features=30, out_features=256, bias=True)
        (1): SiLU()
        (2): Dropout(p=0.1, inplace=False)
        (3): Linear(in_features=256, out_features=30, bias=True)
      )
      (norm2): L

## train

In [18]:
class LinearTransformerPL(pl.LightningModule):
    def __init__(
        self,
        features_dict,
        covariates_dict=None,
        d_ff=512,
        num_classes=2,
        num_layers=2,
        dropout=0.1,
        lr=1e-3,
        weight_decay=1e-2,
        weight=[1, 1],
        **kwargs,
    ):

        super(LinearTransformerPL, self).__init__()

        self.lr = lr
        self.weight_decay = weight_decay

        self.mertic = {
            "train_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
            "val_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
        }
        self.history = defaultdict(dict)
        self.loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(weight).float())
        self.model = LinearTransformer(
            features_dict=features_dict,
            covariates_dict=covariates_dict,
            d_ff=d_ff,
            num_classes=num_classes,
            num_layers=num_layers,
            dropout=dropout,
        )
        self.features = self.model.features

    def forward(self, x):

        return self.model(*x) if isinstance(x, (list, tuple)) else self.model(x)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        outputs = self.forward(x)
        loss = self.loss_fn(outputs, y.squeeze(-1).float())

        self.mertic["train_auc"].update(
            torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
        )

        self.log("ptl/train_loss", loss, on_epoch=True, prog_bar=True, on_step=False)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        outputs = self.forward(x)
        loss = self.loss_fn(outputs, y.squeeze(-1).float())

        self.mertic["val_auc"].update(
            torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
        )

        self.log("ptl/val_loss", loss, on_epoch=True, prog_bar=True)

    def on_train_epoch_end(self):

        auc = self.mertic["train_auc"].compute()
        self.log("ptl/train_auc", auc, prog_bar=True)

    def on_validation_epoch_end(self):
        auc = self.mertic["val_auc"].compute()
        self.log("ptl/val_auc", auc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )
        return optimizer

    def predict_df(self, df, batch_size=256):

        for feature in self.features:
            assert feature in df.columns
        print(f"input df have NA: {df[self.features].isna().sum(axis=1).sum()}")
        df = df.copy().dropna(subset=self.features)

        predict_dataloader = DataLoader(
            torch.tensor(df[self.features].values).float(),
            batch_size=batch_size,
            persistent_workers=True,
            num_workers=4,
        )

        self.eval()
        pred = []
        with torch.no_grad():
            for x in predict_dataloader:
                y_hat = self.forward(x).cpu().detach()
                y_hat = torch.softmax(y_hat, dim=-1)[:, 1]

                pred.append(y_hat)
        pred = torch.cat(pred).numpy()
        df["pred"] = pred
        return df

In [9]:
# # trainer
# used_fatures = proteomics
# # + risk_factors + PRS
# model = LinearTransformerPL(
#     features_dict={"proteomics": proteomics},
#     covariates_dict={"risk_factors": risk_factors},
#     d_ff=512,
#     num_classes=2,
#     num_layers=2,
#     dropout=0.1,
#     lr=1e-3,
#     weight_decay=1e-5,
#     weight=[1, 1],
# )

# dataset = DatasetModule(
#     train=train_imputed,
#     test=test_imputed,
#     features=used_fatures,
#     covariates=risk_factors,
#     label=["incident_cad"],
#     num_classes=2,
#     batch_size=256,
# )

In [126]:
for x, y in dataset.train_dataloader():
    # print(x.shape, y.shape)
    print(torch.argmax(y, dim=1).sum())
    break

tensor(126)


In [144]:
# trainer


model = LinearTransformerPL(
    features_dict={"proteomics": proteomics},
    # covariates_dict={"risk_factors": risk_factors},
    d_ff=64,
    num_classes=2,
    num_layers=1,
    dropout=0.3,
    lr=1e-4,
    weight_decay=1e-3,
    weight=[1, 1],
)

dataset = TableDatasetModule(
    train=train_imputed,
    test=test_imputed,
    features=proteomics,
    covariates=risk_factors,
    label=["incident_cad"],
    num_classes=2,
    batch_size=256,
)

trainer = Trainer(
    max_epochs=10,
    gradient_clip_val=1,
)
trainer.fit(model, dataset)

Train : incident_cad
0.0             27199
1.0              1606
dtype: int64
val : incident_cad
0.0             6809
1.0              393
dtype: int64
Test : incident_cad
0.0             14599
1.0               833
dtype: int64


  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params
----------------------------------------------
0 | loss_fn | CrossEntropyLoss  | 0     
1 | model   | LinearTransformer | 9.2 M 
----------------------------------------------
9.2 M     Trainable params
0         Non-trainable params
9.2 M     Total params
36.860    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [139]:
# main_embedd = model.model.run_encoder(x[0].to(model.device))
# main_embedd.shape
# o = model.model.decoder.intermidiate_forward(main_embedd, x[1].to(model.device))
# o

torch.Size([256, 256])

In [14]:
used_fatures = proteomics
# + risk_factors + PRS
used_fatures

['C3',
 'KLK7',
 'GCHFR',
 'NHLRC3',
 'APOD',
 'GAPDH',
 'TP53I3',
 'CPA4',
 'ANXA2',
 'GRSF1',
 'IL25',
 'HMMR',
 'MRPL52',
 'PAIP2B',
 'THAP12',
 'FOS',
 'FGF9',
 'PITHD1',
 'THSD1',
 'PTGES2',
 'DEFB103A_DEFB103B',
 'ATP1B4',
 'CYB5A',
 'UNC79',
 'SLC34A3',
 'TAGLN3',
 'SLIRP',
 'CLASP1',
 'PSMC3',
 'KIR3DL2',
 'BEX3',
 'PFDN4',
 'BCL7A',
 'SMC3',
 'SLC28A1',
 'CDC123',
 'GJA8',
 'NMRK2',
 'GATA3',
 'CPLX2',
 'RASGRF1',
 'FGF7',
 'ANKRA2',
 'RBM25',
 'LYZL2',
 'CDK1',
 'CREB3',
 'CREBZF',
 'IGLON5',
 'SHC1',
 'ZP4',
 'TMOD4',
 'CEP152',
 'MYH7B',
 'CEP350',
 'CDC25A',
 'TRIM26',
 'MANEAL',
 'MUCL3',
 'GIMAP8',
 'CYTH3',
 'PDXDC1',
 'CLINT1',
 'MAPRE3',
 'EVI2B',
 'STAU1',
 'PCNA',
 'DNAJA1',
 'JMJD1C',
 'GAGE2A',
 'GAD1',
 'IZUMO1',
 'PDCL2',
 'PDE1C',
 'STOML2',
 'BSND',
 'MAPK13',
 'PDIA2',
 'BTLA',
 'MLLT1',
 'TPRKB',
 'ARHGAP5',
 'BTNL10',
 'PHLDB2',
 'PDIA5',
 'ATF4',
 'PRAME',
 'TOP1MT',
 'KHDC3L',
 'DCUN1D2',
 'IL3',
 'DCLRE1C',
 'ERCC1',
 'DCDC2C',
 'VCPKMT',
 'SPRING1',
 'M

In [12]:
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
from ray.train.torch import TorchTrainer


def train_func(config):
    features_key = list(config["features_dict"].keys())[0]
    covariates_key = (
        list(config["covariates_dict"].keys())[0] if config["covariates_dict"] else None
    )

    dataset = TableDatasetModule(
        train=train_imputed,
        test=test_imputed,
        features=config["features_dict"][features_key],
        covariates=(
            config["covariates_dict"][covariates_key]
            if config["covariates_dict"]
            else None
        ),
        label=["incident_cad"],
        num_classes=2,
        batch_size=config["batch_size"],
    )
    # model = FullyConnectedNet(
    #     input_size=len(proteomics),
    #     hidden_size=config["hidden_size"],
    #     output_size=2,
    #     lr=config["lr"],
    #     weight_decay=config["weight_decay"],
    #     weight=config["weight"],
    #     num_resblocks=config["num_resblocks"],
    # )
    # model = FullyConnectedNet(**config)
    model = LinearTransformerPL(**config)
    trainer = Trainer(
        devices="auto",
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=True,
    )
    trainer = prepare_trainer(trainer)
    trainer.fit(model, dataset)


LinearTransformerPL_search_space = {
    "features_dict": {"proteomics": proteomics},
    "covariates_dict": tune.choice([{"risk_factors": risk_factors}, None]),
    "d_ff": tune.choice([64, 128, 256, 512]),
    "num_classes": 2,
    "num_layers": tune.choice([1, 2, 3, 4, 5]),
    "dropout": tune.uniform(0.1, 0.5),
    "lr": tune.loguniform(1e-4, 1e-1),
    "weight_decay": tune.loguniform(1e-4, 1e-2),
    "weight": tune.choice([[1, 1], [0.1, 1], [0.1, 10], [0.1, 100]]),
    "batch_size": tune.choice([64, 256]),
}


search_space = LinearTransformerPL_search_space


# The maximum training epochs
num_epochs = 20

# Number of sampls from parameter space
num_samples = 2
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)


scaling_config = ScalingConfig(
    num_workers=1, use_gpu=True, resources_per_worker={"CPU": 3, "GPU": 0.99}
)

run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="ptl/val_auc",
        checkpoint_score_order="max",
    ),
)

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)


def tune_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": search_space},
        tune_config=tune.TuneConfig(
            metric="ptl/val_auc",
            mode="max",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    return tuner.fit()


results = tune_asha(num_samples=num_samples)


# results.get_best_result("ptl/val_auc")

best_result = results.get_best_result("ptl/val_auc")
best_params = best_result.config
best_result_epoch_dir = (
    best_result.get_best_checkpoint("ptl/val_auc", "max").path + "/checkpoint.ckpt"
)
best_model_state = torch.load(best_result_epoch_dir)
best_model = LinearTransformerPL(**best_params["train_loop_config"])
best_model.load_state_dict(best_model_state["state_dict"])
best_model

0,1
Current time:,2024-04-18 14:35:59
Running for:,00:05:29.08
Memory:,18.4/50.1 GiB

Trial name,status,loc,train_loop_config/ba tch_size,train_loop_config/co variates_dict,train_loop_config/d_ ff,train_loop_config/dr opout,train_loop_config/lr,train_loop_config/nu m_layers,train_loop_config/we ight,train_loop_config/we ight_decay,iter,total time (s),ptl/val_loss,ptl/val_auc,ptl/train_loss
TorchTrainer_26cec_00000,TERMINATED,172.22.18.47:13883,64,,128,0.173362,0.00081795,4,"[0.1, 100]",0.00112076,20,303.029,0.654624,0.632138,0.364551
TorchTrainer_26cec_00001,TERMINATED,172.22.18.47:1450,64,{'risk_factors'_13c0,256,0.118666,0.0834193,2,"[0.1, 1]",0.000292107,1,17.5355,0.236837,0.501361,0.256112


2024-04-18 14:35:59,132	INFO tune.py:1016 -- Wrote the latest version of all result files and experiment state to '/home/xutingfeng/ray_results/TorchTrainer_2024-04-18_14-30-30' in 0.0115s.
2024-04-18 14:35:59,143	INFO tune.py:1048 -- Total run time: 329.10 seconds (329.06 seconds for the tuning loop).


LinearTransformerPL(
  (loss_fn): CrossEntropyLoss()
  (model): LinearTransformer(
    (encoder): LinearTransformerEncoder(
      (layers): Sequential(
        (0): LinearResBlock(
          (fc1): Linear(in_features=2911, out_features=2911, bias=True)
          (norm1): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
          (ff): Sequential(
            (0): Linear(in_features=2911, out_features=128, bias=True)
            (1): SiLU()
            (2): Dropout(p=0.17336180394137352, inplace=False)
            (3): Linear(in_features=128, out_features=2911, bias=True)
          )
          (norm2): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.17336180394137352, inplace=False)
          (dropout2): Dropout(p=0.17336180394137352, inplace=False)
        )
        (1): LinearResBlock(
          (fc1): Linear(in_features=2911, out_features=2911, bias=True)
          (norm1): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
         

In [13]:
results.get_dataframe().sort_values("ptl/val_auc")

Unnamed: 0,ptl/val_loss,ptl/val_auc,ptl/train_loss,epoch,step,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,...,config/train_loop_config/num_classes,config/train_loop_config/num_layers,config/train_loop_config/dropout,config/train_loop_config/lr,config/train_loop_config/weight_decay,config/train_loop_config/weight,config/train_loop_config/batch_size,ptl/train_auc,logdir,config/train_loop_config/covariates_dict/risk_factors
1,0.236837,0.501361,0.256112,0,450,1713422159,checkpoint_000000,True,True,1,...,2,2,0.118666,0.083419,0.000292,"[0.1, 1]",64,,26cec_00001,"[age, sex, ldl_a, hdl_a, tc_a, tg_a, sbp_a, BM..."
0,0.654624,0.632138,0.364551,19,9000,1713422136,checkpoint_000019,True,True,20,...,2,4,0.173362,0.000818,0.001121,"[0.1, 100]",64,0.649011,26cec_00000,


In [19]:
# results.get_best_result("ptl/val_auc")

best_result = results.get_best_result("ptl/val_auc")
best_params = best_result.config
best_result_epoch_dir = (
    best_result.get_best_checkpoint("ptl/val_auc", "max").path + "/checkpoint.ckpt"
)
best_model_state = torch.load(best_result_epoch_dir)
best_model = LinearTransformerPL(**best_params["train_loop_config"])
best_model.load_state_dict(best_model_state["state_dict"])
best_model

LinearTransformerPL(
  (loss_fn): CrossEntropyLoss()
  (model): LinearTransformer(
    (encoder): LinearTransformerEncoder(
      (layers): Sequential(
        (0): LinearResBlock(
          (fc1): Linear(in_features=2911, out_features=2911, bias=True)
          (norm1): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
          (ff): Sequential(
            (0): Linear(in_features=2911, out_features=128, bias=True)
            (1): SiLU()
            (2): Dropout(p=0.17336180394137352, inplace=False)
            (3): Linear(in_features=128, out_features=2911, bias=True)
          )
          (norm2): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.17336180394137352, inplace=False)
          (dropout2): Dropout(p=0.17336180394137352, inplace=False)
        )
        (1): LinearResBlock(
          (fc1): Linear(in_features=2911, out_features=2911, bias=True)
          (norm1): LayerNorm((2911,), eps=1e-05, elementwise_affine=True)
         

In [18]:
best_params["train_loop_config"]

{'features': ['C3',
  'KLK7',
  'GCHFR',
  'NHLRC3',
  'APOD',
  'GAPDH',
  'TP53I3',
  'CPA4',
  'ANXA2',
  'GRSF1',
  'IL25',
  'HMMR',
  'MRPL52',
  'PAIP2B',
  'THAP12',
  'FOS',
  'FGF9',
  'PITHD1',
  'THSD1',
  'PTGES2',
  'DEFB103A_DEFB103B',
  'ATP1B4',
  'CYB5A',
  'UNC79',
  'SLC34A3',
  'TAGLN3',
  'SLIRP',
  'CLASP1',
  'PSMC3',
  'KIR3DL2',
  'BEX3',
  'PFDN4',
  'BCL7A',
  'SMC3',
  'SLC28A1',
  'CDC123',
  'GJA8',
  'NMRK2',
  'GATA3',
  'CPLX2',
  'RASGRF1',
  'FGF7',
  'ANKRA2',
  'RBM25',
  'LYZL2',
  'CDK1',
  'CREB3',
  'CREBZF',
  'IGLON5',
  'SHC1',
  'ZP4',
  'TMOD4',
  'CEP152',
  'MYH7B',
  'CEP350',
  'CDC25A',
  'TRIM26',
  'MANEAL',
  'MUCL3',
  'GIMAP8',
  'CYTH3',
  'PDXDC1',
  'CLINT1',
  'MAPRE3',
  'EVI2B',
  'STAU1',
  'PCNA',
  'DNAJA1',
  'JMJD1C',
  'GAGE2A',
  'GAD1',
  'IZUMO1',
  'PDCL2',
  'PDE1C',
  'STOML2',
  'BSND',
  'MAPK13',
  'PDIA2',
  'BTLA',
  'MLLT1',
  'TPRKB',
  'ARHGAP5',
  'BTNL10',
  'PHLDB2',
  'PDIA5',
  'ATF4',
  'PRAME',
  

In [20]:
test_imputed = best_model.predict_df(test_imputed)
test_imputed

input df have NA: 0


Unnamed: 0,eid,PRS,sex,height,weight,BSA,genotype_array,age,PC1,PC2,...,TGFBR3,CRTAC1,IGFBP7,SELE,VWF,NOTCH3,CNTN1,ENG,ICAM2,pred
19409,2883530.0,1.030583,1.0,171.0,64.2,1.746282,2,44.0,71.3002,-100.66700,...,-0.0087,-0.029539,0.022568,-0.027118,0.008048,0.004249,0.000619,0.001707,-0.026825,0.999796
19272,2867444.0,2.192278,0.0,165.0,55.8,1.599219,2,53.0,-12.4815,3.16181,...,0.1859,0.291950,0.147400,-0.120500,0.597300,0.115700,0.243300,0.127800,0.063400,0.991804
49865,5869793.0,0.653794,1.0,171.0,77.3,1.916181,2,62.0,-11.4721,2.20519,...,0.0516,0.369750,-0.155300,0.035500,-0.276700,-0.043900,0.195500,-0.111000,-0.990800,0.999830
39664,4880838.0,0.664819,0.0,163.0,84.4,1.954852,2,62.0,-11.1640,3.66252,...,-0.0127,0.393200,0.174500,0.035700,0.873200,0.236600,0.114200,0.134400,0.008700,0.999749
30555,3987428.0,0.826465,0.0,164.0,73.1,1.824859,1,66.0,-11.4666,2.77498,...,-0.5216,0.005050,-0.160200,0.181900,1.026700,-0.062150,-0.094500,-0.032700,0.213200,0.999845
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43327,5241912.0,1.085083,1.0,176.0,116.0,2.381409,2,45.0,-10.8083,4.46241,...,0.2348,-0.919950,0.803300,0.131600,0.481500,0.279800,-0.226200,0.262600,0.239400,0.999857
29129,3851862.0,1.294348,0.0,169.0,72.9,1.849932,2,40.0,-12.6549,3.40064,...,-0.3290,-0.251250,-0.787400,-0.919000,0.212700,-0.617800,0.123900,-0.124100,-0.940500,0.998148
1550,1144512.0,0.722791,1.0,191.0,96.6,2.263883,1,59.0,-12.7237,1.46547,...,0.1043,-0.284750,0.350300,1.608600,-0.341300,0.134100,-0.012000,0.226700,0.135200,0.999782
1888,1177099.0,1.335307,1.0,175.0,75.1,1.910679,2,63.0,-15.1573,7.36690,...,0.2172,0.172250,0.431300,0.121750,-0.754900,0.530700,0.244000,-0.018900,-0.053400,0.999728


In [20]:
cal_binary_metrics(test_imputed["incident_cad"], test_imputed["pred"])

NameError: name 'cal_binary_metrics' is not defined

In [17]:
import pandas as pd
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    roc_curve,
    precision_recall_curve,
    auc,
)


from tqdm.rich import tqdm
import numpy as np

from statsmodels.stats.multitest import multipletests


import statsmodels.api as sm

from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
)
from scipy.stats import pearsonr, spearmanr


def generate_multipletests_result(df, pvalue_col="pvalue", alpha=0.05, method="fdr_bh"):
    df = df.copy()
    pvalue_series = df[pvalue_col]
    reject, pvals_corrected, _, _ = multipletests(
        pvalue_series, alpha=alpha, method="fdr_bh"
    )
    df["pval_corrected"] = pvals_corrected
    df["reject"] = reject
    return df


def find_best_cutoff(fpr, tpr, thresholds):
    diff = tpr - fpr
    Youden_index = np.argmax(diff)
    optimal_threshold = thresholds[Youden_index]
    optimal_FPR, optimal_TPR = fpr[Youden_index], tpr[Youden_index]
    return optimal_threshold, optimal_FPR, optimal_TPR


def cal_binary_metrics(y, y_pred):
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    AUC = roc_auc_score(y, y_pred)
    # by best youden

    optim_threshold, optim_fpr, optim_tpr = find_best_cutoff(fpr, tpr, thresholds)
    y_pred_binary = (y_pred > optim_threshold).astype(int)
    ACC = accuracy_score(y, y_pred_binary)
    macro_f1 = f1_score(y, y_pred_binary, average="macro")
    sensitivity = optim_tpr
    specificity = 1 - optim_fpr
    precision, recall, _ = precision_recall_curve(y, y_pred)
    APR = auc(recall, precision)

    return {
        "AUC": AUC,
        "ACC": ACC,
        "Macro_F1": macro_f1,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "APR": APR,
    }


import torch.nn as nn
import torch.optim as optim
from tqdm.rich import tqdm

# 定义神经网络模型


class LinearResBlock(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearResBlock, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)
        self.batch_norm = nn.LayerNorm(output_size)

        torch.nn.init.kaiming_normal_(self.fc1.weight, nonlinearity="relu")  # <6>

        torch.nn.init.constant_(self.batch_norm.weight, 0.5)  # <7>
        torch.nn.init.zeros_(self.batch_norm.bias)

    def forward(self, x):
        out = self.fc1(x)

        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x


class FullyConnectedNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_resblocks=3):
        super(FullyConnectedNet, self).__init__()
        self.norm = nn.LayerNorm(input_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.resblocks = nn.Sequential(
            *[LinearResBlock(hidden_size, hidden_size) for _ in range(num_resblocks)]
        )
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.norm(x)
        out = torch.relu(self.fc1(x))
        out = self.resblocks(out)
        out = self.fc2(out)
        return out


# 定义训练函数
def train(model, dataset, criterion, optimizer, num_epochs):
    train_loader = dataset.train_dataloader()
    val_loader = dataset.test_dataloader()
    for epoch in range(num_epochs):
        running_loss = 0.0
        auroc = torchmetrics.AUROC(num_classes=2, task="multiclass")
        for inputs, labels in tqdm(
            train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", total=len(train_loader)
        ):
            inputs, labels = inputs.to("cuda:0"), labels.to("cuda:0")
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.squeeze(-1).float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            auroc.update(torch.softmax(outputs, dim=-1), torch.argmax(labels, dim=1))
        auc = auroc.compute()
        print(
            f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}, AUC: {auc}"
        )
        if epoch % 1 == 0:
            test_auc = torchmetrics.AUROC(num_classes=2, task="multiclass")
            for inputs, labels in val_loader:
                inputs, labels = inputs.to("cuda:0"), labels.to("cuda:0")
                outputs = model(inputs)
                test_auc.update(
                    torch.softmax(outputs, dim=-1), torch.argmax(labels, dim=1)
                )
            print(f"Epoch {epoch+1}/{num_epochs}, Test AUC: {test_auc.compute()}")
    # test_auc = torchmetrics.AUROC(num_classes=2, task="multiclass")
    # for inputs, labels in dataset.test_dataloader():
    #     inputs, labels = inputs.to("cuda:0"), labels.to("cuda:0")
    #     outputs = model(inputs)
    #     test_auc.update(torch.softmax(outputs, dim=-1), torch.argmax(labels, dim=1))
    # print(f"Test AUC: {test_auc.compute()}")

In [None]:
from pytorch_lightning import Trainer, seed_everything

seed_everything(42)

In [None]:
# 设置训练参数
input_size = len(proteomics)  # 输入特征维度
hidden_size = 512  # 隐藏层维度
output_size = 2  # 输出类别数
learning_rate = 5e-4
batch_size = 256
num_epochs = 5


# 创建模型实例
best_model = FullyConnectedNet(
    input_size=len(proteomics), hidden_size=256, output_size=2, num_resblocks=6
)
best_model.to("cuda:0")
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(weight=torch.Tensor([0.1, 100]).to("cuda:0"))
optimizer = optim.NAdam(best_model.parameters(), lr=learning_rate, weight_decay=5e-3)


# 开始训练
train(best_model, dataset, criterion, optimizer, num_epochs)

In [None]:
y_pred_list = []
y_list = []
AUC = torchmetrics.AUROC(num_classes=2, task="multiclass")

best_model.eval()
for x, y in dataset.test_dataloader():
    y_pred = best_model(x.to("cuda:0")).cpu().detach()
    y_pred_list.append(y_pred)
    y_list.append(y)
    AUC.update(torch.softmax(y_pred, dim=-1), torch.argmax(y, dim=1))

AUC_values = AUC.compute()
print(f"AUC: {AUC_values}")

In [None]:
y_pred = torch.softmax(torch.cat(y_pred_list), dim=-1)[:, 1].numpy()
y_true = torch.argmax(torch.cat(y_list), dim=1).numpy()

test_df = pd.DataFrame(
    {
        "y_pred": y_pred,
        "y_true": y_true,
    }
)

cal_binary_metrics(y_true, y_pred)

In [None]:
y_true

In [None]:
cal_binary_metrics(test_df["y_true"], test_df["y_pred"])