In [74]:
# --- 导入与设置 ---
import numpy as np
import pandas as pd
import gc
import os

# PyTorch and Dataloader
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
# Scikit-learn
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# AnnData for loading
import anndata as ad
from scipy.sparse import issparse

# --- 全局配置 ---
name = 'multi' # 本次处理的数据集
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

def neg_correlation_score(y_pred: torch.Tensor, y_true: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    """
    Row-wise Pearson correlation (y_true vs y_pred), then return negative mean.
    Shapes: [batch, dim] (e.g., dim = #targets per sample)
    """
    # center per row
    x = y_true - y_true.mean(dim=1, keepdim=True)
    y = y_pred - y_pred.mean(dim=1, keepdim=True)

    # numerator & denominator
    num = (x * y).sum(dim=1)                               # [batch]
    den = torch.sqrt((x.square().sum(dim=1) + eps) *
                     (y.square().sum(dim=1) + eps))        # [batch]

    r = num / den                                          # row-wise corr
    return -r.mean()                                       # negative average correlation

def zscore_col(x):
    """Applies z-score normalization to each COLUMN of a dense numpy array."""
    mean = np.mean(x, axis=0, keepdims=True)
    std = np.std(x, axis=0, keepdims=True)
    std[std == 0] = 1 # Avoid division by zero
    return (x - mean) / std

Using device: cuda


In [75]:
# --- Cell2: Data Reduction ---
# --- Cell2: Data Reduction (No SVD) ---
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import issparse
import gc
# TruncatedSVD is no longer needed
from sklearn.preprocessing import normalize

# --- Helper function for sparse matrix row-wise scaling ---
def scal(X):
    """Scales a sparse matrix by the median of non-zero values in each row."""
    X_scaled = X.copy().astype(np.float32)
    for i in range(X_scaled.shape[0]):
        row_data = X_scaled[i].data
        if len(row_data) > 0:
            median = np.median(row_data)
            if median > 0:
                X_scaled[i].data /= median
    return X_scaled

# --- a) Load sparse data from .h5ad files ---
print("Loading sparse data from .h5ad files...")

PATH_TRAIN_INP = f"train_{name}_inputs.h5ad"
PATH_TRAIN_TGT = f"train_{name}_targets.h5ad"
PATH_TEST_INP  = f"test_{name}_inputs.h5ad"

adata_train_inp = ad.read_h5ad(PATH_TRAIN_INP)
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
adata_test_inp = ad.read_h5ad(PATH_TEST_INP)

train_X_sparse = adata_train_inp.X
test_X_sparse = adata_test_inp.X
train_y_sparse = adata_train_tgt.X

print(f"Initial sparse train_X shape: {train_X_sparse.shape}")
print(f"Initial sparse train_y shape: {train_y_sparse.shape}")

# --- Input Matrix Preprocessing (train_X, test_X) ---
print("\n--- Preprocessing Input Matrices (X)... ---")
# 1. Divide by row-wise non-zero median
print("Step 1: Scaling train_X and test_X by row-wise non-zero median...")
train_X_scaled = scal(train_X_sparse)
test_X_scaled = scal(test_X_sparse)
train_X = train_X_scaled.toarray()
test_X = test_X_scaled.toarray()

# 2. Convert target matrix to dense and scale by row-wise non-zero median
print("Step 2: Converting target to dense and scaling by row-wise non-zero median...")
train_y = train_y_sparse.toarray() if issparse(train_y_sparse) else train_y_sparse
for i in range(train_y.shape[0]):
    row = train_y[i, :]
    nonzero_values = row[row > 0]
    if len(nonzero_values) > 0:
        nonzero_median = np.median(nonzero_values)
        if nonzero_median > 0:
            train_y[i, :] /= nonzero_median

# 3. log1p transform for target
print("Step 3: Applying log1p transformation...")
train_y = np.maximum(train_y, 0)
train_y = np.log1p(train_y)

# 4. Row-wise L2 normalization for target
print("Step 4: Applying row-wise L2 normalization...")
train_y = normalize(train_y, norm='l2', axis=1)

# 5. Clean up unused variables from memory
print("\nStep 9: Cleaning up memory...")
del adata_train_inp, adata_train_tgt, adata_test_inp
del train_X_sparse, test_X_sparse, train_y_sparse
del train_X_scaled, test_X_scaled
# del zero_mask, y_reconstructed # These variables are no longer created
gc.collect()

# --- Final Check ---
print("\nPreprocessing complete. Final shapes:")
print(f"train_X: {train_X.shape}")
print(f"train_y: {train_y.shape}") # The target is now the full-dimensional train_y
print(f"test_X: {test_X.shape}")


Loading sparse data from .h5ad files...
Initial sparse train_X shape: (100646, 160259)
Initial sparse train_y shape: (100646, 23418)

--- Preprocessing Input Matrices (X)... ---
Step 1: Scaling train_X and test_X by row-wise non-zero median...


MemoryError: Unable to allocate 33.4 GiB for an array with shape (55935, 160259) and data type float32

In [None]:
# --- Cell 3: Dataset Class ---

class Dataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = features
        self.target = targets
        self.is_train = targets is not None

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        feature = self.features[idx]
        if issparse(feature):
            feature = feature.toarray().squeeze()
        feature = torch.tensor(feature, dtype=torch.float32)
        
        if self.is_train:
            target = torch.tensor(self.target[idx], dtype=torch.float32)
            return feature, target
        else:
            return feature

In [None]:
# --- Cell 4 : Model Architecture ---

import torch
import torch.nn as nn
import torch.nn.functional as F

class VAETransformer(nn.Module):
    """
    一个集成了 VAE 和 Transformer 的模型。
    工作流程:
    1. VAE Encoder: (batch, num_features) -> mu, log_var (batch, latent_dim)
    2. Reparameterization Trick: 从 mu, log_var 采样 -> z (batch, latent_dim)
    3. Transformer: 将 z 视为长度为1的序列进行处理 -> (batch, latent_dim)
    4. VAE Decoder: (batch, latent_dim) -> (batch, target_dim)
    """
    def __init__(self, num_features, target_dim, latent_dim=256, hidden_dim=1024,
                 nhead=8, num_encoder_layers=3, dim_feedforward=1024, dropout=0.1):
        """
        初始化 VAETransformer 模型。

        参数:
            num_features (int): 输入特征维度。
            target_dim (int): 输出目标维度
            latent_dim (int): VAE 的隐空间维度，同时也是 Transformer 的 d_model。
            hidden_dim (int): Encoder 和 Decoder 中间隐藏层的维度。
            nhead (int): Transformer 的头数。
            num_encoder_layers (int): Transformer Encoder 的层数。
            dim_feedforward (int): Transformer 前馈网络的维度。
            dropout (float): Dropout 比例。
        """
        super().__init__()
        self.latent_dim = latent_dim

        # --- 1. VAE Encoder ---
        self.encoder = nn.Sequential(
            nn.Linear(num_features, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.fc_mu = nn.Linear(hidden_dim // 2, latent_dim)
        self.fc_log_var = nn.Linear(hidden_dim // 2, latent_dim)

        # --- 2. Transformer Encoder ---
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=latent_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=num_encoder_layers
        )

        # --- 3. VAE Decoder ---
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, target_dim)
        )

    def reparameterize(self, mu, log_var):
        """
        重参数化技巧，使得采样过程可导。
        在评估模式下，直接返回 mu 以获得确定性输出。
        """
        if not self.training:
            return mu
        std = torch.exp(0.5 * log_var)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        """
        前向传播。
        x 的形状: (batch_size, num_features)
        """
        # 编码
        h = self.encoder(x)
        mu = self.fc_mu(h)
        log_var = self.fc_log_var(h)

        # 采样隐向量 z
        z = self.reparameterize(mu, log_var)

        # Transformer 处理
        # 增加序列维度: (batch, latent_dim) -> (batch, 1, latent_dim)
        z_seq = z.unsqueeze(1)
        transformer_out = self.transformer_encoder(z_seq)
        # 去除序列维度: (batch, 1, latent_dim) -> (batch, latent_dim)
        processed_z = transformer_out.squeeze(1)

        # 解码
        reconstruction = self.decoder(processed_z)

        # 返回重构结果和分布参数（用于计算损失）
        return reconstruction, mu, log_var

In [None]:
# --- Cell 5: Loss function ---

class CombinedLoss(nn.Module):
    """
    组合损失函数
    """
    def __init__(self):
        super().__init__()
        self.mse_loss = nn.MSELoss()
        self.corr_loss = neg_correlation_score

    def forward(self, pred, true, mu, log_var):

        loss1 = self.mse_loss(pred, true)
        loss2 = self.corr_loss(pred, true)
        loss3 = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
        loss = loss1 + loss2 + loss3
        return loss

In [None]:
#--- Cell 6: Training function ---
def train_one_fold(
    train_loader,
    valid_loader,
    model,
    optimizer,
    loss_fn,
    train_params,
    n_fold,
    name
):
    """
    Trains the dual-model setup for a single cross-validation fold.
    """
    # Initialize variables to track the best model
    best_val_loss = float('inf')
    model_path = f"model_{name}_fold_{n_fold+1}.pth"

    # Main training loop for the specified number of epochs
    for epoch in range(train_params['epochs']):
        # --- Training Phase ---
        model.train()
        for features, targets in train_loader:
            # Move data to the GPU
            features, targets = features.to(DEVICE), targets.to(DEVICE)
            
            # Reset gradients
            optimizer.zero_grad()
            
            # Forward pass through the first model
            preds, mu, log_var = model(features)
            
            # Calculate the combined loss
            loss = loss_fn(preds, targets, mu, log_var)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        # --- Validation Phase ---
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in valid_loader:
                # Move data to the GPU
                features, targets = features.to(DEVICE), targets.to(DEVICE)
                
                # Forward pass
                preds, mu, log_var = model(features)

                # Calculate loss for the validation batch
                loss = loss_fn(preds, targets, mu, log_var)
                val_loss += loss.item() * len(features)
        
        val_loss /= len(valid_loader.dataset)
        
        # Check if the model has improved and save the best version
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            # Save the state dictionaries of both models
            torch.save(model.state_dict(), model_path)
            
        # Print progress periodically
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{train_params['epochs']}, Val Loss: {val_loss:.4f}")
    
    print(f"Best validation loss for fold {n_fold+1}: {best_val_loss:.4f}")
    # Return the path to the best saved model for this fold
    return model_path

In [None]:
#--- Cell 7: Train and Validation
MODEL_PARAMS = { # Parameters for the GRUBackbone
    'num_features': train_X.shape[1], 
    'target_dim': train_y.shape[1],
    'latent_dim': 1024,
    'hidden_dim': 1024,
    'nhead': 16,
    'num_encoder_layers': 5,
    'dim_feedforward': 1024,
    'dropout': 0.1,
}
TRAIN_PARAMS = {'batch_size': 512, 'epochs': 50, 'lr': 5e-3, 'loss_alpha': 0.5}
folds = KFold(n_splits=5, shuffle=True, random_state=1337)

# --- Initialize Prediction Arrays ---
oof_preds = np.zeros_like(train_y, dtype=np.float32)
sub_preds = np.zeros((test_X.shape[0], train_y.shape[1]), dtype=np.float32)
test_loader = DataLoader(Dataset(test_X, np.zeros((test_X.shape[0], 128))), 
                         batch_size=TRAIN_PARAMS['batch_size'] * 2)


# --- Main CV Loop ---
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_X)):
    print(f"\n===== Processing Fold {n_fold+1}/{folds.get_n_splits()} =====")
    
    # 1. Slice data for the current fold
    X_train_fold, X_valid_fold = train_X[train_idx], train_X[valid_idx]
    y_full_train, y_full_valid = train_y[train_idx], train_y[valid_idx]
    
    # 2. Prepare DataLoaders, Models, Optimizer, and Loss for the fold
    train_dataset = Dataset(X_train_fold, y_full_train)
    valid_dataset = Dataset(X_valid_fold, y_full_valid)
    train_loader = DataLoader(train_dataset, batch_size=TRAIN_PARAMS['batch_size'], shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=TRAIN_PARAMS['batch_size'] * 2)
    
    model = VAETransformer(**MODEL_PARAMS).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=TRAIN_PARAMS['lr'])
    loss_fn = CombinedLoss().to(DEVICE)

    # 3. Call the training function for this fold
    best_model_path = train_one_fold(
        train_loader, valid_loader, model, optimizer, loss_fn,
        TRAIN_PARAMS, n_fold, f"{name}_multitask"
    )
    
    # 4. Load the best model and perform predictions
    best_model = torch.load(best_model_path)
    model.load_state_dict(best_model)
    model.eval()
    
    # 4a. OOF predictions
    val_preds_list = []
    with torch.no_grad():
        for features, _ in valid_loader:
            preds, _, _ = model(features.to(DEVICE))
            val_preds_list.append(preds.cpu().numpy())
    oof_preds[valid_idx] = np.concatenate(val_preds_list)

        # 4b. Test set predictions (accumulated)
    start_idx = 0
    with torch.no_grad():
        for features, _ in test_loader:
            preds, _, _ = model(features.to(DEVICE))
            preds = preds.detach().cpu().numpy()
            end_idx = start_idx + preds.shape[0]
            sub_preds[start_idx:end_idx, :] += preds / folds.get_n_splits()
            start_idx = end_idx
            
    # 5. Clean up memory for the next fold
    del model, X_train_fold, y_full_train, X_valid_fold, y_full_valid
    gc.collect()
    torch.cuda.empty_cache()


# --- save results ---
np.save(f'sub_preds_{name}.npy', sub_preds)
print('Prediction on test dataset had been saved, shape:', sub_preds.shape)
    



===== Processing Fold 1/5 =====
Epoch 5/50, Val Loss: 0.1611
Epoch 10/50, Val Loss: -0.5138
Epoch 15/50, Val Loss: -0.6513
Epoch 20/50, Val Loss: -0.7103
Epoch 25/50, Val Loss: -0.7284
Epoch 30/50, Val Loss: -0.7472
Epoch 35/50, Val Loss: -0.7452
Epoch 40/50, Val Loss: -0.7683
Epoch 45/50, Val Loss: 1.3538
Epoch 50/50, Val Loss: -0.6205
Best validation loss for fold 1: -0.7752

===== Processing Fold 2/5 =====
Epoch 5/50, Val Loss: 0.1947
Epoch 10/50, Val Loss: -0.5014
Epoch 15/50, Val Loss: -0.6128
Epoch 20/50, Val Loss: -0.6861
Epoch 25/50, Val Loss: -0.7151
Epoch 30/50, Val Loss: -0.7368
Epoch 35/50, Val Loss: -0.5325
Epoch 40/50, Val Loss: -0.7587
Epoch 45/50, Val Loss: -0.6991
Epoch 50/50, Val Loss: -0.7508
Best validation loss for fold 2: -0.7673

===== Processing Fold 3/5 =====
Epoch 5/50, Val Loss: 0.1571
Epoch 10/50, Val Loss: -0.4739
Epoch 15/50, Val Loss: -0.6272
Epoch 20/50, Val Loss: -0.6710
Epoch 25/50, Val Loss: -0.7131
Epoch 30/50, Val Loss: -0.7361
Epoch 35/50, Val Los