In [1]:
# --- 导入与设置 ---
import numpy as np
import pandas as pd
import gc
import os

# PyTorch and Dataloader
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
# Scikit-learn
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# AnnData for loading
import anndata as ad
from scipy.sparse import issparse

# --- 全局配置 ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

def neg_correlation_score(y_pred: torch.Tensor, y_true: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    """
    Row-wise Pearson correlation (y_true vs y_pred), then return negative mean.
    Shapes: [batch, dim] (e.g., dim = #targets per sample)
    """
    # center per row
    x = y_true - y_true.mean(dim=1, keepdim=True)
    y = y_pred - y_pred.mean(dim=1, keepdim=True)

    # numerator & denominator
    num = (x * y).sum(dim=1)                               # [batch]
    den = torch.sqrt((x.square().sum(dim=1) + eps) *
                     (y.square().sum(dim=1) + eps))        # [batch]

    r = num / den                                          # row-wise corr
    return -r.mean()                                       # negative average correlation

def zscore_col(x):
    """Applies z-score normalization to each COLUMN of a dense numpy array."""
    mean = np.mean(x, axis=0, keepdims=True)
    std = np.std(x, axis=0, keepdims=True)
    std[std == 0] = 1 # Avoid division by zero
    return (x - mean) / std

Using device: cuda


In [None]:
# --- Cell2a: Input Preprocessing (Features) ---
import joblib
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import numpy as np
import gc

# --- Parameters from the diagram ---
N_COMP_SVD = 128
train_X_sparse = 
# We start with train_X_sparse and test_X_sparse from the data loading cell.
print(f"Original sparse train_X shape: {train_X_sparse.shape}")
print(f"Original sparse test_X shape: {test_X_sparse.shape}")

# --- Path 1: TF-IDF Scaling + SVD ---
print("\n--- Processing Path 1: TF-IDF Scaling + SVD ---")
# 1a. Divide by row-wise non-zero median
# This is a memory-efficient way to perform this operation on a sparse matrix
def scale_by_row_nonzero_median(X_sparse):
    X_scaled = X_sparse.copy()
    # Iterate over rows
    for i in range(X_scaled.shape[0]):
        row = X_scaled.getrow(i)
        # Find non-zero data and calculate its median
        nonzero_median = np.median(row.data)
        if nonzero_median > 0:
            # Scale the non-zero elements
            row.data /= nonzero_median
            X_scaled[i] = row
    return X_scaled

train_X_scaled = scale_by_row_nonzero_median(train_X_sparse)
test_X_scaled = scale_by_row_nonzero_median(test_X_sparse)
print("Row-wise scaling complete.")

# 1b. Truncated SVD on the scaled data
svd1 = TruncatedSVD(n_components=N_COMP_SVD, random_state=42)
train_X1 = svd1.fit_transform(train_X_scaled)
test_X1 = svd1.transform(test_X_scaled)
print(f"SVD for Path 1 complete. New shape: {train_X1.shape}")

del train_X_scaled, test_X_scaled
gc.collect()

# --- Path 2: Binarization + SVD ---
print("\n--- Processing Path 2: Binarization + SVD ---")
# 2a. Convert non-zero values to 1
train_X_binary = train_X_sparse.copy()
train_X_binary.data[:] = 1
test_X_binary = test_X_sparse.copy()
test_X_binary.data[:] = 1
print("Binarization complete.")

# 2b. Truncated SVD on the binary data
svd2 = TruncatedSVD(n_components=N_COMP_SVD, random_state=42)
train_X2 = svd2.fit_transform(train_X_binary)
test_X2 = svd2.transform(test_X_binary)
print(f"SVD for Path 2 complete. New shape: {train_X2.shape}")

del train_X_binary, test_X_binary
gc.collect()

# --- Combination ---
# Concatenate the results from both paths to create the final feature matrix
print("\n--- Combining results from both paths ---")
train_X = np.concatenate([train_X1, train_X2], axis=1)
test_X = np.concatenate([test_X1, test_X2], axis=1)

print(f"Final train_X shape: {train_X.shape}")
print(f"Final test_X shape: {test_X.shape}")

# Final dimensionality reduction with SVD
print("Step 6: Performing final dimensionality reduction with SVD...")
svd_final = TruncatedSVD(n_components=N_COMP_FINAL, random_state=42)
train_y = svd_final.fit_transform(train_y_processed) # Note: variable name was train_y_processed

# 保存训练好的SVD模型，以便在后处理中使用
SVD_PATH = f'svd_final_{name}.joblib'
joblib.dump(svd_final, SVD_PATH)
print(f"Saved final SVD model for post-processing to '{SVD_PATH}'")


print(f"\nFinal train_y shape after all preprocessing: {train_y.shape}")


Loading sparse data from .h5ad files...
Initial sparse train_X shape: (64074, 15435)

Selecting top 12000 most variable features...
Selected sparse train_X shape: (64074, 12000)
Starting dimension reduction
Reduced shapes: train (64074, 8000), test (48663, 8000)
Explained variance ratio (sum): 0.8963
Final dense train_X shape: (64074, 8000)


In [None]:
# --- New Cell 2: Target Preprocessing (Labels) ---
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
import numpy as np

# --- Parameters from the diagram ---
N_COMP_IMPUTE = 128 # SVD components for imputation
N_COMP_FINAL = 128  # Final SVD components

# We start with the dense train_y from the data loading cell.
print(f"Original train_y shape: {train_y.shape}")

# --- Target Preprocessing Steps ---
# Step 1: Divide by row-wise non-zero median
print("\nStep 1: Scaling by row-wise non-zero median...")
for i in range(train_y.shape[0]):
    row = train_y[i, :]
    nonzero_values = row[row > 0]
    if len(nonzero_values) > 0:
        nonzero_median = np.median(nonzero_values)
        if nonzero_median > 0:
            train_y[i, :] /= nonzero_median

# Step 2: Log1p transform
print("Step 2: Applying log1p transformation...")
train_y = np.log1p(train_y)

# Step 3: Impute zeros using SVD-based method
print("Step 3: Imputing zeros with SVD...")
zero_mask = train_y == 0
svd_impute = TruncatedSVD(n_components=N_COMP_IMPUTE, random_state=42)
y_reconstructed = svd_impute.fit_transform(train_y)
y_reconstructed = svd_impute.inverse_transform(y_reconstructed)
# Copy the imputed values only where the original data was zero
train_y[zero_mask] = y_reconstructed[zero_mask]

# Step 4: Row-wise normalization
print("Step 4: Applying row-wise L2 normalization...")
train_y = normalize(train_y, norm='l2', axis=1)

# Step 5: Subtract column-wise medians
print("Step 5: Subtracting column-wise medians...")
col_medians = np.median(train_y, axis=0)
train_y = train_y - col_medians
# 【重要】: 保存列中位数，以便在后处理时加回
MEDIANS_PATH = 'target_col_medians.npy'
np.save(MEDIANS_PATH, col_medians)
print(f"Saved column medians for post-processing to '{MEDIANS_PATH}'")

# Step 6: Final dimensionality reduction with SVD
print("Step 6: Performing final dimensionality reduction with SVD...")
svd_final = TruncatedSVD(n_components=N_COMP_FINAL, random_state=42)
train_y = svd_final.fit_transform(train_y)

print(f"\nFinal train_y shape after all preprocessing: {train_y.shape}")

In [3]:
# --- Cell 3: PyTorch Dataset Class ---
class SingleCellDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = features
        self.targets = targets
        self.is_train = targets is not None

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        feature = self.features[idx]
        if issparse(feature):
            feature = feature.toarray().squeeze()
        
        feature = torch.tensor(feature, dtype=torch.float32)
        
        if self.is_train:
            target = self.targets[idx]
            if issparse(target):
                target = target.toarray().squeeze()
            target = torch.tensor(target, dtype=torch.float32)
            return feature, target
        
        return feature

In [4]:
# --- Cell 4 : ResNet Model Architecture ---
import torch
import torch.nn as nn

class ResidualBlock(nn.Module):
    """
    一个基础的残差块 (Residual Block).
    结构: Linear -> BatchNorm -> GELU -> Dropout -> Linear -> BatchNorm
    """
    def __init__(self, size, dropout=0.1):
        super().__init__()
        self.norm1 = nn.BatchNorm1d(size)
        self.lin1 = nn.Linear(size, size)
        self.norm2 = nn.BatchNorm1d(size)
        self.lin2 = nn.Linear(size, size)
        self.activation = nn.Tanh()
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        # 残差连接: output = F(x) + x
        residual = x
        x = self.norm1(x)
        x = self.lin1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.norm2(x)
        x = self.lin2(x)
        
        return x + residual

class ResNetModel(nn.Module):
    """
    一个为表格数据设计的、基于残差块的深度MLP模型。
    """
    def __init__(self, num_features, num_targets, hidden_size=1500, num_blocks=3, dropout=0.1):
        super().__init__()
        
        # 1. 输入层：将原始高维特征投射到模型的隐藏维度
        self.input_layer = nn.Sequential(
            nn.Linear(num_features, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.GELU()
        )
        
        # 2. 残差块堆叠：模型的骨干部分，负责深度特征交互
        self.residual_blocks = nn.Sequential(
            *[ResidualBlock(hidden_size, dropout) for _ in range(num_blocks)]
        )
        
        # 3. 预测头：将最终的特征表示映射到目标维度
        self.head = nn.Linear(hidden_size, num_targets)

    def forward(self, x):
        # x shape: (batch_size, num_features)
        x = self.input_layer(x)
        x = self.residual_blocks(x)
        output = self.head(x)
        return output

In [5]:
def train_one_fold(
    X_train, y_train, X_valid, y_valid,
    model_class, model_params, train_params, loss_fn,
    n_fold, name
):
    train_dataset = SingleCellDataset(X_train, y_train)
    valid_dataset = SingleCellDataset(X_valid, y_valid)
    train_loader = DataLoader(train_dataset, batch_size=train_params['batch_size'], shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=train_params['batch_size'] * 2, shuffle=False)

    model = model_class(**model_params).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=train_params['lr'])

    best_val_loss = float('inf')
    model_path = f"model_{name}_fold_{n_fold+1}.pth"

    print_every = train_params['print_every']
    window_val_sum = 0.0
    window_count = 0

    for epoch in range(train_params['epochs']):
        model.train()
        for features, targets in train_loader:
            features, targets = features.to(DEVICE), targets.to(DEVICE)
            optimizer.zero_grad()
            predictions = model(features)
            loss = loss_fn(targets, predictions)
            loss.backward()
            optimizer.step()

        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in valid_loader:
                features, targets = features.to(DEVICE), targets.to(DEVICE)
                predictions = model(features)
                loss = loss_fn(targets, predictions)
                val_loss += loss.item() * len(targets)
        val_loss /= len(valid_dataset)

        # 记录最佳
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_path)

        # 累加到窗口
        window_val_sum += val_loss
        window_count += 1

        # 到达打印间隔 -> 打印窗口平均并清零
        if (epoch + 1) % print_every == 0:
            avg_val = window_val_sum / window_count
            start_ep = epoch + 1 - window_count + 1
            end_ep = epoch + 1
            print(f"Epochs {start_ep}-{end_ep}/{train_params['epochs']}, "
                  f"Mean Val Loss (last {window_count}): {avg_val:.4f}")
            window_val_sum = 0.0
            window_count = 0

    # 末尾若有未满窗口的剩余，也打印一次
    if window_count > 0:
        avg_val = window_val_sum / window_count
        end_ep = train_params['epochs']
        start_ep = end_ep - window_count + 1
        print(f"Epochs {start_ep}-{end_ep}/{train_params['epochs']}, "
              f"Mean Val Loss (last {window_count}): {avg_val:.4f}")

    print(f"Best validation loss for fold {n_fold+1}: {best_val_loss:.4f}")
    return model_path


In [6]:

# Standardize the target variable y for model
train_y_zscored = zscore_col(train_y)

MODEL_PARAMS = {
    'num_features': train_X.shape[1],
    'num_targets': train_y.shape[1],
    'hidden_size': 4096, # ResNet的隐藏层维度
    'num_blocks': 4,       # 残差块的数量
    'dropout': 0.15,
}
TRAIN_PARAMS = {'batch_size': 512, 'epochs': 50, 'lr': 1e-2, 'print_every':10}
folds = KFold(n_splits=5, shuffle=True, random_state=1337)

oof_preds = np.zeros_like(train_y_zscored, dtype=np.float32)
sub_preds = np.zeros((test_X.shape[0], train_y.shape[1]), dtype=np.float32)
test_loader = DataLoader(SingleCellDataset(test_X), batch_size=TRAIN_PARAMS['batch_size'] * 2, shuffle=False)

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_X)):
    print(f"\n===== Processing Fold {n_fold+1}/{folds.get_n_splits()} for Model =====")
    X_train_fold, y_train_fold = train_X[train_idx], train_y_zscored[train_idx]
    X_valid_fold, y_valid_fold = train_X[valid_idx], train_y_zscored[valid_idx]

    best_model_path = train_one_fold(
        X_train_fold, y_train_fold, X_valid_fold, y_valid_fold,
        ResNetModel, 
        MODEL_PARAMS, 
        TRAIN_PARAMS, 
        neg_correlation_score,
        n_fold, 
        f"{name}_resnet_mse"
    )
    
    model = ResNetModel(**MODEL_PARAMS).to(DEVICE)
    model.load_state_dict(torch.load(best_model_path))
    model.eval()

    # OOF predictions
    val_preds_list = []
    with torch.no_grad():
        for features, _ in DataLoader(SingleCellDataset(X_valid_fold, y_valid_fold), batch_size=TRAIN_PARAMS['batch_size']*2):
            val_preds_list.append(model(features.to(DEVICE)).cpu().numpy())
    oof_preds[valid_idx] = np.concatenate(val_preds_list)

    # Test set predictions
    start_idx = 0
    with torch.no_grad():
        for features in test_loader:
            predictions_np = model(features.to(DEVICE)).cpu().numpy()
            end_idx = start_idx + predictions_np.shape[0]
            sub_preds[start_idx:end_idx, :] += predictions_np / folds.get_n_splits()
            start_idx = end_idx

    del model, X_train_fold, y_train_fold, X_valid_fold, y_valid_fold
    gc.collect()
    torch.cuda.empty_cache()


np.save(f'sub_preds_{name}.npy', sub_preds)
print("\nModel training complete and predictions saved.")


===== Processing Fold 1/5 for Model =====
Epochs 1-10/50, Mean Val Loss (last 10): -0.3013
Epochs 11-20/50, Mean Val Loss (last 10): -0.2736
Epochs 21-30/50, Mean Val Loss (last 10): -0.2213
Epochs 31-40/50, Mean Val Loss (last 10): -0.2164
Epochs 41-50/50, Mean Val Loss (last 10): -0.2167
Best validation loss for fold 1: -0.3535

===== Processing Fold 2/5 for Model =====
Epochs 1-10/50, Mean Val Loss (last 10): -0.3042
Epochs 11-20/50, Mean Val Loss (last 10): -0.2800
Epochs 21-30/50, Mean Val Loss (last 10): -0.2233
Epochs 31-40/50, Mean Val Loss (last 10): -0.2188
Epochs 41-50/50, Mean Val Loss (last 10): -0.2187
Best validation loss for fold 2: -0.3559

===== Processing Fold 3/5 for Model =====
Epochs 1-10/50, Mean Val Loss (last 10): -0.3033
Epochs 11-20/50, Mean Val Loss (last 10): -0.2934
Epochs 21-30/50, Mean Val Loss (last 10): -0.2269
Epochs 31-40/50, Mean Val Loss (last 10): -0.2191
Epochs 41-50/50, Mean Val Loss (last 10): -0.2193
Best validation loss for fold 3: -0.3559

