In [1]:
# -*- coding: utf-8 -*-
# 核心库
import numpy as np
import pandas as pd
import gc
import os

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Scikit-learn
from sklearn.model_selection import KFold

# 进度条
from tqdm.notebook import tqdm

# --- 全局配置 ---
# 设备配置，优先使用GPU
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# 为了结果可复现，设置随机种子
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

Using device: cuda


In [None]:
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import issparse

# --- 数据路径 ---
# 这是我们之前数据清洗步骤生成的文件的路径
PATH_TRAIN_INP = "train_multi_inputs.h5ad"
PATH_TRAIN_TGT = "train_multi_targets.h5ad"
PATH_TEST_INP  = "test_multi_inputs.h5ad" # 假设测试集也已处理
PATH_META      = "metadata.csv"

# 加载已清洗的训练数据
print("Loading pre-processed .h5ad files...")
adata_train_inp = ad.read_h5ad(PATH_TRAIN_INP)
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
adata_test_inp = ad.read_h5ad(PATH_TEST_INP) # 如果需要加载测试集

# --- 关键步骤：数据对齐 ---
# 确保输入和目标的细胞顺序完全一致
assert np.all(adata_train_inp.obs_names == adata_train_tgt.obs_names), "Error: Train inputs and targets have different cell orders!"

# 加载元数据并对齐到训练数据的顺序
meta_df = pd.read_csv(PATH_META).set_index("cell_id")
meta_df = meta_df.loc[adata_train_inp.obs_names]

# 保持稀疏格式以避免MemoryError
train_multi_X = adata_train_inp.X
train_multi_y = adata_train_tgt.X.toarray() if issparse(adata_train_tgt.X) else adata_train_tgt.X

# 加载测试集数据
test_multi_X = adata_test_inp.X
test_df = pd.DataFrame(index=adata_test_inp.obs_names)

# 打印数据维度以确认
print("\nData shapes for model training:")
print(f"train_multi_X: {train_multi_X.shape}")
print(f"train_multi_y: {train_multi_y.shape}")
print(f"test_multi_X: {test_multi_X.shape} (placeholder data)")

# 清理内存
del adata_train_inp, adata_train_tgt
gc.collect()

Loading pre-processed .h5ad files...

Data shapes for model training:
train_multi_X: (100646, 160259)
train_multi_y: (100646, 23418)
test_multi_X: (55935, 160259) (placeholder data)


0

In [2]:
# --- 评估与数据处理工具 ---

def correlation_score(y_true, y_pred):
    """逐行计算皮尔逊相关系数并返回平均值"""
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

def zscore(x):
    """对输入矩阵的每一行进行Z-score标准化"""
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        mean = np.mean(x_row)
        std = np.std(x_row)
        if std == 0: # 防止除以零
            x_zscore.append(x_row - mean)
        else:
            x_zscore.append((x_row - mean) / std)
    return np.array(x_zscore)

def cosine_similarity_loss(y_true, y_pred):
    """皮尔逊相关性损失的PyTorch实现"""
    # 1. 中心化
    y_true_centered = y_true - torch.mean(y_true, dim=1, keepdim=True)
    y_pred_centered = y_pred - torch.mean(y_pred, dim=1, keepdim=True)
    
    # 2. L2标准化
    y_true_norm = torch.nn.functional.normalize(y_true_centered, p=2, dim=1)
    y_pred_norm = torch.nn.functional.normalize(y_pred_centered, p=2, dim=1)
    
    # 3. 计算余弦相似度并取反作为损失
    # .mean()聚合batch中的所有样本损失
    return -torch.nn.CosineSimilarity(dim=1)(y_true_norm, y_pred_norm).mean()

In [None]:
from scipy.sparse import issparse # Make sure issparse is imported

class SingleCellDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = features
        self.targets = targets
        self.is_train = targets is not None

    def __len__(self):
        # 【修正】: 使用 .shape[0] 来获取稀疏矩阵的行数（样本数）
        # [FIX]: Use .shape[0] to get the number of rows (samples) from the sparse matrix
        return self.features.shape[0]

    def __getitem__(self, idx):
        # This part, which handles slicing, is correct from our previous fix.
        feature = self.features[idx]
        if issparse(feature):
            feature = feature.toarray().squeeze()
        
        feature = torch.tensor(feature, dtype=torch.float32)
        
        if self.is_train:
            target = torch.tensor(self.targets[idx], dtype=torch.float32)
            return feature, target
        return feature

In [None]:
class TabularTransformer(nn.Module):
    def __init__(self, num_features, num_targets, seq_len=16, d_model=256, nhead=8, num_layers=3, dim_feedforward=512, dropout=0.1):
        """
        num_features: 原始输入特征数 (例如: 20000+)
        num_targets: 目标输出数 (23418)
        seq_len: 我们要将原始特征向量“重塑”成的序列长度
        d_model: Transformer内部的工作维度 (必须能被nhead整除)
        """
        super().__init__()
        
        # 1. 线性投射层：将原始的高维特征投射到一个可以被重塑为序列的空间
        self.projector = nn.Linear(num_features, seq_len * d_model)
        
        # 2. CLS Token: 类似于BERT，我们添加一个特殊的可学习的token，用于聚合整个序列的信息
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        
        # 3. 位置编码：让模型知道序列中每个元素的位置信息
        self.pos_encoder = nn.Parameter(torch.randn(1, seq_len + 1, d_model))
        
        # 4. 标准的Transformer编码器
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 5. 预测头：一个简单的MLP，接收CLS Token的最终输出，并预测140个蛋白质
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_targets)
        )
        
        self.d_model = d_model
        self.seq_len = seq_len

    def forward(self, x):
        # x shape: (batch_size, num_features)
        
        # 1. 投射并重塑为序列
        x = self.projector(x) # -> (batch_size, seq_len * d_model)
        x = x.reshape(-1, self.seq_len, self.d_model) # -> (batch_size, seq_len, d_model)
        
        # 2. 添加CLS token
        batch_size = x.size(0)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1) # -> (batch_size, 1, d_model)
        x = torch.cat((cls_tokens, x), dim=1) # -> (batch_size, seq_len + 1, d_model)
        
        # 3. 添加位置编码
        x += self.pos_encoder
        
        # 4. 通过Transformer编码器
        x = self.transformer_encoder(x) # -> (batch_size, seq_len + 1, d_model)
        
        # 5. 取出CLS token的输出，并通过预测头
        cls_output = x[:, 0] # -> (batch_size, d_model)
        output = self.mlp_head(cls_output) # -> (batch_size, num_targets)
        
        return output

In [None]:
from tqdm.notebook import tqdm # 引入tqdm来显示进度条

def train_and_evaluate(
    model_class,
    train_X,
    train_y,
    test_X,
    folds,
    model_params,
    train_params,
    loss_fn
):
    oof_preds = np.zeros_like(train_y, dtype=np.float32)
    sub_preds = np.zeros((test_X.shape[0], train_y.shape[1]), dtype=np.float32)
    
    test_dataset = SingleCellDataset(test_X)
    test_loader = DataLoader(test_dataset, batch_size=train_params['batch_size'] * 2, shuffle=False)

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_X)):
        print(f"\n===== Fold {n_fold+1} =====")
        
        # 划分数据
        X_train, y_train = train_X[train_idx], train_y[train_idx]
        X_valid, y_valid = train_X[valid_idx], train_y[valid_idx]
        
        train_dataset = SingleCellDataset(X_train, y_train)
        valid_dataset = SingleCellDataset(X_valid, y_valid)
        
        train_loader = DataLoader(train_dataset, batch_size=train_params['batch_size'], shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=train_params['batch_size'] * 2, shuffle=False)
        
        # 初始化模型
        model = model_class(**model_params).to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr=train_params['lr'])
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=6) # 移除了 verbose
        
        best_val_loss = float('inf')
        
        for epoch in range(train_params['epochs']):
            model.train()
            for features, targets in train_loader:
                features, targets = features.to(DEVICE), targets.to(DEVICE)
                
                optimizer.zero_grad()
                predictions = model(features)
                loss = loss_fn(targets, predictions)
                loss.backward()
                optimizer.step()
            
            model.eval()
            val_loss = 0
            with torch.no_grad():
                for features, targets in valid_loader:
                    features, targets = features.to(DEVICE), targets.to(DEVICE)
                    predictions = model(features)
                    loss = loss_fn(targets, predictions)
                    val_loss += loss.item() * len(targets)
            
            val_loss /= len(valid_dataset)
            scheduler.step(val_loss)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), f"transformer_model_fold_{n_fold+1}.pth")
                
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{train_params['epochs']}, Val Loss: {val_loss:.4f}")
    
        # 加载最佳模型进行预测
        model.load_state_dict(torch.load(f"transformer_model_fold_{n_fold+1}.pth"))
        model.eval()
        
        # OOF 预测
        val_preds_list = []
        with torch.no_grad():
            for features, _ in valid_loader:
                features = features.to(DEVICE)
                predictions = model(features)
                val_preds_list.append(predictions.cpu().numpy())
        oof_preds[valid_idx] = np.concatenate(val_preds_list)
        
        # 【修正】: 内存高效的测试集预测
        # 不再将所有批次结果存在一个list中，而是逐批次累加
        print("Predicting on test set (memory-efficiently)...")
        start_idx = 0
        with torch.no_grad():
            for features in tqdm(test_loader, desc=f"Test Prediction Fold {n_fold+1}"):
                features = features.to(DEVICE)
                # 直接获得numpy格式的预测结果
                predictions_np = model(features).cpu().numpy()
                
                # 获取当前批次的大小
                batch_size = predictions_np.shape[0]
                end_idx = start_idx + batch_size
                
                # 直接将缩放后的预测结果累加到 sub_preds 的对应切片上
                sub_preds[start_idx:end_idx, :] += predictions_np / folds.get_n_splits()
                
                # 更新下一个批次的起始索引
                start_idx = end_idx

        del model, X_train, y_train, X_valid, y_valid
        gc.collect()
        torch.cuda.empty_cache()
        
    cv_score = correlation_score(train_y, oof_preds)
    print(f"\nOverall CV Pearson Score: {cv_score:.4f}")
    
    return oof_preds, sub_preds

In [None]:
# --- 模型一: 相关性损失模型 ---
print("\n--- Training Model 1 (Cosine Similarity Loss) ---")

set_seed(1024)
MODEL1_PARAMS = {
    'num_features': train_multi_X.shape[1],
    'num_targets': train_multi_y.shape[1],
    'seq_len': 16,       # 也可以尝试增加到 32
    'd_model': 256,      # 增加宽度
    'nhead': 16,         # 增加头数
    'num_layers': 3,     # 增加深度
    'dim_feedforward': 512, # 增加前馈维度
    'dropout': 0.15,     # 略微增加dropout以应对更大的模型
}

TRAIN1_PARAMS = {
    'batch_size': 512,
    'epochs': 3, 
    'lr': 1e-3,
}

folds = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds_cos, sub_preds_cos = train_and_evaluate(
    TabularTransformer,
    train_multi_X,
    train_multi_y,
    test_multi_X,
    folds,
    MODEL1_PARAMS,
    TRAIN1_PARAMS,
    cosine_similarity_loss
)

OUTPUT_PATH1 = 'oof_preds_cos_multi.npy'
OUTPUT_PATH2 = 'sub_preds_cos_multi.npy'
print("\n--- Saving predictions from Model 1 to disk ---")
np.save(OUTPUT_PATH1, oof_preds_cos)
np.save(OUTPUT_PATH2, sub_preds_cos)
print(f"Saved {OUTPUT_PATH1} and {OUTPUT_PATH2} successfully.")


--- Training Model 1 (Cosine Similarity Loss) ---

===== Fold 1 =====
Epoch 1/3, Val Loss: -0.6428
Epoch 2/3, Val Loss: -0.6430
Epoch 3/3, Val Loss: -0.6492

===== Fold 2 =====
Epoch 1/3, Val Loss: -0.6421
Epoch 2/3, Val Loss: -0.6423
Epoch 3/3, Val Loss: -0.6525

===== Fold 3 =====
Epoch 1/3, Val Loss: -0.6423
Epoch 2/3, Val Loss: -0.6425
Epoch 3/3, Val Loss: -0.6510

===== Fold 4 =====
Epoch 1/3, Val Loss: -0.6422
Epoch 2/3, Val Loss: -0.6424
Epoch 3/3, Val Loss: -0.6518

===== Fold 5 =====
Epoch 1/3, Val Loss: -0.6427
Epoch 2/3, Val Loss: -0.6429
Epoch 3/3, Val Loss: -0.6530

Overall CV Pearson Score: 0.6515

--- Saving predictions from Model 1 to disk ---
Saved 'oof_preds_cos.npy' and 'sub_preds_cos.npy' successfully.


In [None]:
# --- 模型二: MSE 损失模型 (目标Z-score标准化) ---
print("\n--- Training Model 2 (MSE Loss with Z-scored Targets) ---")

# 关键步骤：对目标y进行z-score标准化
train_multi_y_zscored = zscore(train_multi_y)

set_seed(2048)
MODEL2_PARAMS = {
    'num_features': train_multi_X.shape[1],
    'num_targets': 23418,
    'seq_len': 20, # 可以尝试不同的序列长度
    'd_model': 240, # d_model必须能被nhead整除
    'nhead': 8,
    'num_layers': 4,
    'dim_feedforward': 600,
    'dropout': 0.15,
}

TRAIN2_PARAMS = {
    'batch_size': 512,
    'epochs': 10,
    'lr': 8e-4,
}

folds = KFold(n_splits=5, shuffle=True, random_state=1337)

oof_preds_mse, sub_preds_mse = train_and_evaluate(
    TabularTransformer,
    train_multi_X,
    train_multi_y_zscored,
    test_multi_X,
    folds,
    MODEL2_PARAMS,
    TRAIN2_PARAMS,
    nn.MSELoss() # 使用标准的MSE损失
)

OUTPUT_PATH3 = 'oof_preds_mse_multi.npy'
OUTPUT_PATH4 = 'sub_preds_mse_multi.npy'
print("\n--- Saving predictions from Model 1 to disk ---")
np.save(OUTPUT_PATH3, oof_preds_mse)
np.save(OUTPUT_PATH4, sub_preds_mse)
print(f"Saved {OUTPUT_PATH3} and {OUTPUT_PATH4} successfully.")


--- Training Model 2 (MSE Loss with Z-scored Targets) ---

===== Fold 1 =====
Epoch 1/10, Val Loss: 0.5880
Epoch 2/10, Val Loss: 0.5863
Epoch 3/10, Val Loss: 0.5725
Epoch 4/10, Val Loss: 0.5661
Epoch 5/10, Val Loss: 0.5638
Epoch 6/10, Val Loss: 0.5618
Epoch 7/10, Val Loss: 0.5603
Epoch 8/10, Val Loss: 0.5593
Epoch 9/10, Val Loss: 0.5582
Epoch 10/10, Val Loss: 0.5578


MemoryError: Unable to allocate 4.88 GiB for an array with shape (55935, 23418) and data type float32

In [None]:
# --- 最终单元: 集成、融合与提交 ---
import anndata as ad
OUTPUT_PATH1 = 'oof_preds_cos_multi.npy'
OUTPUT_PATH2 = 'sub_preds_cos_multi.npy'
OUTPUT_PATH3 = 'oof_preds_mse_multi.npy'
OUTPUT_PATH4 = 'sub_preds_mse_multi.npy'
# --- 1. 使用 Z-Score 进行模型集成 ---
print("--- Step 1: Ensembling predictions using z-score ---")
oof_preds_cos = np.load(OUTPUT_PATH1)
sub_preds_cos = np.load(OUTPUT_PATH2)
oof_preds_mse = np.load(OUTPUT_PATH3)
sub_preds_mse = np.load(OUTPUT_PATH4)
# a) 对两个模型的OOF预测进行z-score标准化，以统一尺度
oof_preds_cos_z = zscore(oof_preds_cos)
oof_preds_mse_z = zscore(oof_preds_mse)

# b) 对两个模型的测试集预测进行z-score标准化
sub_preds_cos_z = zscore(sub_preds_cos)
sub_preds_mse_z = zscore(sub_preds_mse)

--- Step 1: Ensembling predictions using z-score ---


NameError: name 'issparse' is not defined

In [None]:
from scipy.sparse import issparse
# c) 加权平均
PATH_TRAIN_TGT = "train_multi_targets.h5ad"
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
train_multi_y = adata_train_tgt.X.toarray() if issparse(adata_train_tgt.X) else adata_train_tgt.X
oof_preds_ensembled = oof_preds_cos_z * 0.55 + oof_preds_mse_z * 0.45
cv_score = correlation_score(train_multi_y, oof_preds_ensembled)
print(f"Blended OOF CV Score: {cv_score:.4f}")

sub_preds_ensembled = sub_preds_cos_z * 0.55 + sub_preds_mse_z * 0.45

# --- 2. 生成最终提交文件 (安全合并版) ---
print("\n--- Step 2: Generating final submission file (safe merge method) ---")

# a) 准备预测结果表格
protein_ids = pd.read_hdf('train_multi_targets.h5').columns


Blended OOF CV Score: 0.6515

--- Step 2: Generating final submission file (safe merge method) ---


In [None]:
if pred_df:
    continue
else:#重新加载pred_df
    PATH_TEST_INP  = "test_multi_inputs.h5ad"
    adata_test_inp = ad.read_h5ad(PATH_TEST_INP)
    test_df = pd.DataFrame(index=adata_test_inp.obs_names)
    pred_df = pd.DataFrame(sub_preds_ensembled, index=test_df.index, columns=protein_ids)

# b) 将预测结果转换为“长”格式
pred_long = pred_df.reset_index().rename(columns={'index': 'cell_id'}).melt(
    id_vars='cell_id', 
    var_name='gene_id',
    value_name='target'
)
print(f"Created a long-format prediction table with {len(pred_long)} rows.")

# c) 加载官方的ID转换表
evaluation_ids = pd.read_csv('evaluation_ids.csv')

# d) 合并预测与官方ID
submission_df = evaluation_ids.merge(pred_long, on=['cell_id', 'gene_id'], how='left')

# e) 生成最终提交文件
final_submission = submission_df[['row_id', 'target']]

# 填充可能存在的空值 (主要是Multiome部分)
if final_submission['target'].isnull().any():
    print("Warning: Some rows could not be matched. Filling NaN with 0.")
    final_submission['target'] = final_submission['target'].fillna(0)

final_submission.to_csv('submission_multi.csv', index=False)
print("\nSubmission file 'submission_multi.csv' created successfully.")
print(final_submission.head())