In [None]:
# Core libraries
import numpy as np
import pandas as pd
import gc
import os

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# Scikit-learn
from sklearn.model_selection import KFold

# Progress bar
from tqdm.notebook import tqdm
name = 'cite'
# --- Global Configuration ---
# Device configuration, use GPU if available
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Set random seed for reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

Using device: cuda


In [None]:
import anndata as ad
import pandas as pd
import numpy as np
from scipy.sparse import issparse

# --- Data Paths ---
# Path to the files generated from our previous data cleaning steps
PATH_TRAIN_INP = "train_cite_inputs.h5ad"
PATH_TRAIN_TGT = "train_cite_targets.h5ad"
PATH_TEST_INP  = "test_cite_inputs.h5ad"
PATH_META      = "metadata.csv"

# Load the cleaned training data
print("Loading pre-processed .h5ad files...")
adata_train_inp = ad.read_h5ad(PATH_TRAIN_INP)
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
adata_test_inp = ad.read_h5ad(PATH_TEST_INP)

# Ensure the cell order of inputs and targets is exactly the same
assert np.all(adata_train_inp.obs_names == adata_train_tgt.obs_names), \
    "Error: Train inputs and targets have different cell orders!"

# Load metadata and align it to the order of the training data
meta_df = pd.read_csv(PATH_META).set_index("cell_id")
meta_df = meta_df.loc[adata_train_inp.obs_names]

# --- Prepare final model input variables ---
# Extract feature and target matrices as NumPy arrays
# If the matrix is sparse (issparse), convert it to dense using .toarray()
print("Extracting data into NumPy arrays for training...")
train_cite_X = adata_train_inp.X.toarray() if issparse(adata_train_inp.X) else adata_train_inp.X
train_cite_y = adata_train_tgt.X.toarray() if issparse(adata_train_tgt.X) else adata_train_tgt.X

# Load test set data
test_cite_X = adata_test_inp.X
test_df = pd.DataFrame(index=adata_test_inp.obs_names)

# Print data dimensions to confirm
print("\nData shapes for model training:")
print(f"train_cite_X: {train_cite_X.shape}")
print(f"train_cite_y: {train_cite_y.shape}")
print(f"test_cite_X: {test_cite_X.shape} (placeholder data)")

# Clean up memory
del adata_train_inp, adata_train_tgt
gc.collect()

Loading pre-processed .h5ad files...
Extracting data into NumPy arrays for training...

Data shapes for model training:
train_cite_X: (64074, 15435)
train_cite_y: (64074, 140)
test_cite_X: (48663, 15435) (placeholder data)


0

In [None]:
# --- Evaluation and Data Processing Tools ---

def correlation_score(y_true, y_pred):
    """Calculate Pearson correlation coefficient row by row and return the average"""
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

def zscore(x):
    """Perform Z-score normalization on each row of the input matrix"""
    x_zscore = []
    for i in range(x.shape[0]):
        x_row = x[i]
        mean = np.mean(x_row)
        std = np.std(x_row)
        if std == 0: # Prevent division by zero
            x_zscore.append(x_row - mean)
        else:
            x_zscore.append((x_row - mean) / std)
    return np.array(x_zscore)

def cosine_similarity_loss(y_true, y_pred):
    """PyTorch implementation of Pearson correlation loss"""
    # 1. Centering
    y_true_centered = y_true - torch.mean(y_true, dim=1, keepdim=True)
    y_pred_centered = y_pred - torch.mean(y_pred, dim=1, keepdim=True)
    
    # 2. L2 Normalization
    y_true_norm = torch.nn.functional.normalize(y_true_centered, p=2, dim=1)
    y_pred_norm = torch.nn.functional.normalize(y_pred_centered, p=2, dim=1)
    
    # 3. Calculate cosine similarity and negate it as loss
    # .mean() aggregates all sample losses in the batch
    return -torch.nn.CosineSimilarity(dim=1)(y_true_norm, y_pred_norm).mean()

In [None]:
from scipy.sparse import issparse # Make sure issparse is imported

class SingleCellDataset(Dataset):
    def __init__(self, features, targets=None):
        self.features = features
        self.targets = targets
        self.is_train = targets is not None

    def __len__(self):
        # Use .shape[0] to get the number of rows (samples) from the sparse matrix
        return self.features.shape[0]

    def __getitem__(self, idx):
        # This part, which handles slicing, is correct from our previous fix.
        feature = self.features[idx]
        if issparse(feature):
            feature = feature.toarray().squeeze()
        
        feature = torch.tensor(feature, dtype=torch.float32)
        
        if self.is_train:
            target = torch.tensor(self.targets[idx], dtype=torch.float32)
            return feature, target
        return feature

In [None]:
class TabularTransformer(nn.Module):
    def __init__(self, num_features, num_targets, seq_len=16, d_model=256, nhead=8, num_layers=3, dim_feedforward=512, dropout=0.1):
        """
        num_features: Number of original input features (e.g., 20000+)
        num_targets: Number of target outputs (140)
        seq_len: The sequence length to "reshape" the original feature vector into
        d_model: The working dimension inside the Transformer (must be divisible by nhead)
        """
        super().__init__()
        
        # 1. Linear Projection Layer: Projects the original high-dimensional features into a space that can be reshaped into a sequence
        self.projector = nn.Linear(num_features, seq_len * d_model)
        
        # 2. CLS Token: Similar to BERT, we add a special learnable token to aggregate information from the entire sequence
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        
        # 3. Positional Encoding: Allows the model to know the position information of each element in the sequence
        self.pos_encoder = nn.Parameter(torch.randn(1, seq_len + 1, d_model))
        
        # 4. Standard Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        
        # 5. Prediction Head: A simple MLP that receives the final output of the CLS Token and predicts 140 proteins
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, num_targets)
        )
        
        self.d_model = d_model
        self.seq_len = seq_len

    def forward(self, x):
        # x shape: (batch_size, num_features)
        
        # 1. Project and reshape into a sequence
        x = self.projector(x) # -> (batch_size, seq_len * d_model)
        x = x.reshape(-1, self.seq_len, self.d_model) # -> (batch_size, seq_len, d_model)
        
        # 2. Add CLS token
        batch_size = x.size(0)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1) # -> (batch_size, 1, d_model)
        x = torch.cat((cls_tokens, x), dim=1) # -> (batch_size, seq_len + 1, d_model)
        
        # 3. Add positional encoding
        x += self.pos_encoder
        
        # 4. Transformer encoder
        x = self.transformer_encoder(x) # -> (batch_size, seq_len + 1, d_model)
        
        # 5. Take the output of the CLS token and pass it through the prediction head
        cls_output = x[:, 0] # -> (batch_size, d_model)
        output = self.mlp_head(cls_output) # -> (batch_size, num_targets)
        
        return output

In [None]:
def train_and_evaluate(
    model_class,
    train_X,
    train_y,
    test_X,
    folds,
    model_params,
    train_params,
    loss_fn
):
    oof_preds = np.zeros_like(train_y, dtype=np.float32)
    sub_preds = np.zeros((test_X.shape[0], train_y.shape[1]), dtype=np.float32)
    
    test_dataset = SingleCellDataset(test_X)
    test_loader = DataLoader(test_dataset, batch_size=train_params['batch_size'] * 2, shuffle=False)

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_X)):
        print(f"\n===== Fold {n_fold+1} =====")
        
        # Split data
        X_train, y_train = train_X[train_idx], train_y[train_idx]
        X_valid, y_valid = train_X[valid_idx], train_y[valid_idx]
        
        train_dataset = SingleCellDataset(X_train, y_train)
        valid_dataset = SingleCellDataset(X_valid, y_valid)
        
        train_loader = DataLoader(train_dataset, batch_size=train_params['batch_size'], shuffle=True)
        valid_loader = DataLoader(valid_dataset, batch_size=train_params['batch_size'] * 2, shuffle=False)
        
        # Initialize model
        model = model_class(**model_params).to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr=train_params['lr'])
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=6, verbose=True)
        
        best_val_loss = float('inf')
        
        for epoch in range(train_params['epochs']):
            model.train()
            for features, targets in train_loader:
                features, targets = features.to(DEVICE), targets.to(DEVICE)
                
                optimizer.zero_grad()
                predictions = model(features)
                loss = loss_fn(targets, predictions)
                loss.backward()
                optimizer.step()
            
            model.eval()
            val_loss = 0
            val_preds = []
            with torch.no_grad():
                for features, targets in valid_loader:
                    features, targets = features.to(DEVICE), targets.to(DEVICE)
                    predictions = model(features)
                    loss = loss_fn(targets, predictions)
                    val_loss += loss.item() * len(targets)
                    val_preds.append(predictions.cpu().numpy())
            
            val_loss /= len(valid_dataset)
            scheduler.step(val_loss)
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                torch.save(model.state_dict(), f"transformer_{name}_fold_{n_fold+1}.pth")
                
            if (epoch + 1) % 10 == 0:
                print(f"Epoch {epoch+1}/{train_params['epochs']}, Val Loss: {val_loss:.4f}")
    
        # Load best model for prediction
        model.load_state_dict(torch.load(f"transformer_{cite}_fold_{n_fold+1}.pth"))
        model.eval()
        
        # OOF predictions
        val_preds = []
        with torch.no_grad():
            for features, _ in valid_loader:
                features = features.to(DEVICE)
                predictions = model(features)
                val_preds.append(predictions.cpu().numpy())
        oof_preds[valid_idx] = np.concatenate(val_preds)
        
        # Test set predictions
        fold_sub_preds = []
        with torch.no_grad():
            for features in test_loader:
                features = features.to(DEVICE)
                predictions = model(features)
                fold_sub_preds.append(predictions.cpu().numpy())
        sub_preds += np.concatenate(fold_sub_preds) / folds.get_n_splits()

        del model
        gc.collect()
        torch.cuda.empty_cache()
        
    cv_score = correlation_score(train_y, oof_preds)
    print(f"\nOverall CV Pearson Score: {cv_score:.4f}")
    
    return oof_preds, sub_preds

In [None]:
# --- Model 1: Correlation Loss Model ---
print("\n--- Training Model 1 (Cosine Similarity Loss) ---")

set_seed(1024)
MODEL1_PARAMS = {
    'num_features': train_cite_X.shape[1],
    'num_targets': 140,
    'seq_len': 16,
    'd_model': 256,
    'nhead': 16,
    'num_layers': 3,
    'dim_feedforward': 512,
    'dropout': 0.1,
}

TRAIN1_PARAMS = {
    'batch_size': 512,
    'epochs': 30, 
    'lr': 1e-3,
}

folds = KFold(n_splits=5, shuffle=True, random_state=42)

oof_preds_cos, sub_preds_cos = train_and_evaluate(
    TabularTransformer,
    train_cite_X,
    train_cite_y,
    test_cite_X,
    folds,
    MODEL1_PARAMS,
    TRAIN1_PARAMS,
    cosine_similarity_loss
)
print("\n--- Saving predictions from Model 1 to disk ---")
np.save('oof_preds_cos_cite.npy', oof_preds_cos)
np.save('sub_preds_cos_cite.npy', sub_preds_cos)
print("Saved 'oof_preds_cos_cite.npy' and 'sub_preds_cos_cite.npy' successfully.")


--- Training Model 1 (Cosine Similarity Loss) ---

===== Fold 1 =====
Epoch 10/30, Val Loss: -0.8092
Epoch 20/30, Val Loss: -0.8619
Epoch 30/30, Val Loss: -0.8882

===== Fold 2 =====
Epoch 10/30, Val Loss: -0.8088
Epoch 20/30, Val Loss: -0.8088
Epoch 30/30, Val Loss: -0.8088

===== Fold 3 =====
Epoch 00009: reducing learning rate of group 0 to 1.0000e-04.
Epoch 10/30, Val Loss: -0.8092
Epoch 00016: reducing learning rate of group 0 to 1.0000e-05.
Epoch 20/30, Val Loss: -0.8092
Epoch 00023: reducing learning rate of group 0 to 1.0000e-06.
Epoch 00030: reducing learning rate of group 0 to 1.0000e-07.
Epoch 30/30, Val Loss: -0.8092

===== Fold 4 =====
Epoch 10/30, Val Loss: -0.8080
Epoch 00014: reducing learning rate of group 0 to 1.0000e-04.
Epoch 20/30, Val Loss: -0.8080
Epoch 30/30, Val Loss: -0.8080

===== Fold 5 =====
Epoch 10/30, Val Loss: -0.8837
Epoch 20/30, Val Loss: -0.8945
Epoch 30/30, Val Loss: -0.8005

Overall CV Pearson Score: 0.8430

--- Saving predictions from Model 1 to 

In [None]:
# --- Model 2: MSE Loss Model (Target Z-score Normalization) ---
print("\n--- Training Model 2 (MSE Loss with Z-scored Targets) ---")

# Key step: Z-score normalize the target y
train_cite_y_zscored = zscore(train_cite_y)

set_seed(2048)
MODEL2_PARAMS = {
    'num_features': train_cite_X.shape[1],
    'num_targets': 140,
    'seq_len': 20, # Try different sequence lengths
    'd_model': 240, # d_model must be divisible by nhead
    'nhead': 16,
    'num_layers': 4,
    'dim_feedforward': 600,
    'dropout': 0.15,
}

TRAIN2_PARAMS = {
    'batch_size': 512,
    'epochs': 30,
    'lr': 8e-4,
}

folds = KFold(n_splits=5, shuffle=True, random_state=1337)

oof_preds_mse, sub_preds_mse = train_and_evaluate(
    TabularTransformer,
    train_cite_X,
    train_cite_y_zscored,
    test_cite_X,
    folds,
    MODEL2_PARAMS,
    TRAIN2_PARAMS,
    nn.MSELoss() # Use standard MSE loss
)
print("\n--- Saving predictions from Model 1 to disk ---")
np.save('oof_preds_cos_cite.npy', oof_preds_cos)
np.save('sub_preds_cos_cite.npy', sub_preds_cos)
print("Saved 'oof_preds_cos_cite.npy' and 'sub_preds_cos_cite.npy' successfully.")


--- Training Model 2 (MSE Loss with Z-scored Targets) ---

===== Fold 1 =====
Epoch 10/30, Val Loss: 0.3461
Epoch 00017: reducing learning rate of group 0 to 8.0000e-05.
Epoch 20/30, Val Loss: 0.3460
Epoch 00025: reducing learning rate of group 0 to 8.0000e-06.
Epoch 30/30, Val Loss: 0.3460

===== Fold 2 =====
Epoch 10/30, Val Loss: 0.3470
Epoch 00016: reducing learning rate of group 0 to 8.0000e-05.
Epoch 20/30, Val Loss: 0.3468
Epoch 00024: reducing learning rate of group 0 to 8.0000e-06.
Epoch 30/30, Val Loss: 0.3467

===== Fold 3 =====
Epoch 10/30, Val Loss: 0.3475
Epoch 00020: reducing learning rate of group 0 to 8.0000e-05.
Epoch 20/30, Val Loss: 0.3476
Epoch 00027: reducing learning rate of group 0 to 8.0000e-06.
Epoch 30/30, Val Loss: 0.3474

===== Fold 4 =====
Epoch 00010: reducing learning rate of group 0 to 8.0000e-05.
Epoch 10/30, Val Loss: 0.3433
Epoch 00017: reducing learning rate of group 0 to 8.0000e-06.
Epoch 20/30, Val Loss: 0.3431
Epoch 00024: reducing learning rate