In [1]:
# --- Imports and Setup ---
import numpy as np
import pandas as pd
import gc
import os
import joblib

# PyTorch and Dataloader
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

# Scikit-learn
from sklearn.model_selection import KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize

# AnnData for loading
import anndata as ad
from scipy.sparse import issparse

# --- Global Configuration ---
name = 'cite'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

def neg_correlation_score(y_pred: torch.Tensor, y_true: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    """
    Row-wise Pearson correlation (y_true vs y_pred), then return negative mean.
    Shapes: [batch, dim] (e.g., dim = #targets per sample)
    """
    # Center per row
    x = y_true - y_true.mean(dim=1, keepdim=True)
    y = y_pred - y_pred.mean(dim=1, keepdim=True)

    # Numerator & denominator
    num = (x * y).sum(dim=1)                               # [batch]
    den = torch.sqrt((x.square().sum(dim=1) + eps) *
                     (y.square().sum(dim=1) + eps))        # [batch]

    r = num / den                                          # row-wise corr
    return -r.mean()                                       # negative average correlation

Using device: cpu


In [2]:
# --- Cell 2: Data Loading and Preprocessing ---

# --- a) Load sparse data from .h5ad files ---
print("Loading sparse data from .h5ad files...")
PATH_TRAIN_INP = f"train_{name}_inputs.h5ad"
PATH_TRAIN_TGT = f"train_{name}_targets.h5ad"
PATH_TEST_INP  = f"test_{name}_inputs.h5ad"

adata_train_inp = ad.read_h5ad(PATH_TRAIN_INP)
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
adata_test_inp = ad.read_h5ad(PATH_TEST_INP)

train_X_sparse = adata_train_inp.X
test_X_sparse = adata_test_inp.X
train_y_sparse = adata_train_tgt.X
# Keep original dense y for final OOF shape
train_y_full_shape = train_y_sparse.shape

print(f"Initial sparse train_X shape: {train_X_sparse.shape}")
print(f"Initial sparse train_y shape: {train_y_sparse.shape}")

# --- Input Matrix Preprocessing (train_X, test_X) ---
print("\n--- Preprocessing Input Matrices (X)... ---")
# 1. Row-wise L2 normalization
print("Step 1: Applying L2 normalization to X...")
train_X_normalized = normalize(train_X_sparse, norm='l2', axis=1)
test_X_normalized = normalize(test_X_sparse, norm='l2', axis=1)

# 2. TruncatedSVD decomposition to 128 components
print("Step 2: Performing TruncatedSVD on X to 128 components...")
svd_x = TruncatedSVD(n_components=128, random_state=42)
train_X_svd = svd_x.fit_transform(train_X_normalized)
test_X_svd = svd_x.transform(test_X_normalized)
print(f"Finished X preprocessing. Final train_X_svd shape: {train_X_svd.shape}")

# --- Target Matrix Preprocessing (train_y) ---
print("\n--- Preprocessing Target Matrix (y)... ---")
# 1. Convert to dense and apply Row-wise L2 normalization
print("Step 1: Converting y to dense and applying L2 normalization...")
train_y_dense = train_y_sparse.toarray()
train_y_normalized = normalize(train_y_dense, norm='l2', axis=1)

# 2. TruncatedSVD decomposition to 128 components
print("Step 2: Performing TruncatedSVD on y to 128 components...")
svd_y = TruncatedSVD(n_components=128, random_state=42)
train_y_svd = svd_y.fit_transform(train_y_normalized)
print(f"Finished y preprocessing. Final train_y_svd shape: {train_y_svd.shape}")

# Save the svd_y object for inverse transform during prediction
joblib.dump(svd_y, 'svd_y_model.joblib')
print("Saved svd_y model to 'svd_y_model.joblib'")


# --- Clean up unused variables from memory ---
print("\nCleaning up memory...")
del adata_train_inp, adata_train_tgt, adata_test_inp
del train_X_sparse, test_X_sparse, train_y_sparse
del train_X_normalized, test_X_normalized, train_y_normalized, train_y_dense
gc.collect()

# --- Final Check ---
print("\nPreprocessing complete. Final shapes for training:")
print(f"train_X_svd: {train_X_svd.shape}")
print(f"train_y_svd: {train_y_svd.shape}")
print(f"test_X_svd: {test_X_svd.shape}")

Loading sparse data from .h5ad files...
Initial sparse train_X shape: (64074, 15435)
Initial sparse train_y shape: (64074, 140)

--- Preprocessing Input Matrices (X)... ---
Step 1: Applying L2 normalization to X...
Step 2: Performing TruncatedSVD on X to 128 components...
Finished X preprocessing. Final train_X_svd shape: (64074, 128)

--- Preprocessing Target Matrix (y)... ---
Step 1: Converting y to dense and applying L2 normalization...
Step 2: Performing TruncatedSVD on y to 128 components...
Finished y preprocessing. Final train_y_svd shape: (64074, 128)
Saved svd_y model to 'svd_y_model.joblib'

Cleaning up memory...

Preprocessing complete. Final shapes for training:
train_X_svd: (64074, 128)
train_y_svd: (64074, 128)
test_X_svd: (48663, 128)


In [3]:
# --- Cell 3: Dataset Class ---
class SVDDataset(Dataset):
    def __init__(self, features, targets=None):
        """
        Custom Dataset for SVD-decomposed features and targets.
        """
        self.features = features
        self.targets = targets
        self.is_train = targets is not None

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        # Data is already dense, just convert to tensor
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        
        if self.is_train:
            target = torch.tensor(self.targets[idx], dtype=torch.float32)
            return feature, target
        else:
            return feature

In [8]:
# --- Cell 4 : Model Architecture (FNN) ---
class FNNModel(nn.Module):
    def __init__(self, num_features, target_dim, hidden_size=1024, dropout=0.25):
        """
        A simple Feedforward Neural Network (FNN/MLP).
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_features, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_size, hidden_size // 2),
            nn.BatchNorm1d(hidden_size // 2),
            nn.Tanh(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_size // 2, target_dim)
        )

    def forward(self, x):
        return self.net(x)

In [9]:
# --- Cell 5: Loss function ---
class CombinedLoss(nn.Module):
    """
    Combines MSE loss and negative Pearson correlation loss.
    """
    def __init__(self, alpha=0.5):
        super().__init__()
        self.alpha = alpha
        self.mse_loss = nn.MSELoss()
        self.corr_loss = neg_correlation_score

    def forward(self, pred, true):
        # Calculate individual losses
        loss_mse = self.mse_loss(pred, true)
        loss_corr = self.corr_loss(pred, true)
        
        loss = loss_mse + loss_corr
        return loss

In [10]:
#--- Cell 6: Training function ---
def train_one_fold(
    train_loader,
    valid_loader,
    model,
    optimizer,
    loss_fn,
    train_params,
    n_fold,
    model_name
):
    """
    Trains the FNN model for a single cross-validation fold.
    """
    best_val_loss = float('inf')
    model_path = f"model_{model_name}_fold_{n_fold+1}.pth"

    for epoch in range(train_params['epochs']):
        # --- Training Phase ---
        model.train()
        for features, targets in train_loader:
            features, targets = features.to(DEVICE), targets.to(DEVICE)
            
            optimizer.zero_grad()
            preds = model(features)
            loss = loss_fn(preds, targets)
            loss.backward()
            optimizer.step()

        # --- Validation Phase ---
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for features, targets in valid_loader:
                features, targets = features.to(DEVICE), targets.to(DEVICE)
                preds = model(features)
                loss = loss_fn(preds, targets)
                val_loss += loss.item() * len(features)
        
        val_loss /= len(valid_loader.dataset)
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), model_path)
            
        if (epoch + 1) % 10 == 0:
            print(f"  Epoch {epoch+1}/{train_params['epochs']}, Val Loss: {val_loss:.4f}")
    
    print(f"Best validation loss for fold {n_fold+1}: {best_val_loss:.4f}")
    return model_path

In [None]:
#--- Cell 7: Train and Validation ---
MODEL_PARAMS = {
    'num_features': train_X_svd.shape[1], 
    'target_dim': train_y_svd.shape[1],
    'hidden_size': 1024,
    'dropout': 0.25,
}
TRAIN_PARAMS = {'batch_size': 512, 'epochs': 50, 'lr': 1e-3}
folds = KFold(n_splits=5, shuffle=True, random_state=42)

# --- Initialize Prediction Arrays ---
# These must have the ORIGINAL full shape of the targets.
oof_preds = np.zeros(train_y_full_shape, dtype=np.float32)
sub_preds = np.zeros((test_X_svd.shape[0], train_y_full_shape[1]), dtype=np.float32)

# Load the SVD model for y for inverse transform
svd_y_loaded = joblib.load('svd_y_model.joblib')

# --- Main CV Loop ---
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_X_svd)):
    print(f"\n===== Processing Fold {n_fold+1}/{folds.get_n_splits()} =====")
    
    # 1. Slice SVD data for the current fold
    X_train_fold, X_valid_fold = train_X_svd[train_idx], train_X_svd[valid_idx]
    y_train_fold, y_valid_fold = train_y_svd[train_idx], train_y_svd[valid_idx]
    
    # 2. Prepare DataLoaders, Model, Optimizer, and Loss
    train_dataset = SVDDataset(X_train_fold, y_train_fold)
    valid_dataset = SVDDataset(X_valid_fold, y_valid_fold)
    train_loader = DataLoader(train_dataset, batch_size=TRAIN_PARAMS['batch_size'], shuffle=True)
    valid_loader = DataLoader(valid_dataset, batch_size=TRAIN_PARAMS['batch_size'] * 2)
    
    model = FNNModel(**MODEL_PARAMS).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=TRAIN_PARAMS['lr'])
    loss_fn = CombinedLoss().to(DEVICE)

    # 3. Call the training function
    best_model_path = train_one_fold(
        train_loader, valid_loader, model, optimizer, loss_fn,
        TRAIN_PARAMS, n_fold, f"{name}_fnn"
    )
    
    # 4. Load best model and perform predictions
    model.load_state_dict(torch.load(best_model_path))
    model.eval()
    
    # 4a. OOF (Out-of-Fold) predictions
    val_preds_list_svd = []
    with torch.no_grad():
        for features, _ in valid_loader:
            preds_svd = model(features.to(DEVICE))
            val_preds_list_svd.append(preds_svd.cpu().numpy())
    
    oof_preds_svd = np.concatenate(val_preds_list_svd)
    # Inverse transform to get predictions in original space
    oof_preds[valid_idx] = svd_y_loaded.inverse_transform(oof_preds_svd)

    # 4b. Test set predictions (accumulated)
    test_dataset = SVDDataset(test_X_svd)
    test_loader = DataLoader(test_dataset, batch_size=TRAIN_PARAMS['batch_size'] * 2)
    
    test_preds_list_svd = []
    with torch.no_grad():
        for features in test_loader:
            preds_svd = model(features.to(DEVICE))
            test_preds_list_svd.append(preds_svd.cpu().numpy())
            
    test_preds_svd = np.concatenate(test_preds_list_svd)
    # Inverse transform and accumulate
    sub_preds += svd_y_loaded.inverse_transform(test_preds_svd) / folds.get_n_splits()
            
    # Clean up
    del model, optimizer, loss_fn
    gc.collect()
    torch.cuda.empty_cache()

# --- Save final results ---
np.save(f'sub_preds_{name}.npy', sub_preds)
print(f'Test predictions saved, shape: {sub_preds.shape}')


===== Processing Fold 1/5 =====
  Epoch 10/50, Val Loss: -0.9180
  Epoch 20/50, Val Loss: -0.9204
  Epoch 30/50, Val Loss: -0.9211
  Epoch 40/50, Val Loss: -0.9214
  Epoch 50/50, Val Loss: -0.9216
Best validation loss for fold 1: -0.9216

===== Processing Fold 2/5 =====
  Epoch 10/50, Val Loss: -0.9177
  Epoch 20/50, Val Loss: -0.9206
  Epoch 30/50, Val Loss: -0.9204
  Epoch 40/50, Val Loss: -0.9210
  Epoch 50/50, Val Loss: -0.9216
Best validation loss for fold 2: -0.9219

===== Processing Fold 3/5 =====
  Epoch 10/50, Val Loss: -0.9162
  Epoch 20/50, Val Loss: -0.9200
  Epoch 30/50, Val Loss: -0.9205
  Epoch 40/50, Val Loss: -0.9199
  Epoch 50/50, Val Loss: -0.9210
Best validation loss for fold 3: -0.9212

===== Processing Fold 4/5 =====
  Epoch 10/50, Val Loss: -0.9173
  Epoch 20/50, Val Loss: -0.9198
  Epoch 30/50, Val Loss: -0.9203
  Epoch 40/50, Val Loss: -0.9208
  Epoch 50/50, Val Loss: -0.9216
Best validation loss for fold 4: -0.9216

===== Processing Fold 5/5 =====
  Epoch 10/

In [12]:
# --- Cell 8: Cleanup ---

# Final step: remove all saved model files as requested
print("\nCleaning up saved model files...")
for n_fold in range(folds.get_n_splits()):
    model_path = f"model_{name}_fnn_fold_{n_fold+1}.pth"
    try:
        os.remove(model_path)
        print(f"Removed: {model_path}")
    except OSError as e:
        print(f"Error removing {model_path}: {e}")

# Also remove the SVD model file
try:
    os.remove('svd_y_model.joblib')
    print("Removed: svd_y_model.joblib")
except OSError as e:
    print(f"Error removing svd_y_model.joblib: {e}")

print("\nCleanup complete. Only prediction files remain.")


Cleaning up saved model files...
Removed: model_cite_fnn_fold_1.pth
Removed: model_cite_fnn_fold_2.pth
Removed: model_cite_fnn_fold_3.pth
Removed: model_cite_fnn_fold_4.pth
Removed: model_cite_fnn_fold_5.pth
Removed: svd_y_model.joblib

Cleanup complete. Only prediction files remain.
