In [1]:
# --- Imports and Setup ---
import numpy as np
import pandas as pd
import gc
import os
import joblib

# PyTorch and Dataloader
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

# Scikit-learn and UMAP
from sklearn.model_selection import KFold
from sklearn.preprocessing import normalize
import umap

# AnnData for loading
import anndata as ad
from scipy.sparse import issparse




# --- Global Configuration ---
name = 'multi'

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# --- Helper Functions ---

def neg_correlation_score_torch(y_pred: torch.Tensor, y_true: torch.Tensor, eps: float = 1e-8) -> torch.Tensor:
    """
    Row-wise Pearson correlation for PyTorch Tensors.
    """
    x = y_true - y_true.mean(dim=1, keepdim=True)
    y = y_pred - y_pred.mean(dim=1, keepdim=True)
    num = (x * y).sum(dim=1)
    den = torch.sqrt((x.square().sum(dim=1) + eps) * (y.square().sum(dim=1) + eps))
    r = num / den
    return -r.mean()

def pearson_corr_numpy(y_pred: np.ndarray, y_true: np.ndarray) -> float:
    """
    Calculates mean row-wise Pearson correlation for NumPy arrays.
    """
    # Calculate row-wise correlation
    corrs = np.array([np.corrcoef(p, t)[0, 1] for p, t in zip(y_pred, y_true)])
    # Return the mean of correlations, handling potential NaNs
    return np.nanmean(corrs)

from scipy import sparse
from scipy.sparse import issparse
import warnings

def sparse_clr_transform(X: sparse.csr_matrix):
    """
    Applies a sparse-friendly Centered Log-Ratio (CLR) transformation.
    It operates only on the non-zero elements of the matrix, preserving sparsity.
    1. log1p is applied to all non-zero elements.
    2. For each row, the mean of these transformed values is computed.
    3. This mean is then subtracted from each non-zero element's transformed value in that row.
    """
    print("  Applying sparse-friendly CLR transformation...")
    
    # Ensure matrix is in CSR format for efficient row-wise operations
    X = X.tocsr()
    
    log1p_data = np.log1p(X.data)
    
    clr_data = np.zeros_like(log1p_data)
    
    for i in range(X.shape[0]):
        start, end = X.indptr[i], X.indptr[i+1]
        
        if end > start:
            row_mean = np.mean(log1p_data[start:end])
            clr_data[start:end] = log1p_data[start:end] - row_mean
            
    X_clr_sparse = sparse.csr_matrix((clr_data, X.indices, X.indptr), shape=X.shape)
    
    return X_clr_sparse

Using device: cuda


In [None]:
# --- Cell 2: Data Loading and Preprocessing ---
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
# --- Dimensional Hyperparameters ---
SVD_PRE_COMPONENTS_X = 128  # SVD components for input X pre-reduction
UMAP_COMPONENTS_X = 64      # Final UMAP dimension for input X
SVD_COMPONENTS_Y = 128      # Final SVD dimension for target y
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST = 0.1

# --- a) Load sparse data ---
print("Loading sparse data from .h5ad files...")
PATH_TRAIN_INP = f"train_{name}_inputs.h5ad"
PATH_TRAIN_TGT = f"train_{name}_targets.h5ad"
PATH_TEST_INP  = f"test_{name}_inputs.h5ad"

adata_train_inp = ad.read_h5ad(PATH_TRAIN_INP)
adata_train_tgt = ad.read_h5ad(PATH_TRAIN_TGT)
adata_test_inp = ad.read_h5ad(PATH_TEST_INP)

train_X_sparse = adata_train_inp.X
test_X_sparse = adata_test_inp.X
train_y_sparse = adata_train_tgt.X

print(f"Initial sparse train_X shape: {train_X_sparse.shape}")
print(f"Initial sparse train_y shape: {train_y_sparse.shape}")

# --- Input Matrix Preprocessing (train_X, test_X) ---
print("\n--- Preprocessing Input Matrices (X) with Sparse CLR -> SVD -> UMAP... ---")
# Step 1: Sparse CLR transformation
train_X_clr_sparse = sparse_clr_transform(train_X_sparse)
test_X_clr_sparse = sparse_clr_transform(test_X_sparse)

# Step 2: SVD Pre-reduction
print(f"Step 2: Performing TruncatedSVD on sparse X to {SVD_PRE_COMPONENTS_X} components...")
svd_x = TruncatedSVD(n_components=SVD_PRE_COMPONENTS_X, random_state=42)
train_X_svd = svd_x.fit_transform(train_X_clr_sparse)
test_X_svd = svd_x.transform(test_X_clr_sparse)

# Step 3: UMAP decomposition
print(f"Step 3: Performing UMAP on SVD-reduced X to {UMAP_COMPONENTS_X} components...")
mapper_x = umap.UMAP(n_components=UMAP_COMPONENTS_X, n_neighbors=UMAP_N_NEIGHBORS,
                   min_dist=UMAP_MIN_DIST, random_state=42)
train_X_umap = mapper_x.fit_transform(train_X_svd)
test_X_umap = mapper_x.transform(test_X_svd)
print(f"Finished X preprocessing. Final train_X_umap shape: {train_X_umap.shape}")


# --- Target Matrix Preprocessing (train_y) ---
print("\n--- Preprocessing Target Matrix (y) with Sparse CLR -> SVD... ---")
# Step 1: Sparse CLR transformation
train_y_clr_sparse = sparse_clr_transform(train_y_sparse)
# Convert to dense for validation metric calculation
train_y_clr = train_y_clr_sparse.toarray()

# Step 2: SVD-only reduction for a robust inverse transform
print(f"Step 2: Performing TruncatedSVD on y to {SVD_COMPONENTS_Y} components...")
svd_y = TruncatedSVD(n_components=SVD_COMPONENTS_Y, random_state=42)
train_y_svd = svd_y.fit_transform(train_y_clr_sparse)
print(f"Finished y preprocessing. Final train_y_svd shape: {train_y_svd.shape}")

joblib.dump(svd_y, 'svd_y_model.joblib')
print("Saved svd_y model.")

# --- Clean up unused variables ---
print("\nCleaning up memory...")
del adata_train_inp, adata_train_tgt, adata_test_inp, train_X_sparse, test_X_sparse, train_y_sparse
del train_X_clr_sparse, test_X_clr_sparse, train_y_clr_sparse
del train_X_svd, test_X_svd
# keep train_y_clr for validation
gc.collect()

# --- Final Check ---
print("\nPreprocessing complete. Final shapes for training:")
print(f"train_X_umap: {train_X_umap.shape}")
print(f"train_y_svd: {train_y_svd.shape}")
print(f"test_X_umap: {test_X_umap.shape}")

Loading sparse data from .h5ad files...


In [None]:
# --- Cell 3: Dataset Class (Restored for Validation Metric) ---
class MultiTargetDataset(Dataset):
    def __init__(self, features, targets_low_dim, targets_full_dim):
        """
        Custom Dataset that returns low-dim features, low-dim targets (for loss),
        and full-dim targets (for validation metric).
        """
        self.features = features
        self.targets_low_dim = targets_low_dim
        self.targets_full_dim = targets_full_dim

    def __len__(self):
        return self.features.shape[0]

    def __getitem__(self, idx):
        feature = torch.tensor(self.features[idx], dtype=torch.float32)
        target_low = torch.tensor(self.targets_low_dim[idx], dtype=torch.float32)
        # targets_full_dim is already a dense numpy array
        target_full = torch.tensor(self.targets_full_dim[idx], dtype=torch.float32)
        return feature, target_low, target_full

In [None]:
# --- Cell 4 : Model Architecture (FNN) ---
class FNNModel(nn.Module):
    def __init__(self, num_features, target_dim, hidden_size=1024, dropout=0.25):
        """
        A simple Feedforward Neural Network (FNN/MLP).
        """
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(num_features, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.GELU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_size, hidden_size // 2),
            nn.BatchNorm1d(hidden_size // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            
            nn.Linear(hidden_size // 2, target_dim)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# --- Cell 5: Loss function ---
class CombinedLoss(nn.Module):
    """
    Combines MSE loss and negative Pearson correlation loss in the latent space.
    """
    def __init__(self):
        super().__init__()
        self.mse_loss = nn.MSELoss()
        self.corr_loss = neg_correlation_score_torch

    def forward(self, pred_low_dim, true_low_dim):
        # Calculate losses in the latent space
        loss_mse = self.mse_loss(pred_low_dim, true_low_dim)
        loss_corr = self.corr_loss(pred_low_dim, true_low_dim)
        return loss_mse + loss_corr

In [None]:
#--- Cell 6: Training function ---
def train_one_fold(
    train_loader,
    valid_loader,
    model,
    optimizer,
    loss_fn,
    svd_y,
    train_params,
    n_fold,
    model_name
):
    """
    Validation metric is Pearson correlation on the original (CLR-transformed) space.
    """
    best_val_corr = -1.0
    model_path = f"model_{model_name}_fold_{n_fold+1}.pth"

    for epoch in range(train_params['epochs']):
        # --- Training Phase ---
        model.train()
        for features, targets_low, _ in train_loader:
            features, targets_low = features.to(DEVICE), targets_low.to(DEVICE)
            
            optimizer.zero_grad()
            preds_low = model(features)
            loss = loss_fn(preds_low, targets_low)
            loss.backward()
            optimizer.step()

        # --- Validation Phase ---
        model.eval()
        val_loss_latent = 0.0
        all_recon_preds = []
        all_full_targets = []
        
        with torch.no_grad():
            for features, targets_low, targets_full in valid_loader:
                features, targets_low = features.to(DEVICE), targets_low.to(DEVICE)
                
                preds_low = model(features)
                
                loss = loss_fn(preds_low, targets_low)
                val_loss_latent += loss.item() * len(features)
                
                recon_preds = svd_y.inverse_transform(preds_low.cpu().numpy())
                all_recon_preds.append(recon_preds)
                all_full_targets.append(targets_full.numpy())
        
        # Calculate and report metrics for the epoch
        val_loss_latent /= len(valid_loader.dataset)
        val_preds_full = np.concatenate(all_recon_preds)
        val_targets_full = np.concatenate(all_full_targets)
        val_corr = pearson_corr_numpy(val_preds_full, val_targets_full)
        
        # Save the model if the Pearson correlation on the validation set has improved
        if val_corr > best_val_corr:
            best_val_corr = val_corr
            torch.save(model.state_dict(), model_path)
            
        if (epoch + 1) % 10 == 0:
            print(f"  Epoch {epoch+1}/{train_params['epochs']}, "
                  f"Latent Loss: {val_loss_latent:.4f}, "
                  f"Validation Pearson Corr: {val_corr:.4f}")
    
    print(f"Best validation Pearson correlation for fold {n_fold+1}: {best_val_corr:.4f}")
    return model_path

  from scipy.spatial.qhull import QhullError


: 

In [None]:
#--- Cell 7: Train and Validation ---
MODEL_PARAMS = {
    'num_features': train_X_umap.shape[1], 
    'target_dim': train_y_svd.shape[1],
    'hidden_size': 1024,
    'dropout': 0.25,
}
TRAIN_PARAMS = {'batch_size': 512, 'epochs': 50, 'lr': 1e-3}
folds = KFold(n_splits=5, shuffle=True, random_state=42)

sub_preds = np.zeros((test_X_umap.shape[0], train_y_clr.shape[1]), dtype=np.float32)

svd_y_loaded = joblib.load('svd_y_model.joblib')

# --- Main CV Loop ---
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_X_umap)):
    print(f"\n===== Processing Fold {n_fold+1}/{folds.get_n_splits()} =====")
    
    # 1. Slice all necessary data arrays for the current fold
    X_train_fold, X_valid_fold = train_X_umap[train_idx], train_X_umap[valid_idx]
    y_svd_train_fold, y_svd_valid_fold = train_y_svd[train_idx], train_y_svd[valid_idx]
    y_clr_train_fold, y_clr_valid_fold = train_y_clr[train_idx], train_y_clr[valid_idx]
    
    # 2. Prepare DataLoaders, Model, Optimizer, and Loss
    train_dataset = MultiTargetDataset(X_train_fold, y_svd_train_fold, y_clr_train_fold)
    valid_dataset = MultiTargetDataset(X_valid_fold, y_svd_valid_fold, y_clr_valid_fold)
    train_loader = DataLoader(train_dataset, batch_size=TRAIN_PARAMS['batch_size'], shuffle=True, num_workers=0)
    valid_loader = DataLoader(valid_dataset, batch_size=TRAIN_PARAMS['batch_size'] * 2, num_workers=0)
    
    model = FNNModel(**MODEL_PARAMS).to(DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=TRAIN_PARAMS['lr'])
    loss_fn = CombinedLoss().to(DEVICE)

    # 3. Call the training function
    best_model_path = train_one_fold(
        train_loader, valid_loader, model, optimizer, loss_fn,
        svd_y_loaded,
        TRAIN_PARAMS, n_fold, f"{name}_final_fnn"
    )
    
    # 4. Load best model and perform predictions on the test set
    model.load_state_dict(torch.load(best_model_path))
    model.eval()
    
    # Create DataLoader for the test set (no targets needed)
    test_dataset = MultiTargetDataset(test_X_umap, test_X_umap, test_X_umap) 
    test_loader = DataLoader(test_dataset, batch_size=TRAIN_PARAMS['batch_size'] * 2)
    
    test_preds_list_low_dim = []
    with torch.no_grad():
        for features, _, _ in test_loader:
            preds_low_dim = model(features.to(DEVICE))
            test_preds_list_low_dim.append(preds_low_dim.cpu().numpy())
            
    test_preds_low_dim = np.concatenate(test_preds_list_low_dim)
    
    test_preds_high_dim = svd_y_loaded.inverse_transform(test_preds_low_dim)
    
    # Accumulate predictions for each fold
    sub_preds += test_preds_high_dim / folds.get_n_splits()
            
    # Clean up memory for the next fold
    del model, optimizer, loss_fn, train_dataset, valid_dataset, train_loader, valid_loader
    gc.collect()
    torch.cuda.empty_cache()

# --- Save final results ---
np.save(f'sub_preds_{name}_final_fnn.npy', sub_preds)
print(f'\nTest predictions saved, shape: {sub_preds.shape}')


===== Processing Fold 1/5 =====


  warn(


In [None]:
# --- Cell 8: Cleanup (Restored) ---

# Final step: remove all saved model files
print("\nCleaning up saved model files...")
for n_fold in range(folds.get_n_splits()):
    model_path = f"model_{name}_final_fnn_fold_{n_fold+1}.pth"
    try:
        os.remove(model_path)
        print(f"Removed: {model_path}")
    except OSError as e:
        print(f"Error removing {model_path}: {e}")

# Also remove the SVD mapper file, as it is now saved again
try:
    os.remove('svd_y_model.joblib')
    print("Removed: svd_y_model.joblib")
except OSError as e:
    print(f"Error removing svd_y_model.joblib: {e}")

print("\nCleanup complete. Only prediction files remain.")


--- Training Model 2 (MSE Loss with Z-scored Targets) ---

===== Fold 1 =====
Epoch 1/10, Val Loss: 0.5881
Epoch 2/10, Val Loss: 0.5904
Epoch 3/10, Val Loss: 0.5882
Epoch 4/10, Val Loss: 0.5877
Epoch 5/10, Val Loss: 0.5880
Epoch 6/10, Val Loss: 0.5877
Epoch 7/10, Val Loss: 0.5876
Epoch 8/10, Val Loss: 0.5877
Epoch 9/10, Val Loss: 0.5878
Epoch 10/10, Val Loss: 0.5876
Predicting on test set (memory-efficiently)...


Test Prediction Fold 1:   0%|          | 0/55 [00:00<?, ?it/s]


===== Fold 2 =====
Epoch 1/10, Val Loss: 0.5866
Epoch 2/10, Val Loss: 0.5867
Epoch 3/10, Val Loss: 0.5882
Epoch 4/10, Val Loss: 0.5865
Epoch 5/10, Val Loss: 0.5865
Epoch 6/10, Val Loss: 0.5868
Epoch 7/10, Val Loss: 0.5877
Epoch 8/10, Val Loss: 0.5863
Epoch 9/10, Val Loss: 0.5863
Epoch 10/10, Val Loss: 0.5863
Predicting on test set (memory-efficiently)...


Test Prediction Fold 2:   0%|          | 0/55 [00:00<?, ?it/s]


===== Fold 3 =====
Epoch 1/10, Val Loss: 0.5883
Epoch 2/10, Val Loss: 0.5894
Epoch 3/10, Val Loss: 0.5886
Epoch 4/10, Val Loss: 0.5884
Epoch 5/10, Val Loss: 0.5885
Epoch 6/10, Val Loss: 0.5883
Epoch 7/10, Val Loss: 0.5882
Epoch 8/10, Val Loss: 0.5880
Epoch 9/10, Val Loss: 0.5881
Epoch 10/10, Val Loss: 0.5880
Predicting on test set (memory-efficiently)...


Test Prediction Fold 3:   0%|          | 0/55 [00:00<?, ?it/s]


===== Fold 4 =====
Epoch 1/10, Val Loss: 0.5872
Epoch 2/10, Val Loss: 0.5872
Epoch 3/10, Val Loss: 0.5872
Epoch 4/10, Val Loss: 0.5872
Epoch 5/10, Val Loss: 0.5871
Epoch 6/10, Val Loss: 0.5872
Epoch 7/10, Val Loss: 0.5873
Epoch 8/10, Val Loss: 0.5874
Epoch 9/10, Val Loss: 0.5870
Epoch 10/10, Val Loss: 0.5876
Predicting on test set (memory-efficiently)...


Test Prediction Fold 4:   0%|          | 0/55 [00:00<?, ?it/s]


===== Fold 5 =====
Epoch 1/10, Val Loss: 0.5881
Epoch 2/10, Val Loss: 0.5878
Epoch 3/10, Val Loss: 0.5876
Epoch 4/10, Val Loss: 0.5875
Epoch 5/10, Val Loss: 0.5875
Epoch 6/10, Val Loss: 0.5876
Epoch 7/10, Val Loss: 0.5876
Epoch 8/10, Val Loss: 0.5877
Epoch 9/10, Val Loss: 0.5875
Epoch 10/10, Val Loss: 0.5875
Predicting on test set (memory-efficiently)...


Test Prediction Fold 5:   0%|          | 0/55 [00:00<?, ?it/s]


Overall CV Pearson Score: 0.6425

--- Saving predictions from Model 1 to disk ---
Saved oof_preds_mse_multi.npy and sub_preds_mse_multi.npy successfully.
