cPCA components = Directions of variation that exist in TARGET but NOT in BACKGROUND

In [None]:
import scanpy as sc
from contrastive import CPCA
import numpy as np

# ===== Load your data =====
adata = sc.read_h5ad("your_data.h5ad")
# Assumes adata.obs has a 'condition' column with 'healthy' and 'disease'

# ===== Preprocess =====
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000)

# ===== Split into healthy and disease =====
adata_disease = adata[adata.obs['condition'] == 'disease'].copy()
adata_healthy = adata[adata.obs['condition'] == 'healthy'].copy()

# ===== Get HVGs =====
hvgs = adata.var_names[adata.var['highly_variable']]

# ===== Extract matrices =====
X_disease = adata_disease[:, hvgs].X
X_healthy = adata_healthy[:, hvgs].X

# Convert to dense if sparse
if hasattr(X_disease, 'toarray'):
    X_disease = X_disease.toarray()
    X_healthy = X_healthy.toarray()

# ===== Run cPCA: Find variation unique to disease =====
cpca = CPCA(n_components=50, alpha=1.0)
cpca.fit(X_disease, X_healthy)  # Target=disease, Background=healthy

# Transform disease cells to contrastive space
X_cpca = cpca.transform(X_disease)
adata_disease.obsm['X_cpca'] = X_cpca
adata.obsm['X_cpca'] = cpca.transform(adata[:, hvgs].X.toarray())


# ===== Cluster using disease-specific variation =====
sc.pp.neighbors(adata_disease, use_rep='X_cpca', n_pcs=30)
sc.tl.umap(adata_disease)
sc.tl.leiden(adata_disease, resolution=0.5)

# ===== Plot =====
sc.pl.umap(adata_disease, color=['leiden', 'donor_id'], ncols=2)

# ===== Find genes driving disease-specific clusters =====
sc.tl.rank_genes_groups(adata_disease, groupby='leiden', method='wilcoxon')
sc.pl.rank_genes_groups(adata_disease, n_genes=20)

print(f"Disease cells clustered into {adata_disease.obs['leiden'].nunique()} groups")