# SCGEN:  Perturbation Prediction

In [1]:
import sys
#if branch is stable, will install via pypi, else will install from source
branch = "stable"
IN_COLAB = "google.colab" in sys.modules

if IN_COLAB and branch == "stable":
    !pip install --quiet scgen[tutorials]
elif IN_COLAB and branch != "stable":
    !pip install --quiet --upgrade jsonschema
    !pip install --quiet git+https://github.com/theislab/scgen@$branch#egg=scgen[tutorials]

In [2]:
import logging
import scanpy as sc
import scgen
import numpy as np
import torch
from gears import PertData

In [3]:
print("torch:", torch.__version__)
print("scgen:", scgen.__version__)
print("numpy:", np.__version__)

torch: 2.9.0+cu128
scgen: 2.1.1
numpy: 2.3.4


### Loading Data

In [4]:
pert_data = PertData("data/")
pert_data.load(data_name="norman")

Found local copy...
Found local copy...
Found local copy...
These perturbations are not in the GO graph and their perturbation can thus not be predicted
['RHOXF2BB+ctrl' 'LYL1+IER5L' 'ctrl+IER5L' 'KIAA1804+ctrl' 'IER5L+ctrl'
 'RHOXF2BB+ZBTB25' 'RHOXF2BB+SET']
Local copy of pyg dataset is detected. Loading...
Done!


In [5]:
pert_data.prepare_split(split = 'simulation', seed = 1) # train/test 나누기
pert_adata = pert_data.adata # 전체 데이터

Creating new splits....
Saving new splits at data/norman/splits/norman_simulation_1_0.75.pkl
Simulation split test composition:
combo_seen0:9
combo_seen1:43
combo_seen2:19
unseen_single:36
Done!


In [6]:
train_idx = np.where(pert_adata.obs['split'] == 'train')[0]
test_idx  = np.where(pert_adata.obs['split'] == 'test')[0]

np.save("norman_simulation_seed1_train_idx.npy", train_idx)
np.save("norman_simulation_seed1_test_idx.npy", test_idx)

In [7]:
stimulated = "SAMD1+ZBTB1"
control = "ctrl"
cell_type = "A549"

### Preprocessing Data

In [8]:
train_adata = pert_adata[pert_adata.obs['split'] == 'train']
val_adata = pert_adata[pert_adata.obs['split'] == 'val']
test_adata = pert_adata[pert_adata.obs['split'] == 'test']

In [9]:
train_new = train_adata.concatenate(val_adata, index_unique=None) # train, val 합치기

In [10]:
idx_list = []
for cond, df in test_adata.obs.groupby('condition'):
    # test set에 있는 perturbation에 대해서 각 perturbation 당 하나씩 뽑기
    chosen_idx = np.random.choice(df.index, size=1, replace=False)[0] 
    idx_list.append(chosen_idx)
subset_adata = test_adata[idx_list].copy()

# train set에 뽑은 데이터 추가
train_new = train_new.concatenate(subset_adata, index_unique=None)

In [11]:
train_new = train_new.copy()

In [12]:
scgen.SCGEN.setup_anndata(train_new, batch_key="condition", labels_key="cell_type")

## Creating and Saving the model¶

In [13]:
model = scgen.SCGEN(train_new)
model.save("./saved_models/scgen_norman_prediction.pt", overwrite=True)

## Training the Model

In [14]:
model.train(
    max_epochs=100,
    batch_size=32,
    early_stopping=True,
    early_stopping_patience=25
)
model.save("./saved_models/scgen_norman_prediction.pt", overwrite=True)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
You are using a CUDA device ('NVIDIA A30 MIG 2g.12gb') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [MIG-GPU-d8c14bb1-941e-4adf-09f8-3cddd1e34633/1/0;]


Training:   0%|          | 0/100 [00:00<?, ?it/s]

Monitored metric elbo_validation did not improve in the last 25 records. Best score: 834.844. Signaling Trainer to stop.


### Latent Space

In [15]:
latent_X = model.get_latent_representation()
latent_adata = sc.AnnData(X=latent_X, obs=train_new.obs.copy())

In [16]:
#sc.pp.neighbors(latent_adata)
#sc.tl.umap(latent_adata)
#sc.pl.umap(latent_adata, color=['condition'], wspace=0.4, frameon=False,
#           save='latentspace_batch32_klw000005_z100__100e.pdf')