In [1]:
%load_ext autoreload
%autoreload 2

import os
import pickle

import anndata as ad
import numpy as np
import pandas as pd
import yaml
import sys
import scanpy as sc
import scipy.sparse as sps
import scipy.io as sio

import scglue
import seaborn as sns

from os.path import join

import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path to the data directory
root_dir = '/home/yanxh/gitrepo/multi-omics-matching/neurips2021_multimodal_topmethods-main'
data_dir = "/home/sda1/yanxh/data/Seurat_demo_data/pbmc_multiome"

par = {}
par['output_pretrain'] = os.path.join(
    root_dir, 
    'output/pretrain/clue/pbmc_mult.clue_train.output_pretrain/')

In [3]:
# print('Reading `mtx` files...')
rna_count_mat = sps.csr_matrix(sio.mmread(join(data_dir, 'rna_mat_count.mtx')).T)
atac_count_mat = sps.csr_matrix(sio.mmread(join(data_dir, 'atac_mat_count.mtx')).T)

rna_names = pd.read_csv(join(data_dir, 'gene_names.csv'))['x'].to_numpy()
atac_names = pd.read_csv(join(data_dir, 'atac_names.csv'))['x'].to_numpy()

cell_names = pd.read_csv(join(data_dir, 'cell_names.csv'))['x'].to_numpy()
meta_data = pd.read_csv(join(data_dir, 'metadata.csv'), index_col=0)

train_idx = pd.read_csv(join(data_dir, 'train_idx.csv'))['0'].to_numpy()
test_idx  = pd.read_csv(join(data_dir, 'test_idx.csv'))['0'].to_numpy()

rna_count_mat.shape, atac_count_mat.shape, train_idx.size, test_idx.size

((10411, 36601), (10411, 106086), 6940, 3471)

In [4]:
sys.path.append(os.path.join(root_dir, 'src/match_modality/methods/clue/resources'))
import utils

In [5]:
print('Reading `h5ad` files...')
input_train_mod1 = sc.AnnData(sps.csr_matrix(rna_count_mat[train_idx]), obs=meta_data.loc[cell_names[train_idx]])
input_train_mod2 = sc.AnnData(sps.csr_matrix(atac_count_mat[train_idx]), obs=meta_data.loc[cell_names[train_idx]])
input_train_mod1.var_names = rna_names
input_train_mod2.var_names = atac_names

input_train_mod1.obs['batch'] = 'batch1'
input_train_mod2.obs['batch'] = 'batch1'
input_train_mod1.layers["counts"] = input_train_mod1.X.astype(np.float32)
input_train_mod2.layers["counts"] = input_train_mod2.X.astype(np.float32)

mod1_feature_type = 'GEX'
mod2_feature_type = 'ATAC'

Reading `h5ad` files...


  input_train_mod1 = sc.AnnData(sps.csr_matrix(rna_count_mat[train_idx]), obs=meta_data.loc[cell_names[train_idx]])
  input_train_mod2 = sc.AnnData(sps.csr_matrix(atac_count_mat[train_idx]), obs=meta_data.loc[cell_names[train_idx]])


In [6]:
input_train_mod1, input_train_mod2

(AnnData object with n_obs × n_vars = 6940 × 36601
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seurat_annotations', 'batch'
     layers: 'counts',
 AnnData object with n_obs × n_vars = 6940 × 106086
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seurat_annotations', 'batch'
     layers: 'counts')

In [8]:
if {mod1_feature_type, mod2_feature_type} == {"GEX", "ATAC"}:
    omics = "multiome"
elif {mod1_feature_type, mod2_feature_type} == {"GEX", "ADT"}:
    omics = "cite"
else:
    raise RuntimeError("Unrecognized modality!")

In [9]:
if omics in ["cite", 'multiome']:  # perform better with this group
    n_genes = 5000
    latent_dim = 20
    x2u_h_depth = 2
    x2u_h_dim = 512
    u2x_h_depth = 1
    u2x_h_dim = 128
    du_h_depth = 2
    du_h_dim = 128
    dropout = 0.2
    lam_data = 1.0
    lam_kl = 1.0
    lam_align = 2.0
    lam_cross = 2.0
    lam_cos = 1.0
    normalize_u = True
    random_seed = 5
# elif omics == "multiome":  # not very well
#     n_genes = 10000
#     latent_dim = 50
#     x2u_h_depth = 2
#     x2u_h_dim = 512
#     u2x_h_depth = 1
#     u2x_h_dim = 256
#     du_h_depth = 1
#     du_h_dim = 256
#     dropout = 0.2
#     lam_data = 1.0
#     lam_kl = 0.3
#     lam_align = 0.02
#     lam_cross = 1.0
#     lam_cos = 0.02
#     normalize_u = True
#     random_seed = 2

In [10]:
os.makedirs(par['output_pretrain'], exist_ok=True)
with open(os.path.join(par['output_pretrain'], "hyperparams.yaml"), "w") as f:
    yaml.dump({
        "n_genes": n_genes,
        "latent_dim": latent_dim,
        "x2u_h_depth": x2u_h_depth,
        "x2u_h_dim": x2u_h_dim,
        "u2x_h_depth": u2x_h_depth,
        "u2x_h_dim": u2x_h_dim,
        "du_h_depth": du_h_depth,
        "du_h_dim": du_h_dim,
        "dropout": dropout,
        "lam_data": lam_data,
        "lam_kl": lam_kl,
        "lam_align": lam_align,
        "lam_cross": lam_cross,
        "lam_cos": lam_cos,
        "normalize_u": normalize_u,
        "random_seed": random_seed
    }, f)

In [11]:
input_train_mod1.obs["uid"] = [f"train-{i}" for i in range(input_train_mod1.shape[0])]
input_train_mod2.obs["uid"] = [f"train-{i}" for i in range(input_train_mod2.shape[0])]

In [12]:
if mod1_feature_type == "GEX":
    gex = input_train_mod1
    other = input_train_mod2
else:
    gex = input_train_mod2
    other = input_train_mod1

In [13]:
print('Preprocessing GEX...')
gex_prep = utils.GEXPreprocessing(n_comps=100, n_genes=n_genes, merge_adt=omics == "cite")
gex_prep.fit_transform(gex)

if omics == "cite":
    print('Preprocessing ADT...')
    other_prep = utils.ADTPreprocessing(n_comps=100)
elif omics == "multiome":
    print('Preprocessing ATAC...')
    other_prep = utils.ATACPreprocessing(n_comps=100)
    
other_prep.fit_transform(other)

Preprocessing GEX...
Preprocessing ATAC...


In [14]:
with open(os.path.join(par['output_pretrain'], "prep.pickle"), "wb") as f:
    pickle.dump({
        "gex_prep": gex_prep,
        "other_prep": other_prep
    }, f)

In [15]:
scglue.models.configure_dataset(
    gex, "NB", use_highly_variable=True,
    use_layer="counts", use_rep="X_pca",
    use_batch="batch", use_uid="uid"
)
scglue.models.configure_dataset(
    other, "NB", use_highly_variable=True,
    use_layer="counts", use_rep="X_lsi",
    use_batch="batch", use_uid="uid"
)

In [16]:
print('Building model...')
model = scglue.models.SCCLUEModel(
    {"gex": gex, "other": other},
    latent_dim=latent_dim,
    x2u_h_depth=x2u_h_depth,
    x2u_h_dim=x2u_h_dim,
    u2x_h_depth=u2x_h_depth,
    u2x_h_dim=u2x_h_dim,
    du_h_depth=du_h_depth,
    du_h_dim=du_h_dim,
    dropout=dropout,
    shared_batches=True,
    random_seed=random_seed
)

training = True

Building model...
[INFO] autodevice: Using GPU 1 as computation device.


In [17]:
# loading pretrained weight
# model = scglue.models.load_model(os.path.join(par['output_pretrain'], "pretrain.dill"))
# training = False

In [18]:
print('Compiling model...')
model.compile(
    lam_data=lam_data, lam_kl=lam_kl, lam_align=lam_align,
    lam_cross=lam_cross, lam_cos=lam_cos, normalize_u=normalize_u,
    domain_weight={"gex": 1, "other": 1}
)

Compiling model...


In [19]:
if training:
    print('Training model...')
    model.fit(
        {"gex": gex, "other": other}
    )
    model.save(os.path.join(par['output_pretrain'], "pretrain.dill"))

Training model...
[INFO] SCCLUEModel: Setting `align_burnin` = 123
[INFO] SCCLUEModel: Setting `max_epochs` = 738
[INFO] SCCLUEModel: Setting `patience` = 93
[INFO] SCCLUEModel: Setting `reduce_lr_patience` = 31
[INFO] SCCLUETrainer: Using training directory: "/tmp/GLUETMPrf7qc4nj"
[INFO] SCCLUETrainer: [Epoch 10] train={'dsc_loss': 0.693, 'gen_loss': 4.33, 'cross_loss': 1.887, 'cos_loss': 0.025, 'x_gex_nll': 0.426, 'x_gex_kl': 0.025, 'x_gex_elbo': 0.451, 'x_other_nll': 1.461, 'x_other_kl': 0.005, 'x_other_elbo': 1.467}, val={'dsc_loss': 0.694, 'gen_loss': 4.224, 'cross_loss': 1.854, 'cos_loss': 0.02, 'x_gex_nll': 0.426, 'x_gex_kl': 0.024, 'x_gex_elbo': 0.45, 'x_other_nll': 1.429, 'x_other_kl': 0.005, 'x_other_elbo': 1.434}, 2.3s elapsed
[INFO] SCCLUETrainer: [Epoch 20] train={'dsc_loss': 0.693, 'gen_loss': 2.773, 'cross_loss': 1.373, 'cos_loss': 0.016, 'x_gex_nll': 0.333, 'x_gex_kl': 0.021, 'x_gex_elbo': 0.354, 'x_other_nll': 1.04, 'x_other_kl': 0.005, 'x_other_elbo': 1.045}, val={'ds

2023-08-27 18:35:07,586 ignite.handlers.early_stopping.EarlyStopping INFO: EarlyStopping: Stop training


[INFO] EarlyStopping: Restoring checkpoint "213"...
[INFO] EarlyStopping: Restoring checkpoint "213"...
