In [1]:
import anndata as ad
import networkx as nx
import scanpy as sc
import scglue
import os
import pandas as pd
import numpy as np
from os.path import join
from matplotlib import rcParams
from itertools import chain
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
scglue.plot.set_publication_params()
rcParams["figure.figsize"] = (4, 4)

In [3]:
exp_id = 'PBMC'

PATH = f'./cache/{exp_id}'
os.makedirs(PATH, exist_ok=True)

rna = ad.read_h5ad("/home/yanxh/data/pbmc_10x/adata_rna_raw.h5ad")
atac = ad.read_h5ad("/home/yanxh/data/pbmc_10x/adata_atac_raw.h5ad")

In [4]:
rna, atac

(AnnData object with n_obs × n_vars = 10412 × 36601
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seurat_annotations', 'cell_type',
 AnnData object with n_obs × n_vars = 10412 × 108377
     obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'seurat_annotations', 'cell_type')

In [5]:
rna.layers["counts"] = rna.X.copy()

In [6]:
sc.pp.highly_variable_genes(rna, n_top_genes=2000, flavor="seurat_v3")

In [7]:
sc.pp.normalize_total(rna)
sc.pp.log1p(rna)
sc.pp.scale(rna)
sc.tl.pca(rna, n_comps=100, svd_solver="auto")

In [8]:
scglue.data.lsi(atac, n_components=100, n_iter=15)
np.save(f'{PATH}/x_lsi.npy', atac.obsm["X_lsi"])

# x_lsi = np.load(f'{PATH}/x_lsi.npy')
# atac.obsm['X_lsi'] = x_lsi

In [9]:
scglue.data.get_gene_annotation(
    rna, gtf="/home/yanxh/data/HumanFetal/gencode.v42.chr_patch_hapl_scaff.annotation.gtf",
    gtf_by="gene_name"
)
# exclude NaN rows
rna = rna[:, pd.notna(rna.var["chromStart"])].copy()

rna.shape

(10412, 23517)

In [10]:
split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
atac.var.head()

Unnamed: 0,chrom,chromStart,chromEnd
chr1-10109-10357,chr1,10109,10357
chr1-180730-181630,chr1,180730,181630
chr1-191491-191736,chr1,191491,191736
chr1-267816-268196,chr1,267816,268196
chr1-586028-586373,chr1,586028,586373


In [11]:
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
guidance

window_graph: 100%|████████████████████████████████████████████████████████████████████████████| 23517/23517 [00:01<00:00, 15362.98it/s]


<networkx.classes.multidigraph.MultiDiGraph at 0x7f17951d9e20>

In [12]:
scglue.graph.check_graph(guidance, [rna, atac])

[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


In [13]:
# rna.write("./cache/rna-pp.h5ad", compression="gzip")
# atac.write("./cache/atac-pp.h5ad", compression="gzip")
# nx.write_graphml(guidance, "./cache/guidance.graphml.gz")

# training

In [14]:
scglue.models.configure_dataset(
    rna, "NB", use_highly_variable=True,
    use_layer="counts", use_rep="X_pca",
#     use_cell_type='cell_type'
)

In [15]:
scglue.models.configure_dataset(
    atac, "NB", use_highly_variable=True,
    use_rep="X_lsi"
)

In [16]:
guidance_hvf = guidance.subgraph(chain(
    rna.var.query("highly_variable").index,
    atac.var.query("highly_variable").index
)).copy()

In [17]:
glue = scglue.models.fit_SCGLUE(
    {"rna": rna, "atac": atac}, guidance_hvf,
    fit_kws={"directory": PATH}
)

[INFO] fit_SCGLUE: Pretraining SCGLUE model...
[INFO] autodevice: Using GPU 0 as computation device.
[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] SCGLUEModel: Setting `graph_batch_size` = 12925
[INFO] SCGLUEModel: Setting `max_epochs` = 164
[INFO] SCGLUEModel: Setting `patience` = 14
[INFO] SCGLUEModel: Setting `reduce_lr_patience` = 7
[INFO] SCGLUETrainer: Using training directory: "cache/PBMC/pretrain"
[INFO] SCGLUETrainer: [Epoch 10] train={'g_nll': 0.494, 'g_kl': 0.015, 'g_elbo': 0.509, 'x_rna_nll': 0.341, 'x_rna_kl': 0.01, 'x_rna_elbo': 0.351, 'x_atac_nll': 0.244, 'x_atac_kl': 0.002, 'x_atac_elbo': 0.246, 'dsc_loss': 0.686, 'vae_loss': 0.618, 'gen_loss': 0.583}, val={'g_nll': 0.49, 'g_kl': 0.015, 'g_elbo': 0.505, 'x_rna_nll': 0.35, 'x_rna_kl': 0.009, 'x_rna_elbo': 0.36, 'x_atac_nll': 0.236, 'x_atac_kl': 0.002, 'x_atac_elbo': 0.237, '

2023-02-09 09:27:57,161 ignite.handlers.early_stopping.EarlyStopping INFO: EarlyStopping: Stop training


[INFO] EarlyStopping: Restoring checkpoint "41"...
[INFO] EarlyStopping: Restoring checkpoint "41"...
[INFO] fit_SCGLUE: Estimating balancing weight...
[INFO] estimate_balancing_weight: Clustering cells...


OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


[INFO] estimate_balancing_weight: Matching clusters...
[INFO] estimate_balancing_weight: Matching array shape = (20, 27)...
[INFO] estimate_balancing_weight: Estimating balancing weight...
[INFO] fit_SCGLUE: Fine-tuning SCGLUE model...
[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] SCGLUEModel: Setting `graph_batch_size` = 12925
[INFO] SCGLUEModel: Setting `align_burnin` = 28
[INFO] SCGLUEModel: Setting `max_epochs` = 164
[INFO] SCGLUEModel: Setting `patience` = 14
[INFO] SCGLUEModel: Setting `reduce_lr_patience` = 7
[INFO] SCGLUETrainer: Using training directory: "cache/PBMC/fine-tune"
[INFO] SCGLUETrainer: [Epoch 10] train={'g_nll': 0.458, 'g_kl': 0.014, 'g_elbo': 0.473, 'x_rna_nll': 0.335, 'x_rna_kl': 0.01, 'x_rna_elbo': 0.345, 'x_atac_nll': 0.24, 'x_atac_kl': 0.002, 'x_atac_elbo': 0.241, 'dsc_loss': 0.689, 'vae_loss': 0.605, 'gen_loss':

2023-02-09 09:41:49,140 ignite.handlers.early_stopping.EarlyStopping INFO: EarlyStopping: Stop training


[INFO] EarlyStopping: Restoring checkpoint "71"...
[INFO] EarlyStopping: Restoring checkpoint "71"...


In [18]:
glue.save(f"{PATH}/glue.dill")
# glue = scglue.models.load_model("glue.dill")

In [19]:
# dx = scglue.models.integration_consistency(
#     glue, {"rna": rna, "atac": atac}, guidance_hvf
# )
# dx

# _ = sns.lineplot(x="n_meta", y="consistency", data=dx).axhline(y=0.05, c="darkred", ls="--")

# Embedding

In [20]:
rna.obsm["X_glue"] = glue.encode_data("rna", rna)
atac.obsm["X_glue"] = glue.encode_data("atac", atac)

In [23]:
atac.obs['cell_type_bkp'] = atac.obs['cell_type'].values 
atac.obs = atac.obs.drop(columns=['cell_type'])

In [24]:
scglue.data.transfer_labels(rna, atac, "cell_type", use_rep="X_glue")

In [25]:
(atac.obs.cell_type == atac.obs.cell_type_bkp).mean()

0.8877257011140991