In [1]:
import functools
import gc
import itertools
import os
from itertools import chain
import datetime

import anndata as ad
import faiss
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from os.path import join
from matplotlib import rcParams
from sklearn.preprocessing import Normalizer
from sklearn.utils.extmath import randomized_svd

import scglue

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.environ['CUDA_VISIBLE_DEVICE'] = '1'

In [3]:
PATH = "./cache/HF-400K-ATLAS"
os.makedirs(PATH, exist_ok=True)

In [4]:
atac_all = ad.read_h5ad("/home/yanxh/data/HumanFetal/ATAC/adata_atac_raw.h5ad")
rna_all = ad.read_h5ad("/home/yanxh/data/HumanFetal/RNA/adata_rna_raw.h5ad")

In [5]:
data_root = '/home/yanxh/data/HumanFetal_400k'

atac0 = sc.read_h5ad(join(data_root, 'ATAC/adata_atac.h5ad'))
rna0 = sc.read_h5ad(join(data_root, 'RNA/adata_rna_sampled.h5ad'))

In [6]:
rna_all

AnnData object with n_obs × n_vars = 433695 × 63561
    obs: 'All_reads', 'Assay', 'Batch', 'Development_day', 'Exon_reads', 'Experiment_batch', 'Fetus_id', 'Intron_reads', 'Main_cluster_name', 'Main_cluster_umap_1', 'Main_cluster_umap_2', 'Organ', 'Organ_cell_lineage', 'RT_group', 'Sex', 'Size_Factor', 'batch', 'obs_names', 'sample'
    var: 'exon_intron', 'gene_id', 'gene_short_name', 'gene_type', 'index', 'var_names'

In [7]:

atac = atac_all[atac0.obs_names].copy()
atac.obs['cell_type'] = atac0.obs.cell_type.values
atac.obs['domain'] = 'atac'
atac.obs['tissue'] = ['cerebrum' if _.lower()=='brain' else _ for _ in atac0.obs.tissue.values]  # rename 'brain' as 'cerebrum'
atac.obs['tissue']

rna = rna_all[rna0.obs_names].copy()
rna.obs['cell_type'] = rna0.obs.Main_cluster_name.values
rna.obs['domain'] = 'rna'
# rna.obs = rna.obs[['domain', 'cell_type', 'Organ']].copy()

rna.var_names = rna.var.gene_short_name.values.astype('str')
rna = rna[:, ~rna.var_names.duplicated()]

del atac0, rna0
gc.collect()

563

### process feature names

In [8]:
scglue.data.get_gene_annotation(
    rna, gtf="/home/yanxh/data/HumanFetal/gencode.v42.chr_patch_hapl_scaff.annotation.gtf",
    gtf_by="gene_name"
)
# exclude NaN rows
rna = rna[:, pd.notna(rna.var["chromStart"])].copy()

# rna.var.loc[:, ["chrom", "chromStart", "chromEnd"]].head()

split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)
atac.var.head()

Unnamed: 0,chrom,chromStart,chromEnd
chr1-9992-10688,chr1,9992,10688
chr1-14831-15063,chr1,14831,15063
chr1-17351-17617,chr1,17351,17617
chr1-29200-29505,chr1,29200,29505
chr1-115525-115871,chr1,115525,115871


### Organ balancing

In [9]:
rna_organ_fracs = rna.obs["Organ"].str.lower().value_counts() / rna.shape[0]
atac_organ_fracs = atac.obs["tissue"].str.lower().value_counts() / atac.shape[0]
cmp_organ_fracs = pd.DataFrame({"rna": rna_organ_fracs, "atac": atac_organ_fracs})

organ_min_fracs = cmp_organ_fracs.min(axis=1)

In [10]:
rs = np.random.RandomState(0)
rna_subidx, atac_subidx = [], []
for organ, min_frac in organ_min_fracs.iteritems():
    print(f"Dealing with {organ}...")
    rna_idx = np.where(rna.obs["Organ"].str.lower() == organ)[0]
    rna_subidx.append(rs.choice(rna_idx, max(1, round(min_frac * rna.shape[0])), replace=False))
    atac_idx = np.where(atac.obs["tissue"].str.lower() == organ)[0]
    atac_subidx.append(rs.choice(atac_idx, max(1, round(min_frac * atac.shape[0])), replace=False))
rna_subidx = np.concatenate(rna_subidx)
rna_mask = np.zeros(rna.shape[0], dtype=bool)
rna_mask[rna_subidx] = True
rna.obs["mask"] = rna_mask
atac_subidx = np.concatenate(atac_subidx)
atac_mask = np.zeros(atac.shape[0], dtype=bool)
atac_mask[atac_subidx] = True
atac.obs["mask"] = atac_mask

  for organ, min_frac in organ_min_fracs.iteritems():


Dealing with adrenal...
Dealing with cerebellum...
Dealing with cerebrum...
Dealing with eye...
Dealing with heart...
Dealing with intestine...
Dealing with kidney...
Dealing with liver...
Dealing with lung...
Dealing with muscle...
Dealing with pancreas...
Dealing with placenta...
Dealing with spleen...
Dealing with stomach...
Dealing with thymus...


In [11]:
rna_organ_balancing = np.sqrt(cmp_organ_fracs["atac"] / cmp_organ_fracs["rna"])
atac_organ_balancing = np.sqrt(cmp_organ_fracs["rna"] / cmp_organ_fracs["atac"])

In [12]:
rna.obs["organ_balancing"] = rna_organ_balancing.loc[rna.obs["Organ"].str.lower()].to_numpy()
atac.obs["organ_balancing"] = atac_organ_balancing.loc[atac.obs["tissue"].str.lower()].to_numpy()

### Gene selection and Dimreduction

In [13]:
hvg_df = sc.pp.highly_variable_genes(rna[rna.obs["mask"], :], n_top_genes=4000, flavor="seurat_v3", inplace=False)
rna.var = rna.var.assign(**hvg_df.to_dict(orient="series"))

In [14]:
rna.layers["raw_counts"] = rna.X.copy()
sc.pp.normalize_total(rna)
sc.pp.log1p(rna)
rna = rna[:, rna.var.highly_variable].copy()
gc.collect()

1272

In [15]:
X = rna.X
X_masked = X[rna.obs["mask"]]
mean = X_masked.mean(axis=0).A1
std = np.sqrt(X_masked.power(2).mean(axis=0).A1 - mean ** 2)
X = X.toarray()
X -= mean
X /= std
X = X.clip(-10, 10)
X_masked = X[rna.obs["mask"]]

In [16]:
u, s, vh = randomized_svd(X_masked.T @ X_masked, 100, n_iter=15, random_state=0)
rna.obsm["X_pca"] = X @ vh.T

In [17]:
rna.X = rna.layers["raw_counts"]
del rna.layers["raw_counts"], X, X_masked, mean, std, u, s, vh
gc.collect()

0

In [18]:
X = scglue.num.tfidf(atac.X)
X = Normalizer(norm="l1").fit_transform(X)
X = np.log1p(X * 1e4)
X_masked = X[atac.obs["mask"]]
u, s, vh = randomized_svd(X_masked, 100, n_iter=15, random_state=0)
X_lsi = X @ vh.T / s
X_lsi -= X_lsi.mean(axis=1, keepdims=True)
X_lsi /= X_lsi.std(axis=1, ddof=1, keepdims=True)
atac.obsm["X_lsi"] = X_lsi.astype(np.float32)

del X, X_masked, X_lsi, u, s, vh
gc.collect()

# np.save(f'{PATH}/x_lsi.npy', atac.obsm["X_lsi"])
# X_lsi = np.load(f'{PATH}/x_lsi.npy')
# atac.obsm["X_lsi"] = X_lsi.astype(np.float32)

### Creat graph

generate hvg for atac

In [19]:
start_time = datetime.datetime.now()

In [20]:
guidance = scglue.genomics.rna_anchored_guidance_graph(rna, atac)
scglue.graph.check_graph(guidance, [rna, atac])

window_graph: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4000/4000 [00:08<00:00, 459.03it/s]


[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] check_graph: All checks passed!


# 2. scGLUE Pretrain

In [21]:
scglue.models.configure_dataset(
    rna, "NB", use_highly_variable=True,
    use_rep="X_pca",        # 和tutorial不同的是这里layer是默认的，前面用raw_count替换了X
    use_dsc_weight="organ_balancing"
)

In [22]:
scglue.models.configure_dataset(
    atac, "NB", use_highly_variable=True,
    use_rep="X_lsi",
    use_dsc_weight="organ_balancing"
)

In [23]:
guidance_hvf = guidance.subgraph(chain(
    rna.var.query("highly_variable").index,
    atac.var.query("highly_variable").index
)).copy()

# nx.write_graphml(guidance, f"{PATH}/full.graphml.gz")
# nx.write_graphml(guidance_hvf, f"{PATH}/sub.graphml.gz")

# graph = nx.read_graphml(f"{PATH}/full.graphml.gz")
# subgraph = nx.read_graphml(f"{PATH}/sub.graphml.gz")

In [None]:
glue = scglue.models.fit_SCGLUE(
    {"rna": rna, "atac": atac}, guidance_hvf,
    fit_kws={"directory": PATH}
)

[INFO] fit_SCGLUE: Pretraining SCGLUE model...
[INFO] autodevice: Using GPU 1 as computation device.
[INFO] check_graph: Checking variable coverage...
[INFO] check_graph: Checking edge attributes...
[INFO] check_graph: Checking self-loops...
[INFO] check_graph: Checking graph symmetry...
[INFO] SCGLUEModel: Setting `graph_batch_size` = 185042
[INFO] SCGLUEModel: Setting `max_epochs` = 48
[INFO] SCGLUEModel: Setting `patience` = 4
[INFO] SCGLUEModel: Setting `reduce_lr_patience` = 2
[INFO] SCGLUETrainer: Using training directory: "cache/HF-400K-ATLAS/pretrain"
Epoch 00009: reducing learning rate of group 0 to 2.0000e-04.
Epoch 00009: reducing learning rate of group 0 to 2.0000e-04.
[INFO] LRScheduler: Learning rate reduction: step 1
[INFO] SCGLUETrainer: [Epoch 10] train={'g_nll': 0.389, 'g_kl': 0.001, 'g_elbo': 0.39, 'x_rna_nll': 0.094, 'x_rna_kl': 0.005, 'x_rna_elbo': 0.099, 'x_atac_nll': 0.019, 'x_atac_kl': 0.0, 'x_atac_elbo': 0.019, 'dsc_loss': 0.693, 'vae_loss': 0.134, 'gen_loss': 

In [None]:
end_time = datetime.datetime.now()
train_time = (end_time - start_time).total_seconds()
print('train time, ', train_time)

In [32]:
train_time

26881.664582

In [23]:
glue.save(f"{PATH}/glue.dill")
# glue = scglue.models.load_model("glue.dill")

# Label transfer

In [33]:
start_time = datetime.datetime.now()

In [34]:
rna.obsm["X_glue"] = glue.encode_data("rna", rna)
atac.obsm["X_glue"] = glue.encode_data("atac", atac)

In [35]:
atac.obs['cell_type_bkp'] = atac.obs['cell_type'].values 
atac.obs = atac.obs.drop(columns=['cell_type'])

In [36]:
scglue.data.transfer_labels(rna, atac, "cell_type", use_rep="X_glue")

In [37]:
end_time = datetime.datetime.now()
test_time = (end_time - start_time).total_seconds()
print('test time, ', test_time)

test time,  90.374238


In [38]:
train_time + test_time

26972.03882

In [27]:
(atac.obs.cell_type == atac.obs.cell_type_bkp).mean()

0.5795458333333333