In [1]:
from scipy.io import mmread
from scipy.sparse import csr_matrix
import anndata as ad
import pandas as pd
import scvi
import numpy as np
import scanpy as sc
scvi.settings.seed = 420
import os, sys, time

  from .autonotebook import tqdm as notebook_tqdm
  doc = func(self, args[0].__doc__, *args[1:], **kwargs)
Seed set to 420


In [10]:
rna = ad.read_h5ad("../data/GSE126074/GSE126074-RNA.h5ad")
atac = ad.read_h5ad("../data/GSE126074/GSE126074-ATAC.h5ad")
rna.obs_names = rna.obs_names.str.replace(r"_RNA$", "", regex=True)
atac.obs_names = atac.obs_names.str.replace(r"_ATAC$", "", regex=True)

In [11]:
%%time
adata_paired = ad.concat([rna, atac], merge = "same",axis=1) 
adata_paired.var['modality']=['Gene Expression']*rna.shape[1]+['Peaks']*atac.shape[1]

CPU times: user 299 ms, sys: 414 ms, total: 712 ms
Wall time: 708 ms


In [12]:
del rna, atac

In [13]:
adata_paired

AnnData object with n_obs × n_vars = 9190 × 270687
    obs: 'protocol', 'cell_type'
    var: 'chrom', 'chromStart', 'chromEnd', 'genome', 'n_counts', 'modality'

In [14]:
# We can now use the organizing method from scvi to concatenate these anndata
adata_mvi = scvi.data.organize_multiome_anndatas(adata_paired)
del adata_paired

  self._set_dim_df(value, "var")


In [15]:
adata_mvi

AnnData object with n_obs × n_vars = 9190 × 270687
    obs: 'protocol', 'cell_type', 'modality'
    var: 'chrom', 'chromStart', 'chromEnd', 'genome', 'n_counts', 'modality'

In [16]:
adata_mvi = adata_mvi[:, adata_mvi.var["modality"].argsort()].copy()
adata_mvi.var

Unnamed: 0,chrom,chromStart,chromEnd,genome,n_counts,modality
0610005C13Rik,chr7,45567793,45575327,mm10,13,Gene Expression
Timm22,chr11,76406951,76416292,mm10,170,Gene Expression
Timm21,chr18,84946190,84951524,mm10,261,Gene Expression
Timm17b,chrX,7899356,7908351,mm10,49,Gene Expression
Timm17a,chr1,135295212,135313778,mm10,426,Gene Expression
...,...,...,...,...,...,...
chr15:41375862-41376253,chr15,41375862,41376253,mm10,70,Peaks
chr15:41379625-41380245,chr15,41379625,41380245,mm10,366,Peaks
chr15:41393974-41394207,chr15,41393974,41394207,mm10,35,Peaks
chr15:41311888-41312502,chr15,41311888,41312502,mm10,102,Peaks


In [17]:
scvi.model.MULTIVI.setup_anndata(adata_mvi, batch_key="modality")

In [18]:
model = scvi.model.MULTIVI(
    adata_mvi,
    n_genes=(adata_mvi.var["modality"] == "Gene Expression").sum(),
    n_regions=(adata_mvi.var["modality"] == "Peaks").sum(),
)
model.view_anndata_setup()



In [19]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [20]:
model.train()

  model.train()
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA RTX A2000 12GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 107/500:  21%|██▏       | 107/500 [39:40<2:25:42, 22.25s/it, v_num=1, train_loss_step=1.32e+4, train_loss_epoch=1.34e+4]
Monitored metric reconstruction_loss_validation did not improve in the last 50 records. Best score: 14358.027. Signaling Trainer to stop.


## Save and Load MultiVI models

In [21]:
latent = model.get_latent_representation()

In [22]:
np.savetxt("../result/GSE126074/MultiVI.csv", latent, delimiter=',')

In [23]:
model_dir = os.path.join('../result/GSE126074/MultiVI_model/', 'multivi_GSE126074')
model.save(model_dir, overwrite=True)