In [36]:
import os
import scanpy as sc
import muon as mu 
from anndata import AnnData
from typing import Callable, List, Mapping, Optional
import scglue

In [37]:
data_dir = "../datasets/multiome/"
dataset = "DOGMAseq_human_pbmcs.h5mu"
# dataset = "PBMC10k.h5mu"
# dataset = "multiome_BMMC.h5mu"
mdata = mu.read_h5mu(os.path.join(data_dir, dataset))
mdata

In [38]:
if 'interval' not in mdata.mod['rna'].var.columns:
    gene_location = mdata.mod['rna'].copy()
    scglue.data.get_gene_annotation(
        gene_location, gtf="../datasets/GTFFiles/gencode.v47.basic.annotation.gtf.gz",
        gtf_by="gene_name"
    )
    rna = mdata.mod['rna'].copy()
    rna.var.index.name = "genes"
    rna.var['chrom'] = gene_location.var['chrom']
    rna.var['chromStart'] = gene_location.var['chromStart']
    rna.var['chromEnd'] = gene_location.var['chromEnd']
    rna = rna[:, rna.var['chromStart'].notnull()]
    rna = rna[:, rna.var['chrom'].str.match(r'^chr([1-9]|1[0-9]|2[0-1]|X|Y)$')].copy()
    rna.var['chromStart'] = rna.var['chromStart'].astype(int)
    rna.var['chromEnd'] = rna.var['chromEnd'].astype(int)
    rna.var.sort_values(by=['chrom', 'chromStart', 'chromEnd'], inplace=True)
    rna.var
else:
    rna = mdata.mod['rna'].copy()
    rna.var.index.name = "genes"
    rna.var['chrom'], rna.var['chromStart'], rna.var['chromEnd'] = None, None, None
    rna.var[["chrom", "chromStart", "chromEnd"]] = rna.var['interval'].str.split(r"[:-]", expand=True)
    rna = rna[:, rna.var['chrom'].str.contains('chr')]
    rna.var['chromStart'] = rna.var['chromStart'].astype(int)
    rna.var['chromEnd'] = rna.var['chromEnd'].astype(int)
    rna.var.sort_values(by=['chrom', 'chromStart', 'chromEnd'], inplace=True)
rna

AnnData object with n_obs × n_vars = 13383 × 22560
    obs: 'batch', 'cell_type'
    var: 'chrom', 'chromStart', 'chromEnd'

In [39]:
atac = mdata.mod['atac'].copy()
atac.var.index.name = "peaks"
split = atac.var_names.str.split(r"[:-]")
atac.var["chrom"] = split.map(lambda x: x[0])
atac.var["chromStart"] = split.map(lambda x: x[1]).astype(int)
atac.var["chromEnd"] = split.map(lambda x: x[2]).astype(int)

atac = atac[:, atac.var['chrom'].str.contains('chr')].copy()
atac.var.sort_values(by=['chrom', 'chromStart', 'chromEnd'], inplace=True)
atac

AnnData object with n_obs × n_vars = 13383 × 68963
    obs: 'batch', 'cell_type'
    var: 'chrom', 'chromStart', 'chromEnd'

In [40]:
mdata = mu.MuData({
    'rna': rna,
    'atac': atac
})
mdata

In [41]:
mdata.obs['cell_type'] = mdata.mod['rna'].obs['cell_type']

if 'batch' in mdata.mod['rna'].obs.columns:
    mdata.obs['batch'] = mdata.mod['rna'].obs['batch']

In [42]:
mdata.write_h5mu(os.path.join(data_dir, dataset.split(".")[0] + "_sortby_chrom.h5mu"))