In [1]:
import numpy as np
import pandas as pd
from scipy.io import mmread
import os
import glob
import anndata
import tqdm
import zarr
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

from dredFISH.Utils import basicu
from dredFISH.Utils import powerplots

In [2]:
%%time
pths = [
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P28_glut_dr.h5ad",
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P28_gaba_dr.h5ad",
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P28_non_dr.h5ad",
]
meta = []
for pth in pths:
    meta_ = anndata.read(pth, backed='r').obs
    meta.append(meta_)
    print(meta_.shape)
meta = pd.concat(meta, join='inner')
newnames = ['-'.join(i.split('-')[:3])+'-P28' for i in meta.index.values]
meta.index = newnames
print(meta.shape)
meta = meta[~meta.index.duplicated(keep='first')]
print(meta.shape)
meta

(16379, 12)
(2175, 10)
(6263, 11)
(24817, 10)
(24816, 10)
CPU times: user 490 ms, sys: 177 ms, total: 667 ms
Wall time: 2.35 s


Unnamed: 0,n_genes,percent_mito,n_counts,Doublet,Doublet Score,batch,leiden,sample,Type,Subclass
AAACGAACATCACAGT-1-P28_dr_1a-P28,3720,0.001237,12124.0,False,0.027677,0,L4_1,P28_dr_1a,L4_1,L4
AAACGAATCAATCCGA-1-P28_dr_1a-P28,3798,0.001707,10543.0,False,0.071549,0,11,P28_dr_1a,L4_2,L4
AAAGAACCATGCGTGC-1-P28_dr_1a-P28,6513,0.002578,25592.0,False,0.097239,0,L4_1,P28_dr_1a,L4_1,L4
AAAGAACTCTACGCAA-1-P28_dr_1a-P28,4570,0.002076,13487.0,False,0.081595,0,L4_1,P28_dr_1a,L4_1,L4
AAAGGATAGTGCTCGC-1-P28_dr_1a-P28,3478,0.000669,10466.0,False,0.122757,0,1,P28_dr_1a,L2/3_3,L2/3
...,...,...,...,...,...,...,...,...,...,...
GCTGGGTAGCAAACAT-1-P28_dr_3b-P28,1491,0.002014,2977.0,False,0.011527,P28,1,P28_dr_3b,OD_B,OD
TCAATTCGTTATAGAG-1-P28_dr_3b-P28,784,0.002935,1021.0,False,0.026968,P28,5,P28_dr_3b,Endo,Endo
AGAACCTGTGATACAA-1-P28_dr_3b-P28,1092,0.001289,1552.0,False,0.031783,P28,0,P28_dr_3b,Micro,Micro
GATTCGAGTGATACCT-1-P28_dr_3b-P28,1113,0.001309,1528.0,False,0.016036,P28,0,P28_dr_3b,Micro,Micro


In [3]:
np.unique(newnames).shape

(24816,)

In [4]:
suffixes = ['-'.join(i.split('-')[2:]) for i in meta.index.values]
np.unique(suffixes)

array(['P28_dr_1a-P28', 'P28_dr_1b-P28', 'P28_dr_3a-P28', 'P28_dr_3b-P28'],
      dtype='<U13')

In [5]:
meta['sample'].unique()

['P28_dr_1a', 'P28_dr_1b', 'P28_dr_3a', 'P28_dr_3b']
Categories (4, object): ['P28_dr_1a', 'P28_dr_1b', 'P28_dr_3a', 'P28_dr_3b']

In [6]:
ddir = '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/'
pths = [
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_dr_1_a/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_dr_1_b/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_dr_3_a/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_dr_3_b/filtered_feature_bc_matrix/counts.h5ad"),
]
renames = [
    "-P28_dr_1a-P28",
    "-P28_dr_1b-P28",
    "-P28_dr_3a-P28",
    "-P28_dr_3b-P28",
]

adata = []
for f, rn in zip(pths, renames):
    adata_ = anndata.read(f)
    adata_.obs.index = np.char.add(adata_.obs.index.values.astype(str), rn)
    print(adata_.shape)
    adata.append(adata_)
    
adata = anndata.concat(adata, merge='same')
print(adata.shape)
adata

(12057, 53801)
(12670, 53801)
(9278, 53801)
(9229, 53801)
(43234, 53801)


AnnData object with n_obs × n_vars = 43234 × 53801
    var: 'id', 'name'

In [7]:
np.unique(adata.obs.index).shape

(43234,)

In [8]:
adata_.obs

AAACCCAAGATTACCC-1-P28_dr_3b-P28
AAACCCAAGGACCCAA-1-P28_dr_3b-P28
AAACCCAAGGCGCTCT-1-P28_dr_3b-P28
AAACCCAAGTCGTTAC-1-P28_dr_3b-P28
AAACCCACAAGGCCTC-1-P28_dr_3b-P28
...
TTTGTTGCAACGGCCT-1-P28_dr_3b-P28
TTTGTTGCATCCGAGC-1-P28_dr_3b-P28
TTTGTTGGTAAGATCA-1-P28_dr_3b-P28
TTTGTTGGTCACTACA-1-P28_dr_3b-P28
TTTGTTGGTGTCATTG-1-P28_dr_3b-P28


In [9]:
cov = np.asarray(adata.X.sum(axis=1)).reshape(-1,)
medcov = np.median(cov)
adata.obs['cov'] = cov
adata.obs['covfactor'] = cov/medcov

In [10]:
tmp = adata[meta.index].copy()
unq, inv, cnt = np.unique(tmp.var['name'].values, return_counts=True, return_inverse=True)
tmp = tmp[:, cnt[inv] == 1]
tmp.var = tmp.var.set_index('name')
tmp.obs = tmp.obs.join(meta)
# tmp

AnnData expects .var.index to contain strings, but got values like:
    ['4933401J01Rik', 'Gm26206', 'Xkr4', 'Gm18956', 'Gm37180']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [11]:
tmp.var

Unnamed: 0_level_0,id
name,Unnamed: 1_level_1
4933401J01Rik,ENSMUSG00000102693
Gm26206,ENSMUSG00000064842
Xkr4,ENSMUSG00000051951
Gm18956,ENSMUSG00000102851
Gm37180,ENSMUSG00000103377
...,...
CAAA01205117.1,ENSMUSG00000094431
CAAA01098150.1,ENSMUSG00000094621
CAAA01064564.1,ENSMUSG00000098647
Vmn2r122,ENSMUSG00000096730


In [12]:
tmp.obs

Unnamed: 0,cov,covfactor,n_genes,percent_mito,n_counts,Doublet,Doublet Score,batch,leiden,sample,Type,Subclass
AAACGAACATCACAGT-1-P28_dr_1a-P28,12132.0,2.069954,3720,0.001237,12124.0,False,0.027677,0,L4_1,P28_dr_1a,L4_1,L4
AAACGAATCAATCCGA-1-P28_dr_1a-P28,10548.0,1.799693,3798,0.001707,10543.0,False,0.071549,0,11,P28_dr_1a,L4_2,L4
AAAGAACCATGCGTGC-1-P28_dr_1a-P28,25610.0,4.369562,6513,0.002578,25592.0,False,0.097239,0,L4_1,P28_dr_1a,L4_1,L4
AAAGAACTCTACGCAA-1-P28_dr_1a-P28,13491.0,2.301826,4570,0.002076,13487.0,False,0.081595,0,L4_1,P28_dr_1a,L4_1,L4
AAAGGATAGTGCTCGC-1-P28_dr_1a-P28,10468.0,1.786043,3478,0.000669,10466.0,False,0.122757,0,1,P28_dr_1a,L2/3_3,L2/3
...,...,...,...,...,...,...,...,...,...,...,...,...
GCTGGGTAGCAAACAT-1-P28_dr_3b-P28,2979.0,0.508275,1491,0.002014,2977.0,False,0.011527,P28,1,P28_dr_3b,OD_B,OD
TCAATTCGTTATAGAG-1-P28_dr_3b-P28,1022.0,0.174373,784,0.002935,1021.0,False,0.026968,P28,5,P28_dr_3b,Endo,Endo
AGAACCTGTGATACAA-1-P28_dr_3b-P28,1552.0,0.264801,1092,0.001289,1552.0,False,0.031783,P28,0,P28_dr_3b,Micro,Micro
GATTCGAGTGATACCT-1-P28_dr_3b-P28,1528.0,0.260706,1113,0.001309,1528.0,False,0.016036,P28,0,P28_dr_3b,Micro,Micro


In [13]:
pth_res = os.path.join(ddir, "P28_dr_allcells_Oct25.h5ad")
tmp.write(pth_res)

In [14]:
b = anndata.read(pth_res, backed='r')
b



AnnData object with n_obs × n_vars = 24816 × 53547 backed at '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/P28_dr_allcells_Oct25.h5ad'
    obs: 'cov', 'covfactor', 'n_genes', 'percent_mito', 'n_counts', 'Doublet', 'Doublet Score', 'batch', 'leiden', 'sample', 'Type', 'Subclass'
    var: 'id'