In [1]:
import numpy as np
import pandas as pd
from scipy.io import mmread
import os
import glob
import anndata
import tqdm
import zarr
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

from dredFISH.Utils import basicu
from dredFISH.Utils import powerplots

In [10]:
%%time
pths = [
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P28_glut.h5ad",
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P28_gaba.h5ad",
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P28_non.h5ad",
]
meta = []
for pth in pths:
    meta_ = anndata.read(pth, backed='r').obs
    meta.append(meta_)
    print(meta_.shape)
meta = pd.concat(meta, join='inner')
meta.index = [i[:-len('-P28-4')] if i.endswith('-P28-4') else i for i in meta.index.values] # fix gaba bug
print(meta.shape)
meta

(15315, 16)
(2106, 12)
(6509, 11)
(23930, 11)
CPU times: user 536 ms, sys: 517 ms, total: 1.05 s
Wall time: 1.05 s


Unnamed: 0,batch,n_genes,percent_mito,n_counts,leiden,Doublet,Doublet Score,Class_broad,sample,Type,Subclass
AAACCCACAAATGATG-1-P28_1a-P28,P28,3450,0.000951,10513.0,3,False,0.023377,Excitatory,P28_1a,L6CT_B,L6CT
AAACCCACACCCTAGG-1-P28_1a-P28,P28,4622,0.002900,15174.0,2,False,0.055276,Excitatory,P28_1a,L4_A,L4
AAACCCAGTTCTCTCG-1-P28_1a-P28,P28,4895,0.002141,16814.0,6,False,0.040816,Excitatory,P28_1a,L2/3_C,L2/3
AAACCCATCACTTGTT-1-P28_1a-P28,P28,3751,0.000471,12745.0,3,False,0.012605,Excitatory,P28_1a,L6CT_B,L6CT
AAACCCATCCTAGCTC-1-P28_1a-P28,P28,4327,0.001903,13661.0,1,False,0.028505,Excitatory,P28_1a,L4_B,L4
...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGGTTTCCAAG-1-P28_2b-P28,P28,1819,0.002328,3865.0,3,False,0.005745,Oligodendrocytes,P28_2b,OD_B,OD
TTTGGTTCAACACTAC-1-P28_2b-P28,P28,2298,0.000845,5918.0,6,False,0.008497,Oligodendrocytes,P28_2b,OD_B,OD
TTTGGTTGTCCACTTC-1-P28_2b-P28,P28,2419,0.001318,5313.0,2,False,0.009633,OPCs,P28_2b,OPC_A,OPC
TTTGTTGCATAGGTAA-1-P28_2b-P28,P28,1682,0.002540,3541.0,0,False,0.012542,Astrocytes,P28_2b,Astro_A,Astro


In [11]:
meta.loc[[i for i in meta.index if i.endswith('-P28-P28')]]

Unnamed: 0,batch,n_genes,percent_mito,n_counts,leiden,Doublet,Doublet Score,Class_broad,sample,Type,Subclass


In [12]:
meta['sample'].unique()

['P28_1a', 'P28_1b', 'P28_2a', 'P28_2b']
Categories (4, object): ['P28_1a', 'P28_1b', 'P28_2a', 'P28_2b']

In [13]:
ddir = '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/'
pths = [
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_nr_1_a/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_nr_1_b/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_nr_2_a/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P28_nr_2_b/filtered_feature_bc_matrix/counts.h5ad"),
]
renames = [
    "-P28_1a-P28",
    "-P28_1b-P28",
    "-P28_2a-P28",
    "-P28_2b-P28",
]

adata = []
for f, rn in zip(pths, renames):
    adata_ = anndata.read(f)
    adata_.obs.index = np.char.add(adata_.obs.index.values.astype(str), rn)
    print(adata_.shape)
    adata.append(adata_)
    
adata = anndata.concat(adata, merge='same')
print(adata.shape)
adata

(6666, 53801)
(6558, 53801)
(7268, 53801)
(7937, 53801)
(28429, 53801)


AnnData object with n_obs × n_vars = 28429 × 53801
    var: 'id', 'name'

In [14]:
adata_.obs

AAACCCAAGGCTCACC-1-P28_2b-P28
AAACCCACACAAGCCC-1-P28_2b-P28
AAACCCAGTGCCCTTT-1-P28_2b-P28
AAACGAAAGAGGGTCT-1-P28_2b-P28
AAACGAAAGCAGCCTC-1-P28_2b-P28
...
TTTGTTGCACGTAGTT-1-P28_2b-P28
TTTGTTGCATAGGTAA-1-P28_2b-P28
TTTGTTGGTGTCTAAC-1-P28_2b-P28
TTTGTTGTCCTAAGTG-1-P28_2b-P28
TTTGTTGTCTTAGCTT-1-P28_2b-P28


In [15]:
cov = np.asarray(adata.X.sum(axis=1)).reshape(-1,)
medcov = np.median(cov)
adata.obs['cov'] = cov
adata.obs['covfactor'] = cov/medcov

In [16]:
tmp = adata[meta.index].copy()
unq, inv, cnt = np.unique(tmp.var['name'].values, return_counts=True, return_inverse=True)
tmp = tmp[:, cnt[inv] == 1]
tmp.var = tmp.var.set_index('name')
tmp.obs = tmp.obs.join(meta)
tmp

AnnData expects .var.index to contain strings, but got values like:
    ['4933401J01Rik', 'Gm26206', 'Xkr4', 'Gm18956', 'Gm37180']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


AnnData object with n_obs × n_vars = 23930 × 53547
    obs: 'cov', 'covfactor', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'leiden', 'Doublet', 'Doublet Score', 'Class_broad', 'sample', 'Type', 'Subclass'
    var: 'id'

In [17]:
tmp.var

Unnamed: 0_level_0,id
name,Unnamed: 1_level_1
4933401J01Rik,ENSMUSG00000102693
Gm26206,ENSMUSG00000064842
Xkr4,ENSMUSG00000051951
Gm18956,ENSMUSG00000102851
Gm37180,ENSMUSG00000103377
...,...
CAAA01205117.1,ENSMUSG00000094431
CAAA01098150.1,ENSMUSG00000094621
CAAA01064564.1,ENSMUSG00000098647
Vmn2r122,ENSMUSG00000096730


In [18]:
tmp.obs

Unnamed: 0,cov,covfactor,batch,n_genes,percent_mito,n_counts,leiden,Doublet,Doublet Score,Class_broad,sample,Type,Subclass
AAACCCACAAATGATG-1-P28_1a-P28,10513.0,1.043578,P28,3450,0.000951,10513.0,3,False,0.023377,Excitatory,P28_1a,L6CT_B,L6CT
AAACCCACACCCTAGG-1-P28_1a-P28,15174.0,1.506254,P28,4622,0.002900,15174.0,2,False,0.055276,Excitatory,P28_1a,L4_A,L4
AAACCCAGTTCTCTCG-1-P28_1a-P28,16816.0,1.669248,P28,4895,0.002141,16814.0,6,False,0.040816,Excitatory,P28_1a,L2/3_C,L2/3
AAACCCATCACTTGTT-1-P28_1a-P28,12745.0,1.265138,P28,3751,0.000471,12745.0,3,False,0.012605,Excitatory,P28_1a,L6CT_B,L6CT
AAACCCATCCTAGCTC-1-P28_1a-P28,13661.0,1.356065,P28,4327,0.001903,13661.0,1,False,0.028505,Excitatory,P28_1a,L4_B,L4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGGTTTCCAAG-1-P28_2b-P28,3866.0,0.383760,P28,1819,0.002328,3865.0,3,False,0.005745,Oligodendrocytes,P28_2b,OD_B,OD
TTTGGTTCAACACTAC-1-P28_2b-P28,5919.0,0.587552,P28,2298,0.000845,5918.0,6,False,0.008497,Oligodendrocytes,P28_2b,OD_B,OD
TTTGGTTGTCCACTTC-1-P28_2b-P28,5313.0,0.527397,P28,2419,0.001318,5313.0,2,False,0.009633,OPCs,P28_2b,OPC_A,OPC
TTTGTTGCATAGGTAA-1-P28_2b-P28,3543.0,0.351697,P28,1682,0.002540,3541.0,0,False,0.012542,Astrocytes,P28_2b,Astro_A,Astro


In [19]:
pth_res = os.path.join(ddir, "P28_nr_allcells_Oct24.h5ad")
tmp.write(pth_res)

In [20]:
b = anndata.read(pth_res, backed='r')
b



AnnData object with n_obs × n_vars = 23930 × 53547 backed at '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/P28_nr_allcells_Oct24.h5ad'
    obs: 'cov', 'covfactor', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'leiden', 'Doublet', 'Doublet Score', 'Class_broad', 'sample', 'Type', 'Subclass'
    var: 'id'