In [1]:
import numpy as np
import pandas as pd
from scipy.io import mmread
import os
import glob
import anndata
import tqdm
import zarr
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns

from dredFISH.Utils import basicu
from dredFISH.Utils import powerplots

In [2]:
%%time
pths = [
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P14_glut.h5ad",
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P14_gaba.h5ad",
    "../data_dump/counts/scRNAseq_Cheng2022_Cell_v1ctx/processed/P14_non.h5ad",
]
meta = []
for pth in pths:
    meta_ = anndata.read(pth, backed='r').obs
    meta.append(meta_)
    print(meta_.shape)
meta = pd.concat(meta, join='inner')
newnames = ['-'.join(i.split('-')[:4]) for i in meta.index.values]
meta.index = newnames
print(meta.shape)
meta = meta[~meta.index.duplicated(keep='first')]
print(meta.shape)
meta

(21220, 16)
(2844, 12)
(7708, 11)
(31772, 11)
(31772, 11)
CPU times: user 605 ms, sys: 81 ms, total: 686 ms
Wall time: 2.2 s


Unnamed: 0,batch,n_genes,percent_mito,n_counts,leiden,Doublet,Doublet Score,Class_broad,sample,Type,Subclass
AAACCCACACGCACCA-1-P14_1a-P14,0,3578,0.000250,11986.0,2,False,0.014984,Excitatory,P14_1a,L6CT_B,L6CT
AAACCCAGTCCTGAAT-1-P14_1a-P14,0,3520,0.000258,11632.0,3,False,0.016756,Excitatory,P14_1a,L4_A,L4
AAACCCATCGCTCTCA-1-P14_1a-P14,0,2581,0.000122,8164.0,2,False,0.012550,Excitatory,P14_1a,L6CT_B,L6CT
AAACGAACATGCAGGA-1-P14_1a-P14,0,2542,0.000158,6338.0,3,False,0.028730,Excitatory,P14_1a,L4_A,L4
AAACGAAGTCCTCAGG-1-P14_1a-P14,0,3203,0.000000,9317.0,2,False,0.012168,Excitatory,P14_1a,L6CT_B,L6CT
...,...,...,...,...,...,...,...,...,...,...,...
TTTGATCGTAACCAGG-1-P14_2b-P14,P14,2579,0.000178,5614.0,0,False,0.011671,Astrocytes,P14_2b,Astro_A,Astro
TTTGATCGTTTGAAAG-1-P14_2b-P14,P14,1199,0.000496,2015.0,1,False,0.006068,OPCs,P14_2b,OPC_A,OPC
TTTGGAGGTATGACAA-1-P14_2b-P14,P14,966,0.002183,1374.0,2,False,0.006733,Microglia,P14_2b,Micro,Micro
TTTGGTTGTAGAATGT-1-P14_2b-P14,P14,1935,0.000409,4889.0,4,False,0.020958,Oligodendrocytes,P14_2b,OD_A,OD


In [3]:
suffixes = ['-'.join(i.split('-')[2:]) for i in meta.index.values]
np.unique(suffixes)

array(['P14_1a-P14', 'P14_1b-P14', 'P14_2a-P14', 'P14_2b-P14'],
      dtype='<U10')

In [4]:
meta['sample'].unique()

['P14_1a', 'P14_1b', 'P14_2a', 'P14_2b']
Categories (4, object): ['P14_1a', 'P14_1b', 'P14_2a', 'P14_2b']

In [5]:
ddir = '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/'
pths = [
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P14_nr_1_a/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P14_nr_1_b/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P14_nr_2_a/filtered_feature_bc_matrix/counts.h5ad"),
    os.path.join(ddir, "scRNAseq_Cheng2022_Cell_v1ctx/P14_nr_2_b/filtered_feature_bc_matrix/counts.h5ad"),
]
renames = [
    "-P14_1a-P14",
    "-P14_1b-P14",
    "-P14_2a-P14",
    "-P14_2b-P14",
]

adata = []
for f, rn in zip(pths, renames):
    adata_ = anndata.read(f)
    adata_.obs.index = np.char.add(adata_.obs.index.values.astype(str), rn)
    print(adata_.shape)
    adata.append(adata_)
    
adata = anndata.concat(adata, merge='same')
print(adata.shape)
adata

(12600, 53801)
(12813, 53801)
(8140, 53801)
(7965, 53801)
(41518, 53801)


AnnData object with n_obs × n_vars = 41518 × 53801
    var: 'id', 'name'

In [6]:
np.unique(adata.obs.index).shape

(41518,)

In [7]:
adata_.obs

AAACCCAAGAGCACTG-1-P14_2b-P14
AAACCCAAGCGAGAAA-1-P14_2b-P14
AAACCCAAGGCGTTAG-1-P14_2b-P14
AAACCCAAGTCTCTGA-1-P14_2b-P14
AAACCCAAGTTGGCTT-1-P14_2b-P14
...
TTTGTTGCATAAGCAA-1-P14_2b-P14
TTTGTTGGTCGCAACC-1-P14_2b-P14
TTTGTTGTCCGCGGAT-1-P14_2b-P14
TTTGTTGTCTCTATAC-1-P14_2b-P14
TTTGTTGTCTGTCCCA-1-P14_2b-P14


In [8]:
cov = np.asarray(adata.X.sum(axis=1)).reshape(-1,)
medcov = np.median(cov)
adata.obs['cov'] = cov
adata.obs['covfactor'] = cov/medcov

In [9]:
tmp = adata[meta.index].copy()
unq, inv, cnt = np.unique(tmp.var['name'].values, return_counts=True, return_inverse=True)
tmp = tmp[:, cnt[inv] == 1]
tmp.var = tmp.var.set_index('name')
tmp.obs = tmp.obs.join(meta)
# tmp

AnnData expects .var.index to contain strings, but got values like:
    ['4933401J01Rik', 'Gm26206', 'Xkr4', 'Gm18956', 'Gm37180']

    Inferred to be: categorical

  value_idx = self._prep_dim_index(value.index, attr)


In [10]:
tmp.var

Unnamed: 0_level_0,id
name,Unnamed: 1_level_1
4933401J01Rik,ENSMUSG00000102693
Gm26206,ENSMUSG00000064842
Xkr4,ENSMUSG00000051951
Gm18956,ENSMUSG00000102851
Gm37180,ENSMUSG00000103377
...,...
CAAA01205117.1,ENSMUSG00000094431
CAAA01098150.1,ENSMUSG00000094621
CAAA01064564.1,ENSMUSG00000098647
Vmn2r122,ENSMUSG00000096730


In [11]:
tmp.obs

Unnamed: 0,cov,covfactor,batch,n_genes,percent_mito,n_counts,leiden,Doublet,Doublet Score,Class_broad,sample,Type,Subclass
AAACCCACACGCACCA-1-P14_1a-P14,11986.0,1.360345,0,3578,0.000250,11986.0,2,False,0.014984,Excitatory,P14_1a,L6CT_B,L6CT
AAACCCAGTCCTGAAT-1-P14_1a-P14,11632.0,1.320168,0,3520,0.000258,11632.0,3,False,0.016756,Excitatory,P14_1a,L4_A,L4
AAACCCATCGCTCTCA-1-P14_1a-P14,8164.0,0.926569,0,2581,0.000122,8164.0,2,False,0.012550,Excitatory,P14_1a,L6CT_B,L6CT
AAACGAACATGCAGGA-1-P14_1a-P14,6338.0,0.719328,0,2542,0.000158,6338.0,3,False,0.028730,Excitatory,P14_1a,L4_A,L4
AAACGAAGTCCTCAGG-1-P14_1a-P14,9317.0,1.057428,0,3203,0.000000,9317.0,2,False,0.012168,Excitatory,P14_1a,L6CT_B,L6CT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGATCGTAACCAGG-1-P14_2b-P14,5614.0,0.637158,P14,2579,0.000178,5614.0,0,False,0.011671,Astrocytes,P14_2b,Astro_A,Astro
TTTGATCGTTTGAAAG-1-P14_2b-P14,2015.0,0.228691,P14,1199,0.000496,2015.0,1,False,0.006068,OPCs,P14_2b,OPC_A,OPC
TTTGGAGGTATGACAA-1-P14_2b-P14,1374.0,0.155941,P14,966,0.002183,1374.0,2,False,0.006733,Microglia,P14_2b,Micro,Micro
TTTGGTTGTAGAATGT-1-P14_2b-P14,4889.0,0.554875,P14,1935,0.000409,4889.0,4,False,0.020958,Oligodendrocytes,P14_2b,OD_A,OD


In [12]:
pth_res = os.path.join(ddir, "P14_nr_allcells_Oct26.h5ad")
tmp.write(pth_res)

In [13]:
b = anndata.read(pth_res, backed='r')
b



AnnData object with n_obs × n_vars = 31772 × 53547 backed at '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/P14_nr_allcells_Oct26.h5ad'
    obs: 'cov', 'covfactor', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'leiden', 'Doublet', 'Doublet Score', 'Class_broad', 'sample', 'Type', 'Subclass'
    var: 'id'

# subselect L234 only

In [14]:
tmp.shape, meta.shape

((31772, 53547), (31772, 11))

In [15]:
np.unique(meta.Type)

array(['Astro_A', 'Astro_B', 'Endo', 'Frem1', 'L2/3_AB', 'L2/3_BC',
       'L4_A', 'L4_B', 'L4_C', 'L5IT', 'L5NP', 'L5PT_A', 'L5PT_B',
       'L6CT_A', 'L6CT_B', 'L6CT_C', 'L6IT_A', 'L6IT_B', 'L6b', 'Lamp5',
       'Micro', 'OD_A', 'OD_B', 'OPC_A', 'OPC_B', 'Pvalb_A', 'Pvalb_B',
       'Pvalb_C', 'Pvalb_D', 'Sst_A', 'Sst_B', 'Sst_C', 'Sst_D', 'Sst_E',
       'Stac', 'VLMC_A', 'VLMC_B', 'Vip_A', 'Vip_B', 'Vip_C'],
      dtype=object)

In [16]:
selected_types = ['L2/3_AB', 'L2/3_BC', 'L4_A', 'L4_B', 'L4_C']

In [17]:
tmp_sub = tmp[meta[meta['Type'].isin(selected_types)].index,:]
tmp_sub

View of AnnData object with n_obs × n_vars = 12157 × 53547
    obs: 'cov', 'covfactor', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'leiden', 'Doublet', 'Doublet Score', 'Class_broad', 'sample', 'Type', 'Subclass'
    var: 'id'

In [18]:
pth_res2 = os.path.join(ddir, "P14_nr_L234_Oct26.h5ad")
tmp_sub.write(pth_res2)

In [19]:
b = anndata.read(pth_res2, backed='r')
b



AnnData object with n_obs × n_vars = 12157 × 53547 backed at '/bigstore/GeneralStorage/fangming/projects/visctx/data_dump/counts/P14_nr_L234_Oct26.h5ad'
    obs: 'cov', 'covfactor', 'batch', 'n_genes', 'percent_mito', 'n_counts', 'leiden', 'Doublet', 'Doublet Score', 'Class_broad', 'sample', 'Type', 'Subclass'
    var: 'id'