In [1]:
import numpy as np
import pandas as pd
import os
import glob
import itertools
import scanpy as sc
import natsort

from scroutines import basicu

In [2]:
selected_genes = [
    "Gabra1",
    "Gabra2",
    "Gabra3",
    "Gabra4",
    "Gabra5",
    "Gabra6",
    "Gabrg1",
    "Gabrg2",
    "Gabrg3",
    "Gabrb1",
    "Gabrb2",
    "Gabrb3",
    "Gabrd",    
    "Gabre",    
    "Gabrp",    
    "Gabrq",    
    "Gabrr1",
    "Gabrr2",
    "Gabrr3",
]
len(selected_genes)

19

In [3]:
ddir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/organized'
outdir = '/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs'
!mkdir -p $outdir

In [5]:
### read in the file and prep

for f in natsort.natsorted(glob.glob(ddir+'/*.h5ad')):
    print(f)

    adata = sc.read(f)
    adata.obs['biosample'] = adata.obs['sample'].apply(lambda x: x[:-1])
    selected_genes_idx = basicu.get_index_from_array(adata.var.index.values, selected_genes)
    assert np.any(selected_genes_idx != -1)

    ### sum over counts 
    # by sample
    sample_lbls = adata.obs['biosample'].values
    unq_samples = np.unique(sample_lbls)

    # by type
    type_lbls = adata.obs['Type'].values
    unq_types = np.unique(type_lbls)

    nr, nc, ng =  len(unq_samples), len(unq_types), len(adata.var)  #  rep, subclass, gene 
    tensor = np.zeros((nr, nc, ng))
    print(tensor.shape)

    for j, this_samp in enumerate(unq_samples):
        for k, this_type in enumerate(unq_types):
            selector = ((sample_lbls==this_samp) & (type_lbls==this_type))
            tensor[j,k] = np.sum(np.array(adata[selector].X.todense()), axis=0) # raw reads sum over all cells

    ### CPM  - per sample and subclass
    # normalize it as CPM
    tensor = (tensor/np.sum(tensor, axis=-1, keepdims=True))*1e6
    # tensor = np.log10(1+tensor) 

    ### select GABARRs
    subtensor = tensor[:,:,selected_genes_idx]
    print(subtensor.shape)

    ### save as csv (one per sample)
    for i, this_samp in enumerate(unq_samples):
        output = os.path.join(outdir, f'GABARRs_cheng22_type_CPM_{this_samp}.csv')
        print(this_samp, output)

        submat = subtensor[i]
        subtensor_df = pd.DataFrame(submat, index=unq_types, columns=selected_genes)
        subtensor_df.to_csv(output)
        
    # break
    

/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/organized/P8NR.h5ad
(2, 38, 53801)
(2, 38, 19)
P8_1 /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs/GABARRs_cheng22_type_CPM_P8_1.csv
P8_2 /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs/GABARRs_cheng22_type_CPM_P8_2.csv
/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/organized/P14NR.h5ad
(2, 40, 53801)
(2, 40, 19)
P14_1 /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs/GABARRs_cheng22_type_CPM_P14_1.csv
P14_2 /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs/GABARRs_cheng22_type_CPM_P14_2.csv
/u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/data/cheng21_cell_scrna/organized/P17NR.h5ad
(2, 41, 53801)
(2, 41, 19)
P17_1 /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs/GABARRs_cheng22_type_CPM_P17_1.csv
P17_2 /u/home/f/f7xiesnm/project-zipursky/v1-bb/v1/results/GABAARs/GABARRs_cheng22_type_CPM_P17_2.csv
/u/home/f/f7xiesnm/project-zipursky/v1-bb/

# L2/3 using new types - A, AB, B, BC, C

# Multiome data