In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import anndata

import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
import seaborn as sns

from dredFISH.Utils import basicu
from dredFISH.Utils import powerplots

In [2]:
def get_normed_bulks(mat, genes, types, genes_sel_idx=None):
    """
    Assume cell by gene
    - select expressed genes
    - sparse to dense
    - merge cells to clusters
    - log10(CPM+1) norm bulk samples
    """
    ncell, ngene = mat.shape
    cellcov = np.asarray(mat.sum(axis=1)).reshape(-1,)
    genecov = np.asarray(mat.sum(axis=0)).reshape(-1,)
    
    if genes_sel_idx is None:
        cond = genecov>ncell*0.01 # expressed in at least 1% of cells
        matsub = np.asarray(mat[:,cond].todense())
        genes_sel = genes[cond]
    else:
        matsub = np.asarray(mat[:,genes_sel_idx].todense())
        genes_sel = genes[genes_sel_idx]
    
    # pseudo-bulk samples -- pull counts from cells
    Xk, xclsts = basicu.group_sum(matsub, types)
    ck, xclsts = basicu.group_sum(cellcov.reshape(-1,1), types)
    Xk = np.log10(1+(np.array(Xk)/np.array(ck))*1e6) # log10(1+CPM)
    df = pd.DataFrame(Xk, index=xclsts, columns=genes_sel)
    return df.T # gene by types 

def get_normed_bulks_for_adata_by_types(adata, genes_cndd=None):
    """
    """
    if genes_cndd is None:
        genes_sel_idx = None
    else:
        genes_sel_idx = basicu.get_index_from_array(adata.var.index.values, genes_cndd)
        if np.sum(genes_sel_idx == -1) > 0:
            print("some genes are not there")
            genes_sel_idx = genes_sel_idx[genes_sel_idx!=-1]
            
    return get_normed_bulks(adata.X.copy(), adata.var.index.values, adata.obs['Type'], 
                            genes_sel_idx=genes_sel_idx)

In [3]:
sns.set_context('talk')

In [4]:
f = "/greendata/GeneralStorage/fangming/projects/visctx/data_dump/MERFISH_gene_panel_Chen22_Zador_All_n107.csv"
df_zador = pd.read_csv(f, header=None)
g1 = df_zador[0].values
g1 = np.unique(g1) 
g1, g1.shape

(array(['Actb', 'Alcam', 'Brinp3', 'C1ql3', 'Calb1', 'Camk4', 'Car10',
        'Car3', 'Cbln2', 'Cdh13', 'Cdh18', 'Cdh9', 'Col11a1', 'Col19a1',
        'Coro6', 'Cplx3', 'Cpne4', 'Ctgf', 'Cux2', 'Dab1', 'Dcc', 'Dgkb',
        'Efna5', 'Enpp2', 'Etv1', 'Fam19a1', 'Fam19a2', 'Fat3', 'Fbxl7',
        'Foxp2', 'Gad1', 'Galnt14', 'Galntl6', 'Gfra1', 'Gnb4', 'Gpc5',
        'Gria1', 'Grik1', 'Grin3a', 'Hcn1', 'Hs3st2', 'Hs3st4', 'Hs6st3',
        'Htr2a', 'Igsf21', 'Il1rapl2', 'Inpp4b', 'Kcnip1', 'Kcnn2',
        'Kctd1', 'Lamp5', 'Lmo4', 'Lpp', 'Lrrtm4', 'Lypd1', 'Marcksl1',
        'Ncald', 'Ncam2', 'Nell1', 'Nnat', 'Nr4a2', 'Nrg1', 'Nrp1',
        'Nrsn1', 'Nxph1', 'Olfm3', 'Oprk1', 'Otof', 'Pam', 'Pcp4', 'Prkca',
        'Ptprk', 'Rab3c', 'Rasgrf2', 'Rasl10a', 'Rcan2', 'Reln', 'Rgs4',
        'Rora', 'Rorb', 'Satb1', 'Scnn1a', 'Sdk1', 'Sgcd', 'Slc17a7',
        'Slc24a2', 'Slc24a3', 'Slc30a3', 'Sorcs3', 'Spock3', 'Sv2c',
        'Svil', 'Synpr', 'Syt17', 'Syt2', 'Tenm3', 'Timp2', 'Tle4',

In [5]:
f = "/greendata/GeneralStorage/fangming/projects/visctx/data_dump/MERFISH_gene_panel_Unique_Feb21.csv"
df_merfish = pd.read_csv(f)

cnddts = df_merfish['gene_name_data'].values
unq, cnts = np.unique(cnddts, return_counts=True)
print(len(cnddts), unq.shape, unq[cnts>1])

df_merfish

684 (684,) []


Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3,Unnamed: 7
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
...,...,...,...,...,...,...,...,...
679,Slc39a2,Slc39a2,NRvsDR_DEG,our analysis,Endo,DR_dn,,
680,Zfp366,Zfp366,NRvsDR_DEG,our analysis,Endo,DR_dn,,
681,Slc35a2,Slc35a2,NRvsDR_DEG,our analysis,VLMC_A,DR_up,,
682,Sephs1,Sephs1,NRvsDR_DEG,our analysis,VLMC_B,DR_up,,


In [6]:
print([i for i in g1 if i not in unq])
print(np.intersect1d(df_merfish['gene_name_data'].values, g1).shape)

[]
(107,)


In [7]:
# df_merfish.groupby(['why included', 'source']).size().to_frame('number')

In [8]:
dfout = df_merfish.copy()
print(dfout.shape)

cond = (~dfout['source'].str.contains("Chen22_biorxiv_Zador")) & dfout['gene_name_data'].isin(g1)
dfout.loc[cond,'source'] = "Chen22_biorxiv_Zador;"+dfout.loc[cond, 'source']

cond = (~dfout['why included'].str.contains("All cell types")) & dfout['gene_name_data'].isin(g1)
dfout.loc[cond,'why included'] = "All cell types;"+dfout.loc[cond, 'why included']

(684, 8)


In [9]:
dfout.groupby(['why included', 'source']).size().to_frame('number')

Unnamed: 0_level_0,Unnamed: 1_level_0,number
why included,source,Unnamed: 2_level_1
All cell types,Chen22_biorxiv_Zador,52
All cell types,Chen22_biorxiv_Zador;Cheng22_Cell,14
All cell types,Chen22_biorxiv_Zador;our analysis,7
All cell types,Cheng22_Cell,54
All cell types,our analysis,43
All cell types;IEG,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1
All cell types;L2/3 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,17
All cell types;L2/3/4 subtypes at P14,Chen22_biorxiv_Zador;our analysis,1
All cell types;L4 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,13
All cell types;L4/5 subtypes,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1


In [10]:
cond1 = dfout['why included'].str.contains("All cell types")
cond2 = dfout['source'].str.contains("Chen22_biorxiv_Zador")
print(dfout[cond1].shape)
print(dfout[cond2].shape)
print(dfout[cond1 & cond2].shape)

(205, 8)
(107, 8)
(107, 8)


In [11]:
fout = '../data_dump/MERFISH_gene_panel_Unique_org_Feb21.csv' 
dfout.to_csv(fout, header=True, index=False)

In [12]:
!head $fout

gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3,Unnamed: 7
Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
Adamts2,Adamts2,L2/3 subtypes,Cheng22_Cell,A>B=C,***,,
Cdh13,Cdh13,All cell types;L2/3 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,A>B=C,CSM,,
6530403H02Rik,6530403H02Rik,L2/3 subtypes,Cheng22_Cell,A>B=C,,"top L2/3 A marker, single cell",
Rhbdl3,Rhbdl3,L2/3 subtypes,Cheng22_Cell,A>B=C,**,top single cell marker,


In [13]:
!wc -l $fout

685 ../data_dump/MERFISH_gene_panel_Unique_org_Feb21.csv
