In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import anndata

import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
import seaborn as sns

from dredFISH.Utils import basicu
from dredFISH.Utils import powerplots

In [2]:
def get_normed_bulks(mat, genes, types, genes_sel_idx=None):
    """
    Assume cell by gene
    - select expressed genes
    - sparse to dense
    - merge cells to clusters
    - log10(CPM+1) norm bulk samples
    """
    ncell, ngene = mat.shape
    cellcov = np.asarray(mat.sum(axis=1)).reshape(-1,)
    genecov = np.asarray(mat.sum(axis=0)).reshape(-1,)
    
    if genes_sel_idx is None:
        cond = genecov>ncell*0.01 # expressed in at least 1% of cells
        matsub = np.asarray(mat[:,cond].todense())
        genes_sel = genes[cond]
    else:
        matsub = np.asarray(mat[:,genes_sel_idx].todense())
        genes_sel = genes[genes_sel_idx]
    
    # pseudo-bulk samples -- pull counts from cells
    Xk, xclsts = basicu.group_sum(matsub, types)
    ck, xclsts = basicu.group_sum(cellcov.reshape(-1,1), types)
    Xk = np.log10(1+(np.array(Xk)/np.array(ck))*1e6) # log10(1+CPM)
    df = pd.DataFrame(Xk, index=xclsts, columns=genes_sel)
    return df.T # gene by types 

def get_normed_bulks_for_adata_by_types(adata, genes_cndd=None):
    """
    """
    if genes_cndd is None:
        genes_sel_idx = None
    else:
        genes_sel_idx = basicu.get_index_from_array(adata.var.index.values, genes_cndd)
        if np.sum(genes_sel_idx == -1) > 0:
            print("some genes are not there")
            genes_sel_idx = genes_sel_idx[genes_sel_idx!=-1]
            
    return get_normed_bulks(adata.X.copy(), adata.var.index.values, adata.obs['Type'], 
                            genes_sel_idx=genes_sel_idx)

In [3]:
sns.set_context('talk')

In [4]:
f = "/greendata/GeneralStorage/fangming/projects/visctx/data_dump/MERFISH_gene_panel_Current_Feb21.csv"
df = pd.read_csv(f)

cnddts = df['gene_name_data'].values
unq, cnts = np.unique(cnddts, return_counts=True)
print(len(cnddts), unq.shape, unq[cnts>1])

df

684 (684,) []


Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
679,Slc39a2,Slc39a2,NRvsDR_DEG,our analysis,Endo,DR_dn,
680,Zfp366,Zfp366,NRvsDR_DEG,our analysis,Endo,DR_dn,
681,Slc35a2,Slc35a2,NRvsDR_DEG,our analysis,VLMC_A,DR_up,
682,Sephs1,Sephs1,NRvsDR_DEG,our analysis,VLMC_B,DR_up,


In [5]:
df.groupby(['why included', 'source']).size().to_frame('number')

Unnamed: 0_level_0,Unnamed: 1_level_0,number
why included,source,Unnamed: 2_level_1
All cell types,Chen22_biorxiv_Zador,52
All cell types,Chen22_biorxiv_Zador;Cheng22_Cell,14
All cell types,Chen22_biorxiv_Zador;our analysis,7
All cell types,Cheng22_Cell,54
All cell types,our analysis,43
All cell types;IEG,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1
All cell types;L2/3 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,17
All cell types;L2/3/4 subtypes at P14,Chen22_biorxiv_Zador;our analysis,1
All cell types;L4 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,13
All cell types;L4/5 subtypes,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1


In [7]:
df.groupby(['why included']).size().to_frame('number')

Unnamed: 0_level_0,number
why included,Unnamed: 1_level_1
All cell types,170
All cell types;IEG,1
All cell types;L2/3 subtypes,17
All cell types;L2/3/4 subtypes at P14,1
All cell types;L4 subtypes,13
All cell types;L4/5 subtypes,1
All cell types;L5 IT subtypes,1
All cell types;V1_HVA_Spatial_Gradient,1
Astrocyte_NRvsDR_DEG,7
Astrocytes,44


In [8]:
df.groupby(['source']).size().to_frame('number')

Unnamed: 0_level_0,number
source,Unnamed: 1_level_1
Bayraktar20_NatNeuro,45
Berg21_Nature,3
Buchanan22_PNAS_Allen,21
Chen22_biorxiv_Zador,52
Chen22_biorxiv_Zador;Cheng22_Cell,44
Chen22_biorxiv_Zador;Chou13_Science,1
Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,2
Chen22_biorxiv_Zador;our analysis,8
Cheng22_Cell,230
Cheng22_Cell;Hrvatin17_NatNeuro,1


In [6]:
cond1 = df['why included'].str.contains("All cell types")
cond2 = df['source'].str.contains("Chen22_biorxiv_Zador")
print(df[cond1].shape)
print(df[cond2].shape)
print(df[cond1 & cond2].shape)

(205, 7)
(107, 7)
(107, 7)


In [None]:
# fout = '../data_dump/MERFISH_gene_panel_Unique_org_Feb21.csv' 
# dfout.to_csv(fout, header=True, index=False)

In [None]:
!head $fout

gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3,Unnamed: 7
Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
Adamts2,Adamts2,L2/3 subtypes,Cheng22_Cell,A>B=C,***,,
Cdh13,Cdh13,All cell types;L2/3 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,A>B=C,CSM,,
6530403H02Rik,6530403H02Rik,L2/3 subtypes,Cheng22_Cell,A>B=C,,"top L2/3 A marker, single cell",
Rhbdl3,Rhbdl3,L2/3 subtypes,Cheng22_Cell,A>B=C,**,top single cell marker,


In [None]:
!wc -l $fout

685 ../data_dump/MERFISH_gene_panel_Unique_org_Feb21.csv
