In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import anndata

import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
import seaborn as sns

from dredFISH.Utils import basicu
from dredFISH.Utils import powerplots

In [2]:
def get_normed_bulks(mat, genes, types, genes_sel_idx=None):
    """
    Assume cell by gene
    - select expressed genes
    - sparse to dense
    - merge cells to clusters
    - log10(CPM+1) norm bulk samples
    """
    ncell, ngene = mat.shape
    cellcov = np.asarray(mat.sum(axis=1)).reshape(-1,)
    genecov = np.asarray(mat.sum(axis=0)).reshape(-1,)
    
    if genes_sel_idx is None:
        cond = genecov>ncell*0.01 # expressed in at least 1% of cells
        matsub = np.asarray(mat[:,cond].todense())
        genes_sel = genes[cond]
    else:
        matsub = np.asarray(mat[:,genes_sel_idx].todense())
        genes_sel = genes[genes_sel_idx]
    
    # pseudo-bulk samples -- pull counts from cells
    Xk, xclsts = basicu.group_sum(matsub, types)
    ck, xclsts = basicu.group_sum(cellcov.reshape(-1,1), types)
    Xk = np.log10(1+(np.array(Xk)/np.array(ck))*1e6) # log10(1+CPM)
    df = pd.DataFrame(Xk, index=xclsts, columns=genes_sel)
    return df.T # gene by types 

def get_normed_bulks_for_adata_by_types(adata, genes_cndd=None):
    """
    """
    if genes_cndd is None:
        genes_sel_idx = None
    else:
        genes_sel_idx = basicu.get_index_from_array(adata.var.index.values, genes_cndd)
        if np.sum(genes_sel_idx == -1) > 0:
            print("some genes are not there")
            genes_sel_idx = genes_sel_idx[genes_sel_idx!=-1]
            
    return get_normed_bulks(adata.X.copy(), adata.var.index.values, adata.obs['Type'], 
                            genes_sel_idx=genes_sel_idx)

In [3]:
sns.set_context('talk')

In [4]:
f = "/greendata/GeneralStorage/fangming/projects/visctx/data_dump/Chen_Zador.csv"
df_zador = pd.read_csv(f, header=None)
g1 = df_zador[0].values
g1 = np.unique(g1) 
g1, g1.shape

(array(['Actb', 'Alcam', 'Brinp3', 'C1ql3', 'Calb1', 'Camk4', 'Car10',
        'Car3', 'Cbln2', 'Cdh13', 'Cdh18', 'Cdh9', 'Col11a1', 'Col19a1',
        'Coro6', 'Cplx3', 'Cpne4', 'Ctgf', 'Cux2', 'Dab1', 'Dcc', 'Dgkb',
        'Efna5', 'Enpp2', 'Etv1', 'Fam19a1', 'Fam19a2', 'Fat3', 'Fbxl7',
        'Foxp2', 'Gad1', 'Galnt14', 'Galntl6', 'Gfra1', 'Gnb4', 'Gpc5',
        'Gria1', 'Grik1', 'Grin3a', 'Hcn1', 'Hs3st2', 'Hs3st4', 'Hs6st3',
        'Htr2a', 'Igsf21', 'Il1rapl2', 'Inpp4b', 'Kcnip1', 'Kcnn2',
        'Kctd1', 'Lamp5', 'Lmo4', 'Lpp', 'Lrrtm4', 'Lypd1', 'Marcksl1',
        'Ncald', 'Ncam2', 'Nell1', 'Nnat', 'Nr4a2', 'Nrg1', 'Nrp1',
        'Nrsn1', 'Nxph1', 'Olfm3', 'Oprk1', 'Otof', 'Pam', 'Pcp4', 'Prkca',
        'Ptprk', 'Rab3c', 'Rasgrf2', 'Rasl10a', 'Rcan2', 'Reln', 'Rgs4',
        'Rora', 'Rorb', 'Satb1', 'Scnn1a', 'Sdk1', 'Sgcd', 'Slc17a7',
        'Slc24a2', 'Slc24a3', 'Slc30a3', 'Sorcs3', 'Spock3', 'Sv2c',
        'Svil', 'Synpr', 'Syt17', 'Syt2', 'Tenm3', 'Timp2', 'Tle4',

In [5]:
f = "/greendata/GeneralStorage/fangming/projects/visctx/data_dump/Chen_Zador_probes.csv"
df_zador2 = pd.read_csv(f).dropna()
df_zador2['gene'] = df_zador2['notes'].apply(lambda x: x.split(' ')[0])
g2 = df_zador2['gene'].unique()
df_zador2

Unnamed: 0,Oligo name,sequence,IDT scale,additional processing (IDT),notes,gene
0,XCA2201,/5phos/tgggcttctgcctctgaagcaGATCGTCGGACTGTAGAA...,4nmU,STD,"Calb1 padlock, GII11, use with XC1701",Calb1
1,XCA2202,/5phos/agctgtaccgaacagaccttgcGATCGTCGGACTGTAGA...,4nmU,STD,"Calb1 padlock, GII11, use with XC1702",Calb1
2,XCA2203,/5phos/tggccaggttactaccagtgcGATCGTCGGACTGTAGAA...,4nmU,STD,"Calb1 padlock, GII11, use with XC1703",Calb1
3,XCA2204,/5phos/ggctggattggagctatcaccGATCGTCGGACTGTAGAA...,4nmU,STD,"Calb1 padlock, GII11, use with XC1704",Calb1
4,XCA2205,/5phos/cctggaaggaaaggagctgcaGATCGTCGGACTGTAGAA...,4nmU,STD,"Calb1 padlock, GII11, use with XC1705",Calb1
...,...,...,...,...,...,...
1144,XCA4200,/5phos/ggatgatggtggcttcaacgtGATCGTCGGACTGTAGAA...,4nmU,STD,"Scnn1a padlock, GII300, use with XC4800",Scnn1a
1145,XCA4201,/5phos/gtggatgccgtgagagaatggGATCGTCGGACTGTAGAA...,4nmU,STD,"Scnn1a padlock, GII300, use with XC4801",Scnn1a
1146,XCA4202,/5phos/ccgtcactgtgtgcacccttaGATCGTCGGACTGTAGAA...,4nmU,STD,"Scnn1a padlock, GII300, use with XC4802",Scnn1a
1147,XCA4203,/5phos/tcctctgcccatgcaaggactGATCGTCGGACTGTAGAA...,4nmU,STD,"Scnn1a padlock, GII300, use with XC4803",Scnn1a


In [6]:
g12 = np.intersect1d(g1, g2)
g1.shape, g2.shape, g12.shape

((107,), (101,), (96,))

In [7]:
[g for g in g1 if g not in g2]

['Actb',
 'Car10',
 'Ctgf',
 'Fam19a1',
 'Fam19a2',
 'Gad1',
 'Marcksl1',
 'Nrg1',
 'Rgs4',
 'Slc17a7',
 'Slc30a3']

In [8]:
[g for g in g2 if g not in g1]

['Tafa1', 'Tafa2', 'Ccn2', 'Hgf', 'Fezf2']

In [9]:
df_zador[0].unique().shape, df_zador[0].value_counts() # count_values()

((107,),
 Fam19a1    2
 Calb1      1
 Brinp3     1
 Ptprk      1
 Kctd1      1
           ..
 Galnt14    1
 Dgkb       1
 Cdh9       1
 C1ql3      1
 Slc30a3    1
 Name: 0, Length: 107, dtype: int64)

In [10]:
f = "/greendata/GeneralStorage/fangming/projects/visctx/data_dump/MERFISH_design_Feb17.csv"
df_merfish = pd.read_csv(f)

# # df_merfish = df_merfish[~df_merfish[0].isnull()]
cnddts = df_merfish['gene_name_data'].values
unq, cnts = np.unique(cnddts, return_counts=True)
print(len(cnddts), unq.shape, unq[cnts>1])

df_merfish

685 (683,) ['Hsd11b1' 'Whrn']


Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3,Unnamed: 7
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,
...,...,...,...,...,...,...,...,...
680,Tox2,Tox2,L5 IT subtypes,Tasic18_Nature,,,,
681,Batf3,Batf3,L5 IT subtypes,Tasic18_Nature,,,,
682,Col6a1,Col6a1,L5 IT subtypes,Tasic18_Nature,,,,
683,Fezf2,Fezf2,L5 IT subtypes,Tasic18_Nature,,,,


In [11]:
np.intersect1d(df_merfish['gene_name_data'].values, df_zador[0]).shape

(106,)

In [12]:
[i for i in df_zador[0] if i not in unq]

['Slc30a3']

In [13]:
np.intersect1d(df_merfish['gene_name_data'].values, df_zador[0]).shape

(106,)

In [14]:
df_merfish.groupby(['why included', 'source']).size().to_frame('number')

Unnamed: 0_level_0,Unnamed: 1_level_0,number
why included,source,Unnamed: 2_level_1
All cell types in V1,Cheng22_Cell,69
All cell types in V1,PROPOSE,50
Astrocyte NRvsDR DEGs,Cheng22_Cell,7
Astrocyte related,Bayraktar20_NatNeuro,44
Cortical excitatory types,Chen22_biorxiv_Zador,51
DEG_NRvsDR,our analysis,88
Early on marker,Cheng22_Cell,4
IEG,Hrvatin17_NatNeuro,22
L2/3 subtypes,Cheng22_Cell,122
L2/3 subtypes in SSp,Condylis22_Science,10
