In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import anndata

import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
import seaborn as sns

from scroutines import basicu
# from scroutines import powerplots

In [2]:
sns.set_context('talk')

# base line

In [3]:
f = "../data/cheng21_cell_scrna/organized/P28NR.h5ad"
adata = anndata.read(f, backed='r')
genes = adata.var.index.values
genes

array(['4933401J01Rik', 'Gm26206', 'Xkr4', ..., 'CAAA01064564.1',
       'Vmn2r122', 'CAAA01147332.1'], dtype=object)

In [4]:
f = "../results/MERFISH_gene_panel_Version1_March9.csv"
df = pd.read_csv(f)

cnddts = df['gene_name_data'].values
unq, cnts = np.unique(cnddts, return_counts=True)
print(len(cnddts), unq.shape, unq[cnts>1], [g for g in cnddts if g not in genes])

df

503 (503,) [] []


Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot0,Annot1,Annot2,Annot3,Annot4,ispicked,P17on,P14,DR,Vizgen targets < 50,Target Regions,Abundance
0,Matn2,Matn2,L2/3 types,Cheng22_Cell,,A>C>B,screened,,L2/3 types,True,A,BC,1,False,112.0,2.465576
1,Egfem1,Egfem1,L2/3 types;NRvsDR_DEG,Cheng22_Cell;our analysis,,A>C>B,screened,,L2/3 types,True,A,unsure,unsure,False,56.0,0.973929
2,Grb14,Grb14,L2/3 types,Cheng22_Cell,,A>C>B,*,,L2/3 types,True,A,unsure,1,False,57.0,16.487443
3,Adamts17,Adamts17,L2/3 types,Cheng22_Cell,,A>C>B,*,,L2/3 types,True,A,unsure,unsure,True,46.0,0.493933
4,Ldb2,Ldb2,L2/3 types,Cheng22_Cell,,A>C>B,*,,L2/3 types,True,A,unsure,unsure,False,69.0,7.382394
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,Sh3pxd2b,Sh3pxd2b,L2/3 types,Cheng22_Cell,,,,,L2/3 types,False,B,unsure,unsure,False,226.0,1.757504
499,Sertm1,Sertm1,L2/3 types,Cheng22_Cell,,,,,L2/3 types,False,B,unsure,unsure,False,93.0,4.621483
500,Mas1,Mas1,L2/3 types,Cheng22_Cell,,,,,L2/3 types,False,B,unsure,3,False,59.0,2.104614
501,Scg3,Scg3,L2/3 types,Cheng22_Cell,,,,,L2/3 types,False,B,unsure,3,False,74.0,55.862967


In [5]:
df['source'].str.contains('Zador').sum()

103

In [6]:
df['ispicked'].sum()

122

In [7]:
(df['Annot4']=='L2/3 types').sum()

170

In [8]:
df['P17on'].value_counts()

C    71
A    64
B    35
Name: P17on, dtype: int64

# summarize

In [9]:
unq_reasons = np.unique(np.hstack(df['why included'].str.split(';')))
unq_reasons

array(['All cell types', 'Astrocyte_NRvsDR_DEG', 'Astrocytes',
       'Early on marker', 'IEG', 'L2/3 SSp', 'L2/3 V1', 'L2/3 dynamic',
       'L2/3 types', 'L2/3/4 types at P14', 'L4 types', 'L4/5 types',
       'L5 IT types', 'Microglia', 'NRvsDR_DEG', 'OPC',
       'V1_HVA_Spatial_Gradient'], dtype='<U23')

In [10]:
breakdown = []
for reason in unq_reasons:
    num = df['why included'].str.contains(reason).sum()
    breakdown.append({'reason': reason, 'num': num})
breakdown = pd.DataFrame(breakdown)
breakdown.sort_values('num', ascending=False)

Unnamed: 0,reason,num
8,L2/3 types,170
0,All cell types,166
10,L4 types,70
13,Microglia,56
7,L2/3 dynamic,49
14,NRvsDR_DEG,45
2,Astrocytes,32
4,IEG,22
5,L2/3 SSp,10
1,Astrocyte_NRvsDR_DEG,8


In [11]:
df.groupby(['why included']).size().to_frame('number').sort_values('number', ascending=False)

Unnamed: 0_level_0,number
why included,Unnamed: 1_level_1
All cell types,104
L2/3 types,94
Microglia,53
L4 types,52
Astrocytes,27
L2/3 types;L2/3 dynamic,19
L2/3 types;All cell types,15
All cell types;L2/3 types,11
All cell types;L4 types,11
IEG,11


In [12]:
pd.set_option('display.max_rows', 100)
df.groupby(['why included', 'source']).size().to_frame('number')

Unnamed: 0_level_0,Unnamed: 1_level_0,number
why included,source,Unnamed: 2_level_1
All cell types,Chen22_biorxiv_Zador,40
All cell types,Chen22_biorxiv_Zador;Cheng22_Cell,11
All cell types,Chen22_biorxiv_Zador;our analysis,5
All cell types,Cheng22_Cell,48
All cell types;IEG;NRvsDR_DEG,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro;our analysis,1
All cell types;L2/3 dynamic,Chen22_biorxiv_Zador;Cheng22_Cell;our analysis,2
All cell types;L2/3 dynamic,Cheng22_Cell;our analysis,1
All cell types;L2/3 dynamic;NRvsDR_DEG,Chen22_biorxiv_Zador;our analysis,1
All cell types;L2/3 dynamic;NRvsDR_DEG,Cheng22_Cell;our analysis,1
All cell types;L2/3 types,Chen22_biorxiv_Zador;Cheng22_Cell,11


In [13]:
df.groupby(['why included']).size().to_frame('number')

Unnamed: 0_level_0,number
why included,Unnamed: 1_level_1
All cell types,104
All cell types;IEG;NRvsDR_DEG,1
All cell types;L2/3 dynamic,3
All cell types;L2/3 dynamic;NRvsDR_DEG,2
All cell types;L2/3 types,11
All cell types;L2/3 types;L2/3 dynamic,5
All cell types;L2/3 types;NRvsDR_DEG,1
All cell types;L2/3/4 types at P14,1
All cell types;L4 types,11
All cell types;L4 types;L2/3 dynamic,2


In [14]:
df.groupby(['why included']).size().sort_values(ascending=False).to_frame('number')

Unnamed: 0_level_0,number
why included,Unnamed: 1_level_1
All cell types,104
L2/3 types,94
Microglia,53
L4 types,52
Astrocytes,27
L2/3 types;L2/3 dynamic,19
L2/3 types;All cell types,15
All cell types;L2/3 types,11
All cell types;L4 types,11
IEG,11


In [15]:
df.groupby(['source']).size().to_frame('number')

Unnamed: 0_level_0,number
source,Unnamed: 1_level_1
Bayraktar20_NatNeuro,27
Bayraktar20_NatNeuro;our analysis,4
Berg21_Nature,3
Buchanan22_PNAS_Allen;our analysis,1
Chen22_biorxiv_Zador,40
Chen22_biorxiv_Zador;Cheng22_Cell,33
Chen22_biorxiv_Zador;Cheng22_Cell;our analysis,11
Chen22_biorxiv_Zador;Chou13_Science,1
Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1
Chen22_biorxiv_Zador;Hrvatin17_NatNeuro;our analysis,1


In [16]:
cond1 = df['why included'].str.contains("All cell types")
cond2 = df['source'].str.contains("Chen22_biorxiv_Zador")
print(df[cond1].shape)
print(df[cond2].shape)
print(df[cond1 & cond2].shape)

(166, 16)
(103, 16)
(103, 16)


In [17]:
"March3" in df['gene_name_data'].values

False