In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import anndata

import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
import seaborn as sns

from scroutines import basicu
# from scroutines import powerplots

In [2]:
sns.set_context('talk')

# base line

In [3]:
f = "../data/cheng21_cell_scrna/organized/P28NR.h5ad"

adata = anndata.read(f, backed='r')
genes = adata.var.index.values
genes


array(['4933401J01Rik', 'Gm26206', 'Xkr4', ..., 'CAAA01064564.1',
       'Vmn2r122', 'CAAA01147332.1'], dtype=object)

In [4]:
f = "../results/MERFISH_gene_panel_working_Feb28.csv"
df = pd.read_csv(f)

cnddts = df['gene_name_data'].values
unq, cnts = np.unique(cnddts, return_counts=True)
print(len(cnddts), unq.shape, unq[cnts>1], [g for g in cnddts if g not in genes])

df

595 (595,) [] []


Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
590,Abca1,Abca1,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
591,Lrp1,Lrp1,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
592,Dock2,Dock2,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
593,Abr,Abr,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,


In [5]:
df.groupby(['why included', 'source']).size().to_frame('number')

Unnamed: 0_level_0,Unnamed: 1_level_0,number
why included,source,Unnamed: 2_level_1
All cell types,Chen22_biorxiv_Zador,52
All cell types,Chen22_biorxiv_Zador;Cheng22_Cell,14
All cell types,Chen22_biorxiv_Zador;our analysis,7
All cell types,Cheng22_Cell,54
All cell types,our analysis,43
All cell types;IEG,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1
All cell types;L2/3 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,17
All cell types;L2/3/4 subtypes at P14,Chen22_biorxiv_Zador;our analysis,1
All cell types;L4 subtypes,Chen22_biorxiv_Zador;Cheng22_Cell,13
All cell types;L4/5 subtypes,Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,1


In [6]:
df.groupby(['why included']).size().to_frame('number')

Unnamed: 0_level_0,number
why included,Unnamed: 1_level_1
All cell types,170
All cell types;IEG,1
All cell types;L2/3 subtypes,17
All cell types;L2/3/4 subtypes at P14,1
All cell types;L4 subtypes,13
All cell types;L4/5 subtypes,1
All cell types;L5 IT subtypes,1
All cell types;V1_HVA_Spatial_Gradient,1
Astrocyte_NRvsDR_DEG,7
Astrocytes,44


In [7]:
df.groupby(['source']).size().to_frame('number')

Unnamed: 0_level_0,number
source,Unnamed: 1_level_1
Bayraktar20_NatNeuro,44
Berg21_Nature,3
Buchanan22_PNAS_Allen,21
Chen22_biorxiv_Zador,52
Chen22_biorxiv_Zador;Cheng22_Cell,44
Chen22_biorxiv_Zador;Chou13_Science,1
Chen22_biorxiv_Zador;Hrvatin17_NatNeuro,2
Chen22_biorxiv_Zador;our analysis,8
Cheng22_Cell,230
Cheng22_Cell;Hrvatin17_NatNeuro,1


In [8]:
cond1 = df['why included'].str.contains("All cell types")
cond2 = df['source'].str.contains("Chen22_biorxiv_Zador")
print(df[cond1].shape)
print(df[cond2].shape)
print(df[cond1 & cond2].shape)

(205, 7)
(107, 7)
(107, 7)


# our analysis genes

In [9]:
f = '../results/dynamic_genes_230228.csv'
dfu1 = pd.read_csv(f)
dfu1

Unnamed: 0,gene,subclass,reason
0,Rgs20,L2/3,dynamic
1,Slco5a1,L2/3,dynamic
2,Col19a1,L2/3,dynamic
3,Pard3b,L2/3,dynamic
4,Fn1,L2/3,dynamic
...,...,...,...
661,Vcan,L6b,dynamic
662,Ankrd33b,L6b,dynamic
663,Pde10a,L6b,dynamic
664,Rab26,L6b,dynamic


In [10]:
f = '../results/P28NRvsDR_DEGs_230228.csv'
dfu2 = pd.read_csv(f)
dfu2

Unnamed: 0,gene,subclass,reason
0,Coq10b,Astro,DRup
1,Coq10b,L2/3,DRup
2,Pard3b,Sst,DRup
3,Zdbf2,L2/3,DRup
4,Zdbf2,L4,DRup
...,...,...,...
271,Prodh,Astro,DRdn
272,Fkbp5,Astro,DRdn
273,Ptchd4,L2/3,DRdn
274,Ptchd4,L5IT,DRdn


# Merge 

In [11]:
dfu1r = dfu1[dfu1['subclass']=='L2/3'].copy()
assert len(dfu1r) == len(dfu1r['gene'].unique())
dfu1r['gene_name_data'] = dfu1r['gene']
dfu1r['gene_name_vizgen'] = dfu1r['gene']
dfu1r['why included'] = "L2/3 dynamic"
dfu1r['source'] = 'our analysis'
dfu1r

Unnamed: 0,gene,subclass,reason,gene_name_data,gene_name_vizgen,why included,source
0,Rgs20,L2/3,dynamic,Rgs20,Rgs20,L2/3 dynamic,our analysis
1,Slco5a1,L2/3,dynamic,Slco5a1,Slco5a1,L2/3 dynamic,our analysis
2,Col19a1,L2/3,dynamic,Col19a1,Col19a1,L2/3 dynamic,our analysis
3,Pard3b,L2/3,dynamic,Pard3b,Pard3b,L2/3 dynamic,our analysis
4,Fn1,L2/3,dynamic,Fn1,Fn1,L2/3 dynamic,our analysis
...,...,...,...,...,...,...,...
144,Pcsk5,L2/3,dynamic,Pcsk5,Pcsk5,L2/3 dynamic,our analysis
145,Pip5k1b,L2/3,dynamic,Pip5k1b,Pip5k1b,L2/3 dynamic,our analysis
146,Glis3,L2/3,dynamic,Glis3,Glis3,L2/3 dynamic,our analysis
147,Sorcs3,L2/3,dynamic,Sorcs3,Sorcs3,L2/3 dynamic,our analysis


In [12]:
dfu2r = dfu2.groupby('gene', as_index=False).agg({'subclass': ",".join, 'reason': ",".join}).copy() 
dfu2r['gene_name_data'] = dfu2r['gene']
dfu2r['gene_name_vizgen'] = dfu2r['gene']
dfu2r['why included']   = 'NRvsDR_DEG'
dfu2r['source']   = 'our analysis'
dfu2r['Annot1'] = dfu2r['subclass']
dfu2r['Annot2'] = dfu2r['reason']
dfu2r

Unnamed: 0,gene,subclass,reason,gene_name_data,gene_name_vizgen,why included,source,Annot1,Annot2
0,1600020E01Rik,L4,DRup,1600020E01Rik,1600020E01Rik,NRvsDR_DEG,our analysis,L4,DRup
1,1700016P03Rik,"Endo,L2/3,L4,L5IT,L5NP,L5PT,L6CT,L6IT,Sst","DRup,DRup,DRup,DRup,DRup,DRup,DRup,DRup,DRup",1700016P03Rik,1700016P03Rik,NRvsDR_DEG,our analysis,"Endo,L2/3,L4,L5IT,L5NP,L5PT,L6CT,L6IT,Sst","DRup,DRup,DRup,DRup,DRup,DRup,DRup,DRup,DRup"
2,4930578C19Rik,Endo,DRdn,4930578C19Rik,4930578C19Rik,NRvsDR_DEG,our analysis,Endo,DRdn
3,AC110091.1,OD,DRdn,AC110091.1,AC110091.1,NRvsDR_DEG,our analysis,OD,DRdn
4,Acsl4,"L2/3,L6IT","DRup,DRup",Acsl4,Acsl4,NRvsDR_DEG,our analysis,"L2/3,L6IT","DRup,DRup"
...,...,...,...,...,...,...,...,...,...
169,Vcl,Endo,DRup,Vcl,Vcl,NRvsDR_DEG,our analysis,Endo,DRup
170,Zbtb38,Astro,DRup,Zbtb38,Zbtb38,NRvsDR_DEG,our analysis,Astro,DRup
171,Zdbf2,"L2/3,L4,L5IT,L6CT,L6IT","DRup,DRup,DRup,DRup,DRup",Zdbf2,Zdbf2,NRvsDR_DEG,our analysis,"L2/3,L4,L5IT,L6CT,L6IT","DRup,DRup,DRup,DRup,DRup"
172,Zfp366,Endo,DRdn,Zfp366,Zfp366,NRvsDR_DEG,our analysis,Endo,DRdn


In [13]:
df

Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
590,Abca1,Abca1,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
591,Lrp1,Lrp1,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
592,Dock2,Dock2,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
593,Abr,Abr,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,


In [14]:
dfnew = df.copy()
dfnew

Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
590,Abca1,Abca1,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
591,Lrp1,Lrp1,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
592,Dock2,Dock2,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,
593,Abr,Abr,OPC,Buchanan22_PNAS_Allen,phagocytosis enriched genes,,


In [15]:
newidx1 = []
for idx, row in dfu1r.iterrows():
    if row['gene'] not in df['gene_name_data'].values:
        newidx1.append(idx)
len(newidx1)

104

In [16]:
newidx2 = []
for idx, row in dfu2r.iterrows():
    if ((row['gene'] not in df['gene_name_data'].values) and 
        (row['gene'] not in dfu1r['gene_name_data'].values)):
        newidx2.append(idx)
len(newidx2)

133

In [17]:
df_merged = pd.concat([df, 
                       dfu1r.loc[newidx1, ['gene_name_vizgen', 'gene_name_data', 'why included', 'source',]], 
                       dfu2r.loc[newidx2, ['gene_name_vizgen', 'gene_name_data', 'why included', 'source', 'Annot1', 'Annot2']], 
                       ], ignore_index=True)
df_merged

Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
827,Vcl,Vcl,NRvsDR_DEG,our analysis,Endo,DRup,
828,Zbtb38,Zbtb38,NRvsDR_DEG,our analysis,Astro,DRup,
829,Zdbf2,Zdbf2,NRvsDR_DEG,our analysis,"L2/3,L4,L5IT,L6CT,L6IT","DRup,DRup,DRup,DRup,DRup",
830,Zfp366,Zfp366,NRvsDR_DEG,our analysis,Endo,DRdn,


In [18]:
len(df_merged['gene_name_data'].unique()) == len(df_merged)

True

In [25]:
# add those gene annotations back
dfout = df_merged.copy()
print(dfout.shape)

g1 = dfu1r['gene'].values
g2 = dfu2r['gene'].values

cond = (~dfout['why included'].str.contains("L2/3 dynamic")) & dfout['gene_name_data'].isin(g1)
dfout.loc[cond,'why included'] = dfout.loc[cond, 'why included']+";L2/3 dynamic"

cond = (~dfout['source'].str.contains("our analysis")) & dfout['gene_name_data'].isin(g1)
dfout.loc[cond,'source'] = dfout.loc[cond, 'source']+";our analysis"

cond = (~dfout['why included'].str.contains("NRvsDR_DEG")) & dfout['gene_name_data'].isin(g2)
dfout.loc[cond,'why included'] = dfout.loc[cond, 'why included']+";NRvsDR_DEG"

cond = (~dfout['source'].str.contains("our analysis")) & dfout['gene_name_data'].isin(g2)
dfout.loc[cond,'source'] = dfout.loc[cond, 'source']+";our analysis"

(832, 7)


In [26]:
dfout

Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes;NRvsDR_DEG,Cheng22_Cell;our analysis,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
827,Vcl,Vcl,NRvsDR_DEG,our analysis,Endo,DRup,
828,Zbtb38,Zbtb38,NRvsDR_DEG,our analysis,Astro,DRup,
829,Zdbf2,Zdbf2,NRvsDR_DEG,our analysis,"L2/3,L4,L5IT,L6CT,L6IT","DRup,DRup,DRup,DRup,DRup",
830,Zfp366,Zfp366,NRvsDR_DEG,our analysis,Endo,DRdn,


# Save

In [27]:
fout = '../results/MERFISH_gene_panel_merged_Feb28.csv' 
dfout.to_csv(fout, header=True, index=False)

In [28]:
!head $fout

gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
Egfem1,Egfem1,L2/3 subtypes;NRvsDR_DEG,Cheng22_Cell;our analysis,A>C>B,screened,
Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
Adamts2,Adamts2,L2/3 subtypes;NRvsDR_DEG,Cheng22_Cell;our analysis,A>B=C,***,
Cdh13,Cdh13,All cell types;L2/3 subtypes;L2/3 dynamic,Chen22_biorxiv_Zador;Cheng22_Cell;our analysis,A>B=C,CSM,
6530403H02Rik,6530403H02Rik,L2/3 subtypes,Cheng22_Cell,A>B=C,,"top L2/3 A marker, single cell"
Rhbdl3,Rhbdl3,L2/3 subtypes;L2/3 dynamic,Cheng22_Cell;our analysis,A>B=C,**,top single cell marker


In [29]:
!wc -l $fout

833 ../results/MERFISH_gene_panel_merged_Feb28.csv
