In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import anndata

import matplotlib.pyplot as plt
from statsmodels.stats.multitest import multipletests
import seaborn as sns

from scroutines import basicu
# from scroutines import powerplots

In [2]:
sns.set_context('talk')

# base line

In [3]:
f = "../data/cheng21_cell_scrna/organized/P28NR.h5ad"
adata = anndata.read(f, backed='r')
genes = adata.var.index.values
genes

array(['4933401J01Rik', 'Gm26206', 'Xkr4', ..., 'CAAA01064564.1',
       'Vmn2r122', 'CAAA01147332.1'], dtype=object)

In [4]:
f = "../results/MERFISH_gene_panel_Current_Mar7.csv"
df = pd.read_csv(f)

# remove Vincent genes
f = "../results/MERFISH_gene_panel_VX_edit_Mar7.csv"
df2 = pd.read_csv(f)
df = df[df['gene_name_data'].isin(df2['gene_name_data'])].copy()

cnddts = df['gene_name_data'].values
unq, cnts = np.unique(cnddts, return_counts=True)
print(len(cnddts), unq.shape, unq[cnts>1], [g for g in cnddts if g not in genes])

df

792 (792,) [] []


Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,
1,Egfem1,Egfem1,L2/3 subtypes;NRvsDR_DEG,Cheng22_Cell;our analysis,A>C>B,screened,
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,
...,...,...,...,...,...,...,...
827,Vcl,Vcl,NRvsDR_DEG,our analysis,Endo,DRup,
828,Zbtb38,Zbtb38,NRvsDR_DEG,our analysis,Astro,DRup,
829,Zdbf2,Zdbf2,NRvsDR_DEG,our analysis,"L2/3,L4,L5IT,L6CT,L6IT","DRup,DRup,DRup,DRup,DRup",
830,Zfp366,Zfp366,NRvsDR_DEG,our analysis,Endo,DRdn,


# annotate and add

In [5]:
f = "../data/cheng21_cell_scrna/res/L23-ABC-genes-n288-n286unq-annot.csv"
df3 = pd.read_csv(f)
df3['Annot4'] = 'L2/3 types'
df3

Unnamed: 0,gene,group,ispicked,P17on,P14,DR,Annot4
0,Matn2,A2,True,A,BC,1,L2/3 types
1,Fam126a,A2,False,A,unsure,1,L2/3 types
2,Egfem1,A2,True,A,unsure,unsure,L2/3 types
3,Gm19410,A2,False,A,unsure,1,L2/3 types
4,Grk3,A2,False,A,unsure,1,L2/3 types
...,...,...,...,...,...,...,...
281,Brinp3,C3,True,C,BC,unsure,L2/3 types
282,Ell2,C3,True,C,unsure,3,L2/3 types
283,Gm21949,C3,False,C,unsure,unsure,L2/3 types
284,Jdp2,C3,True,C,BC,3,L2/3 types


In [6]:
df3['gene'].isin(cnddts).sum(), (~df3['gene'].isin(cnddts)).sum()

(162, 124)

In [7]:
g3_p  = df3[df3['ispicked']]['gene']
g3_up = df3[~df3['ispicked']]['gene']

# do not add g3_p
g0 = g3_p.values
print(np.all(g3_p.isin(cnddts)))
# add annot
g1 = g3_up[g3_up.isin(cnddts)].values
# add unique
g2 = g3_up[~g3_up.isin(cnddts)].values

print(g0.shape)
print(g1.shape)
print(g2.shape)
print(len(g0)+len(g1)+len(g2))

True
(122,)
(40,)
(124,)
286


In [8]:
dfout = df.copy()
cond = dfout['gene_name_data'].isin(g1)
dfout.loc[cond,'why included'] = "L2/3 subtypes;"+dfout.loc[cond, 'why included']
dfout.loc[cond,'source']       = "Cheng22_Cell;" +dfout.loc[cond, 'source']

In [9]:
cond = dfout['gene_name_data'].isin(g2)
cond.sum()

0

In [10]:
dfnew = pd.DataFrame()
dfnew['gene_name_data'] = g2
dfnew['gene_name_vizgen'] = g2
dfnew['why included'] = 'L2/3 subtypes'
dfnew['source'] = 'Cheng22_Cell'
dfout = pd.concat([dfout, dfnew], ignore_index=True)


In [11]:
dfout = pd.merge(dfout, df3[['gene', 'Annot4', 'ispicked', 'P17on', 'P14', 'DR']].rename(columns={'gene':'gene_name_data'}), 
                 how='left', on='gene_name_data')
dfout

Unnamed: 0,gene_name_vizgen,gene_name_data,why included,source,Annot1,Annot2,Annot3,Annot4,ispicked,P17on,P14,DR
0,Matn2,Matn2,L2/3 subtypes,Cheng22_Cell,A>C>B,screened,,L2/3 types,True,A,BC,1
1,Egfem1,Egfem1,L2/3 subtypes;NRvsDR_DEG,Cheng22_Cell;our analysis,A>C>B,screened,,L2/3 types,True,A,unsure,unsure
2,Grb14,Grb14,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,L2/3 types,True,A,unsure,1
3,Adamts17,Adamts17,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,L2/3 types,True,A,unsure,unsure
4,Ldb2,Ldb2,L2/3 subtypes,Cheng22_Cell,A>C>B,*,,L2/3 types,True,A,unsure,unsure
...,...,...,...,...,...,...,...,...,...,...,...,...
911,AI593442,AI593442,L2/3 subtypes,Cheng22_Cell,,,,L2/3 types,False,C,unsure,unsure
912,Cpne9,Cpne9,L2/3 subtypes,Cheng22_Cell,,,,L2/3 types,False,C,BC,unsure
913,Dennd4c,Dennd4c,L2/3 subtypes,Cheng22_Cell,,,,L2/3 types,False,C,unsure,unsure
914,Gm21949,Gm21949,L2/3 subtypes,Cheng22_Cell,,,,L2/3 types,False,C,unsure,unsure


In [12]:
fout = "../results/MERFISH_gene_panel_Current_Mar7-v2.csv"
dfout.to_csv(fout, header=True, index=False) 

In [13]:
dfout['ispicked'].sum()

122

In [14]:
(dfout['Annot4']=='L2/3 types').sum()

286

# summarize

In [None]:
unq_reasons = np.unique(np.hstack(df['why included'].str.split(';')))
unq_reasons

In [None]:
breakdown = []
for reason in unq_reasons:
    num = df['why included'].str.contains(reason).sum()
    breakdown.append({'reason': reason, 'num': num})
breakdown = pd.DataFrame(breakdown)
breakdown.sort_values('num', ascending=False)

In [None]:
pd.set_option('display.max_rows', 100)
df.groupby(['why included', 'source']).size().to_frame('number')

In [None]:
df.groupby(['why included']).size().to_frame('number')

In [None]:
df.groupby(['why included']).size().sort_values(ascending=False).to_frame('number')

In [None]:
df.groupby(['source']).size().to_frame('number')

In [None]:
cond1 = df['why included'].str.contains("All cell types")
cond2 = df['source'].str.contains("Chen22_biorxiv_Zador")
print(df[cond1].shape)
print(df[cond2].shape)
print(df[cond1 & cond2].shape)

In [None]:
"March3" in df['gene_name_data'].values

# Save

In [None]:
# fout = '../results/MERFISH_gene_panel_merged_Feb28.csv' 
# dfout.to_csv(fout, header=True, index=False)

In [None]:
# !head $fout

In [None]:
# !wc -l $fout