# Purpose
- To re-create mfishtools in python (along with Hannah's code)
- Validate with copying hannah's inhibitory gene panel selection: no subsampling
- We want to use python for ease of use and to incorporate merFISH data in the future

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys

In [69]:
# options for cluster grouping
gene_panel_selection_ops = {
    'panel_name': 'inhibitory',
    'full_panel_size': 28,
    'starting_genes': ["Gad2","Slc17a7","Pvalb","Sst","Vip","Cck","Tac1","Npy","Crh","Necab1","Ptprt","Kirrel3","Penk","Hpse","Calb2","Chodl"],
    'layer_1234_filter': True,
    'use_supertypes': False,
    'blend_supertypes': False,
    'remove_bad_genes': False,
    'other_as_subclass': True,
}

# gene_panel_selection_ops = {
#     'panel_name': 'pan_neuronal',
#     'full_panel_size': 30,
#     'starting_genes': ["Gad2","Slc17a7","Tac2","Tac1","Calb1","Npy","Cck","Vip","Crh","Calb2","Penk","Oprm1","Pvalb","Ptprt","Kirrel3","Sst","Ndnf","Nos1","Baz1a","Sncg","Mybpc1","Lamp5","Hpse","Etv1","Rorb","Agmat","Chat","Adamts2"],
#     'layer_1234_filter': True,
#     'use_supertypes': False,
#     'blend_supertypes': True,
#     'remove_bad_genes': True,
#     'other_as_subclass': True,
# }

In [3]:
# paths to the data
data_folder = Path('//allen/programs/mindscope/workgroups/omfish/hannahs/mfish_project/gene_panels/L23_inh_panel/Mm_VISp_14236_20180912')

In [75]:
# read annotation
annotation = pd.read_feather(data_folder / 'anno.feather')
print(annotation.shape)
annotation.head()

(14236, 152)


Unnamed: 0,sample_id,cl,genes_label,confusion_label,cl_cor_label,core_int_label,primary_cluster_label,secondary_cluster_label,donor_label,genotype_label,...,lrcluster_id,lrcluster_label,lrcluster_color,lf_cluster_id,lf_cluster_label,lf_cluster_color,dendcluster_label,dendcluster_id,dendcluster_color,cluster_id
0,LS-14690_S02_E1-50,57.0,8537.0,0.342841,0.851554,core,Pvalb Tpbg,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,...,50.0,Pvalb Tpbg,#AF3F64,59.0,59 Pvalb Tpbg,#AF3F64,Pvalb Tpbg,114,#AF3F64,59
1,LS-14690_S03_E1-50,69.0,8106.0,0.030707,0.844934,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,...,59.0,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64
2,LS-14690_S05_E1-50,69.0,8779.0,0.109306,0.841516,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,...,59.0,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64
3,LS-14690_S06_E1-50,69.0,8494.0,0.033539,0.864382,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,...,59.0,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64
4,LS-14690_S07_E1-50,69.0,7562.0,0.036388,0.795118,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,...,59.0,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64


In [5]:
# read data (tasic 2018 v1)
data = pd.read_feather(data_folder / 'data_t.feather')
print(data.shape)
data.head()
# Takes about 1 minute to run

(45768, 14237)


Unnamed: 0,gene,LS-14690_S02_E1-50,LS-14690_S03_E1-50,LS-14690_S05_E1-50,LS-14690_S06_E1-50,LS-14690_S07_E1-50,LS-14690_S08_E1-50,LS-14690_S09_E1-50,LS-14690_S10_E1-50,LS-14690_S11_E1-50,...,SQ-80004_S37_E1-50,SQ-80004_S38_E1-50,SQ-80004_S39_E1-50,SQ-80004_S40_E1-50,SQ-80004_S41_E1-50,SQ-80004_S42_E1-50,SQ-80004_S43_E1-50,SQ-80004_S44_E1-50,SQ-80004_S47_E1-50,SQ-80004_S48_E1-50
0,0610005C13Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0610006L08Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0610007P14Rik,43.299797,11.16731,31.402268,145.99169,154.261257,0.0,0.0,83.984133,65.468468,...,205.562609,182.700625,0.0,63.1248,103.694716,0.0,340.619657,99.623263,0.0,132.725762
3,0610009B22Rik,126.692,54.905939,19.626418,16.753145,102.840838,47.063575,60.956081,113.517016,20.144144,...,9.042343,0.0,77.88796,75.843976,71.851457,0.439562,0.0,24.77119,0.0,145.95824
4,0610009E02Rik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.602823,0.0,0.0,6.124048,4.898963,0.0,0.0,0.0,0.0,4.009842


In [6]:
print(f'Data memory {sys.getsizeof(data) / 1e9:.2f} GB')

Data memory 5.22 GB


In [7]:
print(data.shape[1] - annotation.shape[0])

1


In [8]:
# preprocessing
if 'X' in data.columns:
    print('Dropping "X" column from data')
    data = data.drop(columns=['X'])
if 'X' in annotation.columns:
    print('Dropping "X" column fro annotation')
    annotation = annotation.drop(columns=['X'])
data.set_index('gene', inplace=True, drop=True)
# change the row order of annotation to match the order of columns in data
annotation.set_index('sample_id', inplace=True, drop=True)
annotation = annotation.loc[data.columns]
assert np.all(annotation.index.values == data.columns.values)

In [9]:
# read supertype information
supertype_folder = Path('//allen/programs/mindscope/workgroups/omfish/hannahs/mfish_project/gene_panels')
supertype_fn = supertype_folder / 'tasic2018_supertypes_manual_v2.xlsx'
sheet_name = 'all_supertypes_v2'
supertype = pd.read_excel(supertype_fn, sheet_name=sheet_name)
supertype.rename(columns={'Cell Type': 'cell_type', 'Supertype': 'supertype'}, inplace=True)
supertype.cell_type = supertype.cell_type.str.replace('\xa0', ' ')
supertype.supertype = supertype.supertype.str.replace('\xa0', ' ')
assert np.all([ct in annotation['cluster_label'].values for ct in supertype.cell_type.values])
supertype.set_index('cell_type', inplace=True, drop=True)

In [10]:
annotation['supertype_label'] = annotation.cluster_label.map(supertype.supertype)

  annotation['supertype_label'] = annotation.cluster_label.map(supertype.supertype)


In [97]:
# checking the mapping
anno_i = 500
cluster_label = annotation.iloc[anno_i].cluster_label
print(cluster_label)
supertype_label = annotation.iloc[anno_i].supertype_label
print(supertype_label)
supertype.loc[cluster_label]

L6 IT VISp Col23a1 Adamts2
L6 IT VISp Penk


ID                         5
supertype    L6 IT VISp Penk
Name: L6 IT VISp Col23a1 Adamts2, dtype: object

In [79]:
keep_class = ['GABAergic']
gabaergic_layer_threshold = 0.15
L6_layer_threshold = 0.75
L1234_labels = ['L1', 'L1-L2/3', 'L1-L4', 'L2/3', 'L2/3-L4', 'L4']
L6_labels = ['L5-L6', 'L6', 'L6b']

annotation = pd.read_feather(data_folder / 'anno.feather')

keep_types = []
if gene_panel_selection_ops['layer_1234_filter']:
    if 'Glutamatergic' in keep_class:
        L234_exc_subclasses = ['L2/3 IT','L4']
        L5_exc_subclasses = ['L5 IT','L5 PT','NP']
        L234_exc_types = annotation[annotation['subclass_label'].isin(L234_exc_subclasses)].cluster_label.unique()
        L5_exc_types = annotation[annotation['cluster_label'].isin(L5_exc_subclasses)].cluster_label.unique()
        keep_types.extend(L234_exc_types)
    if 'GABAergic' in keep_class:
        layer_df = annotation.query('class_label=="GABAergic"')[['layer_label', 'cluster_label']].copy()
        layer_table = layer_df.groupby(['layer_label', 'cluster_label']).size().unstack(fill_value=0)
        prop_table = layer_table.div(layer_table.sum(axis=0), axis=1)
        L1234_prop_sum = prop_table.loc[L1234_labels].sum(axis=0)
        L1234_inh_types = L14_prop_sum[L1234_prop_sum > gabaergic_layer_threshold].index.values
        not_L1234_inh_types = L14_prop_sum[L1234_prop_sum <= gabaergic_layer_threshold].index.values
        L6_prop_sum = prop_table.loc[L6_labels].sum(axis=0)
        L6_inh_types = L6_prop_sum[L6_prop_sum >= L6_layer_threshold].index.values
        L5_inh_types = np.setdiff1d(not_L1234_inh_types, L6_inh_types)
        keep_types.extend(L1234_inh_types)
    
    # Check these codes later
    if gene_panel_selection_ops['other_as_subclass']:
        L5_inh_cluster_labels = []
        for cluster_label in L5_inh_types:
            temp_subclass = cluster_label.split(' ')[0]
            indices = annotation.query('cluster_label==@cluster_label').index
            annotation.loc[indices, 'cluster_label'] = f'L5 {temp_subclass}' # need to change this code later. Don't reuse the same column name!
            L5_inh_cluster_labels.append(f'L5 {temp_subclass}')
        L5_inh_cluster_labels = np.unique(L5_inh_cluster_labels)
        keep_types.extend(L5_inh_cluster_labels)
    else:
        keep_types.extend(L5_inh_types)
    
    # Check these codes later
    if gene_panel_selection_ops['use_supertypes']:
        keep_clusts = annotation.query('cluster_label in @keep_types').supertype_label.unique()
        L5_inh_types = annotation.query('cluster_label in @L5_inh_types').L5_inh_types.unique()
        annotation['cluster_label_original'] = annotation['cluster_label']
        annotation['cluster_label'] = annotation['supertype_label']
        annotation.query('cluster_label_original in @L5_inh_cluster_labels')['cluster_label'] = \
            annotation.query('clusgter_label_original in @L5_inh_cluster_labels')['clusgter_label_original']
    else:
        keep_clusts = annotation.query('cluster_label in @keep_types').cluster_label.unique()
    


In [94]:
# remove starting genes that are not in the data
st_in_data = [st not in data.index.values for st in gene_panel_selection_ops['starting_genes']]
if np.any(st_in_data):
    st_not_in_data = [st for i, st in enumerate(gene_panel_selection_ops['starting_genes']) if st_in_data[i]]
    print(f'{st_not_in_data} are not in the data')
    gene_panel_selection_ops['starting_genes'] = [st for st in gene_panel_selection_ops['starting_genes'] if st not in st_not_in_data]

In [95]:
# Convert rpkm(? not TPM?) to log2
data_log2 = np.log2(data + 1)
# takes about 9 s to run

In [118]:
# calculate proportions and medians per cluster
cluster_names = annotation.cluster_label.unique()
expre_thresh = 1
# make data_log2 to have another level of columns with matching cluster names per cell ID
data_log2_cluster = data_log2.copy()
assert np.all(data_log2.columns == annotation.sample_id.values)
# groupby cluster and calculate median and proportion
data_log2_cluster.columns = pd.MultiIndex.from_arrays([annotation.cluster_label, data_log2.columns])
median_per_cluster = data_log2_cluster.T.groupby(level=0).median().T
prop_expr = data_log2_cluster.T.groupby(level=0).apply(lambda x: (x > expre_thresh).mean(axis=0)).T
assert np.all(prop_expr.index.values == median_per_cluster.index.values)
assert np.all(prop_expr.index.values == data_log2.index.values)



In [172]:
from importlib import reload
reload(mfishtools)

<module 'mfishtools' from 'c:\\Users\\jinho.kim\\Github\\lamf_analysis_lims\\gene_panel_selection\\mfishtoolspy\\mfishtools.py'>

In [168]:
import sys
sys.path.append(r'C:\Users\jinho.kim\Github\lamf_analysis_lims\gene_panel_selection\mfishtoolspy')
import mfishtools

In [176]:
median_per_cluster.index.isnull().any()

np.False_

In [204]:
reload(mfishtools)
run_genes = mfishtools.filter_panel_genes(median_per_cluster**2 - 1, 
prop_expr=prop_expr,
on_clusters=list(keep_clusts),
off_clusters=list(annotation.query('class_label=="Non-Neuronal"').cluster_label.unique()),
starting_genes=gene_panel_selection_ops['starting_genes'],
num_binary_genes=300,
min_on=10,
max_on=300,
max_off=10,
min_length=1400,
fraction_on_clusters=0.5,
exclude_families=["LOC","Fam","RIK","RPS","RPL","\\-","Gm","Rnf","BC0"])

1363 total genes pass constraints prior to binary score calculation.


In [205]:
len(run_genes)

315

In [184]:
list(annotation.query('class_label=="Non-Neuronal"').cluster_label.unique())

['Oligo Rassf10',
 'Oligo Synpr',
 'Oligo Serpinb1a',
 'Astro Aqp4',
 'Endo Ctla2a',
 'Peri Kcnj8',
 'Microglia Siglech',
 'PVM Mrc1',
 'VLMC Osr1 Mc5r',
 'VLMC Spp1 Col15a1',
 'OPC Pdgfra Ccnb1',
 'Endo Cytl1',
 'VLMC Spp1 Hs3st6',
 'SMC Acta2',
 'VLMC Osr1 Cd74',
 'OPC Pdgfra Grm5']

In [195]:
isinstance(on_clusters, list) and all(isinstance(x, str) for x in on_clusters)

False

In [197]:
summary_expr = median_per_cluster**2 - 1
prop_expr=prop_expr
on_clusters=list(keep_clusts)
off_clusters=list(annotation.query('class_label=="Non-Neuronal"').cluster_label.unique())
starting_genes=gene_panel_selection_ops['starting_genes'],
num_binary_genes=300,
min_on=10,
max_on=300,
max_off=10,
min_length=1400,
fraction_on_clusters=0.5,
exclude_families=["LOC","Fam","RIK","RPS","RPL","\\-","Gm","Rnf","BC0"]

In [134]:
temp_median_data = median_per_cluster.iloc[:100, :50].copy()
cor_dist = lambda x: 1 - np.corrcoef(x)
cluster_genes = temp_median_data.index
cluster_genes = list(set(cluster_genes).intersection(set(temp_median_data.index)))
cluster_distance = pd.DataFrame(cor_dist(temp_median_data.loc[cluster_genes, :].T),
                                index=temp_median_data.columns, columns=temp_median_data.columns)

In [127]:
a= ['1','2']
b = None
np.isin(a,b)

array([False, False])

In [53]:
layer_df = annotation.query('class_label=="GABAergic"')[['layer_label', 'cluster_label']].copy()
layer_table = layer_df.groupby(['layer_label', 'cluster_label']).size().unstack(fill_value=0)
prop_table = layer_table.div(layer_table.sum(axis=0), axis=1)

In [62]:
L14_labels = ['L1', 'L1-L2/3', 'L1-L4', 'L2/3', 'L2/3-L4', 'L4']
gabaergic_layer_threshold = 0.15
L14_prop_sum = prop_table.loc[L14_labels].sum(axis=0)
L14_prop_sum[L14_prop_sum > gabaergic_layer_threshold].index.values

array(['Lamp5 Fam19a1 Pax6', 'Lamp5 Fam19a1 Tmem182', 'Lamp5 Krt73',
       'Lamp5 Lhx6', 'Lamp5 Lsp1', 'Lamp5 Ntn1 Npy2r',
       'Lamp5 Plch2 Dock5', 'Pvalb Reln Itm2a', 'Pvalb Tpbg',
       'Pvalb Vipr2', 'Serpinf1 Aqp5 Vip', 'Sncg Gpr50', 'Sncg Slc17a8',
       'Sncg Vip Itih5', 'Sncg Vip Nptx2', 'Sst Calb2 Necab1',
       'Sst Calb2 Pdlim5', 'Sst Chodl', 'Sst Hpse Cbln4',
       'Sst Hpse Sema3c', 'Sst Mme Fam114a1', 'Sst Tac1 Htr1d',
       'Sst Tac1 Tacr3', 'Vip Arhgap36 Hmcn1', 'Vip Chat Htr1f',
       'Vip Col15a1 Pde1a', 'Vip Crispld2 Htr2c', 'Vip Crispld2 Kcne4',
       'Vip Igfbp4 Mab21l1', 'Vip Igfbp6 Car10', 'Vip Igfbp6 Pltp',
       'Vip Lect1 Oxtr', 'Vip Lmo1 Myl1', 'Vip Ptprt Pkp2',
       'Vip Pygm C1ql1', 'Vip Rspo1 Itga4', 'Vip Rspo4 Rxfp1 Chat'],
      dtype=object)

# Test

## Layer distribution

In [20]:
annotation.layer_label.unique()

array(['L2/3', 'L4', 'L5', 'L1', 'L6', 'L5-L6', 'L1-L4', 'L1-L6', 'L4-L6',
       'L1-L2/3', 'L4-L5', 'L2/3-L4', 'L6b'], dtype=object)

In [41]:
layer_df = annotation.query('class_label=="GABAergic"')[['layer_label', 'cluster_label']].copy()
# make a table counting the number of cells in each layer and cluster_label
layer_table = layer_df.groupby(['layer_label', 'cluster_label']).size().unstack(fill_value=0)

In [50]:
prop_table = layer_table.div(layer_table.sum(axis=0), axis=1).T
prop_table[prop_table['L5-L6']>0.3]

layer_label,L1,L1-L2/3,L1-L4,L1-L6,L2/3,L2/3-L4,L4,L5,L5-L6,L6,L6b
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Meis2 Adamts19,0.044444,0.0,0.0,0.088889,0.0,0.0,0.0,0.022222,0.6,0.244444,0.0
Pvalb Akr1c18 Ntf3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362069,0.637931,0.0
Pvalb Calb1 Sst,0.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.28169,0.380282,0.309859,0.0
Pvalb Th Sst,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032258,0.306452,0.66129,0.0
Sncg Slc17a8,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.083333,0.416667,0.083333,0.083333
Sst Chodl,0.035294,0.047059,0.094118,0.0,0.011765,0.0,0.011765,0.117647,0.458824,0.2,0.023529
Sst Nr2f2 Necab1,0.0,0.0,0.0,0.0,0.02439,0.0,0.0,0.585366,0.365854,0.02439,0.0
Sst Rxfp1 Eya1,0.0,0.0,0.0,0.018182,0.0,0.0,0.0,0.090909,0.509091,0.381818,0.0
Sst Tac2 Tacstd2,0.0,0.0,0.017857,0.0,0.0,0.0,0.0,0.339286,0.303571,0.339286,0.0


In [46]:
prop_table = layer_table.div(layer_table.sum(axis=0), axis=1).T
prop_table[prop_table['L1-L6']>0.05]

layer_label,L1,L1-L2/3,L1-L4,L1-L6,L2/3,L2/3-L4,L4,L5,L5-L6,L6,L6b
cluster_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Lamp5 Lhx6,0.065217,0.0,0.217391,0.108696,0.043478,0.0,0.0,0.086957,0.217391,0.26087,0.0
Meis2 Adamts19,0.044444,0.0,0.0,0.088889,0.0,0.0,0.0,0.022222,0.6,0.244444,0.0
Pvalb Vipr2,0.086957,0.0,0.456522,0.108696,0.23913,0.0,0.021739,0.021739,0.065217,0.0,0.0
Sst Calb2 Necab1,0.0,0.37037,0.185185,0.074074,0.111111,0.0,0.037037,0.185185,0.037037,0.0,0.0


In [28]:
layer_df = annotation.query('class_label=="GABAergic"')[['layer_label', 'cluster_label']].copy()
layer_df.groupby(['layer_label', 'cluster_label']).size().unstack(fill_value=0)

cluster_label,Lamp5 Fam19a1 Pax6,Lamp5 Fam19a1 Tmem182,Lamp5 Krt73,Lamp5 Lhx6,Lamp5 Lsp1,Lamp5 Ntn1 Npy2r,Lamp5 Plch2 Dock5,Meis2 Adamts19,Pvalb Akr1c18 Ntf3,Pvalb Calb1 Sst,...,Vip Igfbp4 Mab21l1,Vip Igfbp6 Car10,Vip Igfbp6 Pltp,Vip Lect1 Oxtr,Vip Lmo1 Fam159b,Vip Lmo1 Myl1,Vip Ptprt Pkp2,Vip Pygm C1ql1,Vip Rspo1 Itga4,Vip Rspo4 Rxfp1 Chat
layer_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
L1,33,32,21,3,104,124,208,2,0,0,...,3,25,12,12,0,1,8,7,11,11
L1-L2/3,0,0,0,0,1,0,4,0,0,0,...,0,3,0,16,0,2,13,16,6,13
L1-L4,3,2,10,10,49,26,59,0,0,0,...,9,7,11,17,1,10,18,8,4,3
L1-L6,0,0,0,5,4,2,1,4,0,0,...,1,0,0,1,1,2,3,2,0,0
L2/3,7,12,10,2,152,30,34,0,0,0,...,13,24,42,60,1,9,80,55,26,33
L2/3-L4,0,0,0,0,3,0,0,0,0,0,...,0,0,3,0,0,0,0,0,1,0
L4,0,0,0,0,24,2,4,0,0,2,...,17,4,34,2,5,34,57,36,2,4
L5,0,0,0,4,50,0,2,1,0,20,...,12,2,9,4,27,10,25,6,0,1
L5-L6,0,0,0,10,21,0,0,27,21,27,...,2,0,3,0,2,0,2,0,0,0
L6,0,0,0,12,42,0,0,11,37,22,...,1,6,9,3,24,0,4,2,0,2


In [36]:
annotation.slice_label.unique()

array(['CR003', 'CR002', 'CR004', 'CR006', 'CR005', 'CR007', 'CR008',
       'CR001', 'CRVISp'], dtype=object)

In [39]:
annotation.head()

Unnamed: 0,cl,genes_label,confusion_label,cl_cor_label,core_int_label,primary_cluster_label,secondary_cluster_label,donor_label,genotype_label,reporter_label,...,lrcluster_label,lrcluster_color,lf_cluster_id,lf_cluster_label,lf_cluster_color,dendcluster_label,dendcluster_id,dendcluster_color,cluster_id,supertype_label
LS-14690_S02_E1-50,57.0,8537.0,0.342841,0.851554,core,Pvalb Tpbg,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,Ai14(RCL-tdT),...,Pvalb Tpbg,#AF3F64,59.0,59 Pvalb Tpbg,#AF3F64,Pvalb Tpbg,114,#AF3F64,59,Pvalb Reln
LS-14690_S03_E1-50,69.0,8106.0,0.030707,0.844934,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,Ai14(RCL-tdT),...,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64,L4 IT VISp
LS-14690_S05_E1-50,69.0,8779.0,0.109306,0.841516,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,Ai14(RCL-tdT),...,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64,L4 IT VISp
LS-14690_S06_E1-50,69.0,8494.0,0.033539,0.864382,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,Ai14(RCL-tdT),...,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64,L4 IT VISp
LS-14690_S07_E1-50,69.0,7562.0,0.036388,0.795118,core,L4 IT VISp Rspo1,Low Quality Sst Chodl,222454,Snap25-IRES2-Cre/wt;Ai14(RCL-tdT)/wt,Ai14(RCL-tdT),...,L4 IT VISp Rspo1,#00979D,70.0,70 L4 IT VISp Rspo1,#00979D,L4 IT VISp Rspo1,7,#00979D,64,L4 IT VISp


In [40]:
annotation.reset_index().query('class_label=="GABAergic"')[['layer_label', 'slice_label', 'index']].groupby(['layer_label', 'slice_label']).count().unstack(fill_value=0)

Unnamed: 0_level_0,index,index,index,index,index,index,index,index,index
slice_label,CR001,CR002,CR003,CR004,CR005,CR006,CR007,CR008,CRVISp
layer_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
L1,0,13,44,269,160,182,59,0,0
L1-L2/3,0,0,9,203,26,0,0,0,0
L1-L4,0,65,213,227,10,0,0,0,0
L1-L6,6,1,17,22,0,4,0,0,0
L2/3,0,0,37,324,449,263,35,65,0
L2/3-L4,0,0,0,45,0,0,0,0,0
L4,0,0,27,193,156,63,2,26,0
L5,0,44,192,627,367,269,24,15,0
L5-L6,0,77,150,302,16,0,0,0,28
L6,32,2,108,287,207,143,16,0,0


In [34]:
annotation.query('class_label=="GABAergic"').groupby('layer_label').size()

layer_label
L1          727
L1-L2/3     238
L1-L4       515
L1-L6        50
L2/3       1173
L2/3-L4      45
L4          467
L5         1538
L5-L6       573
L6          795
L6b           4
dtype: int64

In [17]:
annotation[annotation['subclass_label'].isin(['L2/3 IT','L4'])].cluster_label.unique()

array(['L4 IT VISp Rspo1', 'L2/3 IT VISp Agmat', 'L2/3 IT VISp Adamts2',
       'L2/3 IT VISp Rrad'], dtype=object)

In [18]:
annotation.class_label.unique()


array(['GABAergic', 'Glutamatergic', 'Non-Neuronal'], dtype=object)

In [14]:
cname_list = list(annotation.columns)
[cn for cn in cname_list if 'class' in cn]

['subclass_id',
 'subclass_label',
 'subclass_color',
 'class_id',
 'class_label',
 'class_color']