# Purpose:
- address random seed issue in mfishtools
- Develop mfishtoolspy to iterate multiple times for one gene addition
- Make code more stable for addressing 'other' genes

In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from importlib import reload

import sys
sys.path.append(r'C:\Users\jinho.kim\Github\lamf_analysis_lims\gene_panel_selection\mfishtoolspy')
import mfishtools


In [151]:
# options for cluster grouping
ops = {
    'panel_name': 'Pan-neuronal', # GABAergic or Glutamatergic or Pan-neuronal
    'full_panel_size': 28,
    'starting_genes': ["Gad2","Slc17a7","Pvalb","Sst","Vip","Cck","Tac1","Npy","Crh","Necab1","Ptprt","Kirrel3","Penk","Hpse","Calb2","Chodl"],
    'layer_1234_filter': True,
    'GABAergic_group_level': 'cluster', # class, subclass, supertype, or cluster
    'GABAergic_mapping_level': 'cluster',
    'Glutamatergic_group_level': 'subclass', # class, subclass, supertype, or cluster
    'Glutamatergic_mapping_level': 'subclass',
    'GABAergic_other_group_level': 'class', # class, subclass, supertype, or cluster
    'GABAergic_other_mapping_level': 'cluster',
    'Glutamatergic_other_group_level': 'class', # class, subclass, supertype, or cluster
    'Glutamatergic_other_mapping_level': 'subclass',
    'blend_supertypes': False,  # Don't know if I'm going to keep this
    'remove_redundant_genes': False, # from the starting_genes list
    'remove_redundant_genes_threshold': 0.95, # threshold for removing redundant genes from normalized accuracy
    'L1234_layer_threshold': 0.15,
    'L6_layer_threshold': 0.7,
    'L1234_labels': ['L1', 'L1-L2/3', 'L1-L4', 'L2/3', 'L2/3-L4', 'L4'],
    'L6_labels': ['L6', 'L6b']
}

if 'GABAergic' in ops['panel_name']:
    ops['keep_class'] = ['GABAergic']
elif 'Glutamatergic' in ops['panel_name']:
    ops['keep_class'] = ['Glutamatergic']
elif 'Pan-neuronal' in ops['panel_name']:
    ops['keep_class'] = ['GABAergic', 'Glutamatergic']
else:
    raise ValueError('panel_name must be GABAergic, Glutamatergic, or Pan-neuronal')

level_hierarchy = {'class': 0, 'subclass': 1, 'supertype': 2, 'cluster': 3}
assert level_hierarchy[ops['GABAergic_group_level']] <= level_hierarchy[ops['GABAergic_mapping_level']]
assert level_hierarchy[ops['GABAergic_other_group_level']] <= level_hierarchy[ops['GABAergic_other_mapping_level']]
assert level_hierarchy[ops['Glutamatergic_group_level']] <= level_hierarchy[ops['Glutamatergic_mapping_level']]
assert level_hierarchy[ops['Glutamatergic_other_group_level']] <= level_hierarchy[ops['Glutamatergic_other_mapping_level']]




In [17]:
annotation.layer_label.unique()

array(['L2/3', 'L4', 'L5', 'L1', 'L6', 'L5-L6', 'L1-L4', 'L1-L6', 'L4-L6',
       'L1-L2/3', 'L4-L5', 'L2/3-L4', 'L6b'], dtype=object)

In [146]:
# paths to the data
data_folder = Path(r'\\allen\programs\mindscope\workgroups\learning\jinho\gene_panel_selection\data\mouse_VISp_gene_expression_matrices_2018-06-14')
output_folder = Path('//allen/programs/mindscope/workgroups/learning/jinho/gene_panel_selection/inhibitory')

In [147]:
# read annotation
annotation = pd.read_feather(data_folder / 'anno.feather')
# read data
# TODO: check where this data is coming from. Values are similar to cpm but not exactly the same
# data = pd.read_feather(data_folder / 'data_t.feather')
data = pd.read_feather(data_folder / 'exon_tpm.feather')

In [150]:
annotation.set_index('sample_id', inplace=True, drop=True)
# data.set_index('gene', inplace=True, drop=True) # only necessary with data_t.feather

In [10]:
# preprocessing
# Removing 'X" in column? from Hannah's code. Don't know when this happens, but leave them here just in case.
if 'X' in data.columns:
    print('Dropping "X" column from data')
    data = data.drop(columns=['X'])
if 'X' in annotation.columns:
    print('Dropping "X" column fro annotation')
    annotation = annotation.drop(columns=['X'])

# change the row order of annotation to match the order of columns in data
# annotation = annotation.loc[data.columns]  # don't need this, but add assert statement to check
assert np.all(annotation.index.values == data.columns.values)

In [12]:
# read supertype information
# TODO: re-define supertype (will be addressed in another notebook)
supertype_folder = Path('//allen/programs/mindscope/workgroups/omfish/hannahs/mfish_project/gene_panels')
supertype_fn = supertype_folder / 'tasic2018_supertypes_manual_v2.xlsx'
sheet_name = 'all_supertypes_v2'
supertype = pd.read_excel(supertype_fn, sheet_name=sheet_name)
supertype.rename(columns={'Cell Type': 'cell_type', 'Supertype': 'supertype'}, inplace=True)
supertype.cell_type = supertype.cell_type.str.replace('\xa0', ' ')
supertype.supertype = supertype.supertype.str.replace('\xa0', ' ')
assert np.all([ct in annotation['cluster_label'].values for ct in supertype.cell_type.values])
supertype.set_index('cell_type', inplace=True, drop=True)

annotation['supertype_label'] = annotation.cluster_label.map(supertype.supertype)

  annotation['supertype_label'] = annotation.cluster_label.map(supertype.supertype)


In [34]:
annotation.query('class_label=="Glutamatergic"').supertype_label.unique()

array(['L4 IT VISp', 'L2/3 IT VISp', 'L5 NP VISp Trhr ',
       'L5 IT VISp Batf3', 'L6 CT VISp Ctxn3', 'L6 IT VISp Penk',
       'L6 IT VISp Car3', 'L5 IT VISp Col6a1', 'L5 PT VISp C1ql2',
       'L6b P2ry12', 'L6 CT VISp Gpr139', 'L6b VISp Col8a1 ',
       'L5 PT VISp Chrna6', 'CR Lhx5', 'L6b Hsd17b2'], dtype=object)

In [27]:
annotation.query('class_label=="GABAergic"').subclass_label.unique()

array(['Pvalb', 'Vip', 'Lamp5', 'Sst', 'Sncg', 'Serpinf1', 'Meis2'],
      dtype=object)

In [None]:
group_label = f'{ops["Glutamatergic_group_level"]}_label'
mapping_label = f'{ops["Glutamatergic_mapping_level"]}_label'
temp_annotation = annotation.query('class_label=="Glutamatergic"')

In [152]:
# filtering and assigning group_label
# Group means target clustering that I want to classify eventually
# There is also mapping_label, to which I want to match first before calculating classification accuracy
# E.g., match individual samples to cluster level but then classify them at supertype level
# E.g., classification can happen in mixed level, some at the cluster level while others in subclass level

# If "other" group and mapping levels are different from group and mapping levels
# then assign relevant labels to "other" group and mapping levels
#   "other" is relevant only when filtering by the layers
#   It means the rest of groups of mappings that can be within the imaging regime but we want to exclude from further analysis

# Filtering based on layer abundance
# TODO: test thresholds. 
# TODO: Better to do this from merFISH data

keep_groups = []
keep_mappings = []
other_groups = []
other_mappings = []

if 'Glutamatergic' in ops['keep_class']:
    # Assign group and mapping labels
    group_level_label = f'{ops["Glutamatergic_group_level"]}_label'
    group_label_list = [f'Glutamatergic {gl}' for gl in annotation.loc[annotation.class_label == 'Glutamatergic', group_level_label]] # To disambiguate from GABAergic groups
    annotation.loc[annotation.class_label == 'Glutamatergic', 'group_label'] = group_label_list
    mapping_level_label = f'{ops["Glutamatergic_mapping_level"]}_label'
    mapping_label_list = [f'Glutamatergic {ml}' for ml in annotation.loc[annotation.class_label == 'Glutamatergic', mapping_label]] # To disambiguate from GABAergic mappings
    annotation.loc[annotation.class_label == 'Glutamatergic', 'mapping_label'] = mapping_label_list
    
    # Assign keep and other groups.
    # If lower than class level, then consider layer filtering (L1234 only for now)
    if ops['Glutamatergic_group_level'] == 'class':
        keep_groups += ['Glutamatergic']
    else:
        temp_annotation = annotation.query('class_label=="Glutamatergic"')
        if ops['layer_1234_filter']:
            keep_groups += [gl for gl in temp_annotation['group_label'].unique().tolist() if gl.split(' ')[1] in ['L2/3', 'L4']] # Have class label at the beginning
            # Process "other" groups
            other_group_level_label = f'{ops["Glutamatergic_other_group_level"]}_label'
            temp_other_groups = [gl for gl in temp_annotation['group_label'].unique().tolist() if gl.split(' ')[1] in ['L5', 'NP']] # Have class label at the beginning
            if other_group_level_label == group_level_label: # no need to change group labels
                other_groups += temp_other_groups
            else:
                # Add L5 after class label
                other_group_labels = [f'Glutamatergic L5 {gl}' for gl in annotation[annotation['group_label'].isin(temp_other_groups)][other_group_level_label].values]
                annotation.loc[annotation['group_label'].isin(temp_other_groups), 'group_label'] = other_group_labels
                other_groups += np.unique(other_group_labels).tolist()
        else:
            keep_groups += temp_annotation['group_label'].unique().tolist()
    # Assign keep and other mappings.
    if ops['Glutamatergic_mapping_level'] == 'class':
        keep_mappings += ['Glutamatergic']
    else:
        temp_annotation = annotation.query('class_label=="Glutamatergic"')
        if ops['layer_1234_filter']:
            keep_mappings += [ml for ml in temp_annotation['mapping_label'].unique().tolist() if ml.split(' ')[1] in ['L2/3', 'L4']] # Have class label at the beginning
            # Process "other" groups
            other_mapping_level_label = f'{ops["Glutamatergic_other_mapping_level"]}_label'
            temp_other_mappings = [ml for ml in temp_annotation['mapping_label'].unique().tolist() if ml.split(' ')[1] in ['L5', 'NP']] # Have class label at the beginning
            if other_mapping_level_label == mapping_level_label: # no need to change mapping labels
                other_mappings += temp_other_mappings
            else:
                # Add L5 after class label
                other_mapping_labels = [f'Glutamatergic L5 {ml}' for ml in annotation[annotation['mapping_label'].isin(temp_other_mappings)][other_mapping_level_label].values]
                annotation.loc[annotation['mapping_label'].isin(temp_other_mappings), 'mapping_label'] = other_mapping_labels
                other_mappings += np.unique(other_mapping_labels).tolist()
        else:
            keep_mappings += temp_annotation['mapping_label'].unique().tolist()

# Same for GABAergic
# Except for filtering, now we are using scRNAseq layer-enriched data with thresholds
if 'GABAergic' in ops['keep_class']:
    # Assign group and mapping labels
    group_level_label = f'{ops["GABAergic_group_level"]}_label'
    group_label_list = [f'GABAergic {gl}' for gl in annotation.loc[annotation.class_label == 'GABAergic', group_level_label]] # To disambiguate from Glutamatergic groups
    annotation.loc[annotation.class_label == 'GABAergic', 'group_label'] = group_label_list
    mapping_level_label = f'{ops["GABAergic_mapping_level"]}_label'
    mapping_label_list = [f'GABAergic {ml}' for ml in annotation.loc[annotation.class_label == 'GABAergic', mapping_level_label]] # To disambiguate from Glutamatergic mappings
    annotation.loc[annotation.class_label == 'GABAergic', 'mapping_label'] = mapping_label_list
    
    # Assign keep and other groups.
    # If lower than class level, then consider layer filtering (L1234 only for now)
    # Also need to name them different (adding L5 in front of the group and mapping labels)
    if ops['GABAergic_group_level'] == 'class':
        keep_groups += ['GABAergic']
    else:
        temp_annotation = annotation.query('class_label=="GABAergic"')
        if ops['layer_1234_filter']:
            # Filtering process based on the layer abundance
            layer_df = annotation.query('class_label=="GABAergic"')[['layer_label', 'cluster_label']].copy()
            layer_table = layer_df.groupby(['layer_label', 'cluster_label']).size().unstack(fill_value=0)
            prop_table = layer_table.div(layer_table.sum(axis=0), axis=1)
            L1234_prop_sum = prop_table.loc[ops['L1234_labels']].sum(axis=0)
            L1234_inh_types = set(L1234_prop_sum[L1234_prop_sum >= ops['L1234_layer_threshold']].index.values)
            not_L1234_inh_types = set(layer_df.cluster_label).difference(L1234_inh_types)
            L6_prop_sum = prop_table.loc[ops['L6_labels']].sum(axis=0)
            L6_inh_types = set(L6_prop_sum[L6_prop_sum >= ops['L6_layer_threshold']].index.values)
            L5_inh_types = not_L1234_inh_types.difference(L6_inh_types)
            # L1234_inh_types are going to be kept
            # L5_inh_types are going to be "other"
            # Ignore L6_inh_types (assume they won't be imaged)
            keep_annotation = temp_annotation[temp_annotation['cluster_label'].isin(L1234_inh_types)]
            other_annotation = temp_annotation[temp_annotation['cluster_label'].isin(L5_inh_types)]
            
            keep_groups += keep_annotation['group_label'].unique().tolist()

            # Process "other" groups
            other_group_level_label = f'{ops["GABAergic_other_group_level"]}_label'
            temp_other_groups = other_annotation['group_label'].unique().tolist()
            if other_group_level_label == group_level_label: # no need to change group labels
                other_groups += temp_other_groups
            else:
                # Add L5 after class label
                other_group_labels = [f'GABAergic L5 {gl}' for gl in annotation[annotation['group_label'].isin(temp_other_groups)][other_group_level_label].values]
                annotation.loc[annotation['group_label'].isin(temp_other_groups), 'group_label'] = other_group_labels
                other_groups += np.unique(other_group_labels).tolist()
        else:
            keep_groups += temp_annotation['group_label'].unique().tolist()
    # Assign keep and other mappings.
    if ops['GABAergic_mapping_level'] == 'class':
        keep_mappings += ['GABAergic']
    else:
        temp_annotation = annotation.query('class_label=="GABAergic"')
        if ops['layer_1234_filter']:
            # Filtering process should have been done already in the above if clause
            # keep_annotation and other_annotation are already defined
            keep_mappings += keep_annotation['mapping_label'].unique().tolist()
            
            # Process "other" groups
            other_mapping_level_label = f'{ops["GABAergic_other_mapping_level"]}_label'
            temp_other_mappings = other_annotation['mapping_label'].unique().tolist()
            if other_mapping_level_label == mapping_level_label: # no need to change mapping labels
                other_mappings += temp_other_mappings
            else:
                # Add L5 after class label
                other_mapping_labels = [f'GABAergic L5 {ml}' for ml in annotation[annotation['mapping_label'].isin(temp_other_mappings)][other_mapping_level_label].values]
                annotation.loc[annotation['mapping_label'].isin(temp_other_mappings), 'mapping_label'] = other_mapping_labels
                other_mappings += np.unique(other_mapping_labels).tolist()
        else:
            keep_mappings += temp_annotation['mapping_label'].unique().tolist()


  annotation.loc[annotation.class_label == 'Glutamatergic', 'group_label'] = group_label_list
  annotation.loc[annotation.class_label == 'Glutamatergic', 'mapping_label'] = mapping_label_list


In [141]:
L6_inh_types

{'Sst Crh 4930553C11Rik ', 'Sst Nts'}

## Validation
- Done with 'Pan-neuronal' setting

In [73]:
L23_exc = annotation[(annotation.class_label=="Glutamatergic") & annotation.cluster_label.str.startswith('L2/3')]
L23_exc[['class_label', 'cluster_label', 'group_label', 'mapping_label']]

Unnamed: 0_level_0,class_label,cluster_label,group_label,mapping_label
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LS-14690_S11_E1-50,Glutamatergic,L2/3 IT VISp Agmat,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Agmat
LS-14692_S01_E1-50,Glutamatergic,L2/3 IT VISp Agmat,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Agmat
LS-14696_S33_E1-50,Glutamatergic,L2/3 IT VISp Adamts2,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Adamts2
LS-14696_S34_E1-50,Glutamatergic,L2/3 IT VISp Agmat,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Agmat
LS-14696_S35_E1-50,Glutamatergic,L2/3 IT VISp Agmat,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Agmat
...,...,...,...,...
SM-GE8ZO_S094_E1-50,Glutamatergic,L2/3 IT VISp Agmat,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Agmat
SM-GE8ZO_S095_E1-50,Glutamatergic,L2/3 IT VISp Adamts2,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Adamts2
SM-GE8ZO_S096_E1-50,Glutamatergic,L2/3 IT VISp Adamts2,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Adamts2
SM-GE91F_S187_E1-50,Glutamatergic,L2/3 IT VISp Agmat,Glutamatergic L2/3 IT,Glutamatergic L2/3 IT VISp Agmat


In [74]:
print(L23_exc.group_label.unique())
print(L23_exc.mapping_label.unique())

['Glutamatergic L2/3 IT']
['Glutamatergic L2/3 IT VISp Agmat' 'Glutamatergic L2/3 IT VISp Adamts2'
 'Glutamatergic L2/3 IT VISp Rrad']


In [75]:
L4_exc = annotation[(annotation.class_label=="Glutamatergic") & annotation.cluster_label.str.startswith('L4')]
L4_exc[['class_label', 'cluster_label', 'group_label', 'mapping_label']]

Unnamed: 0_level_0,class_label,cluster_label,group_label,mapping_label
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LS-14690_S03_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
LS-14690_S05_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
LS-14690_S06_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
LS-14690_S07_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
LS-14690_S08_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
...,...,...,...,...
SM-GE91H_S343_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
SM-GE91H_S344_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
SM-GE91H_S362_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1
SQ-80001_S12_E1-50,Glutamatergic,L4 IT VISp Rspo1,Glutamatergic L4,Glutamatergic L4 IT VISp Rspo1


In [78]:
print(L4_exc.group_label.unique())
print(L4_exc.mapping_label.unique())

['Glutamatergic L4']
['Glutamatergic L4 IT VISp Rspo1']


In [76]:
L5_exc = annotation[(annotation.class_label=="Glutamatergic") & (annotation.cluster_label.str.startswith('L5') | annotation.cluster_label.str.startswith('NP'))]
L5_exc[['class_label', 'cluster_label', 'group_label', 'mapping_label']]

Unnamed: 0_level_0,class_label,cluster_label,group_label,mapping_label
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LS-14690_S17_E1-50,Glutamatergic,L5 NP VISp Trhr Cpne7,Glutamatergic Glutamatergic,Glutamatergic L5 NP VISp Trhr Cpne7
LS-14690_S20_E1-50,Glutamatergic,L5 IT VISp Hsd11b1 Endou,Glutamatergic Glutamatergic,Glutamatergic L5 IT VISp Hsd11b1 Endou
LS-14690_S26_E1-50,Glutamatergic,L5 NP VISp Trhr Met,Glutamatergic Glutamatergic,Glutamatergic L5 NP VISp Trhr Met
LS-14690_S30_E1-50,Glutamatergic,L5 NP VISp Trhr Cpne7,Glutamatergic Glutamatergic,Glutamatergic L5 NP VISp Trhr Cpne7
LS-14690_S31_E1-50,Glutamatergic,L5 IT VISp Batf3,Glutamatergic Glutamatergic,Glutamatergic L5 IT VISp Batf3
...,...,...,...,...
SM-GE91H_S364_E1-50,Glutamatergic,L5 IT VISp Col6a1 Fezf2,Glutamatergic Glutamatergic,Glutamatergic L5 IT VISp Col6a1 Fezf2
SM-GE91H_S365_E1-50,Glutamatergic,L5 IT VISp Batf3,Glutamatergic Glutamatergic,Glutamatergic L5 IT VISp Batf3
SM-GE91H_S366_E1-50,Glutamatergic,L5 IT VISp Batf3,Glutamatergic Glutamatergic,Glutamatergic L5 IT VISp Batf3
SM-GE91H_S367_E1-50,Glutamatergic,L5 IT VISp Batf3,Glutamatergic Glutamatergic,Glutamatergic L5 IT VISp Batf3


In [77]:
print(L5_exc.group_label.unique())
print(L5_exc.mapping_label.unique())

['Glutamatergic Glutamatergic']
['Glutamatergic L5 NP VISp Trhr Cpne7'
 'Glutamatergic L5 IT VISp Hsd11b1 Endou'
 'Glutamatergic L5 NP VISp Trhr Met' 'Glutamatergic L5 IT VISp Batf3'
 'Glutamatergic L5 IT VISp Col27a1' 'Glutamatergic L5 PT VISp Krt80'
 'Glutamatergic L5 IT VISp Whrn Tox2'
 'Glutamatergic L5 PT VISp C1ql2 Ptgfr' 'Glutamatergic L5 PT VISp Chrna6'
 'Glutamatergic L5 IT VISp Col6a1 Fezf2'
 'Glutamatergic L5 PT VISp C1ql2 Cdh13' 'Glutamatergic L5 PT VISp Lgr5']


In [79]:
L6_exc = annotation[(annotation.class_label=="Glutamatergic") & annotation.cluster_label.str.startswith('L6')]
L6_exc[['class_label', 'cluster_label', 'group_label', 'mapping_label']]

Unnamed: 0_level_0,class_label,cluster_label,group_label,mapping_label
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LS-14690_S34_E1-50,Glutamatergic,L6 CT VISp Ctxn3 Brinp3,Glutamatergic L6 CT,Glutamatergic L6 CT VISp Ctxn3 Brinp3
LS-14690_S35_E1-50,Glutamatergic,L6 IT VISp Penk Col27a1,Glutamatergic L6 IT,Glutamatergic L6 IT VISp Penk Col27a1
LS-14690_S36_E1-50,Glutamatergic,L6 CT VISp Ctxn3 Brinp3,Glutamatergic L6 CT,Glutamatergic L6 CT VISp Ctxn3 Brinp3
LS-14690_S38_E1-50,Glutamatergic,L6 IT VISp Col23a1 Adamts2,Glutamatergic L6 IT,Glutamatergic L6 IT VISp Col23a1 Adamts2
LS-14690_S40_E1-50,Glutamatergic,L6 IT VISp Penk Col27a1,Glutamatergic L6 IT,Glutamatergic L6 IT VISp Penk Col27a1
...,...,...,...,...
SM-GE8ZL_S112_E1-50,Glutamatergic,L6 CT VISp Nxph2 Wls,Glutamatergic L6 CT,Glutamatergic L6 CT VISp Nxph2 Wls
SM-GE8ZL_S176_E1-50,Glutamatergic,L6 IT VISp Penk Col27a1,Glutamatergic L6 IT,Glutamatergic L6 IT VISp Penk Col27a1
SM-GE8ZO_S035_E1-50,Glutamatergic,L6 CT VISp Ctxn3 Brinp3,Glutamatergic L6 CT,Glutamatergic L6 CT VISp Ctxn3 Brinp3
SM-GE91H_S368_E1-50,Glutamatergic,L6 IT VISp Penk Col27a1,Glutamatergic L6 IT,Glutamatergic L6 IT VISp Penk Col27a1


In [80]:
print(L6_exc.group_label.unique())
print(L6_exc.mapping_label.unique())

['Glutamatergic L6 CT' 'Glutamatergic L6 IT' 'Glutamatergic L6b']
['Glutamatergic L6 CT VISp Ctxn3 Brinp3'
 'Glutamatergic L6 IT VISp Penk Col27a1'
 'Glutamatergic L6 IT VISp Col23a1 Adamts2'
 'Glutamatergic L6 IT VISp Col18a1' 'Glutamatergic L6 IT VISp Penk Fst'
 'Glutamatergic L6 IT VISp Car3' 'Glutamatergic L6 CT VISp Ctxn3 Sla'
 'Glutamatergic L6 CT VISp Krt80 Sla' 'Glutamatergic L6b P2ry12'
 'Glutamatergic L6 CT VISp Nxph2 Wls' 'Glutamatergic L6 CT VISp Gpr139'
 'Glutamatergic L6b VISp Mup5' 'Glutamatergic L6b Col8a1 Rprm'
 'Glutamatergic L6 CT Nxph2 Sla' 'Glutamatergic L6b VISp Col8a1 Rxfp1'
 'Glutamatergic L6b VISp Crh' 'Glutamatergic L6b Hsd17b2']


In [81]:
annotation[annotation.class_label=="GABAergic"].group_label.unique()

array(['GABAergic Pvalb Tpbg', 'GABAergic Vip Chat Htr1f',
       'GABAergic Lamp5 Lsp1', 'GABAergic Sst Hpse Sema3c',
       'GABAergic Vip Pygm C1ql1', 'GABAergic Vip Ptprt Pkp2',
       'GABAergic Vip Igfbp4 Mab21l1', 'GABAergic Sst Chodl',
       'GABAergic Lamp5 Ntn1 Npy2r', 'GABAergic Vip Igfbp6 Car10',
       'GABAergic Vip Crispld2 Kcne4', 'GABAergic Vip Igfbp6 Pltp',
       'GABAergic L5 GABAergic', 'GABAergic Pvalb Reln Itm2a',
       'GABAergic Vip Lmo1 Myl1', 'GABAergic Vip Crispld2 Htr2c',
       'GABAergic Lamp5 Plch2 Dock5', 'GABAergic Vip Col15a1 Pde1a',
       'GABAergic Vip Rspo4 Rxfp1 Chat',
       'GABAergic Lamp5 Fam19a1 Tmem182', 'GABAergic Lamp5 Fam19a1 Pax6',
       'GABAergic Vip Arhgap36 Hmcn1', 'GABAergic Sncg Vip Itih5',
       'GABAergic Lamp5 Lhx6', 'GABAergic Sncg Slc17a8',
       'GABAergic Sncg Vip Nptx2', 'GABAergic Pvalb Vipr2',
       'GABAergic Lamp5 Krt73', 'GABAergic Serpinf1 Aqp5 Vip',
       'GABAergic Sst Calb2 Pdlim5', 'GABAergic Sst Mme Fam11

In [82]:
annotation[annotation.class_label=="GABAergic"].mapping_label.unique()

array(['GABAergic Pvalb Tpbg', 'GABAergic Vip Chat Htr1f',
       'GABAergic Lamp5 Lsp1', 'GABAergic Sst Hpse Sema3c',
       'GABAergic Vip Pygm C1ql1', 'GABAergic Vip Ptprt Pkp2',
       'GABAergic Vip Igfbp4 Mab21l1', 'GABAergic Sst Chodl',
       'GABAergic Lamp5 Ntn1 Npy2r', 'GABAergic Vip Igfbp6 Car10',
       'GABAergic Vip Crispld2 Kcne4', 'GABAergic Vip Igfbp6 Pltp',
       'GABAergic L5 Vip Gpc3 Slc18a3', 'GABAergic Pvalb Reln Itm2a',
       'GABAergic Vip Lmo1 Myl1', 'GABAergic Vip Crispld2 Htr2c',
       'GABAergic Lamp5 Plch2 Dock5', 'GABAergic Vip Col15a1 Pde1a',
       'GABAergic L5 Pvalb Gabrg1', 'GABAergic Vip Rspo4 Rxfp1 Chat',
       'GABAergic Lamp5 Fam19a1 Tmem182', 'GABAergic Lamp5 Fam19a1 Pax6',
       'GABAergic L5 Sst Chrna2 Ptgdr', 'GABAergic L5 Pvalb Reln Tac1',
       'GABAergic Vip Arhgap36 Hmcn1', 'GABAergic L5 Pvalb Calb1 Sst',
       'GABAergic L5 Sst Rxfp1 Prdm8', 'GABAergic L5 Pvalb Th Sst',
       'GABAergic L5 Sst Nr2f2 Necab1', 'GABAergic L5 Sst Myh

In [142]:
keep_groups

['Glutamatergic L4',
 'Glutamatergic L2/3 IT',
 'GABAergic Pvalb Tpbg',
 'GABAergic Vip Chat Htr1f',
 'GABAergic Lamp5 Lsp1',
 'GABAergic Sst Hpse Sema3c',
 'GABAergic Vip Pygm C1ql1',
 'GABAergic Vip Ptprt Pkp2',
 'GABAergic Vip Igfbp4 Mab21l1',
 'GABAergic Sst Chodl',
 'GABAergic Lamp5 Ntn1 Npy2r',
 'GABAergic Vip Igfbp6 Car10',
 'GABAergic Vip Crispld2 Kcne4',
 'GABAergic Vip Igfbp6 Pltp',
 'GABAergic Pvalb Reln Itm2a',
 'GABAergic Vip Lmo1 Myl1',
 'GABAergic Vip Crispld2 Htr2c',
 'GABAergic Lamp5 Plch2 Dock5',
 'GABAergic Vip Col15a1 Pde1a',
 'GABAergic Vip Rspo4 Rxfp1 Chat',
 'GABAergic Lamp5 Fam19a1 Tmem182',
 'GABAergic Lamp5 Fam19a1 Pax6',
 'GABAergic Vip Arhgap36 Hmcn1',
 'GABAergic Sncg Vip Itih5',
 'GABAergic Lamp5 Lhx6',
 'GABAergic Sncg Slc17a8',
 'GABAergic Sncg Vip Nptx2',
 'GABAergic Pvalb Vipr2',
 'GABAergic Lamp5 Krt73',
 'GABAergic Serpinf1 Aqp5 Vip',
 'GABAergic Sst Calb2 Pdlim5',
 'GABAergic Sst Mme Fam114a1',
 'GABAergic Sst Hpse Cbln4',
 'GABAergic Vip Lect1 Oxtr

In [134]:
keep_mappings

['Glutamatergic L4 IT VISp Rspo1',
 'Glutamatergic L2/3 IT VISp Agmat',
 'Glutamatergic L2/3 IT VISp Adamts2',
 'Glutamatergic L2/3 IT VISp Rrad',
 'GABAergic Pvalb Tpbg',
 'GABAergic Vip Chat Htr1f',
 'GABAergic Lamp5 Lsp1',
 'GABAergic Sst Hpse Sema3c',
 'GABAergic Vip Pygm C1ql1',
 'GABAergic Vip Ptprt Pkp2',
 'GABAergic Vip Igfbp4 Mab21l1',
 'GABAergic Sst Chodl',
 'GABAergic Lamp5 Ntn1 Npy2r',
 'GABAergic Vip Igfbp6 Car10',
 'GABAergic Vip Crispld2 Kcne4',
 'GABAergic Vip Igfbp6 Pltp',
 'GABAergic Pvalb Reln Itm2a',
 'GABAergic Vip Lmo1 Myl1',
 'GABAergic Vip Crispld2 Htr2c',
 'GABAergic Lamp5 Plch2 Dock5',
 'GABAergic Vip Col15a1 Pde1a',
 'GABAergic Vip Rspo4 Rxfp1 Chat',
 'GABAergic Lamp5 Fam19a1 Tmem182',
 'GABAergic Lamp5 Fam19a1 Pax6',
 'GABAergic Vip Arhgap36 Hmcn1',
 'GABAergic Sncg Vip Itih5',
 'GABAergic Lamp5 Lhx6',
 'GABAergic Sncg Slc17a8',
 'GABAergic Sncg Vip Nptx2',
 'GABAergic Pvalb Vipr2',
 'GABAergic Lamp5 Krt73',
 'GABAergic Serpinf1 Aqp5 Vip',
 'GABAergic Sst C

In [143]:
other_groups

['Glutamatergic L5 Glutamatergic', 'GABAergic L5 GABAergic']

In [144]:
other_mappings

['Glutamatergic L5 NP VISp Trhr Cpne7',
 'Glutamatergic L5 IT VISp Hsd11b1 Endou',
 'Glutamatergic L5 NP VISp Trhr Met',
 'Glutamatergic L5 IT VISp Batf3',
 'Glutamatergic L5 IT VISp Col27a1',
 'Glutamatergic L5 PT VISp Krt80',
 'Glutamatergic L5 IT VISp Whrn Tox2',
 'Glutamatergic L5 PT VISp C1ql2 Ptgfr',
 'Glutamatergic L5 PT VISp Chrna6',
 'Glutamatergic L5 IT VISp Col6a1 Fezf2',
 'Glutamatergic L5 PT VISp C1ql2 Cdh13',
 'Glutamatergic L5 PT VISp Lgr5',
 'GABAergic Vip Gpc3 Slc18a3',
 'GABAergic Pvalb Gabrg1',
 'GABAergic Sst Chrna2 Ptgdr',
 'GABAergic Pvalb Reln Tac1',
 'GABAergic Pvalb Calb1 Sst',
 'GABAergic Sst Rxfp1 Prdm8',
 'GABAergic Pvalb Th Sst',
 'GABAergic Sst Nr2f2 Necab1',
 'GABAergic Sst Myh8 Etv1 ',
 'GABAergic Sst Rxfp1 Eya1',
 'GABAergic Sst Chrna2 Glra3',
 'GABAergic Sst Tac2 Tacstd2',
 'GABAergic Sst Esm1',
 'GABAergic Sst Myh8 Fibin',
 'GABAergic Serpinf1 Clrn1',
 'GABAergic Vip Lmo1 Fam159b',
 'GABAergic Sst Crhr2 Efemp1',
 'GABAergic Sst Tac2 Myh4',
 'GABAergic

In [153]:
assert np.all([og in annotation.group_label.unique() for og in other_groups])
assert np.all([om in annotation.mapping_label.unique() for om in other_mappings])
assert np.all([kg in annotation.group_label.unique() for kg in keep_groups])
assert np.all([km in annotation.mapping_label.unique() for km in keep_mappings])

In [138]:
set(annotation.group_label.unique()).difference(other_groups + keep_groups)

{'GABAergic Sst Crh 4930553C11Rik ',
 'GABAergic Sst Nts',
 'Glutamatergic CR',
 'Glutamatergic L6 CT',
 'Glutamatergic L6 IT',
 'Glutamatergic L6b',
 nan}

In [139]:
set(annotation.mapping_label.unique()).difference(other_mappings + keep_mappings)

{'GABAergic Sst Crh 4930553C11Rik ',
 'GABAergic Sst Nts',
 'Glutamatergic CR Lhx5',
 'Glutamatergic L6 CT Nxph2 Sla',
 'Glutamatergic L6 CT VISp Ctxn3 Brinp3',
 'Glutamatergic L6 CT VISp Ctxn3 Sla',
 'Glutamatergic L6 CT VISp Gpr139',
 'Glutamatergic L6 CT VISp Krt80 Sla',
 'Glutamatergic L6 CT VISp Nxph2 Wls',
 'Glutamatergic L6 IT VISp Car3',
 'Glutamatergic L6 IT VISp Col18a1',
 'Glutamatergic L6 IT VISp Col23a1 Adamts2',
 'Glutamatergic L6 IT VISp Penk Col27a1',
 'Glutamatergic L6 IT VISp Penk Fst',
 'Glutamatergic L6b Col8a1 Rprm',
 'Glutamatergic L6b Hsd17b2',
 'Glutamatergic L6b P2ry12',
 'Glutamatergic L6b VISp Col8a1 Rxfp1',
 'Glutamatergic L6b VISp Crh',
 'Glutamatergic L6b VISp Mup5',
 nan}

# Filtering setting done
- Now address random seed issue
- Divide on_cluster_mapping and on_cluster_group
- Use on_cluster_mapping for initial mapping, and use on_cluster_group for classification performance calculation
- Iternate multiple times (100) for one gene addition
    - Try multiprocessing