# Compile thalamic nuclei marker genes from literature sources

In [1]:
import os
import pandas as pd
import anndata as ad

In [2]:
root_dir = '/root/capsule'

## Wang *et al.* 2020 CCFv3 marker genes

In [3]:
ccf_csv = pd.read_csv(os.path.join(root_dir,'resources/Wang_2020_CCFv3_Table_S3_data_supporting_delineation_TH_ZI_only.csv'))

In [4]:
ccf_df = ccf_csv[ccf_csv['Dataset type'].isin(['Tg lines', 'ISH (ABA)'])]

In [5]:
# we don't need to keep all the columns around
col_to_keep = ['structure abbreviation', 
               'full structure name', 
               'Major Brain Division', 
               'Mouse line', 
               'Dataset type', 
               'Transgenic Signal in CCF structure', # this is not actually that accurate for determining marker 
                                                     # genes (what is +/++/+++ relative to??) but I'm keeping it 
                                                     # around just in case it proves useful for something
               'ABA gene name']

ccf_df = ccf_df[col_to_keep]

In [6]:
ccf_df

Unnamed: 0,structure abbreviation,full structure name,Major Brain Division,Mouse line,Dataset type,Transgenic Signal in CCF structure,ABA gene name
0,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Pvalb-IRES-Cre;Ai14,Tg lines,x,
1,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Grp-Cre_KH288;Ai14,Tg lines,x,
2,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Syt6-Cre_KI148;Ai14,Tg lines,xx,
3,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Dlg3-Cre_KG118;Ai14,Tg lines,xx,
4,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Fezf2-CreER;Ai14,Tg lines,xx,
...,...,...,...,...,...,...,...
809,ZI,Zona incerta,Hypothalamus,C57BL/6J,ISH (ABA),,Gfra1
810,ZI,Zona incerta,Hypothalamus,C57BL/6J,ISH (ABA),,Glra1
811,ZI,Zona incerta,Hypothalamus,C57BL/6J,ISH (ABA),,Kcnab3
812,ZI,Zona incerta,Hypothalamus,C57BL/6J,ISH (ABA),,Sphkap


In [7]:
ccf_df.insert(3, 'Reference', 'Wang et al. 2020')

In [8]:
# clean up the 'mouse line' to just the driver gene for the 'Tg lines' datasets
mask_tg_lines = ccf_df['Dataset type']=='Tg lines'
ccf_df.loc[mask_tg_lines, ['gene symbol']] = ccf_df.loc[mask_tg_lines, ['Mouse line']]['Mouse line'].str.split('-').str[0]

In [9]:
# move 'ABA gene name' over into the 'gene symbol' column to combine with the Tg lines genes
mask_aba = ccf_df['Dataset type']=='ISH (ABA)'
ccf_df.loc[mask_aba, ['gene symbol']] = ccf_df.loc[mask_aba, ['ABA gene name']]['ABA gene name']

In [10]:
# drop the original columns in favor of just having the one "gene symbol" column
ccf_df.drop(labels=['Mouse line','ABA gene name'], axis=1, inplace=True)

In [11]:
# test subset from AD
ccf_df[ccf_df['structure abbreviation']=='AD']

Unnamed: 0,structure abbreviation,full structure name,Major Brain Division,Reference,Dataset type,Transgenic Signal in CCF structure,gene symbol
423,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,x,Rorb
424,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,xx,Chrna2
425,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,xx,Scnn1a
426,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,xx,Gal
427,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,xxx,Kcng4
432,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,Tesc
433,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,C1ql2
434,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,Kcnc2
435,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,Igfbp5
436,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,Kcnab3


### Cross-reference with brain3 & brain1 gene panels

In [12]:
# get gene panels
brain1_genes_csv = pd.read_csv(os.path.join(root_dir,'resources/gene_panel_VZ147_mouse_609882_brain1.csv'))
brain3_genes_csv = pd.read_csv(os.path.join(root_dir,'resources/gene_panel_VZ142_mouse_638850_brain3.csv'))

genes_brain1 = brain1_genes_csv['gene_panel_VZ147_mouse_609882_brain1'].tolist()
genes_brain3 = brain3_genes_csv['gene_panel_VZ142_mouse_638850_brain3'].tolist()

In [13]:
# flag rows where the gene is in the gene panel
ccf_df['is_in_brain1_gene_panel'] = ccf_df['gene symbol'].isin(genes_brain1)
ccf_df['is_in_brain3_gene_panel'] = ccf_df['gene symbol'].isin(genes_brain3)
ccf_df.head(5)

Unnamed: 0,structure abbreviation,full structure name,Major Brain Division,Reference,Dataset type,Transgenic Signal in CCF structure,gene symbol,is_in_brain1_gene_panel,is_in_brain3_gene_panel
0,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,x,Pvalb,True,True
1,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,x,Grp,True,True
2,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,xx,Syt6,True,True
3,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,xx,Dlg3,False,False
4,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,xx,Fezf2,True,False


In [14]:
# example of how to use the gene panel flags to grab a list of genes for further review
ccf_df[(ccf_df['structure abbreviation']=='AD') & (ccf_df['is_in_brain1_gene_panel'] | ccf_df['is_in_brain3_gene_panel'])]

Unnamed: 0,structure abbreviation,full structure name,Major Brain Division,Reference,Dataset type,Transgenic Signal in CCF structure,gene symbol,is_in_brain1_gene_panel,is_in_brain3_gene_panel
423,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,x,Rorb,True,True
426,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,Tg lines,xx,Gal,True,True
433,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,C1ql2,True,False
434,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,Kcnc2,True,False
436,AD,Anterodorsal nucleus,Thalamus,Wang et al. 2020,ISH (ABA),,Kcnab3,False,True


## Add flag column for Nagalski *et al.* 2016 marker genes

In [15]:
nagalski_csv = pd.read_csv(os.path.join(root_dir,'resources/Nagalski_2016_Supplementary_Data_2_mouse_nuclei_specific_gene_table.csv'))

In [16]:
# want to be able to index into rows via gene name
nagalski_csv.set_index('Gene symbol', inplace=True)

In [17]:
nagalski_csv

Unnamed: 0_level_0,Profile No.,Protein name,PG,ZI,RT,LHb,MHb,PF,SPF,PIL,...,VPPC,MGN,Sub,LD,VA/VL,DLG,VPL/VPM,AM,AV,AD
Gene symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ecel1,1,Endothelin converting enzyme-like 1,1,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
Esrrg,1,Estrogen-related receptor gamma,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Kcng4,1,"Potassium voltage-gated channel, subfamily G, ...",1,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Isl1,2,"ISL1 transcription factor, LIM/homeodomain",0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Meis2,2,Meis homeobox 2,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pde7b,24,Phosphodiesterase 7B,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,0,0,1
Wnt4,25,Wingless-related MMTV integration site 4,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
Ier3,25,Immediate early response 3,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
Cd59a,25,CD59a antigen,0,0,1,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0


### Display abbrv:fullname pairs for CCFv3 structures

In [18]:
# display abbv + full names for each CCFv3 structure
ccf_names = ccf_df[['structure abbreviation','full structure name']]
df_no_duplicates = ccf_names.drop_duplicates(subset=['structure abbreviation'])
df_sorted = df_no_duplicates.sort_values(by='structure abbreviation')
df_sorted

Unnamed: 0,structure abbreviation,full structure name
423,AD,Anterodorsal nucleus
382,AMd,"Anteromedial nucleus, dorsal part"
405,AMv,"Anteromedial nucleus, ventral part"
351,AV,Anteroventral nucleus of thalamus
675,CL,Central lateral nucleus of the thalamus
649,CM,Central medial nucleus of the thalamus
348,Eth,Ethmoid nucleus of the thalamus
460,IAD,Interanterodorsal nucleus of the thalamus
443,IAM,Interanteromedial nucleus of the thalamus
732,IGL,Intergeniculate leaflet of the lateral genicul...


### Manual mapping dictionaries of CCFv3 <--> Nagalski structure abbrv

In [19]:
# display abbrv lists to manually compare
print(sorted(nagalski_csv.columns))
print(sorted(ccf_df['structure abbreviation'].unique()))

['AD', 'AM', 'AV', 'CL', 'CM', 'DLG', 'IAD', 'IGL', 'IMD', 'LD', 'LHb', 'LP', 'MD', 'MGN', 'MHb', 'PC', 'PF', 'PG', 'PIL', 'PP', 'PT', 'PVA', 'PVP', 'Po', 'Profile No.', 'Protein name', 'RT', 'Re', 'ReA', 'Rh', 'SG', 'SPF', 'Sub', 'VA/VL', 'VM', 'VPL/VPM', 'VPPC', 'ZI']
['AD', 'AMd', 'AMv', 'AV', 'CL', 'CM', 'Eth', 'IAD', 'IAM', 'IGL', 'IMD', 'IntG', 'LD', 'LGd-co', 'LGd-ip', 'LGd-sh', 'LGv', 'LH', 'LP', 'MD', 'MGd', 'MGm', 'MGv', 'MH', 'PCN', 'PF', 'PIL', 'PO', 'POL', 'PP', 'PR', 'PT', 'PVT', 'PoT', 'RE', 'RH', 'RT', 'SGN', 'SMT', 'SPA', 'SPFm', 'SPFp', 'SubG', 'VAL', 'VM', 'VPL', 'VPLpc', 'VPM', 'VPMpc', 'Xi', 'ZI']


In [20]:
# manual mapping from Nagalski to CCFv3 thalamic nuclei abbreviations (by MT, 2023-10-26)
nagalski_to_ccfv3_structure_abbrv_mapping = {
    'AD': 'AD',
    'AM':['AMd','AMv'],
    'AV':'AV',
    'CL':'CL',
    'CM':'CM',
    'DLG':['LGd-co', 'LGd-ip', 'LGd-sh'],
    'IAD':'IAD',
    'IGL':'IGL',
    'IMD':'IMD',
    'LD':'LD',
    'LHb':'LH',
    'LP':'LP',
    'MD':'MD',
    'MGN':['MGd', 'MGm', 'MGv'],
    'MHb':'MH',
    'PC':'PCN',
    'PF':'PF',
    'PG':'LGv', # from Nagalski2016 pg 2497: "PG (former ventral lateral geniculate nucleus)"
    'PIL':'PIL',
    'PP':'PP',
    'PT':'PT',
    'PVA':'PVT',
    'PVP':'PVT',
    'Po':'PO',
    'RT':'RT',
    'Re':'RE',
    'ReA':'RE',
    'Rh':'RH',
    'SG':'SGN',
    'SPF':['SPA', 'SPFm', 'SPFp'],
    'Sub':'SMT', # from Nagalski2016 pg 2494: "Sub: Submedius thalamic nucleus"
    'VA/VL':'VAL',
    'VM':'VM',
    'VPL/VPM':['VPL', 'VPM'],
    'VPPC':['VPLpc', 'VPMpc'],
    'ZI':'ZI'
}

In [21]:
# manual mapping from CCFv3 to Nagalski thalamic nuclei abbreviations (by MT, 2023-10-26)
ccfv3_to_nagalski_structure_abbrv_mapping = {
    'AD': 'AD',
    'AMd':'AM',
    'AMv':'AM',
    'AV':'AV',
    'CL':'CL',
    'CM':'CM',
    'Eth':None,
    'IAD':'IAD',
    'IAM':None,
    'IGL':'IGL',
    'IMD':'IMD',
    'IntG':None,
    'LD':'LD',
    'LGd-co':'DLG',
    'LGd-ip':'DLG',
    'LGd-sh':'DLG',
    'LGv':'PG', # from Nagalski2016 pg 2497: "PG (former ventral lateral geniculate nucleus)"
    'LH':'LHb',
    'LP':'LP',
    'MD':'MD',
    'MGd':'MGN',
    'MGm':'MGN',
    'MGv':'MGN',
    'MH':'MHb',
    'PCN':'PC',
    'PF':'PF',
    'PIL':'PIL',
    'PO':'Po',
    'POL':None,
    'PP':'PP',
    'PR':None,
    'PT':'PT',
    'PVT':['PVA','PVP'],
    'PoT':None,
    'RE':['Re','ReA'],
    'RH':'Rh',
    'RT':'RT',
    'SGN':'SG',
    'SMT':'Sub', # from Nagalski2016 pg 2494: "Sub: Submedius thalamic nucleus"
    'SPA':'SPF',
    'SPFm':'SPF',
    'SPFp':'SPF',
    'SubG':None,
    'VAL':'VA/VL',
    'VM':'VM',
    'VPL':'VPL/VPM',
    'VPLpc':'VPPC',
    'VPM':'VPL/VPM',
    'VPMpc':'VPPC',
    'Xi':None,
    'ZI':'ZI'  
}

### Fill in column to flag Nagalski marker genes

In [22]:
ccf_df_with_nagalski = ccf_df.copy()

In [23]:
# Check if any of the Wang et al 2020 marker genes were also flagged by 
# Nagalski et al 2016 as expressing in that thalamic nucleus
# 
# I don't *think* there's a good way to vectorize this, given that I need to do
# checks on the structure abbreviations before I can use them to index in ...
for ind, row in ccf_df_with_nagalski.iterrows():
    ccf_abbrv = row['structure abbreviation']
    gene = row['gene symbol']
    
    if gene not in nagalski_csv.index:
        expr_flag = False
    else:
        nagalski_abbrv = ccfv3_to_nagalski_structure_abbrv_mapping[ccf_abbrv]
        
        if nagalski_abbrv is None:
            expr_flag = False
        else:
            expr_flag = nagalski_csv.loc[gene, nagalski_abbrv].any() # account for multiple mapped abbrv
    
    ccf_df_with_nagalski.loc[ind,'is_expressed_Nagalski2016'] = expr_flag

In [24]:
ccf_df_with_nagalski

Unnamed: 0,structure abbreviation,full structure name,Major Brain Division,Reference,Dataset type,Transgenic Signal in CCF structure,gene symbol,is_in_brain1_gene_panel,is_in_brain3_gene_panel,is_expressed_Nagalski2016
0,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,x,Pvalb,True,True,True
1,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,x,Grp,True,True,False
2,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,xx,Syt6,True,True,False
3,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,xx,Dlg3,False,False,False
4,VAL,Ventral anterior-lateral complex of the thalamus,Thalamus,Wang et al. 2020,Tg lines,xx,Fezf2,True,False,False
...,...,...,...,...,...,...,...,...,...,...
809,ZI,Zona incerta,Hypothalamus,Wang et al. 2020,ISH (ABA),,Gfra1,False,True,True
810,ZI,Zona incerta,Hypothalamus,Wang et al. 2020,ISH (ABA),,Glra1,False,False,False
811,ZI,Zona incerta,Hypothalamus,Wang et al. 2020,ISH (ABA),,Kcnab3,False,True,True
812,ZI,Zona incerta,Hypothalamus,Wang et al. 2020,ISH (ABA),,Sphkap,False,False,False


## Save out as csv resource file

In [25]:
ccf_df_with_nagalski.to_csv(os.path.join(root_dir,'resources/thalamic_nuclei_marker_genes_from_Wang2020.csv'), index=False)