# scFAIR_Sillet_WMB_2_KG: Jupyter Notebook Version

This notebook replicates the logic of `scFAIR_Sillet_WMB_2_KG.py` for interactive exploration and reporting. After each major code block, a concise report of the resulting data is shown. Only the final joined DataFrame is saved as a file.

In [6]:
# Import Required Libraries
import pandas as pd
from pathlib import Path

In [7]:
# Load and Preprocess Data
info_path = Path('info_celltype_complete.tsv')
matrix_path = Path('sm_cluster.mapping_table.tsv')

# Load info_cell_type_complete.tsv
df_info = pd.read_csv(info_path, sep='\t')
print('info_celltype_complete.tsv:')
print('Shape:', df_info.shape)
print('Columns:', df_info.columns.tolist())
display(df_info.head())

# Aggregate and prepend 'mm_'
df_info_agg = df_info[['cell_type', 'cellTypeId_', 'cellTypeName_']].groupby('cellTypeName_', as_index=False).agg({'cell_type': list})
df_info_agg['cellTypeName_'] = 'mm_' + df_info_agg['cellTypeName_'].astype(str)
print('Aggregated info_cell_type_complete.tsv:')
print('Shape:', df_info_agg.shape)
display(df_info_agg.head())

# Load mapping table
raw = pd.read_csv(matrix_path, sep='\t', header=None)
header = raw.iloc[0, 1:].tolist()
rows = raw.iloc[1:, 0].tolist()
matrix = raw.iloc[1:, 1:]
matrix.columns = header
matrix.index = rows
matrix = matrix.astype(float)

# Melt to long format
long_df = matrix.reset_index().melt(id_vars='index', var_name='c', value_name='score')
long_df = long_df.rename(columns={'index': 'r'})
long_df = long_df[long_df['score'] >= 0.1]
long_df['score'] = long_df['score'].round(2)
long_df = long_df.sort_values(by='score', ascending=False)
print('Processed mapping table (long format):')
print('Shape:', long_df.shape)
display(long_df.head())

info_celltype_complete.tsv:
Shape: (338, 8)
Columns: ['cell_type', 'notes_cell', 'NTR_details', 'cellTypeId_', 'cellTypeName_', 'cellTypeAnnotationStatus_', 'cell_type_new', 'comment_cell']


Unnamed: 0,cell_type,notes_cell,NTR_details,cellTypeId_,cellTypeName_,cellTypeAnnotationStatus_,cell_type_new,comment_cell
0,Astro-Epen: Astro-CB NN,CB: cerebellum,,CL:0002603,astrocyte of the cerebellum,perfect match,,
1,Astro-Epen: Astro-NT NN,NT: non-telencephalon,,CL:0000127,astrocyte,other,,
2,Astro-Epen: Astro-OLF NN,OLF: olfactory,,CL:0012000,astrocyte of the forebrain,other,,
3,Astro-Epen: Astro-TE NN,TE: telencephalon,,CL:0002605,astrocyte of the cerebral cortex,perfect match,,
4,Astro-Epen: Astroependymal NN,,,CL:0000683,ependymoglial cell,perfect match,,


Aggregated info_cell_type_complete.tsv:
Shape: (75, 2)


Unnamed: 0,cellTypeName_,cell_type
0,mm_Bergmann glial cell,[Astro-Epen: Bergmann NN]
1,mm_Cajal-Retzius cell,[OB-CR Glut: HPF CR Glut]
2,mm_D1/D2-hybrid medium spiny neuron,[CNU-LGE GABA: STR D1 Sema5a Gaba]
3,mm_GABAergic neuron,"[CNU-HYa GABA: BST Tac2 Gaba, CNU-HYa GABA: BS..."
4,mm_Island of Calleja granule cell,[CNU-LGE GABA: OT D3 Folh1 Gaba]


Processed mapping table (long format):
Shape: (882, 3)


Unnamed: 0,r,c,score
80936,mm_lymphocyte,hs_2,1.0
21656,hs_2,mm_lymphocyte,1.0
34322,mm_lymphocyte,hs_0,0.99
21554,hs_0,mm_lymphocyte,0.99
21555,hs_1,mm_lymphocyte,0.99


In [13]:
[x for x in list(df_info_agg['cellTypeName_']) if 'mm_' in x]


['mm_Bergmann glial cell',
 'mm_Cajal-Retzius cell',
 'mm_D1/D2-hybrid medium spiny neuron',
 'mm_GABAergic neuron',
 'mm_Island of Calleja granule cell',
 'mm_L2 intratelencephalic projecting glutamatergic neuron',
 'mm_L2/3 intratelencephalic projecting glutamatergic neuron',
 'mm_L4/5 intratelencephalic projecting glutamatergic neuron',
 'mm_L5 extratelencephalic projecting glutamatergic cortical neuron',
 'mm_L5 intratelencephalic projecting glutamatergic neuron',
 'mm_L6 corticothalamic-projecting glutamatergic cortical neuron',
 'mm_L6 intratelencephalic projecting glutamatergic neuron',
 'mm_L6b glutamatergic cortical neuron',
 'mm_Lamp5 Lhx6 neuron',
 'mm_Purkinje cell',
 'mm_amygdala excitatory neuron',
 'mm_arachnoid barrier cell',
 'mm_astrocyte',
 'mm_astrocyte of the cerebellum',
 'mm_astrocyte of the cerebral cortex',
 'mm_astrocyte of the forebrain',
 'mm_border associated macrophage',
 'mm_brain pericyte',
 'mm_cartwheel cell',
 'mm_caudal ganglionic eminence derived in

In [14]:
[x for x in list(long_df['c']) if 'mm_' in x]


['mm_lymphocyte',
 'mm_lymphocyte',
 'mm_lymphocyte',
 'mm_Purkinje cell',
 'mm_unipolar brush cell',
 'mm_thalamic excitatory neuron',
 'mm_glutamatergic neuron',
 'mm_thalamic excitatory neuron',
 'mm_dendritic cell',
 'mm_sst chodl GABAergic cortical interneuron',
 'mm_border associated macrophage',
 'mm_L2/3 intratelencephalic projecting glutamatergic neuron',
 'mm_hippocampal CA1-3 neuron',
 'mm_indirect pathway medium spiny neuron',
 'mm_glutamatergic neuron',
 'mm_microglial cell',
 'mm_dentate gyrus of hippocampal formation granule cell',
 'mm_thalamic excitatory neuron',
 'mm_direct pathway medium spiny neuron',
 'mm_indirect pathway medium spiny neuron',
 'mm_L6 corticothalamic-projecting glutamatergic cortical neuron',
 'mm_indirect pathway medium spiny neuron',
 'mm_indirect pathway medium spiny neuron',
 'mm_sst GABAergic cortical interneuron',
 'mm_olfactory granule cell',
 'mm_lamp5 GABAergic cortical interneuron',
 'mm_cerebellar granule cell',
 'mm_L6 corticothalamic-p

In [15]:
# Inner join long_df and df_info on 'c'
joined_df = pd.merge(long_df, df_info_agg, left_on='c', right_on='cellTypeName_', how='inner')
print('Joined DataFrame:')
print('Shape:', joined_df.shape)
display(joined_df.head())

# Save the final joined DataFrame
joined_df.to_csv('sm_cluster.mappings_long_joined.tsv', sep='\t', index=False)
print('Final joined DataFrame saved to sm_cluster.mappings_long_joined.tsv')

Joined DataFrame:
Shape: (432, 5)


Unnamed: 0,r,c,score,cellTypeName_,cell_type
0,hs_2,mm_lymphocyte,1.0,mm_lymphocyte,[Immune: Lymphoid NN]
1,hs_0,mm_lymphocyte,0.99,mm_lymphocyte,[Immune: Lymphoid NN]
2,hs_1,mm_lymphocyte,0.99,mm_lymphocyte,[Immune: Lymphoid NN]
3,hs_314,mm_Purkinje cell,0.97,mm_Purkinje cell,[CB GABA: CBX Purkinje Gaba]
4,hs_308,mm_unipolar brush cell,0.96,mm_unipolar brush cell,[CB Glut: DCO UBC Glut]


Final joined DataFrame saved to sm_cluster.mappings_long_joined.tsv


In [16]:
joined_df

Unnamed: 0,r,c,score,cellTypeName_,cell_type
0,hs_2,mm_lymphocyte,1.00,mm_lymphocyte,[Immune: Lymphoid NN]
1,hs_0,mm_lymphocyte,0.99,mm_lymphocyte,[Immune: Lymphoid NN]
2,hs_1,mm_lymphocyte,0.99,mm_lymphocyte,[Immune: Lymphoid NN]
3,hs_314,mm_Purkinje cell,0.97,mm_Purkinje cell,[CB GABA: CBX Purkinje Gaba]
4,hs_308,mm_unipolar brush cell,0.96,mm_unipolar brush cell,[CB Glut: DCO UBC Glut]
...,...,...,...,...,...
427,hs_339,mm_neuron,0.10,mm_neuron,"[CNU-HYa Glut: GPi Tbr1 Cngb3 Gaba-Glut, HY GA..."
428,hs_409,mm_olfactory granule cell,0.10,mm_olfactory granule cell,"[OB-IMN GABA: OB Trdn Gaba, OB-IMN GABA: OB-ST..."
429,hs_146,mm_intratelencephalic-projecting glutamatergic...,0.10,mm_intratelencephalic-projecting glutamatergic...,"[IT-ET Glut: IT AON-TT-DP Glut, IT-ET Glut: IT..."
430,hs_237,mm_sst GABAergic cortical interneuron,0.10,mm_sst GABAergic cortical interneuron,[CTX-MGE GABA: Sst Gaba]


In [18]:
# Add cell_set_accession to joined_df using an inner join on r
human_clusters = pd.read_csv('human_clusters_with_top_mouse_pred_and_score.tsv', sep='\t', usecols=['human_cluster', 'cell_set_accession'])
joined_df_with_accession = pd.merge(joined_df, human_clusters, left_on='r', right_on='human_cluster', how='inner')
joined_df_with_accession = joined_df_with_accession.drop(columns=['human_cluster'])
display(joined_df_with_accession.head())

Unnamed: 0,r,c,score,cellTypeName_,cell_type,cell_set_accession
0,hs_2,mm_lymphocyte,1.0,mm_lymphocyte,[Immune: Lymphoid NN],CS202210140_3
1,hs_0,mm_lymphocyte,0.99,mm_lymphocyte,[Immune: Lymphoid NN],CS202210140_1
2,hs_1,mm_lymphocyte,0.99,mm_lymphocyte,[Immune: Lymphoid NN],CS202210140_2
3,hs_314,mm_Purkinje cell,0.97,mm_Purkinje cell,[CB GABA: CBX Purkinje Gaba],CS202210140_315
4,hs_308,mm_unipolar brush cell,0.96,mm_unipolar brush cell,[CB Glut: DCO UBC Glut],CS202210140_309


In [19]:
# Rename columns and drop CellTypeName_ in joined_df_with_accession
joined_df_with_accession = joined_df_with_accession.rename(columns={
    'r': 'human_cluster',
    'c': 'mouse_CL_cell_set',
    'cell_type': 'Mouse_subclasses',
    'cell_set_accession': 'Human_cell_set_accession'
})
if 'cellTypeName_' in joined_df_with_accession.columns:
    joined_df_with_accession = joined_df_with_accession.drop(columns=['cellTypeName_'])
display(joined_df_with_accession.head())

Unnamed: 0,human_cluster,mouse_CL_cell_set,score,Mouse_subclasses,Human_cell_set_accession
0,hs_2,mm_lymphocyte,1.0,[Immune: Lymphoid NN],CS202210140_3
1,hs_0,mm_lymphocyte,0.99,[Immune: Lymphoid NN],CS202210140_1
2,hs_1,mm_lymphocyte,0.99,[Immune: Lymphoid NN],CS202210140_2
3,hs_314,mm_Purkinje cell,0.97,[CB GABA: CBX Purkinje Gaba],CS202210140_315
4,hs_308,mm_unipolar brush cell,0.96,[CB Glut: DCO UBC Glut],CS202210140_309


In [22]:
# Clean Mouse_subclasses: remove everything before ': ' in each list entry
if 'Mouse_subclasses' in joined_df_with_accession.columns:
    def clean_mouse_subclass(lst):
        cleaned= [x.split(': ', 1)[-1] if ': ' in x else x for x in lst]
        if len(cleaned) == 1:
            return(cleaned)[0]
        elif len(cleaned) == 0 or len(cleaned) > 1:
            return('')

    joined_df_with_accession['Mouse_subclasses'] = joined_df_with_accession['Mouse_subclasses'].apply(clean_mouse_subclass)
display(joined_df_with_accession.head())


Unnamed: 0,human_cluster,mouse_CL_cell_set,score,Mouse_subclasses,Human_cell_set_accession
0,hs_2,mm_lymphocyte,1.0,Lymphoid NN,CS202210140_3
1,hs_0,mm_lymphocyte,0.99,Lymphoid NN,CS202210140_1
2,hs_1,mm_lymphocyte,0.99,Lymphoid NN,CS202210140_2
3,hs_314,mm_Purkinje cell,0.97,CBX Purkinje Gaba,CS202210140_315
4,hs_308,mm_unipolar brush cell,0.96,DCO UBC Glut,CS202210140_309


In [27]:
# Map Mouse_subclasses to mouse accession using cell_set_map.tsv
cell_set_map_path = Path('../maps/cell_set_map.tsv')
cell_set_map = pd.read_csv(cell_set_map_path, sep='\t')

# Filter for mouse subclass rows
mouse_subclass_map = cell_set_map[(cell_set_map['dataset'] == 'Whole Mouse Brain Taxonomy') & (cell_set_map['labelset'] == 'subclass')].copy()

# Remove leading numbers and space from label for matching
import re
def clean_label(label):
    return re.sub(r'^\d+ ', '', str(label))

mouse_subclass_map['clean_label'] = mouse_subclass_map['label'].apply(clean_label)

# Compute short_form from iri (text after last '/')
def iri_to_short_form(iri):
    return str(iri).rsplit('/', 1)[-1] if pd.notnull(iri) else ''

mouse_subclass_map['short_form'] = mouse_subclass_map['iri'].apply(iri_to_short_form)

# Build mapping: cleaned label -> short_form
label_to_accession = dict(zip(mouse_subclass_map['clean_label'], mouse_subclass_map['short_form']))

# Map Mouse_subclasses to accession
def map_mouse_accession(subclass):
    return label_to_accession.get(subclass, '')

joined_df_with_accession['Mouse_accession'] = joined_df_with_accession['Mouse_subclasses'].apply(map_mouse_accession)

display(joined_df_with_accession.head())

Unnamed: 0,human_cluster,mouse_CL_cell_set,score,Mouse_subclasses,Human_cell_set_accession,Mouse_accession
0,hs_2,mm_lymphocyte,1.0,Lymphoid NN,CS202210140_3,CS20230722_SUBC_338
1,hs_0,mm_lymphocyte,0.99,Lymphoid NN,CS202210140_1,CS20230722_SUBC_338
2,hs_1,mm_lymphocyte,0.99,Lymphoid NN,CS202210140_2,CS20230722_SUBC_338
3,hs_314,mm_Purkinje cell,0.97,CBX Purkinje Gaba,CS202210140_315,CS20230722_SUBC_313
4,hs_308,mm_unipolar brush cell,0.96,DCO UBC Glut,CS202210140_309,CS20230722_SUBC_315
