## load and clean cluster mapping table

In [14]:
import pandas as pd
# Step 1: Load and reshape mapping table
mapping_wide = pd.read_csv("sm_cluster.mapping_table.tsv", sep="\t", index_col=0)
mapping_long = mapping_wide.reset_index().melt(id_vars='index', 
                                               var_name='cluster_2', 
                                               value_name='score')

In [15]:
# Step 2: Rename columns for clarity
mapping_long = mapping_long.rename(columns={'index': 'cluster_1'})

In [16]:
# Step 3: Identify species based on prefix
def get_species(label):
    if label.startswith("mm_"):
        return "mouse"
    elif label.startswith("hs_"):
        return "human"
    else:
        return "unknown"

mapping_long['species_1'] = mapping_long['cluster_1'].apply(get_species)
mapping_long['species_2'] = mapping_long['cluster_2'].apply(get_species)

In [17]:
mapping_long = mapping_long[
    ((mapping_long['species_1'] == 'mouse') & (mapping_long['species_2'] == 'human')) |
    ((mapping_long['species_1'] == 'human') & (mapping_long['species_2'] == 'mouse'))
].copy()

In [19]:
# Step 5: Standardize columns: always make mouse → human direction
# If cluster_1 is mouse and cluster_2 is human, keep as is
# If it's reversed, swap them
def reorder_clusters(row):
    if row['species_1'] == 'mouse':
        return pd.Series({'mouse_cluster': row['cluster_1'], 'human_cluster': row['cluster_2'], 'score': row['score']})
    else:
        return pd.Series({'mouse_cluster': row['cluster_2'], 'human_cluster': row['cluster_1'], 'score': row['score']})

mapping_clean = mapping_long.apply(reorder_clusters, axis=1)

## load and clean human_labels

In [26]:
# 1. Load human cluster annotations
human_labels = pd.read_csv("1_final_celltype.tsv", sep="\t")
human_labels = human_labels.rename(columns={'Cluster ID': 'human_cluster_id', 'cell_ontology_term': 'human_label'})

In [27]:
human_labels.columns

Index(['cell_set_accession', 'cell_label', 'cell_fullname',
       'parent_cell_set_accession', 'parent_cell_set_name', 'labelset',
       'cell_ontology_term_id', 'human_label', 'rationale', 'rationale_dois',
       'marker_gene_evidence', 'synonyms', 'human_cluster_id',
       'Class auto_annotation', 'sc_notes', 'Neurotransmitter auto_annotation',
       'Subtype auto_annotation', 'Transferred MTG Label',
       'Neuropeptide auto_annotation', 'Top three regions',
       'Top three dissections', 'Top Enriched Genes', 'Number of cells',
       'DoubletFinder score', 'Total UMI', 'Fraction unspliced',
       'Fraction mitochondrial', 'H19.30.002', 'H19.30.001', 'H18.30.002',
       'H18.30.001', 'Fraction cells from top donor', 'Number of donors',
       'subcluster_id'],
      dtype='object')

In [28]:
# 2. Clean human cluster column: convert 'hs_32' → 32
mapping_clean['human_cluster_id'] = mapping_clean['human_cluster'].str.extract(r'hs_(\d+)').astype(int)

In [29]:
# 3. Find top-scoring mouse cluster for each human cluster
top_mouse_per_human = (
    mapping_clean.sort_values('score', ascending=False)
    .groupby('human_cluster_id')
    .first()
    .reset_index()
)

In [30]:
# 4. Join with human label
top_mouse_per_human = top_mouse_per_human.merge(human_labels, on='human_cluster_id', how='left')

In [31]:
# 5. Clean mouse label (remove 'mm_' prefix for comparison)
top_mouse_per_human['mouse_label'] = top_mouse_per_human['mouse_cluster'].str.replace('mm_', '', regex=False)

In [32]:
# 6. Evaluate agreement (simple string match)
top_mouse_per_human['label_match'] = (
    top_mouse_per_human['mouse_label'].str.lower() == top_mouse_per_human['human_label'].str.lower()
)

In [33]:
# 7. Calculate accuracy
match_rate = top_mouse_per_human['label_match'].mean()

### exact match rate = 37%

In [34]:
# get mismatches
mismatches = top_mouse_per_human[~top_mouse_per_human['label_match']]

In [35]:
# Show most common mismatched pairs
mismatch_counts = (
    mismatches.groupby(['human_label', 'mouse_label'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

In [36]:
print("\nTop mismatches:")
print(mismatch_counts)


Top mismatches:
                                          human_label  \
53                                             neuron   
48                                             neuron   
43                 midbrain-derived inhibitory neuron   
5           L5/6 near-projecting glutamatergic neuron   
15                                               cell   
..                                                ...   
33                           hippocampal CA1-3 neuron   
34                           hippocampal CA1-3 neuron   
35                           hippocampal CA1-3 neuron   
38  intratelencephalic-projecting glutamatergic co...   
64                                 von Economo neuron   

                                          mouse_label  count  
53                               glutamatergic neuron     51  
48                                   GABAergic neuron     28  
43                                   GABAergic neuron     12  
5       near-projecting glutamatergic cortical

In [37]:
mismatch_counts.to_csv("mismatch_counts_all.tsv", sep="\t", index=False)

In [38]:
top_mouse_per_human.to_csv("human_remapping.tsv", sep="\t", index=False)

## mismatches again but removing broad human labels
broad is where cell_ontology_term = neuron or cell - to look at these separately

## also to look at mismatches where score is over some threshold - let's say 70 to start

## look at additional insight given by mouse data for Miscellaneous and Splatter subsets

In [39]:
broad_labels = ['CL:0000540', 'CL:0000000']

In [40]:
broad_human_clusters = top_mouse_per_human[top_mouse_per_human['cell_ontology_term_id'].isin(broad_labels)]

In [41]:
mismatches_broad = broad_human_clusters[~broad_human_clusters['label_match']]

In [42]:
broad_mismatch_counts = (
    mismatches_broad.groupby(['human_label', 'mouse_label'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)