# load and clean data

In [6]:
import pandas as pd
# Step 1: Load and reshape mapping table
mapping_wide = pd.read_csv("sm_cluster.mapping_table.tsv", sep="\t", index_col=0)
mapping_long = mapping_wide.reset_index().melt(id_vars='index', 
                                               var_name='cluster_2', 
                                               value_name='score')
# Step 2: Rename columns for clarity
mapping_long = mapping_long.rename(columns={'index': 'cluster_1'})
# Step 3: Identify species based on prefix
def get_species(label):
    if label.startswith("mm_"):
        return "mouse"
    elif label.startswith("hs_"):
        return "human"
    else:
        return "unknown"
mapping_long['species_1'] = mapping_long['cluster_1'].apply(get_species)
mapping_long['species_2'] = mapping_long['cluster_2'].apply(get_species)
# Step 4: Filter for correct pairings only (mouse vs human)
mapping_long = mapping_long[
    ((mapping_long['species_1'] == 'mouse') & (mapping_long['species_2'] == 'human')) |
    ((mapping_long['species_1'] == 'human') & (mapping_long['species_2'] == 'mouse'))
].copy()
# Step 5: Standardize columns: always make mouse → human direction
# If cluster_1 is mouse and cluster_2 is human, keep as is
# If it's reversed, swap them
def reorder_clusters(row):
    if row['species_1'] == 'mouse':
        return pd.Series({'mouse_cluster': row['cluster_1'], 'human_cluster': row['cluster_2'], 'score': row['score']})
    else:
        return pd.Series({'mouse_cluster': row['cluster_2'], 'human_cluster': row['cluster_1'], 'score': row['score']})

mapping_clean = mapping_long.apply(reorder_clusters, axis=1)


In [8]:
# 1. Load human cluster annotations
human_labels = pd.read_csv("1_final_celltype.tsv", sep="\t")
human_labels = human_labels.rename(columns={'Cluster ID': 'human_cluster_id', 'cell_ontology_term': 'human_label'})
# 2. Clean human cluster column: convert 'hs_32' → 32
mapping_clean['human_cluster_id'] = mapping_clean['human_cluster'].str.extract(r'hs_(\d+)').astype(int)
# 3. Find top-scoring mouse cluster for each human cluster
top_mouse_per_human = (
    mapping_clean.sort_values('score', ascending=False)
    .groupby('human_cluster_id')
    .first()
    .reset_index()
)
# 4. Join with human label
top_mouse_per_human = top_mouse_per_human.merge(human_labels, on='human_cluster_id', how='left')
# 5. Clean mouse label (remove 'mm_' prefix for comparison)
top_mouse_per_human['mouse_label'] = top_mouse_per_human['mouse_cluster'].str.replace('mm_', '', regex=False)
# 6. Evaluate agreement (exact match only)
top_mouse_per_human['label_match'] = (
    top_mouse_per_human['mouse_label'].str.lower() == top_mouse_per_human['human_label'].str.lower()
)


# calculate accuracy
exact match rate = 37% - surprisingly high

In [9]:
match_rate = top_mouse_per_human['label_match'].mean()

# breakdown mismatches (all data)

In [10]:
# get mismatches
mismatches = top_mouse_per_human[~top_mouse_per_human['label_match']]
# Show most common mismatched pairs
mismatch_counts = (
    mismatches.groupby(['human_label', 'mouse_label'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)
mismatch_counts.to_csv("outputs/mismatch_counts_all.tsv", sep="\t", index=False)

# breakdown mismatches again but removing broad human labels
broad is where cell_ontology_term = neuron or cell - to look at these separately

In [11]:
broad_labels = ['CL:0000540', 'CL:0000000']
specific_human_clusters = top_mouse_per_human[~top_mouse_per_human['cell_ontology_term_id'].isin(broad_labels)]
mismatches_specific = specific_human_clusters[~specific_human_clusters['label_match']]
specific_mismatch_counts = (
    mismatches_specific.groupby(['human_label', 'mouse_label'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)

In [13]:
match_rate_specific = specific_human_clusters['label_match'].mean()

# accuracy after removing broad human terms
accuracy is now 49%

# look into these mismatches and score them as complete mismatch or fuzzy mismatch - manual, see tsv file
could break these into close match, not similar but not mutually exclusive, and complete discordance

In [12]:
specific_mismatch_counts.to_csv("outputs/mismatch_counts_specific.tsv", sep="\t", index=False)

# look at additional insight given by mouse data for Miscellaneous and Splatter subsets

In [14]:
broad_labels = ['CL:0000540', 'CL:0000000']
broad_human_clusters = top_mouse_per_human[top_mouse_per_human['cell_ontology_term_id'].isin(broad_labels)]
mismatches_broad = broad_human_clusters[~broad_human_clusters['label_match']]
broad_mismatch_counts = (
    mismatches_broad.groupby(['human_label', 'mouse_label'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
)
broad_mismatch_counts.to_csv("outputs/mismatch_counts_broad.tsv", sep="\t", index=False)

# look at Neurotransmitter auto_annotation to confirm some mouse predictions
maybe david has some insight on what to do for other predictions

# maybe implement a score threshold and redo some of the previous breakdowns