# Gene Ontology Similarity Comparisons
In this notebook we will load in cell type annotations and an cell ontology term converter to analyse the similarity between author annotations and celltype annotation. Thi swill be done via obonet and networkx in order to base simlarity on biological similarities.

In [3]:
# Import
import pandas as pd
import obonet
import networkx as nx
import os

In [54]:
# Data for Author vs LLMs
# Load the CSV
file_path = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_Llama4.csv")
df = pd.read_csv(file_path)

# Load Label-to-ID Mappings from CSV
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
label_to_id = dict(zip(mapping_df['label'], mapping_df['ontology_id']))
label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()}

In [146]:
# Data for Author vs CellTypist
# Load the CSV
file_path = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Heimlich_et_al/author_vs_celltypist_Heimlich_Immune_All_High.csv")
df = pd.read_csv(file_path)

# Load Label-to-ID Mappings from CSV for CellTypist
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/CellTypist_to_Cell_Ontology.csv"))
#label_to_id = dict(zip(mapping_df['Low-hierarchy cell types'], mapping_df['Cell Ontology ID']))
label_to_id = (mapping_df
    .groupby('High-hierarchy cell types')['Cell Ontology ID']
    .apply(set)
    .to_dict()
)
label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()}

mapping_df_2 = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
label_to_id_2 = dict(zip(mapping_df_2['label'], mapping_df_2['ontology_id']))
label_to_id_ci_2 = {k.lower(): v for k, v in label_to_id_2.items()}

In [11]:
# Load Cell Ontology
url = 'http://purl.obolibrary.org/obo/cl.obo'
print('Loading Cell Ontology...')
G = obonet.read_obo(url)
print('Ontology loaded.')

Loading Cell Ontology...
Ontology loaded.


In [55]:
# Compute Similarity for Author vs LLM
def compute_similarity(id1, id2, graph):
    if id1 is None or id2 is None:
        return 0
    try:
        path_length = nx.shortest_path_length(graph, id1, id2)
        max_depth = 10  # adjust as needed for normalization
        similarity = 1 - (path_length / max_depth)
        return max(similarity, 0)
    except nx.NetworkXNoPath:
        return 0

similarity_scores = []
for _, row in df.iterrows():
    author_label = row['cell_type'].lower()
    llama_label = row['0'].lower()
    
    id1 = label_to_id_ci.get(author_label)
    id2 = label_to_id_ci.get(llama_label)
    
    sim = compute_similarity(id1, id2, G)
    similarity_scores.append(sim)

# Add to dataframe
df['similarity_score'] = similarity_scores

In [144]:
# Compute Similarity for Author vs CellTypist (High Granularity)
def compute_similarity(id1, id2, graph):
    if id1 is None or id2 is None:
        return 0
    try:
        path_length = nx.shortest_path_length(graph, id1, id2)
        max_depth = 10  # adjust as needed for normalization
        similarity = 1 - (path_length / max_depth)
        return max(similarity, 0)
    except nx.NetworkXNoPath:
        return 0

similarity_scores = []
for _, row in df.iterrows():
    author_label = row['cell_type'].lower()
    cellTypist_label = row['cluster_cell_type'].lower()
    
    id1 = label_to_id_ci_2.get(author_label)
    id2 = label_to_id_ci.get(cellTypist_label)
    
    sim = compute_similarity(id1, id2, G)
    similarity_scores.append(sim)

# Add to dataframe
df['similarity_score'] = similarity_scores

In [147]:
# Compute Similarity for Author vs CellTypist (Low Granularity)
similarity_scores = []

for _, row in df.iterrows():
    author_label = row['cell_type'].lower()
    cellTypist_label = row['cluster_cell_type'].lower()
    
    ids1 = label_to_id_ci_2.get(author_label, set())
    ids2 = label_to_id_ci.get(cellTypist_label, set())
    
    # Ensure they are always sets
    if isinstance(ids1, str):
        ids1 = {ids1}
    if isinstance(ids2, str):
        ids2 = {ids2}
    
    max_sim = 0
    for id1 in ids1:
        for id2 in ids2:
            if id1 not in G or id2 not in G:
                continue
            sim = compute_similarity(id1, id2, G)
            print(f"Comparing {id1} vs {id2}: sim={sim}")
            if sim > max_sim:
                max_sim = sim
    
    print(f"Row: {author_label} vs {cellTypist_label} → max_sim={max_sim}")
    similarity_scores.append(max_sim)

df['similarity_score'] = similarity_scores


Comparing CL:0000233 vs CL:0000556: sim=0.9
Row: platelet vs megakaryocytes/platelets → max_sim=0.9
Comparing CL:0000236 vs CL:0000843: sim=0
Comparing CL:0000236 vs CL:0000787: sim=0
Comparing CL:0000236 vs CL:0000818: sim=0
Comparing CL:0000236 vs CL:0000236: sim=1.0
Comparing CL:0000236 vs CL:0000788: sim=0
Comparing CL:0000236 vs CL:0000844: sim=0
Row: b cell vs b cells → max_sim=1.0
Comparing CL:0001056 vs CL:0001058: sim=0
Row: dendritic cell vs pdc → max_sim=0
Comparing CL:0000623 vs CL:0001077: sim=0
Comparing CL:0000623 vs CL:0001081: sim=0
Comparing CL:0000623 vs CL:0001078: sim=0
Comparing CL:0000623 vs CL:0001065: sim=0.8
Comparing CL:0000623 vs CL:0000823: sim=0
Comparing CL:0000623 vs CL:0000623: sim=1.0
Row: natural killer cell vs ilc → max_sim=1.0
Comparing CL:0000624 vs CL:0000934: sim=0
Comparing CL:0000624 vs CL:0002038: sim=0
Comparing CL:0000624 vs CL:0000798: sim=0
Comparing CL:0000624 vs CL:0000814: sim=0
Comparing CL:0000624 vs CL:0000907: sim=0
Comparing CL:000

In [34]:
# Save Results as CSV
output_file = '~/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_llama4_similarity.csv'
df.to_csv(output_file, index=False)