# Gene Ontology Similarity Comparisons

In [1]:
# Import
import pandas as pd
import obonet
import networkx as nx
import os

In [13]:
# === Step 1: Load the CSV ===
file_path = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_Llama4.csv")
df = pd.read_csv(file_path)

# Load Label-to-ID Mappings from CSV
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
label_to_id = dict(zip(mapping_df['label'], mapping_df['ontology_id']))
label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()}

In [10]:
# Load Cell Ontology
url = 'http://purl.obolibrary.org/obo/cl.obo'
print('Loading Cell Ontology...')
G = obonet.read_obo(url)
print('Ontology loaded.')

Loading Cell Ontology...
Ontology loaded.


In [31]:
# === Step 4: Compute Similarity ===
def compute_similarity(id1, id2, graph):
    if id1 is None or id2 is None:
        return 0
    try:
        path_length = nx.shortest_path_length(graph, id1, id2)
        max_depth = 10  # adjust as needed for normalization
        similarity = 1 - (path_length / max_depth)
        return max(similarity, 0)
    except nx.NetworkXNoPath:
        return 0

similarity_scores = []
for _, row in df.iterrows():
    author_label = row['cell_type'].lower()
    llama_label = row['0'].lower()
    
    id1 = label_to_id_ci.get(author_label)
    id2 = label_to_id_ci.get(llama_label)
    
    sim = compute_similarity(id1, id2, G)
    similarity_scores.append(sim)

# Add to dataframe
df['similarity_score'] = similarity_scores

In [34]:
# Save Results as CSV
output_file = '~/Göran_Karlsson_Lab/benchLLM/Simone_et_al/author_vs_llama4_similarity.csv'
df.to_csv(output_file, index=False)