# Cell Ontology Similarity from CSV
A notebook that assesses cell similarities from a csv conatining two cell types. Using Owlready 

In [2]:
import pandas as pd
import numpy as np
import os
import glob
import re
from rapidfuzz import process
from owlready2 import get_ontology

In [8]:
## Load Datasets and Mapping Reference (Text to Cell Ontology ID)
# Base Directory of Dataset
base_dir = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Granja_et_al")
input_pattern = os.path.join(base_dir, "author_vs_*.csv")

# Mapping Reference 
# General 
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
label_to_id = dict(zip(mapping_df['label'], mapping_df['ontology_id']))
label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()} # Remove Case Sentivity

In [4]:
# Load Cell Ontology Map (OWL)
print("Loading Cell Ontology (OWL)...")
G = get_ontology("http://purl.obolibrary.org/obo/cl.owl").load()
print("Ontology loaded.")

Loading Cell Ontology (OWL)...
Ontology loaded.


In [9]:
def get_ancestors(cls):
    return set(cls.ancestors()) if cls else set()

def compute_similarity(id1, id2, onto):
    if id1 is None or id2 is None:
        return 0

    class1 = onto.search_one(iri="*" + id1.replace(":", "_"))
    class2 = onto.search_one(iri="*" + id2.replace(":", "_"))

    if not class1 or not class2:
        return 0

    ancestors1 = get_ancestors(class1)
    ancestors2 = get_ancestors(class2)

    intersection = ancestors1 & ancestors2
    union = ancestors1 | ancestors2

    if not union:
        return 0

    return len(intersection) / len(union)  # Jaccard similarity

def normalize_label(label):
    label = str(label).lower().strip()
    label = re.sub(r'\s*\(.*?\)', '', label)
    if label.endswith('s') and len(label) > 1:
        label = label[:-1]
    return label

def fuzzy_match_label(label, choices, threshold=80):
    if label is None:
        return None
    result = process.extractOne(label, choices)
    if result is None:
        return None
    if len(result) == 2:
        match, score = result
    elif len(result) == 3:
        # Sometimes it returns (match, score, index)
        match, score, _ = result
    else:
        return None
    if score >= threshold:
        return match
    return None


for file_path in glob.glob(input_pattern):
    if '_similarity' in os.path.basename(file_path):
        continue

    print(f"\nProcessing: {os.path.basename(file_path)}")
    df = pd.read_csv(file_path)

    colnames = df.columns.tolist()
    if len(colnames) < 2:
        print(f"  Skipping {file_path} — not enough columns.")
        continue

    author_col = colnames[0]
    llm_col = colnames[1]

    similarity_scores = []
    for _, row in df.iterrows():
        author_label = normalize_label(row[author_col])
        llm_label = normalize_label(row[llm_col])

        if author_label in ["nan", "none"] or pd.isna(author_label):
            author_label = None
        if llm_label in ["nan", "none"] or pd.isna(llm_label):
            llm_label = None

        fuzzy_used = False

        id1 = label_to_id_ci.get(author_label) if author_label else None
        id2 = label_to_id_ci.get(llm_label) if llm_label else None

        if id1 is None and author_label is not None:
            matched_label = fuzzy_match_label(author_label, label_to_id_ci.keys())
            if matched_label:
                fuzzy_used = True
            id1 = label_to_id_ci.get(matched_label)

        if id2 is None and llm_label is not None:
            matched_label = fuzzy_match_label(llm_label, label_to_id_ci.keys())
            if matched_label:
                fuzzy_used = True
            id2 = label_to_id_ci.get(matched_label)

        sim = compute_similarity(id1, id2, G)
        print(f"Comparing {author_label} ({id1}) vs {llm_label} ({id2}) (fuzzy={fuzzy_used}): sim={sim}")
        similarity_scores.append(sim)


    df['similarity_score'] = similarity_scores

    #out_path = file_path.replace(".csv", "_similarity.csv")
    #df.to_csv(out_path, index=False)
    #print(f"  Saved: {os.path.basename(out_path)}")


Processing: author_vs_celltypist_Roy_Immune_All_High_new.csv
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000037) (fuzzy=True): sim=1.0
Comparing 02_early.eryth (None) vs memp (CL:0000050) (fuzzy=True): sim=0
Comparing 03_late.eryth (None) vs mid erythroid (CL:0000038) (fuzzy=True): sim=0
Comparing 04_early.baso (None) vs mast cell (CL:0000097) (fuzzy=False): sim=0
Comparing 05_cmp.lmpp (CL:0001059) vs hsc/mpp (CL:0000037) (fuzzy=True): sim=0.6842105263157895
Comparing 06_clp.1 (CL:0000051) vs pro-b cell (CL:0000826) (fuzzy=True): sim=0.7222222222222222
Comparing 07_gmp (CL:0000557) vs classical monocyte (CL:0000860) (fuzzy=True): sim=0.3939393939393939
Comparing 08_gmp.neut (CL:0000557) vs neutrophil-myeloid progenitor (CL:0000775) (fuzzy=True): sim=0.39285714285714285
Comparing 09_pdc (CL:0000784) vs pdc (CL:0000784) (fuzzy=True): sim=1.0
Comparing 10_cdc (CL:0000990) vs dc2 (CL:0000784) (fuzzy=True): sim=0.9130434782608695
Comparing 11_cd14.mono.1 (None) vs classical monocyte (CL: