# Cell Ontology Similarity from CSV
A notebook that assesses cell similarities from a .csv containing two congruent columns of cell types. Jaccard similarity is calculated using Owlready. 
<br> <b/> N.B. </b> For cell ontology labelling from general sources utilise the upper cell and for cell ontology labelling from CellTypist labelled data use the lower cell. Automatic choice based on file name are the bottom cells.
<br> <b/> N.B. </b> CellTypist granularity (low and high immune reference) can be changed affceting similarity.

In [3]:
import pandas as pd
import numpy as np
import os
import glob
import re
from rapidfuzz import process
from owlready2 import get_ontology

In mapping refernece choose which 'label_to_id_2' depending on cell typist granularity

In [8]:
## Load Datasets and Mapping Reference (Text to Cell Ontology ID)
# Base Directory of Dataset
base_dir = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Granja_et_al")
input_pattern = os.path.join(base_dir, "author_vs_*.csv")
input_pattern_cellTypist = os.path.join(base_dir, "author_vs_celltypist_*.csv")

## Mapping Reference 
# General 
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
label_to_id = dict(zip(mapping_df['label'], mapping_df['ontology_id']))
label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()} # Remove Case Sentivity

# CellTypist (Switch 'label_to_id_2' Depending on Granularity of CellTypist Annotation)
mapping_df_2 = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/CellTypist_to_Cell_Ontology.csv"))
#label_to_id_2 = dict(zip(mapping_df_2['Low-hierarchy cell types'], mapping_df_2['Cell Ontology ID'])) # High Granularity
label_to_id_2 = (mapping_df_2
    .groupby('High-hierarchy cell types')['Cell Ontology ID']
    .apply(set)
    .to_dict()
) # Low Granularity
label_to_id_ci_2 = {k.lower(): v for k, v in label_to_id_2.items()}

In [3]:
# Load Cell Ontology Map (OWL)
print("Loading Cell Ontology (OWL)...")
G = get_ontology("http://purl.obolibrary.org/obo/cl.owl").load()
print("Ontology loaded.")

Loading Cell Ontology (OWL)...
Ontology loaded.


General Purpose Cell Ontology Based Similarity

In [131]:
def get_ancestors(cls):
    return set(cls.ancestors()) if cls else set()

def compute_similarity(id1, id2, onto):
    if id1 is None or id2 is None:
        return 0

    class1 = onto.search_one(iri="*" + id1.replace(":", "_")) # Convert ID1 to be Compatible with OWL
    class2 = onto.search_one(iri="*" + id2.replace(":", "_")) # Convert ID2 to be Compatible with OWL

    if not class1 or not class2:
        return 0

    ancestors1 = get_ancestors(class1)
    ancestors2 = get_ancestors(class2)

    intersection = ancestors1 & ancestors2
    union = ancestors1 | ancestors2

    if not union:
        return 0

    return len(intersection) / len(union)  # Jaccard Similarity

def normalize_label(label):
    label = str(label).lower().strip() # Remove Case Sensitivity
    label = re.sub(r'\s*\(.*?\)', '', label)  # Remove Special Characters
    if label.endswith('s') and len(label) > 1:
        label = label[:-1]
    return label # Remove Plural

def fuzzy_match_label(label, choices, threshold=80):
    if label is None:
        return None
    result = process.extractOne(label, choices)
    if result is None:
        return None
    if len(result) == 2:
        match, score = result
    elif len(result) == 3:
        # Sometimes it returns (match, score, index)
        match, score, _ = result
    else:
        return None
    if score >= threshold:
        return match
    return None # Use Fuzzy Match if Exact Match is Null


for file_path in glob.glob(input_pattern):
    if '_similarity' in os.path.basename(file_path) or '_celltypist_' in os.path.basename(file_path):
        continue

    print(f"\nProcessing: {os.path.basename(file_path)}")
    df = pd.read_csv(file_path)

    colnames = df.columns.tolist()
    if len(colnames) < 2:
        print(f"  Skipping {file_path} — not enough columns.")
        continue

    author_col = colnames[0]
    llm_col = colnames[1]

    similarity_scores = []
    for _, row in df.iterrows():
        author_label = normalize_label(row[author_col])
        llm_label = normalize_label(row[llm_col])

        if author_label in ["nan", "none"] or pd.isna(author_label):
            author_label = None
        if llm_label in ["nan", "none"] or pd.isna(llm_label):
            llm_label = None

        fuzzy_used_author = False
        fuzzy_used_llm = False

        id1 = label_to_id_ci.get(author_label) if author_label else None
        id2 = label_to_id_ci.get(llm_label) if llm_label else None

        if id1 is None and author_label is not None:
            matched_label = fuzzy_match_label(author_label, label_to_id_ci.keys())
            if matched_label:
                fuzzy_used_author = True
            id1 = label_to_id_ci.get(matched_label)

        if id2 is None and llm_label is not None:
            matched_label = fuzzy_match_label(llm_label, label_to_id_ci.keys())
            if matched_label:
                fuzzy_used_llm = True
            id2 = label_to_id_ci.get(matched_label)

        sim = compute_similarity(id1, id2, G)
        print(f"Comparing {author_label} ({id1}) vs {llm_label} ({id2}) (fuzzy author ={fuzzy_used_author} and fuzzy llm ={fuzzy_used_llm}): sim={sim}")
        similarity_scores.append(sim)


    df['similarity_score'] = similarity_scores

    #out_path = file_path.replace(".csv", "_similarity.csv")
    #df.to_csv(out_path, index=False)
    #print(f"  Saved: {os.path.basename(out_path)}")


Processing: author_vs_Gemini2_5.csv
Comparing 01_hsc (CL:0000037) vs leukemic blast (CL:4070012) (fuzzy author =True and fuzzy llm =True): sim=0.3333333333333333
Comparing 02_early.eryth (None) vs erythroid progenitor cell (CL:0000038) (fuzzy author =False and fuzzy llm =False): sim=0
Comparing 03_late.eryth (None) vs erythroid lineage cell (CL:0000764) (fuzzy author =False and fuzzy llm =False): sim=0
Comparing 04_early.baso (None) vs None (None) (fuzzy author =False and fuzzy llm =False): sim=0
Comparing 05_cmp.lmpp (CL:0001059) vs mixed-phenotype acute leukemia blast (CL:0000055) (fuzzy author =True and fuzzy llm =True): sim=0.47058823529411764
Comparing 06_clp.1 (CL:0000051) vs b-lymphoid progenitor (CL:0000050) (fuzzy author =True and fuzzy llm =True): sim=0.8235294117647058
Comparing 07_gmp (CL:0000557) vs myeloid blast (CL:0000835) (fuzzy author =True and fuzzy llm =True): sim=0.56
Comparing 08_gmp.neut,"mixed-phenotype acute leukemia blasts, b/myeloid" (None) vs None (None) (f

CellTypist Annotated Cell Ontology Based Similarity

In [132]:
for file_path in glob.glob(input_pattern_cellTypist):
    if '_similarity' in os.path.basename(file_path):
        continue

    print(f"\nProcessing: {os.path.basename(file_path)}")
    df = pd.read_csv(file_path)

    colnames = df.columns.tolist()
    if len(colnames) < 2:
        print(f"  Skipping {file_path} — not enough columns.")
        continue

    author_col = colnames[0]
    cellTypist_col = colnames[1]

    similarity_scores = []

    for _, row in df.iterrows():
        author_label = normalize_label(row[author_col])
        cellTypist_label = normalize_label(row[cellTypist_col])

        fuzzy_used = False

        ids1 = label_to_id_ci.get(author_label)
        ids2 = label_to_id_ci_2.get(cellTypist_label)

        # Fuzzy match fallback if not found
        if not ids1 and author_label:
            match = fuzzy_match_label(author_label, label_to_id_ci.keys())
            if match:
                ids1 = label_to_id_ci[match]
                fuzzy_used_author = True

        if not ids2 and llm_label:
            match = fuzzy_match_label(cellTypist_label, label_to_id_ci_2.keys())
            if match:
                ids2 = label_to_id_ci_2[match]
                fuzzy_used_cellTypist = True

        # Normalize to sets
        if isinstance(ids1, str):
            ids1 = {ids1}
        if isinstance(ids2, str):
            ids2 = {ids2}

        max_sim = 0
        if ids1 and ids2:
            for id1 in ids1:
                for id2 in ids2:
                    sim = compute_similarity(id1, id2, G)
                    print(f"Comparing {author_label} ({id1}) vs {cellTypist_label} ({id2}) (Author Fuzzy={fuzzy_used_author} and Cell Typist Fuzzy={fuzzy_used_cellTypist}): sim={sim}")
                    if sim > max_sim:
                        max_sim = sim
                       
        else:
            print(f"Skipping row: {author_label} or {cellTypist_label} not found.")
        
        print(f"Max Sim {max_sim}") 
        similarity_scores.append(max_sim)

    df['similarity_score'] = similarity_scores
    #out_path = file_path.replace(".csv", "_similarity.csv")
    #df.to_csv(out_path, index=False)
    #print(f"  Saved: {os.path.basename(out_path)}")



Processing: author_vs_celltypist_Roy_Immune_All_High_new.csv
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000049) (Author Fuzzy=True and Cell Typist Fuzzy=True): sim=0.6111111111111112
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000557) (Author Fuzzy=True and Cell Typist Fuzzy=True): sim=0.5416666666666666
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000837) (Author Fuzzy=True and Cell Typist Fuzzy=True): sim=0.6470588235294118
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000936) (Author Fuzzy=True and Cell Typist Fuzzy=True): sim=0.7222222222222222
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000834) (Author Fuzzy=True and Cell Typist Fuzzy=True): sim=0.6190476190476191
Comparing 01_hsc (CL:0000037) vs hsc/mpp (CL:0000050) (Author Fuzzy=True and Cell Typist Fuzzy=True): sim=0.6842105263157895
Max Sim 0.7222222222222222
Skipping row: 02_early.eryth or memp not found.
Max Sim 0
Skipping row: 03_late.eryth or mid erythroid not found.
Max Sim 0
Skipping row: 04_early.baso or mast c

## Automatic Classifcation
Cell ontology based simlarity via a optimised system (celltypist annotation is detected and processed seperately via file name)

In [7]:
## Load Datasets and Mapping Reference (Text to Cell Ontology ID)
# Base Directory of Dataset
base_dir = os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Heimlich_et_al")
input_pattern = os.path.join(base_dir, "author_vs_*.csv")

## Mapping Reference 
# General 
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
# CellTypist
mapping_df_cellTypist = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/CellTypist_to_Cell_Ontology.csv"))


In [4]:
# Load Cell Ontology Map (OWL)
print("Loading Cell Ontology (OWL)...")
G = get_ontology("http://purl.obolibrary.org/obo/cl.owl").load()
print("Ontology loaded.")

Loading Cell Ontology (OWL)...
Ontology loaded.


In [5]:
def get_ancestors(cls):
    return set(cls.ancestors()) if cls else set()

def compute_similarity(id1, id2, onto):
    if id1 is None or id2 is None:
        return 0

    class1 = onto.search_one(iri="*" + id1.replace(":", "_")) # Convert ID1 to be Compatible with OWL
    class2 = onto.search_one(iri="*" + id2.replace(":", "_")) # Convert ID2 to be Compatible with OWL

    if not class1 or not class2:
        return 0

    ancestors1 = get_ancestors(class1)
    ancestors2 = get_ancestors(class2)

    intersection = ancestors1 & ancestors2
    union = ancestors1 | ancestors2

    if not union:
        return 0

    return len(intersection) / len(union)  # Jaccard Similarity

def normalize_label(label):
    label = str(label).lower().strip()                      # Remove Case Sensitivity and Trim
    label = re.sub(r'^\d+_?', '', label)                    # Remove Leading Numbers and Underscore
    label = re.sub(r'\s*\(.*?\)', '', label)                # Remove text in parentheses
    #label = re.sub(r'"', '', label)                # Remove Quotation Marks
    if label.endswith('s') and len(label) > 1:
        label = label[:-1]                                  # Remove Plural
    return label.strip()

#def normalize_label(label):
#    label = str(label).lower().strip() # Remove Case Sensitivity
#    label = re.sub(r'\s*\(.*?\)', '', label)  # Remove Special Characters
#    if label.endswith('s') and len(label) > 1:
#        label = label[:-1]
#    return label # Remove Plural

def fuzzy_match_label(label, choices, threshold=80):
    if label is None:
        return None
    result = process.extractOne(label, choices)
    if result is None:
        return None
    if len(result) == 2:
        match, score = result
    elif len(result) == 3:
        # Sometimes it returns (match, score, index)
        match, score, _ = result
    else:
        return None
    if score >= threshold:
        return match
    return None # Use Fuzzy Match if Exact Match is Null
    

In [19]:
# Read in all .CSVs that are relavent
for file_path in glob.glob(input_pattern):
    basename = os.path.basename(file_path)
    
    if '_similarity' in basename:
        continue # Exclude already completed file (Already made similarity files)
    
    if '_celltypist_' in basename:
        if 'Low' in basename:
            # Isolate CellTypist Files with High Granularity
            print(f'\nProcessing: {basename} as High Granularity CellTypist Annotation')
            label_to_id_high = dict(zip(mapping_df_cellTypist['Low-hierarchy cell types'], mapping_df_cellTypist['Cell Ontology ID']))
            label_to_id_ci_high = {k.lower(): v for k, v in label_to_id_high.items()}
            
            df = pd.read_csv(file_path)

            colnames = df.columns.tolist()
            if len(colnames) < 2:
                print(f"  Skipping {file_path} — not enough columns.")
                continue

            author_col = colnames[0]
            cellTypist_high_col = colnames[1]

            similarity_scores = []
            for _, row in df.iterrows():
                author_label = normalize_label(row[author_col])
                cellTypist_high_label = normalize_label(row[cellTypist_high_col])

                if author_label in ["nan", "none"] or pd.isna(author_label):
                    author_label = None
                if cellTypist_high_label in ["nan", "none"] or pd.isna(cellTypist_high_label):
                    cellTypist_high_label = None

                fuzzy_used_author = False
                fuzzy_used_cellTypist_high = False

                id1 = label_to_id_ci.get(author_label) if author_label else None
                id2 = label_to_id_ci_high.get(cellTypist_high_label) if cellTypist_high_label else None

                if id1 is None and author_label is not None:
                    matched_label = fuzzy_match_label(author_label, label_to_id_ci.keys())
                    if matched_label:
                        fuzzy_used_author = True
                    id1 = label_to_id_ci.get(matched_label)

                if id2 is None and cellTypist_high_label is not None:
                    matched_label = fuzzy_match_label(cellTypist_high_label, label_to_id_ci_low.keys())
                    if matched_label:
                        fuzzy_used_cellTypist_high = True
                    id2 = label_to_id_ci_high.get(matched_label)

                sim = compute_similarity(id1, id2, G)
                print(f"Comparing {author_label} ({id1}) vs {cellTypist_high_label} ({id2}) (fuzzy author ={fuzzy_used_author} and fuzzy llm ={fuzzy_used_cellTypist_high}): sim={sim}")
                similarity_scores.append(sim)


            df['similarity_score'] = similarity_scores
            print(df)
            
        elif 'High' in basename:
            # Isolate CellTypist Files with Low Granularity
            print(f'\nProcessing: {basename} as Low Granularity CellTypist Annotation')
            label_to_id_low = (mapping_df_cellTypist
                .groupby('High-hierarchy cell types')['Cell Ontology ID']
                .apply(set)
                .to_dict()
            )
            label_to_id_ci_low = {k.lower(): v for k, v in label_to_id_low.items()}
            df = pd.read_csv(file_path)

            colnames = df.columns.tolist()
            if len(colnames) < 2:
                print(f"  Skipping {file_path} — not enough columns.")
                continue

            author_col = colnames[0]
            cellTypist_col = colnames[1]

            similarity_scores = []

            for _, row in df.iterrows():
                author_label = normalize_label(row[author_col])
                cellTypist_low_label = normalize_label(row[cellTypist_col])

                fuzzy_used_author = False
                fuzzy_used_cellTypist_low = False

                ids1 = label_to_id_ci.get(author_label)
                ids2 = label_to_id_ci_low.get(cellTypist_low_label)

                # Fuzzy match fallback if not found
                if not ids1 and author_label:
                    match = fuzzy_match_label(author_label, label_to_id_ci.keys())
                    if match:
                        ids1 = label_to_id_ci.get(normalize_label(match))
                        fuzzy_used_author = True

                if not ids2 and cellTypist_low_label:
                    match = fuzzy_match_label(cellTypist_low_label, label_to_id_ci_low.keys())
                if match:
                    ids2 = label_to_id_ci_low.get(match.lower())
                    if ids2:
                        fuzzy_used_cellTypist_low = True

                # Normalize to sets
                if isinstance(ids1, str):
                    ids1 = {ids1}
                if isinstance(ids2, str):
                    ids2 = {ids2}

                max_sim = 0
                if ids1 and ids2:
                    for id1 in ids1:
                        for id2 in ids2:
                            sim = compute_similarity(id1, id2, G)
                            print(f"Comparing {author_label} ({id1}) vs {cellTypist_low_label} ({id2}) (Author Fuzzy={fuzzy_used_author} and Cell Typist Fuzzy={fuzzy_used_cellTypist_low}): sim={sim}")
                            if sim > max_sim:
                                max_sim = sim
                       
                else:
                    id1_vals = ids1 if ids1 else [None]
                    id2_vals = ids2 if ids2 else [None]
                    for id1 in id1_vals:
                        for id2 in id2_vals:
                            print(f"Comparing {author_label} ({id1}) vs {cellTypist_low_label} ({id2}) (Author Fuzzy={fuzzy_used_author} and Cell Typist Fuzzy={fuzzy_used_cellTypist_low}): sim=Skipped (missing IDs)")
        
                print(f"Max Sim {max_sim}") 
                similarity_scores.append(max_sim)

            df['similarity_score'] = similarity_scores
            print(df)
            #out_path = file_path.replace(".csv", "_similarity.csv")
            #df.to_csv(out_path, index=False)
            #print(f"  Saved: {os.path.basename(out_path)}")
            
    else:
        # Isolate General Files
        print(f'\nProcessing: {basename} as General Annotation')
        label_to_id = dict(zip(mapping_df['label'], mapping_df['ontology_id']))
        label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()}

        df = pd.read_csv(file_path, , quotechar='"')
    
        colnames = df.columns.tolist()
        if len(colnames) < 2:
            print(f"  Skipping {file_path} — not enough columns.")
            continue

        author_col = colnames[0]
        llm_col = colnames[1]

        similarity_scores = []
        for _, row in df.iterrows():
            author_label = normalize_label(row[author_col])
            llm_label = normalize_label(row[llm_col])

            if author_label in ["nan", "none"] or pd.isna(author_label):
                author_label = None
            if llm_label in ["nan", "none"] or pd.isna(llm_label):
                llm_label = None

            fuzzy_used_author = False
            fuzzy_used_llm = False

            id1 = label_to_id_ci.get(author_label) if author_label else None
            id2 = label_to_id_ci.get(llm_label) if llm_label else None

            if id1 is None and author_label is not None:
                matched_label = fuzzy_match_label(author_label, label_to_id_ci.keys())
                if matched_label:
                    fuzzy_used_author = True
                id1 = label_to_id_ci.get(matched_label)

            if id2 is None and llm_label is not None:
                matched_label = fuzzy_match_label(llm_label, label_to_id_ci.keys())
                if matched_label:
                    fuzzy_used_llm = True
                id2 = label_to_id_ci.get(matched_label)

            sim = compute_similarity(id1, id2, G)
            print(f"Comparing {author_label} ({id1}) vs {llm_label} ({id2}) (fuzzy author ={fuzzy_used_author} and fuzzy llm ={fuzzy_used_llm}): sim={sim}")
            similarity_scores.append(sim)


        df['similarity_score'] = similarity_scores
        print(df)

        #out_path = file_path.replace(".csv", "_similarity.csv")
        #df.to_csv(out_path, index=False)
        #print(f"  Saved: {os.path.basename(out_path)}")


Processing: author_vs_Qwen3.csv as General Annotation
Comparing platelet (CL:0000233) vs hematopoietic stem cell (CL:0000037) (fuzzy author =False and fuzzy llm =False): sim=0.43478260869565216
Comparing b cell (CL:0000236) vs None (None) (fuzzy author =False and fuzzy llm =False): sim=0
Comparing dendritic cell (CL:0001056) vs b cell (CL:0000236) (fuzzy author =False and fuzzy llm =False): sim=0.6
Comparing natural killer cell (CL:0000623) vs None (None) (fuzzy author =False and fuzzy llm =False): sim=0
Comparing cd4-positive, alpha-beta t cell (CL:0000624) vs myeloid progenitor (CL:0000049) (fuzzy author =False and fuzzy llm =True): sim=0.4166666666666667
Comparing cd8-positive, alpha-beta t cell (CL:0000625) vs hematopoietic progenitor cell (CL:0000837) (fuzzy author =False and fuzzy llm =True): sim=0.43478260869565216
Comparing erythroid lineage cell (CL:0000764) vs myeloid cell (CL:0000763) (fuzzy author =False and fuzzy llm =False): sim=0.9166666666666666
Comparing cd16-positive

## From the large datafiles

In [9]:
base_dirs = [
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Heimlich_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Granja_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Simone_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Jardine_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Roy_et_al"),
    os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Zhang_et_al"),
]

## Mapping Reference 
# General 
mapping_df = pd.read_csv(os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/cell_to_cell_ontology.csv"))
label_to_id = dict(zip(mapping_df['label'], mapping_df['ontology_id']))
label_to_id_ci = {k.lower(): v for k, v in label_to_id.items()} # Remove Case Sentivity

In [14]:
for base_dir in base_dirs:
    input_pattern = os.path.join(base_dir, "*_with_author.csv")
    for file_path in glob.glob(input_pattern):
        basename = os.path.basename(file_path)

        if '_similarity' in basename:
            continue  # Skip already processed files

        print(f'\nProcessing: {base_dir} as General Annotation')

        df = pd.read_csv(file_path, sep=';')
        colnames = df.columns.tolist()

        required_cols = ["Ontology Term", "Author Label"]
        if not all(col in colnames for col in required_cols):
            print(f"  Skipping {file_path} — required columns not found.")
            continue

        author_col = "Author Label"
        llm_col = "Ontology Term"
        llm_name_col = "Cell Type"

        similarity_scores = []

        for _, row in df.iterrows():
            author_label = normalize_label(row[author_col])
            llm_label = row[llm_col]
            llm_label_name = row[llm_name_col]
        
            if author_label in ["nan", "none"] or pd.isna(author_label):
                author_label = None
            if llm_label in ["nan", "none"] or pd.isna(llm_label):
                llm_label = None
        
            fuzzy_used_author = False
        
            id1 = label_to_id_ci.get(author_label) if author_label else None
            id2 = llm_label  # already in ontology format
        
            # First fuzzy match attempt if id1 is not found
            if id1 is None and author_label:
                matched_label = fuzzy_match_label(author_label, label_to_id_ci.keys())
                if matched_label:
                    fuzzy_used_author = True
                    id1 = label_to_id_ci.get(matched_label)
        
            # Second attempt if id1 is still CL:0000000 (unknown) and "+" can be expanded
            if id1 == "CL:0000000" and author_label and "+" in author_label:
                expanded_label = author_label.replace("+", "-positive")
                matched_label = fuzzy_match_label(expanded_label, label_to_id_ci.keys())
                if matched_label:
                    fuzzy_used_author = True
                    id1 = label_to_id_ci.get(matched_label)

            # Third Atteempt if id1 is still (None) and contains term early
            if id1 is None and author_label and ("-early" in author_label or "early-" in author_label):
                for replacement_term in ["precursor", "immature", "developing", "pre"]:
                    if "-early" in author_label:
                        expanded_label = author_label.replace("-early", f" {replacement_term}")
                    elif "early-" in author_label:
                        expanded_label = author_label.replace("early-", f"{replacement_term} ")
                    
                    matched_label = fuzzy_match_label(expanded_label, label_to_id_ci.keys())
                    if matched_label:
                        fuzzy_used_author = True
                        id1 = label_to_id_ci.get(matched_label)
                        break
        
            sim = compute_similarity(id1, id2, G)
            print(f"Comparing {author_label} ({id1}) vs {llm_label_name} ({id2}) "
                  f"(fuzzy author={fuzzy_used_author}): sim={sim}")
            similarity_scores.append(sim)

        df['similarity_score'] = similarity_scores
        
        # Optional save
        out_path = file_path.replace(".csv", "_similarity.csv")
        df.to_csv(out_path, sep=';', index=False)
        print(f"  Saved: {os.path.basename(out_path)}")




Processing: /Users/alexantill/Göran_Karlsson_Lab/benchLLM/Zhang_et_al as General Annotation
Comparing asdc (None) vs Dendritic Cell (CL_0001029) (fuzzy author=False): sim=0
Comparing hsc (CL:0000037) vs Hematopoietic Stem Cell (HSC) (CL_0000037) (fuzzy author=False): sim=1.0
Comparing mdp (CL:0002009) vs Pre-B cell (CL_0000817) (fuzzy author=False): sim=0.37037037037037035
Comparing mep (CL:0000050) vs Megakaryocyte-Erythroid Progenitor (MEP) (CL_0000050) (fuzzy author=False): sim=1.0
Comparing mkp (CL:0000553) vs Megakaryocyte (CL_0000556) (fuzzy author=False): sim=0.6111111111111112
Comparing mpp (CL:0000837) vs Hematopoietic stem cell (HSC) (CL_0000037) (fuzzy author=False): sim=0.6470588235294118
Comparing mac (CL:0000091) vs Macrophage (CL_0002476) (fuzzy author=True): sim=0.8
Comparing mast-early (CL:0002354) vs Basophil (CL_0000767) (fuzzy author=True): sim=0.38461538461538464
Comparing monocyte (CL:0001054) vs Monocyte Progenitor (CL_0000040) (fuzzy author=False): sim=0.433333