In [1]:
import pandas as pd
import numpy as np
import os
import glob
import re
from rapidfuzz import process
from owlready2 import get_ontology

In [2]:
# Load Cell Ontology Map (OWL)
print("Loading Cell Ontology (OWL)...")
G = get_ontology("http://purl.obolibrary.org/obo/cl.owl").load()
print("Ontology loaded.")

Loading Cell Ontology (OWL)...
Ontology loaded.


In [3]:
base_dirs = [
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Heimlich_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Granja_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Simone_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Jardine_et_al"),
    #os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Roy_et_al"),
    os.path.expanduser("~/Göran_Karlsson_Lab/benchLLM/Zhang_et_al")
]

In [9]:
def get_ancestors(cls):
    return set(cls.ancestors()) if cls else set()

def semantic_similarity(id1, id2, G):
    if id1 is None or id2 is None:
        return 0
    class1 = G.search_one(iri="*" + id1.replace(":", "_"))
    class2 = G.search_one(iri="*" + id2.replace(":", "_"))
    if not class1 or not class2:
        return 0
    ancestors1 = get_ancestors(class1)
    ancestors2 = get_ancestors(class2)
    intersection = ancestors1 & ancestors2
    union = ancestors1 | ancestors2
    return len(intersection) / len(union) if union else 0

def cluster_semantic_similarity(set1, set2, G):
    if not set1 or not set2:
        return 0
    scores = []
    for t1 in set1:
        for t2 in set2:
            sim = semantic_similarity(t1, t2, G)
            scores.append(sim)
    return sum(scores) / len(scores) if scores else 0

output_rows = []

for base_dir in base_dirs:
    input_pattern = os.path.join(base_dir, "*_with_author.csv")
    for file_path in glob.glob(input_pattern):
        basename = os.path.basename(file_path)
        if '_similarity' in basename:
            continue

        print(f'\nProcessing: {basename} as General Annotation')
        df = pd.read_csv(file_path, sep=';')

        grouped = df.groupby(['LLM', 'Author Label'])['Ontology Term'].apply(set).reset_index()

        llm_cluster_terms = {}
        for _, row in grouped.iterrows():
            llm = row['LLM']
            cluster = row['Author Label']
            terms = row['Ontology Term']
            llm_cluster_terms.setdefault(llm, {})[cluster] = terms

        def compute_llm_similarity(llm1, llm2):
            clusters1 = llm_cluster_terms.get(llm1, {})
            clusters2 = llm_cluster_terms.get(llm2, {})
            common_clusters = set(clusters1.keys()) & set(clusters2.keys())
            if not common_clusters:
                print(f"No common clusters between {llm1} and {llm2}")
                return 0

            similarities = []
            for cid in sorted(common_clusters):
                set1 = clusters1[cid]
                set2 = clusters2[cid]
                sim = cluster_semantic_similarity(set1, set2, G)
                similarities.append(sim)

                output_rows.append({
                    "LLM1": llm1,
                    "LLM2": llm2,
                    "Cluster": cid,
                    "Similarity": round(sim, 4),
                    "LLM1_terms": ", ".join(sorted(set1)),
                    "LLM2_terms": ", ".join(sorted(set2))
                })

            overall_sim = sum(similarities) / len(similarities)
            output_rows.append({
                "LLM1": llm1,
                "LLM2": llm2,
                "Cluster": "OVERALL",
                "Similarity": round(overall_sim, 4),
                "LLM1_terms": "",
                "LLM2_terms": ""
            })
            return overall_sim

        # Pairwise comparison for all LLMs
        llms = list(llm_cluster_terms.keys())
        for i in range(len(llms)):
            for j in range(i + 1, len(llms)):
                compute_llm_similarity(llms[i], llms[j])

# Save to CSV
output_df = pd.DataFrame(output_rows)
output_csv_path = "~/Göran_Karlsson_Lab/benchLLM/Zhang_et_al/llm_pairwise_similarity.csv"
output_df.to_csv(output_csv_path, index=False)
print(f"\n✅ Similarity results saved to: {output_csv_path}")



Processing: cluster_descriptions_with_author.csv as General Annotation

✅ Similarity results saved to: ~/Göran_Karlsson_Lab/benchLLM/Zhang_et_al/llm_pairwise_similarity.csv
