In [None]:
'''
Mapping cell types to broder parent categories
'''

In [1]:
import requests
from collections import defaultdict, Counter
import time
import json
from typing import Dict, List, Set, Tuple
from tqdm import tqdm

class CellMapper:
    def __init__(self):
        self.base_url = "https://www.ebi.ac.uk/ols/api"
        self.ontology = "cl"
        self.cache = {}
    
    def find_cl_id(self, cell_name):
        if cell_name in self.cache:
            return self.cache[cell_name]
        
        url = f"{self.base_url}/search"
        params = {'q': cell_name, 'ontology': self.ontology, 'exact': 'true'}
        
        try:
            response = requests.get(url, params=params)
            data = response.json()
            if 'response' in data and 'docs' in data['response'] and data['response']['docs']:
                cl_id = data['response']['docs'][0].get('short_form')
                self.cache[cell_name] = cl_id
                time.sleep(0.1)
                return cl_id
        except:
            pass
        return None

    def get_ancestors(self, cl_id):
        cache_key = f"ancestors_{cl_id}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        iri = f"http://purl.obolibrary.org/obo/{cl_id}"
        encoded_iri = requests.utils.quote(requests.utils.quote(iri, safe=''), safe='')
        url = f"{self.base_url}/ontologies/{self.ontology}/terms/{encoded_iri}/hierarchicalAncestors"
        
        try:
            response = requests.get(url)
            ancestors = []
            if response.status_code == 200:
                data = response.json()
                if '_embedded' in data and 'terms' in data['_embedded']:
                    for term in data['_embedded']['terms']:
                        if term.get('obo_id') and 'CL' in term.get('obo_id'):
                            ancestors.append(term.get('label'))
            self.cache[cache_key] = ancestors
            time.sleep(0.1)
            return ancestors
        except:
            return []
    
    def get_parents(self, cl_id):
        cache_key = f"parents_{cl_id}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        iri = f"http://purl.obolibrary.org/obo/{cl_id}"
        encoded_iri = requests.utils.quote(requests.utils.quote(iri, safe=''), safe='')
        url = f"{self.base_url}/ontologies/{self.ontology}/terms/{encoded_iri}/hierarchicalParents"
        
        try:
            response = requests.get(url)
            parents = []
            if response.status_code == 200:
                data = response.json()
                if '_embedded' in data and 'terms' in data['_embedded']:
                    for term in data['_embedded']['terms']:
                        if term.get('obo_id') and 'CL' in term.get('obo_id'):
                            parents.append(term.get('label'))
            self.cache[cache_key] = parents
            time.sleep(0.1)
            return parents
        except:
            return []

    def get_children(self, cl_id):
        cache_key = f"children_{cl_id}"
        if cache_key in self.cache:
            return self.cache[cache_key]
        
        iri = f"http://purl.obolibrary.org/obo/{cl_id}"
        encoded_iri = requests.utils.quote(requests.utils.quote(iri, safe=''), safe='')
        url = f"{self.base_url}/ontologies/{self.ontology}/terms/{encoded_iri}/hierarchicalChildren"
        
        try:
            response = requests.get(url)
            children = []
            if response.status_code == 200:
                data = response.json()
                if '_embedded' in data and 'terms' in data['_embedded']:
                    for term in data['_embedded']['terms']:
                        if term.get('obo_id') and 'CL' in term.get('obo_id'):
                            children.append(term.get('label'))
            self.cache[cache_key] = children
            time.sleep(0.1)
            return children
        except:
            return []

In [2]:
cm = CellMapper()

In [3]:
# CL_0000255: eukaryotic cell
broad_categories = sorted(cm.get_children('CL_0000255'))

In [4]:
broad_categories = broad_categories + ['T cell', 'B cell', 'neutrophil', 'monocyte', 'macrophage', 'plasma cell', 'natural killer cell', 'fibroblast', 'platelet', 'erythrocyte']

In [5]:
def sum_across_tissues(tissue_to_ct_dict):
    # Combine all of the dictionaries
    total_dict = {}
    for tissue in tissue_to_ct_dict:
        total_dict = Counter(total_dict) + Counter(tissue_to_ct_dict[tissue])
    return total_dict
with open('cell_type_distributions/cell_type_distributions_08_15_2025_cleaned.json', 'r') as file:
    tissue_to_cts = json.load(file)

cell_types = list(sum_across_tissues(tissue_to_cts).keys())

In [9]:
# Num cell types
len(cell_types)

143

In [6]:
"""
For every cell type
    Keep track of a dictionary between cell type and all of its top level categorizations
    Keep track of a dictionary between top level categorization and the number of appearances in the first dictionary
For every cell type
    Find top level categorization that has the highest cell count and assign it
"""
categorizations = {}
count_categories = {}
for cell_type in tqdm(cell_types):
    cell_id = cm.find_cl_id(cell_type)
    categories = list((set(cm.get_ancestors(cell_id)).union({cell_type})).intersection(set(broad_categories)))
    categorizations[cell_type] = categories
    
    for category in categories:
        count_categories[category] = count_categories.get(category, 0) + 1

100%|█████████████████████████████████████████| 143/143 [03:46<00:00,  1.58s/it]


In [7]:
"""
hematopoietic stem cell -> stem cell
hematopoietic precursor cell -> hematopoietic cell
mucus secreting cell -> secretory cell 
mural cell -> perivascular cell 
"""
# categorizations['hematopoietic stem cell'] = ['stem cell']
categorizations['hematopoietic precursor cell'] = ['hematopoietic precursor cell']
categorizations['common myeloid progenitor'] = ['hematopoietic precursor cell']
categorizations['mucus secreting cell'] = ['secretory cell']
categorizations['mural cell'] = ['perivascular cell']

count_categories['hematopoietic precursor cell'] = 2
count_categories['secretory cell'] = 1
count_categories['perivascular cell'] = 1

In [17]:
# top level catgorization
ct_to_category_top = {}
for cell_type in tqdm(categorizations):
    counts = []
    for category in categorizations[cell_type]:
        counts.append(count_categories[category])
    if 'stem cell' in categorizations[cell_type] or 'progenitor cell' in categorizations[cell_type]:
        ct_to_category_top[cell_type] = 'progenitor/stem cell'
    else:
        ct_to_category_top[cell_type] = categorizations[cell_type][counts.index(max(counts))]

100%|█████████████████████████████████████| 143/143 [00:00<00:00, 733234.07it/s]


In [8]:
# Fine-grained categorization
ct_to_category = {}
for cell_type in tqdm(categorizations):
    counts = []
    for category in categorizations[cell_type]:
        counts.append(count_categories[category])
    if 'monocyte' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'monocyte'
    elif 'stem cell' in categorizations[cell_type] or 'progenitor cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'progenitor/stem cell'
    
    elif 'fibroblast' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'fibroblast'
    elif 'connective tissue cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'other connective tissue cell'
    
    elif 'T cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'T cell'
    elif 'B cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'B cell'
    elif 'neutrophil' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'neutrophil'
    elif 'macrophage' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'macrophage'
    elif 'plasma cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'plasma cell'
    elif 'natural killer cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'natural killer cell'
    elif 'leukocyte' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'other leukocyte'
    
    elif 'platelet' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'platelet'
    elif 'erythrocyte' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'erythrocyte'
    
    elif 'hematopoietic cell' in categorizations[cell_type]:
        ct_to_category[cell_type] = 'other hematopoietic cell'
    else:
        ct_to_category[cell_type] = categorizations[cell_type][counts.index(max(counts))]

100%|█████████████████████████████████████| 143/143 [00:00<00:00, 473764.20it/s]


In [9]:
with open("cell_type_distributions/ct_categorized_ontology_08_16_2025.json", "w") as json_file:
    json.dump(ct_to_category, json_file, indent=4)

In [None]:
with open("cell_type_distributions/ct_categorized_ontology_toplevel_08_15_2025.json", "w") as json_file:
    json.dump(ct_to_category_top, json_file, indent=4)