# Comments

- AIBS CCFv3 terms currently have this shape:  `rdflib.term.Literal('Allen Transcriptome Type', datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string'))` in the **Brain Region Ontology**
- had to rename some of the references [here](https://docs.google.com/spreadsheets/d/1iUgqPszKkYQgkJlmpQSkeyFWcEoOxovsBkoLPtA3qPg/edit#gid=1180597294) to match how they are written in the "Source of transcriptome" column of the **Transcriptome Cell Type** sheet

# Cell Types Ontology

## Context

This notebook was put together as a result of discussion on this ticket: [MMB2022-32](https://bbpteam.epfl.ch/project/issues/browse/MMB2022-32)

## Imports

In [None]:
import json
import rdflib
import pandas as pd
from rdflib import RDF, RDFS, XSD, OWL, URIRef, BNode, SKOS
from rdflib.paths import OneOrMore
from bmo.ontologies import subontology_from_term
import pprint

## Helper functions

In [None]:
def add_term(label, parent_label=None):
    new_s = rdflib.URIRef(f"https://bbp.epfl.ch/ontologies/core/celltypes/{label.replace(' ', '')}")
    triples_to_add = set()
    if parent_label:
        for s, p, o in cell_type_ontology.triples((None, RDFS.label, rdflib.term.Literal(parent_label, lang='en'))):
            triples_to_add.add((new_s, RDFS.subClassOf, s))
    triples_to_add.add((new_s, RDFS.label, rdflib.term.Literal(label, lang='en')))
    triples_to_add.add((new_s, RDF.type, OWL.Class))
    for el in triples_to_add:
        cell_type_ontology.add(el)

In [None]:
def add_relationship(graph, subject_id, relationship, object_id):
    blank_node = rdflib.term.BNode()
    graph.add((blank_node, OWL.onProperty, relationship))
    graph.add((subject_id, RDFS.subClassOf, blank_node))
    graph.add((blank_node, RDF.type, OWL.Restriction))
    graph.add((blank_node, OWL.someValuesFrom, object_id))
    # pprint.pprint((blank_node, OWL.onProperty, relationship))
    return graph

In [None]:
def remove_relationship(graph, term, relationship):
    for s, p, o in graph.triples((term, None, None)):
        for ss, pp, oo in  graph.triples((o, 
                                          rdflib.term.URIRef('http://www.w3.org/2002/07/owl#onProperty'), 
                                          relationship)):
            graph.remove((ss, OWL.onProperty, relationship))
            graph.remove((s, RDFS.subClassOf, ss))
            graph.remove((ss, RDF.type, OWL.Restriction))
            graph.remove((ss, OWL.someValuesFrom, None))
            pprint.pprint((ss, OWL.onProperty, relationship))
    return graph

## Load Brain Region Ontology

In [None]:
base = "" # Provide the directory into which you have downloaded the Cell Types and Brain Region Ontology from WebProtégé

In [None]:
brain_region_ontology = rdflib.Graph()
brain_region_ontology.parse(f"{base}/brainregion.ttl")

## Get available Cell Types

### Cell Type Ontology from WebProtégé

This ontology file was downloaded from [WebProtégé](https://webprotege.kcp.bbp.epfl.ch/#projects/968c9144-bca3-4436-bdb5-6529d46016b9/edit/Classes)

In [None]:
cell_type_ontology = rdflib.Graph()
cell_type_ontology.parse(f"{base}/celltypes.ttl")

In [None]:
len(list(cell_type_ontology.subjects()))

### All labels from Cell Type Ontology

In [None]:
labels = list()
for s, p, o in cell_type_ontology.triples((None, RDFS.label, None)):
    labels.append(str(o))

### ME-Type to T-Type compatibility mapping from Yann Roussel

This file was shared on this ticket: [MMB2022-32](https://bbpteam.epfl.ch/project/issues/browse/MMB2022-32)

In [None]:
with open(f"{base}/me_type_to_t_type_compatibility-1.json") as f:
    met_mapping = json.load(f)

#### Change L1_DLAC to L1_SAC and L1_SLAC to L1_LAC (since Yann is using old labels)

In [None]:
changes = {
    "L1_DLAC": "L1_LAC",
    "L1_SLAC": "L1_SAC",
}

for k, v in changes.items():
    for kk, vv in met_mapping.items():
        if k in kk:
            new_key = kk.replace(k, v)
            print(new_key)
            met_mapping[new_key] = vv
            del met_mapping[kk]

#### Get all t-types

In [None]:
t_types = list()
for v in met_mapping.values():
    for el in v:
        t_types.append(el)
t_types = list(set(t_types))

In [None]:
len(t_types)

#### Get all m-types

In [None]:
m_types = list()
for el in met_mapping.keys():
    fragments = el.split("_")
    if len(fragments) == 3:
        m_types.append(f"{fragments[0]}_{fragments[1]}")
m_types = list(set(m_types))

In [None]:
len(m_types)

#### Get all e-types

In [None]:
e_types = list()
for el in met_mapping.keys():
    fragments = el.split("_")
    if len(fragments) == 3:
        e_types.append(fragments[2])
e_types = list(set(e_types))

In [None]:
len(e_types)

### Cell Types from Blue Brain Cell Atlas

The cell types were copied from the Blue Brain Cell Atlas: https://bbp.epfl.ch/nexus/cell-atlas/?all=1 and pasted into an excel sheet

In [None]:
cell_atlas_types = pd.read_excel("./Blue Brain Cell Atlas Cell Types.xlsx")

In [None]:
cell_atlas_types.head()

## Cell Types and missing data google sheet

The sheet has been downloaded [here](https://docs.google.com/spreadsheets/d/1iUgqPszKkYQgkJlmpQSkeyFWcEoOxovsBkoLPtA3qPg/edit#gid=642322419)

In [None]:
cell_types_g_sheet = pd.read_excel(f"{base}/mmb_month5/Cell Types and Missing Data - Version 1.xlsx", sheet_name="Transcriptome Cell Types")

In [None]:
cell_types_g_sheet

/!\ Note: had to rename some of the [references](https://docs.google.com/spreadsheets/d/1iUgqPszKkYQgkJlmpQSkeyFWcEoOxovsBkoLPtA3qPg/edit#gid=1180597294) to match how they are written in the "Source of transcriptome" column of the **Transcriptome Cell Type** sheet

In [None]:
references = pd.read_excel(f"{base}/mmb_month5/Cell Types and Missing Data - Version 1.xlsx", sheet_name="Notes")

In [None]:
references = dict(zip(references[6:11]["Unnamed: 0"], references[6:11]["Unnamed: 1"]))

In [None]:
references

## Check for missing labels in the Cell Types Ontology

### Check which Cell Atlas labels not yet present

In [None]:
len(list(set(cell_atlas_types.Child)))

In [None]:
missing_cell_atlas_types = list()
for el in list(cell_atlas_types.Child):
    if el not in labels:
        missing_cell_atlas_types.append(el)

In [None]:
len(missing_cell_atlas_types)

In [None]:
len(list(set(cell_atlas_types.Parent)))

In [None]:
for el in list(set(cell_atlas_types.Parent)):
    if el not in labels:
        missing_cell_atlas_types.append(el)

In [None]:
missing_cell_atlas_types = list(set(missing_cell_atlas_types))

In [None]:
len(missing_cell_atlas_types)

In [None]:
missing_cell_atlas_types[:5]

### Check which T-types not yet present

In [None]:
len(t_types)

In [None]:
missing_t_types = list()
for el in t_types:
    if el not in labels:
        missing_t_types.append(el)

In [None]:
missing_t_types = list(set(missing_t_types))

In [None]:
len(missing_t_types)

In [None]:
missing_t_types[:5]

### Check which M-types not yet present

In [None]:
missing_m_types = list()
for el in m_types:
    if el not in labels:
            missing_m_types.append(el)

In [None]:
missing_m_types = list(set(missing_m_types))

In [None]:
len(missing_m_types)

In [None]:
missing_m_types[:5]

### Check which E-types not yet present

In [None]:
missing_e_types = list()
for el in e_types:
    if el not in labels:
        missing_e_types.append(el)

In [None]:
len(missing_e_types)

In [None]:
missing_e_types = list(set(missing_e_types))

## Add to the Cell Types Ontology

### Add new parent to t-types

In [None]:
for t in t_types:
    triples_to_add = set()
    parent_label = "Neuron Transcriptomic Type"
    for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(t, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
        label_id = s
    for s, p, o in cell_type_ontology.triples((None, RDFS.label, rdflib.term.Literal(parent_label, lang='en'))):
        triples_to_add.add((label_id, RDFS.subClassOf, s))
    for el in triples_to_add:
        cell_type_ontology.add(el)

### Add link from m-types to t-types

In [None]:
for label in m_types:
    for s, p, o in cell_type_ontology.triples((None, 
                                               RDFS.label, 
                                               rdflib.term.Literal(label, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
         cell_type_ontology = remove_relationship(cell_type_ontology, s, rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveTType"))

In [None]:
m_type_to_t_type = dict()
for el, v in met_mapping.items():
    fragments = el.split("_")
    if len(fragments) == 3:
        m_type = f"{fragments[0]}_{fragments[1]}"
    else:
        m_type = el
    if not m_type in m_type_to_t_type.keys():
        m_type_to_t_type[m_type] = v
    else:
        prev_v =  m_type_to_t_type[m_type]
        m_type_to_t_type[m_type] = list(set(prev_v + v))

In [None]:
for k, v in m_type_to_t_type.items():
    for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(k, lang="en"))):
        m_type_id = s
    for vv in v:
        for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(vv, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
            t_type_id = s
        cell_type_ontology = add_relationship(cell_type_ontology, m_type_id, rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveTType"), t_type_id)

### Add link from e-types to t-types

In [None]:
for label in e_types:
    for s, p, o in cell_type_ontology.triples((None, 
                                               RDFS.label, 
                                               rdflib.term.Literal(label, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
         cell_type_ontology = remove_relationship(cell_type_ontology, s, rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveTType"))

In [None]:
e_type_to_t_type = dict()
for el, v in met_mapping.items():
    fragments = el.split("_")
    if len(fragments) == 3:
        e_type = fragments[-1]
        if not e_type in e_type_to_t_type.keys():
            e_type_to_t_type[e_type] = v
        else:
            prev_v =  e_type_to_t_type[e_type]
            e_type_to_t_type[e_type] = list(set(prev_v + v))

In [None]:
for k, v in e_type_to_t_type.items():
    for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(k, lang="en"))):
        e_type_id = s
    for vv in v:
        for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(vv, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
            t_type_id = s
        cell_type_ontology = add_relationship(cell_type_ontology, e_type_id, rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveTType"), t_type_id)

### Add link from e-types to m-types

In [None]:
for label in e_types:
    for s, p, o in cell_type_ontology.triples((None, 
                                               RDFS.label, 
                                               rdflib.term.Literal(label, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
         cell_type_ontology = remove_relationship(cell_type_ontology, s, rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveMType"))

In [None]:
me_types = list(set(met_mapping.keys()))

In [None]:
e_type_to_m_type = dict()
for el in me_types:
    fragments = el.split("_")
    if len(fragments) == 3:
        e_type = fragments[-1]
        m_type = f"{fragments[0]}_{fragments[1]}"
        if not e_type in e_type_to_m_type.keys():
            e_type_to_m_type[e_type] = [m_type]
        else:
            prev_v =  e_type_to_m_type[e_type]
            e_type_to_m_type[e_type] = list(set(prev_v + [m_type]))

In [None]:
for k, v in e_type_to_m_type.items():
    for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(k, lang="en"))):
        e_type_id = s
    for vv in v:
        for s, p, o in cell_type_ontology.triples((None, RDFS.label,  rdflib.term.Literal(vv, lang="en"))):
            m_type_id = s
        cell_type_ontology = add_relationship(cell_type_ontology, e_type_id, rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveMType"), m_type_id)

### Add Blue Brain Cell Atlas types

In [None]:
# for t in missing_cell_atlas_types:
#     parent = cell_atlas_types[cell_atlas_types.Child==t].Parent.iloc[0]
#     add_term(t, parent)

### Add more t-types and information (brain region and paper reference) from the google sheet

In [None]:
set(cell_types_g_sheet["Brain Reigion (CCFv3 aligned)"])

In [None]:
brain_regions = {
 'Agranular insular area': 'Agranular insular area',
 'Area prostriata': 'Area prostriata',
 'Cerebellum': 'Cerebellum',
 'Cerebral Cortex': 'Cerebral cortex',
 'Cerebral Cortex - Hippocampal formation': ['Cerebral cortex', 'Hippocampal formation'],
 'Dentate gyrus': 'Dentate gyrus',
 'Enthorinal area, lateral part': 'Entorhinal area, lateral part',
 'Enthorinal area, medial part, dorsal zone': 'Entorhinal area, medial part, dorsal zone',
 'Entorhinal area': 'Entorhinal area',
 'Entorhinal area, lateral part': 'Entorhinal area, lateral part',
 'Entorhinal area, medial part, dorsal zone': 'Entorhinal area, medial part, dorsal zone',
 'Field CA1': 'Field CA1',
 'Field CA1, Prosubiculum ': ['Field CA1', 'Prosubiculum'],
 'Field CA2, Induseum griseum, Fasciola cinerea': ["Field CA2", "Induseum griseum", "Fasciola cinerea"],
 'Field CA3': 'Field CA3',
 'Hippocampal formation': 'Hippocampal formation',
 'Hippocampo-amygdalar transition area': 'Hippocampo-amygdalar transition area',
 'Hypothalamus': 'Hypothalamus',
 'Isocortex - Entorhinal area': ['Isocortex', 'Entorhinal area'],
 'Parasubiculum': 'Parasubiculum',
 'Postsubiculum - Presubiculum': ['Postsubiculum', 'Presubiculum'],
 'Prosubiculum': 'Prosubiculum',
 'Retrohippocampal region': 'Retrohippocampal region',
 'Retrosplenial area - Anterior cingulate area': ['Retrosplenial area', 'Anterior cingulate area'],
 'Retrosplenial area, ventral part- Postsubiculum - Presubiculum': ['Retrosplenial area, ventral part', 'Postsubiculum', 'Presubiculum'],
 'Striatum': 'Striatum',
 'Subiculum': 'Subiculum',
 'Thalamus': 'Thalamus'
}

In [None]:
for i in cell_types_g_sheet.iterrows():
    label = i[1]["T Type"]
    for s, p, o in cell_type_ontology.triples((None, 
                                               RDFS.label, 
                                               rdflib.term.Literal(label, datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
         cell_type_ontology = remove_relationship(cell_type_ontology, s, rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveBrainRegion"))

In [None]:
for i in cell_types_g_sheet.iterrows():
    label = i[1]["T Type"]
    if label not in ["Macrophage", "Microglia"]:
        see_also = references[i[1]["Source of transcriptome"]]
        triples_to_add = set()
        if label not in labels:        
            new_s = rdflib.URIRef(f"https://bbp.epfl.ch/ontologies/core/celltypes/{label.replace(' ', '').replace('/', '_')}")
            triples_to_add.add((new_s, RDFS.subClassOf, rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/BrainCellTranscriptomeType")))
            triples_to_add.add((new_s, RDFS.label, rdflib.term.Literal(label, lang='en')))
            triples_to_add.add((new_s, RDF.type, OWL.Class))
        else:
            for s, p, o in cell_type_ontology.triples((None, RDFS.label, rdflib.term.Literal(i[1]["T Type"], datatype=rdflib.term.URIRef('http://www.w3.org/2001/XMLSchema#string')))):
                new_s = s

        triples_to_add.add((new_s, RDFS.seeAlso, rdflib.URIRef(see_also)))
        if i[1]["Brain Reigion (CCFv3 aligned)"] in brain_regions.keys():
            brain_region_label = brain_regions[i[1]["Brain Reigion (CCFv3 aligned)"]]
            if isinstance(brain_region_label, list):
                for br in brain_region_label:
                    for s, p, o in brain_region_ontology.triples((None, RDFS.label, rdflib.term.Literal(br))):
                        brain_region = s
                    cell_type_ontology = add_relationship(cell_type_ontology, new_s, rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveBrainRegion"), brain_region)
            else:
                for s, p, o in brain_region_ontology.triples((None, RDFS.label, rdflib.term.Literal(brain_region_label))):
                    brain_region = s
                cell_type_ontology = add_relationship(cell_type_ontology, new_s, rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/canHaveBrainRegion"), brain_region)
        for el in triples_to_add:
            cell_type_ontology.add(el)

### Add more m-types from the google sheet

In [None]:
m_types_g_sheet = pd.read_excel(f"{base}/mmb_month5/Cell Types and Missing Data - Version 1.xlsx", sheet_name="Functional Types")

In [None]:
new_s = rdflib.URIRef(f"https://bbp.epfl.ch/ontologies/core/celltypes/{label.replace(' ', '')}")
triples_to_add = set()
if parent_label:
    for s, p, o in cell_type_ontology.triples((None, RDFS.label, rdflib.term.Literal(parent_label, lang='en'))):
        triples_to_add.add((new_s, RDFS.subClassOf, s))
triples_to_add.add((new_s, RDFS.label, rdflib.term.Literal(label, lang='en')))
triples_to_add.add((new_s, RDF.type, OWL.Class))
for el in triples_to_add:
    cell_type_ontology.add(el)

In [None]:
cell_type_ontology.namespace_manager.bind("skos", SKOS)

In [None]:
for el in list(set(m_types_g_sheet["Cell Type"][:])):
    if isinstance(el, str):
        if "(" in el:
            definition = el.split("(")[0]
            label = el.split("(")[-1].split(")")[0]
            new_s = rdflib.URIRef(f"https://bbp.epfl.ch/ontologies/core/celltypes/{label.replace(' ', '')}")
            triples_to_add = set()
            parent = rdflib.term.URIRef("https://bbp.epfl.ch/ontologies/core/bmo/NeuronType")
            triples_to_add.add((new_s, RDFS.subClassOf, parent))
            triples_to_add.add((new_s, RDFS.label, rdflib.term.Literal(label, lang='en')))
            triples_to_add.add((new_s, SKOS.prefLabel, rdflib.term.Literal(label, lang='en')))
            triples_to_add.add((new_s, SKOS.notation, rdflib.term.Literal(label, lang='en')))
            triples_to_add.add((new_s, RDF.type, OWL.Class))
            triples_to_add.add((new_s, SKOS.definition, rdflib.term.Literal(definition)))
            for el in triples_to_add:
                cell_type_ontology.add(el)

## Serialize the updated Cell Types Ontology

In [None]:
cell_type_ontology.serialize(destination=f"{base}/cell-type-ontology.ttl")

The updated Cell Types Ontology then needs to be merged back into WebProtégé by:
1. Navigating to the [Cell Types Ontology](https://webprotege.kcp.bbp.epfl.ch/#projects/968c9144-bca3-4436-bdb5-6529d46016b9/edit/Classes)
2. Clicking on `Project` in the top right corner
3. Clicking on `Apply External Edits` 
4. Selecting the file that you have just serialised