# Cell Types Ontology

## Imports

In [1]:
import json
import rdflib
import pandas as pd
from rdflib import RDF, RDFS, XSD, OWL, URIRef, BNode, SKOS
from rdflib.paths import OneOrMore
import pprint

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.mappings import DictionaryMapping

from bmo.utils import BMO, MBA, BRAIN_REGION_ONTOLOGY_URI, NSG, SCHEMAORG, SHACL, NXV
from bmo.ontologies import subontology_from_term

## Helper functions

In [2]:
def has_relation(graph, subject_id, relationship, object_id):
    if (subject_id, RDFS.subClassOf/OWL.onProperty, relationship) in graph \
        and  (subject_id, RDFS.subClassOf/OWL.someValuesFrom, object_id) in graph:
        return True
    else:
        return False

## Load Ontology

In [3]:
base = "../../ontologies/bbp" # Provide the directory into which you have downloaded the Cell Types and Brain Region Ontology from WebProtégé

## Get available Cell Types

In [4]:
cell_type_ontology = rdflib.Graph()
cell_type_ontology.parse("../../ontologies/bbp/celltypes.ttl")

<Graph identifier=N83a4d2729df341cc9b561ccabe293484 (<class 'rdflib.graph.Graph'>)>

## Load excel file

In [5]:
hip_dict = pd.read_excel("../data/cell_metypes/CellTypesandMissingData-Version2-20230614_cleaned.xlsx", sheet_name=None)

In [6]:
skiping = ['Etypes', 'Mtypes', 'hippocampus', 'hippocampus - abbreviations', 'cortex', 'Notes', 'Percent inhibitory by region', 'basal ganglia', 'basal ganglia simple']

## Add to the Cell Types Ontology

### All labels from Cell Type Ontology

In [7]:
all_labels = []
all_prefLables = []
all_notations = []

### Get all labels

In [8]:
for s, p, o in cell_type_ontology.triples((None, RDFS.label, None)):
    all_labels.append(str(o))
for s, p, o in cell_type_ontology.triples((None, SKOS.prefLabel, None)):
    all_prefLables.append(str(o))
for s, p, o in cell_type_ontology.triples((None, SKOS.notation, None)):
    all_notations.append(str(o))
    

In [9]:
all_acronyms = []

In [10]:
for region in hip_dict.keys():
    if region not in skiping:
        rgn_df = hip_dict[region]
        brain_regions = rgn_df['BBP ATLAS Brain Region'].dropna().unique()
        for br in brain_regions:
            sub_df = rgn_df.loc[rgn_df['BBP ATLAS Brain Region'] == br]
            for irow, row in sub_df.iterrows():
                valid = row['Nexus Status (N=Not Ready ,Y=Yes ready,D=Done)'].lower() if not pd.isna(row['Nexus Status (N=Not Ready ,Y=Yes ready,D=Done)']) else False
                if valid:
                    all_acronyms.append(row['Acronym'])

In [11]:
all_possible_labels = set(all_labels + all_prefLables + all_notations)

In [12]:
repeated = [s for s in all_acronyms if s not in set(all_acronyms)]

In [13]:
assert not repeated

In [19]:
missing = [s for s in all_acronyms if s not in all_possible_labels and not pd.isna(s)]

In [20]:
assert not missing