In [None]:
import json
import rdflib
from rdflib.namespace import OWL, RDF, RDFS, XSD

import networkx as nx
import pandas as pd

from zipfile import ZipFile

from bluegraph import PandasPGFrame
from bluegraph.backends.networkx import pgframe_to_networkx

In [None]:
def remove_empty_hierarchy_nodes(frame):
    graph = pgframe_to_networkx(frame)
    while True:
        edges_to_remove = set()
        edges_to_add = set()
        for n in graph.nodes():
            if frame._nodes.loc[n, "cell_set_alias"]:
                for _, t in graph.out_edges(n):
                    father_alias = frame._nodes.loc[t, "cell_set_alias"]
                    if not father_alias:
                        edges_to_remove.add((n, t))
                        for _, tt in graph.out_edges(t):
                            edges_to_add.add((n, tt))

        for s, t in edges_to_remove:
            graph.remove_edge(s, t)
        for s, t in edges_to_add:
            graph.add_edge(s, t)

        if not edges_to_remove and not edges_to_add:
            break
    graph.remove_nodes_from(list(nx.isolates(graph)))
    graph.remove_nodes_from([
        n for n in graph.nodes() if not frame._nodes.loc[n, "cell_set_alias"]
    ])
    frame.remove_nodes([
        n for n in frame.nodes() if n not in graph.nodes()
    ])
    frame.remove_edges([
        (s, t) for s, t in frame.edges() if (s, t) not in graph.edges()
    ])
    new_edges = [
        (s, t) for s, t in graph.edges() if (s, t) not in frame.edges()
    ]
    frame.add_edges(new_edges)
    for s, t in new_edges:
        frame._edges.loc[(s, t), "@type"] = "IS_SUBCLASS_OF"

In [None]:
with ZipFile("../data/allen_ttypes_data_25_11_2021/dend.json.zip", 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall("../data/allen_ttypes_data_25_11_2021/")

In [None]:
with open("../data/allen_ttypes_data_25_11_2021/dend.json", "r") as f:
    type_hierarchy = json.load(f)

In [None]:
edges = set()
nodes = {}

def extract_data(hierarchy, father=None):
    if "children" in hierarchy:
        for child in hierarchy["children"]:
            if "node_attributes" in child:
                attrs = child["node_attributes"][0]
            else:
                attrs = child["leaf_attributes"][0]
            nodes[attrs["cell_set_accession"]] = attrs
            extract_data(child, attrs["cell_set_accession"])
            if father:
                edges.add((attrs["cell_set_accession"], father))

In [None]:
extract_data(type_hierarchy)

In [None]:
node_df = pd.DataFrame(nodes.values()).rename(
    columns={"cell_set_accession": "@id"}).set_index("@id")

In [None]:
edge_df = pd.DataFrame(edges, columns=["@source_id", "@target_id"]).set_index(["@source_id", "@target_id"])
edge_df["@type"] = "IS_SUBCLASS_OF"
frame = PandasPGFrame.from_frames(node_df, edge_df)
for c in frame._nodes.columns:
    frame.node_prop_as_category(c)

In [None]:
remove_empty_hierarchy_nodes(frame)

In [None]:
rdf_graph = rdflib.Graph()
rdf_graph.add(
    (
        rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes"),
        RDF.type,
        OWL.Ontology
    )
)
rdf_graph.add(
    (
        rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes"),
        rdflib.URIRef("http://purl.org/dc/elements/1.1/title"),
        rdflib.Literal(
            "Cell Transcriptome Types Ontology", datatype=XSD.string)
    )
)
rdf_graph.add(
    (
        rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes"),
        RDFS.label,
        rdflib.Literal(
            "Cell Transcriptome Types Ontology", datatype=XSD.string)
    )
)
rdf_graph.add(
    (
        rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes"),
        rdflib.URIRef("http://purl.org/vocab/vann/preferredNamespacePrefix"),
        rdflib.Literal(
            "ttypes", datatype=XSD.string)
    )
)

In [None]:
uri_template = "https://bbp.epfl.ch/ontologies/core/ttypes/{}"

def create_url(uri_template, label):
    safe_label = label.replace(" ", "_").replace("/", "_").replace(",", "_")
    uri = uri_template.format(safe_label)
    return uri

In [None]:
def create_class(graph, label, name=None, taxonomy_id=None):
    uri = create_url(uri_template, label)
    print(uri)
    graph.add((rdflib.URIRef(uri), RDF.type, OWL.Class))
    graph.add((rdflib.URIRef(uri), RDFS.label, rdflib.Literal(label, datatype=XSD.string)))
    if name:
        graph.add((
            rdflib.URIRef(uri),
            rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/cell_set_alias"),
            rdflib.Literal(name, datatype=XSD.string)
        ))
    if taxonomy_id:
        graph.add((
            rdflib.URIRef(uri),
            rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/taxonomy_id"),
            rdflib.Literal(taxonomy_id, datatype=XSD.string)
        ))

In [None]:
create_class(rdf_graph, "Neuron Transcriptome Type")
create_class(rdf_graph, "Allen Transcriptome Type")

In [None]:
for node in frame.nodes():
    label = frame._nodes.loc[node].cell_set_alias
    name = frame._nodes.loc[node].name
    taxonomy_id = frame._nodes.loc[node].taxonomy_id
    create_class(rdf_graph, label, name, taxonomy_id)

In [None]:
for s, t in frame.edges():
    label_s = frame._nodes.loc[s].cell_set_alias
    label_t = frame._nodes.loc[t].cell_set_alias
    rdf_graph.add((
        rdflib.URIRef(create_url(uri_template, label_s)),
        RDFS.subClassOf,
        rdflib.URIRef(create_url(uri_template, label_t))))

In [None]:
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Neuron_Transcriptome_Type")
))
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Astrocytes"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type")
))
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/DG"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type")
))
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Gabaergic_neurons"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type")
))
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Glutamatergic_neurons"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type")
))
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Immune__Vasculature"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type")
))
rdf_graph.add((
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Oligodendrocytes"),
    RDFS.subClassOf,
    rdflib.URIRef("https://bbp.epfl.ch/ontologies/core/ttypes/Allen_Transcriptome_Type")
))

In [None]:
rdf_graph.serialize("../data/ttypes.ttl", format="turtle")