In [None]:
import getpass
import requests
import json
import rdflib

from collections import defaultdict

from io import StringIO
import pandas as pd

from rdflib import RDF, RDFS, OWL, SKOS, XSD

from bluegraph import PandasPGFrame
from bluegraph.backends.neo4j import pgframe_to_neo4j

from bmo.ontologies import subontology_from_term
from bmo.kbs.gene_kbs import get_gene_data

## Get gene data

In [None]:
data = pd.read_csv("../data/ncmv3_transcription_data.csv")

In [None]:
data = data.set_index("Gene")

In [None]:
SPECIES = 'mus_musculus'

In [None]:
gene_selection = data.index.tolist()
len(gene_selection)

In [None]:
gene_selection = gene_selection[:20]

In [None]:
records = {}
for gene in gene_selection:
    if gene not in records:
        try:
            records[gene] = get_gene_data(gene, SPECIES)
        except:
            print("Skipped: ", gene)

In [None]:
with open("../data/gene_non_zero_expression.json", "w") as f:
    json.dump(records, f)

In [None]:
len(records)

## Merge gene data with Gene ontology

In [None]:
with open("../data/gene_non_zero_expression.json", "r") as f:
    records = json.load(f)

In [None]:
go = rdflib.Graph()
go.parse("../../ontologies/external/go.ttl", format="turtle")

In [None]:
go_terms_to_include = {}
for g, record in records.items():
    if "go_edges" in record:
        for _, t in record["go_edges"]:
            if t not in go_terms_to_include:
                for s in go.subjects(
                        rdflib.URIRef("http://www.geneontology.org/formats/oboInOwl#id"),
                        rdflib.Literal(t, datatype=XSD.string)):
                    go_terms_to_include[t] = s
                    break

In [None]:
selected_go = rdflib.Graph()
for t in go_terms_to_include.values():
    subontology = subontology_from_term(go, t, top_down=False, closed=True)
    selected_go += subontology

In [None]:
frame = PandasPGFrame.from_ontology(rdf_graph=selected_go, remove_prop_uris=True)
# frame.remove_node_properties("http://purl.obolibrary.org/obo/IAO_0000589")
frame.remove_isolated_nodes()
frame.rename_node_properties({"id": "go_id"})

In [None]:
frame.add_nodes(["Gene"])

In [None]:
gene_df = pd.DataFrame(records.values())
gene_df["@id"] = gene_df["prefLabel"]
gene_df["label"] = gene_df["prefLabel"]
gene_df = gene_df.dropna()
gene_df = gene_df.drop_duplicates("@id")

In [None]:
edges = gene_df[["@id", "go_edges"]]

In [None]:
gene_df = gene_df.drop(columns=["go_edges"])
gene_df["xrefs"] = gene_df["xrefs"].apply(lambda x: {f"{k}:{v}" for k, v in x.items()})
frame.add_nodes(gene_df["@id"])
for c in gene_df.columns:
    if c != "@id":
        frame.add_node_properties(gene_df[["@id", c]])

In [None]:
instance_edges = [
    (el, "Gene")   
    for el in gene_df["@id"] 
]
frame.add_edges(instance_edges)
for s, t in instance_edges:
    frame._edges.loc[(s, t), "@type"] = {"IS_INSTANCE_OF"}

In [None]:
go_edges = defaultdict(set)
for row in edges.to_dict("records"):
    source  = row["@id"]
    for rel, go_term in row["go_edges"]:
        try:
            target = frame._nodes[frame._nodes["go_id"] == go_term].index[0]
            go_edges[(source, target)].add(rel.replace(" ", "_"))
        except IndexError:
            pass

frame.add_edges(go_edges.keys())
for e, types in go_edges.items():
    frame._edges.loc[e, "@type"] = types

frame.rename_node_properties({
    p: p.replace(" ", "_") for p in frame.node_properties()
})

## Export to Neo4j

In [None]:
uri = "bolt://127.0.0.1:7687"
username = "neo4j"

In [None]:
password = getpass.getpass()

In [None]:
pgframe_to_neo4j(
    frame, uri=uri, username=username, password=password, 
    node_label="TEST_ONTOLOGY_CLASS", 
#     node_types_as_labels=True,
    edge_types_as_labels=True)

In [None]:
frame._nodes