In [1]:
import pandas as pd
from bioregistry import get_iri, normalize_curie
from rdflib import Graph, URIRef, Literal, Namespace, BNode
from rdflib.namespace import RDF, RDFS, XSD, SKOS, FOAF

* 'underscore_attrs_are_private' has been removed


In [2]:
import io
import pydotplus
from IPython.display import display, Image
from rdflib.tools.rdf2dot import rdf2dot


def visualize(g):
    stream = io.StringIO()
    rdf2dot(g, stream, opts={display})
    dg = pydotplus.graph_from_dot_data(stream.getvalue())
    png = dg.create_png()
    display(Image(png))

In [3]:
df = pd.read_pickle("../examples/combined_df.pkl")
df = df[["identifier", "identifier.source", "target", "target.source", "Bgee", "DisGeNET"]]
df.dropna(inplace=True)
df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee,DisGeNET
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C2350344', 'disease_name..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C0751882', 'disease_name..."


In [4]:
play_df = pd.DataFrame(df.iloc[36]).T

In [5]:
play_df

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee,DisGeNET
36,VAMP1,HGNC,ENSG00000139190,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C0751882', 'disease_name..."


In [11]:
# Namespaces
SIO = Namespace("http://semanticscience.org/resource/")
HGNC = Namespace("http://bio2rdf.org/hgnc:")
OBO = Namespace("http://purl.obolibrary.org/obo/")
UMLS = Namespace("http://linkedlifedata.com/resource/umls/id/")
ENSEMBL = Namespace("https://identifiers.org/ensembl:")

# Create an RDF graph
g = Graph()
g.bind("foaf", FOAF)
g.bind("skos", SKOS)
g.bind("sio", SIO)

for data_row in play_df.values:
    (
        source_idx,
        source_namespace,
        target_idx,
        target_namespace,
        bgee_expression_data,
        digenet_disease_data,
    ) = data_row

    # Add base nodes
    disease_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C7057")  # DiseaseOrDisoroder
    gene_disease_association = URIRef(
        "http://semanticscience.org/resource/SIO_000983"
    )  # GeneDiseaseAssociation
    gene_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C16612")  # Gene
    gene_symbol_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C43568")  # GeneSymbol
    source_base_node = URIRef("http://www.w3.org/ns/dcat#Dataset")  # Source
    data_source_base_node = URIRef("http://purl.obolibrary.org/obo/SLSO_0001122")  # DataSource
    score_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C25338")  # Score

    # Add metadata
    # gene_node = URIRef(get_iri(f"{source_namespace}:{source_idx}"))
    # g.add((gene_base_node, RDFS.label, gene_node))
    # g.add((gene_node, RDFS.label, Literal(source_idx, datatype=XSD.string)))

    # Define URIs for nodes
    gene_ensembl_uri = ENSEMBL[target_idx]  # URI
    g.add((gene_base_node, SKOS.exactMatch, gene_ensembl_uri))

    gene_symbol_uri = HGNC[source_idx]  # URI
    g.add((gene_symbol_base_node, SKOS.exactMatch, gene_symbol_uri))
    g.add((gene_base_node, SIO.is_represented_by, gene_symbol_base_node))

    for data in digenet_disease_data:
        g.add((gene_disease_association, SIO.refers_to, gene_base_node))
        g.add((gene_disease_association, SIO.refers_to, disease_base_node))

        disease_node = URIRef(get_iri(data["disease_id"]))
        g.add((disease_base_node, RDFS.label, Literal(data["disease_name"], datatype=XSD.string)))
        g.add((disease_base_node, SKOS.exactMatch, disease_node))

        score_val = Literal(data["score"], datatype=XSD.double)
        g.add((gene_disease_association, SIO.has_measured_value, score_base_node))
        g.add((score_base_node, SIO.has_value, score_val))

        source_name = Literal(data["evidence_source"], datatype=XSD.string)
        g.add((gene_disease_association, SIO.has_source, source_base_node))
        g.add((source_base_node, SIO.has_value, source_name))

        data_source_name = Literal("DisGeNET", datatype=XSD.string)
        g.add((gene_disease_association, SIO.has_source, data_source_base_node))
        g.add((data_source_base_node, SIO.has_value, data_source_name))

    break

In [12]:
g.serialize("example.ttl", format="turtle")

<Graph identifier=N32090b00238a462dbba97e4e7722a8f9 (<class 'rdflib.graph.Graph'>)>