In [1]:
import pandas as pd
from bioregistry import get_iri, normalize_curie
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD, SKOS, FOAF

* 'underscore_attrs_are_private' has been removed


In [2]:
df = pd.read_pickle("../examples/combined_df.pkl")
df = df[["identifier", "identifier.source", "target", "target.source", "Bgee", "DisGeNET"]]
df.dropna(inplace=True)
df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee,DisGeNET
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C2350344', 'disease_name..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C0751882', 'disease_name..."


In [3]:
# Namespaces
SIO = Namespace("http://semanticscience.org/resource/")
HGNC = Namespace("http://bio2rdf.org/hgnc:")
OBO = Namespace("http://purl.obolibrary.org/obo/")
# UMLS = Namespace("https://uts-ws.nlm.nih.gov/rest/semantic-network/2015AB/CUI/")
ENSEMBL = Namespace("https://identifiers.org/ensembl:")

# Create an RDF graph
g = Graph()
g.bind("foaf", FOAF)
g.bind("skos", SKOS)
g.bind("sio", SIO)

# Constant schema
disease_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C7057")  # DiseaseOrDisoroder
gene_disease_association = URIRef(
    "http://semanticscience.org/resource/SIO_000983"
)  # GeneDiseaseAssociation
gene_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C16612")  # Gene
gene_symbol_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C43568")  # GeneSymbol
source_base_node = URIRef("http://www.w3.org/ns/dcat#Dataset")  # Source
data_source_base_node = URIRef("http://purl.obolibrary.org/obo/SLSO_0001122")  # DataSource
score_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C25338")  # Score
experimental_process_node = URIRef(
    "http://semanticscience.org/resource/SIO_001077"
)  # ExperimentalProcess
anatomical_entity_base_node = URIRef(
    "http://purl.obolibrary.org/obo/UBERON_0001062"
)  # AnatomicalEntity
life_cycle_base_node = URIRef("http://purl.obolibrary.org/obo/UBERON_0000104")  # LifeCycle
gene_expression_value_base_node = URIRef(
    "http://semanticscience.org/resource/SIO_001077"
)  # GeneExpressionValue


for data_row in df.values:
    (
        source_idx,
        source_namespace,
        target_idx,
        target_namespace,
        bgee_expression_data,
        digenet_disease_data,
    ) = data_row

    # Add base nodes

    # Add metadata
    gene_ensembl_uri = ENSEMBL[target_idx]  # URI
    g.add((gene_base_node, SKOS.exactMatch, gene_ensembl_uri))

    gene_symbol_uri = HGNC[source_idx]  # URI
    g.add((gene_symbol_base_node, SKOS.exactMatch, gene_symbol_uri))
    g.add((gene_base_node, SIO.is_represented_by, gene_symbol_base_node))

    for data in digenet_disease_data:
        if pd.isna(data["disease_id"]):
            continue

        g.add((gene_disease_association, SIO.refers_to, gene_base_node))
        g.add((gene_disease_association, SIO.refers_to, disease_base_node))

        disease_node = URIRef(get_iri(data["disease_id"]))
        g.add((disease_base_node, RDFS.label, Literal(data["disease_name"], datatype=XSD.string)))
        g.add((disease_base_node, SKOS.exactMatch, disease_node))

        score_val = Literal(data["score"], datatype=XSD.double)
        g.add((gene_disease_association, SIO.has_measured_value, score_base_node))
        g.add((score_base_node, SIO.has_value, score_val))

        source_name = Literal(data["evidence_source"], datatype=XSD.string)
        g.add((gene_disease_association, SIO.has_source, source_base_node))
        g.add((source_base_node, SIO.has_value, source_name))

        data_source_name = Literal("DisGeNET", datatype=XSD.string)
        g.add((gene_disease_association, SIO.has_source, data_source_base_node))
        g.add((data_source_base_node, SIO.has_value, data_source_name))

    g.add((experimental_process_node, SIO.has_input, gene_base_node))
    g.add((experimental_process_node, SIO.has_output, gene_expression_value_base_node))
    g.add((experimental_process_node, SIO.has_input, anatomical_entity_base_node))
    g.add((experimental_process_node, SIO.has_input, life_cycle_base_node))

    g.add((gene_base_node, SIO.is_associated_with, gene_expression_value_base_node))
    g.add((gene_base_node, SIO.is_associated_with, anatomical_entity_base_node))
    g.add((gene_base_node, SIO.is_associated_with, life_cycle_base_node))

    for data in bgee_expression_data:
        if pd.isna(data["anatomical_entity_id"]):
            continue

        g.add(
            (
                gene_expression_value_base_node,
                SIO.has_value,
                Literal(data["expression_level"], datatype=XSD.double),
            )
        )

        g.add(
            (
                anatomical_entity_base_node,
                SKOS.exactMatch,
                URIRef(get_iri(data["anatomical_entity_id"].replace("_", ":"))),
            )
        )
        g.add(
            (
                anatomical_entity_base_node,
                RDFS.label,
                Literal(data["anatomical_entity_name"], datatype=XSD.string),
            )
        )
        g.add(
            (
                life_cycle_base_node,
                SKOS.exactMatch,
                URIRef(get_iri(data["developmental_stage_id"].replace("_", ":"))),
            )
        )
        g.add(
            (
                life_cycle_base_node,
                RDFS.label,
                Literal(data["developmental_stage_name"], datatype=XSD.string),
            )
        )

In [4]:
k = g.serialize("biodatafuse-exp1.ttl", format="turtle", encoding="utf-8")