In [24]:
import pandas as pd
from bioregistry import get_iri, normalize_curie
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS, XSD, SKOS, FOAF

In [25]:
df = pd.read_pickle("../examples/combined_df.pkl")
df = df[["identifier", "identifier.source", "target", "target.source", "Bgee", "DisGeNET"]]
df.dropna(inplace=True)
df.head(2)

Unnamed: 0,identifier,identifier.source,target,target.source,Bgee,DisGeNET
0,AHR,HGNC,ENSG00000106546,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C2350344', 'disease_name..."
1,ALG14,HGNC,ENSG00000172339,Ensembl,"[{'anatomical_entity_id': 'UBERON_0000178', 'a...","[{'disease_id': 'umls:C0751882', 'disease_name..."


In [26]:
from rdflib import Graph, Namespace, URIRef, Literal, RDF, RDFS, XSD
import pandas as pd

# Define namespaces
SIO = Namespace("http://semanticscience.org/resource/")
HGNC = Namespace("http://bio2rdf.org/hgnc:")
OBO = Namespace("http://purl.obolibrary.org/obo/")
UMLS = Namespace("https://uts-ws.nlm.nih.gov/rest/semantic-network/2015AB/CUI/")
ENSEMBL = Namespace("https://identifiers.org/ensembl:")
DCAT = Namespace('http://www.w3.org/ns/dcat#')
BIODATAFUSE = Namespace('https://biodatafuse.org/') # TODO decide internal identifier for each node 
# Create an RDF graph
g = Graph()
g.bind("foaf", FOAF)
g.bind("skos", SKOS)
g.bind("sio", SIO)
g.bind("hgnc", HGNC)
g.bind("obo", OBO)
g.bind("umls", UMLS)
g.bind('dcat', DCAT)
g.bind('biodatafuse', BIODATAFUSE) # TODO decide curie name
# Constant schema
disease_base_node = URIRef(f"{BIODATAFUSE}disease")  # DiseaseOrDisoroder
gene_disease_association = URIRef(f"{BIODATAFUSE}gene_disease_association") # GeneDiseaseAssociation
gene_base_node = URIRef(f"{BIODATAFUSE}gene")  # Gene
gene_symbol_base_node = URIRef("http://purl.obolibrary.org/obo/NCIT_C43568")  # GeneSymbol
source_base_node = URIRef(f"{BIODATAFUSE}source")  # Source
data_source_base_node = URIRef(f"{BIODATAFUSE}datasource")  # DataSource
score_base_node = URIRef(f"{BIODATAFUSE}score")  # Score
experimental_process_node = URIRef(f"{BIODATAFUSE}experimental_process")  # ExperimentalProcess
anatomical_entity_base_node = URIRef(f"{BIODATAFUSE}anatomical_entity")  # AnatomicalEntity
life_cycle_base_node = URIRef(f"{BIODATAFUSE}life_cycle")  # LifeCycle
gene_expression_value_base_node = URIRef(f"{BIODATAFUSE}gene_expression_value")  # GeneExpressionValue
gene_ensembl_uri = URIRef("https://identifiers.org/")

i=0
gene_ncit = URIRef(f'{OBO}NCIT_C16612')
g.add((gene_ncit, RDFS.label, Literal('Gene')))
for data_row in df.values:
    (
        source_idx,
        source_namespace,
        target_idx,
        target_namespace,
        bgee_expression_data,
        digenet_disease_data,
    ) = data_row
    id_number = f"{i:06d}" # Create ID for this row
    # Add base nodes
    source_curie = normalize_curie(f"{source_namespace}:{source_idx}")
    source_xref_curie = normalize_curie(f"{target_namespace}:{target_idx}")
    source_node = URIRef(get_iri(source_curie))
    target_node = URIRef(get_iri(source_xref_curie))

    # Create and type the gene and gene symbol nodes
    gene_node = URIRef(f"{gene_base_node}/{id_number}/")
    #gene_symbol_node = TODO Where is the gene symbol in the table?
    g.add((gene_node, RDF.type, OBO.NCIT_C16612))
    #g.add((gene_symbol_node, RDF.type, OBO.NCIT_C43568))

    # Add metadata
    #g.add((gene_node, SKOS.exactMatch, gene_symbol_node)) #TODO not sure that "exact match" is the right predicate here.
    #g.add((gene_symbol_node, SKOS.exactMatch, target_node)) TODO not sure what this aims to do
    #g.add((gene_node, SIO.is_represented_by, gene_symbol_node)) #TODO not sure where to retrieve this data from the dataframe
    
    for data in digenet_disease_data:
        if pd.isna(data["disease_id"]):
            continue

        disease_node = URIRef(get_iri(data["disease_id"]))

        g.add((disease_node, RDF.type, OBO.NCIT_C7057))
        g.add((disease_node, RDFS.label, Literal(data["disease_name"], datatype=XSD.string)))
        g.add((disease_node, SKOS.exactMatch, disease_node))

        gene_disease_assoc_node = URIRef(f"{gene_disease_association}/{id_number}/{source_idx}_assoc_{data['disease_id']}")
        g.add((gene_disease_assoc_node, RDF.type, SIO.SIO_000983))
        g.add((gene_disease_assoc_node, SIO.refers_to, gene_node))
        g.add((gene_disease_assoc_node, SIO.refers_to, disease_node))

        score_node = URIRef(f"{score_base_node}/{id_number}/{source_idx}_{data['disease_id']}")
        score_val = Literal(data["score"], datatype=XSD.double)
        g.add((score_node, RDF.type, OBO.NCIT_C25338))
        g.add((score_node, SIO.has_value, score_val))
        g.add((gene_disease_assoc_node, SIO.has_measured_value, score_node))

        source_name = Literal(data["evidence_source"], datatype=XSD.string)
        evidence_source_node = URIRef(f"{source_base_node}/{id_number}/{source_idx}_{data['disease_id']}")
        g.add((evidence_source_node, RDF.type, DCAT.Dataset))
        g.add((evidence_source_node, RDFS.label, source_name))
        g.add((gene_disease_assoc_node, SIO.has_source, evidence_source_node))

        data_source_name = Literal("DisGeNET", datatype=XSD.string)
        data_source_url = URIRef("https://disgenet.com/")
        data_source_node = URIRef(data_source_url) #TODO not sure what you intend here?
        g.add((data_source_node, RDF.type, OBO.SLSO_0001122))
        g.add((data_source_node, RDFS.label, data_source_name))
        g.add((gene_disease_assoc_node, SIO.has_source, data_source_node))


    for data in bgee_expression_data:
        if pd.isna(data["anatomical_entity_id"]):
            continue
        exp_process_node = URIRef(f"{experimental_process_node}/{id_number}/{source_idx}")
        developmental_stage_node = URIRef(get_iri(data["developmental_stage_id"].replace("_", ":")))
        anatomical_entity_node = URIRef(get_iri(data["anatomical_entity_id"].replace("_", ":")))
        life_cycle_node = URIRef(f"{life_cycle_base_node}/{id_number}/{source_idx}")
        gene_expression_value_node = URIRef(f"{gene_expression_value_base_node}/{id_number}/{source_idx}_{data['anatomical_entity_id']}")

        g.add((gene_node, SIO.is_associated_with, gene_expression_value_node))
        g.add((gene_node, SIO.is_associated_with, anatomical_entity_node))
        g.add((gene_node, SIO.is_associated_with, life_cycle_node))
        g.add((gene_expression_value_node, RDF.type, SIO.SIO_001077))
        g.add((gene_expression_value_node, SIO.has_value, Literal(data["expression_level"], datatype=XSD.double)))
        g.add((anatomical_entity_node, RDF.type, OBO.UBERON_0001062))
        g.add((anatomical_entity_node, RDFS.label, Literal(data["anatomical_entity_name"], datatype=XSD.string)))
        g.add((developmental_stage_node, RDFS.label, Literal(data["developmental_stage_name"], datatype=XSD.string)))
        g.add((life_cycle_node, SKOS.exactMatch, URIRef(get_iri(data["developmental_stage_id"].replace("_", ":")))))
        g.add((life_cycle_node, RDF.type, developmental_stage_node))
        g.add((anatomical_entity_node, SKOS.exactMatch, anatomical_entity_node))
        g.add((life_cycle_base_node, SKOS.exactMatch, developmental_stage_node))
        g.add((exp_process_node, RDF.type, SIO.SIO_001077))
        g.add((exp_process_node, SIO.has_input, gene_node))
        g.add((exp_process_node, SIO.has_output, gene_expression_value_node))
        g.add((exp_process_node, SIO.has_input, anatomical_entity_node))
        g.add((exp_process_node, SIO.has_input, life_cycle_node))
    i += 1

print(g.serialize(format="turtle"))


@prefix biodatafuse: <https://biodatafuse.org/> .
@prefix dcat: <http://www.w3.org/ns/dcat#> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix sio: <http://semanticscience.org/resource/> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

<https://biodatafuse.org/experimental_process/000000/AHR> a sio:SIO_001077 ;
    sio:has_input obo:UBERON_0000178,
        <https://biodatafuse.org/gene/000000/>,
        <https://biodatafuse.org/life_cycle/000000/AHR> ;
    sio:has_output <https://biodatafuse.org/gene_expression_value/000000/AHR_UBERON_0000178> .

<https://biodatafuse.org/experimental_process/000001/ALG14> a sio:SIO_001077 ;
    sio:has_input obo:UBERON_0000178,
        <https://biodatafuse.org/gene/000001/>,
        <https://biodatafuse.org/life_cycle/000001/ALG14> ;
    sio:has_output <https://biodatafuse.org/gene_expression_value/000001/ALG14_UBERON_0000178> .


In [27]:
k = g.serialize("biodatafuse-exp1.ttl", format="turtle", encoding="utf-8")