In [None]:
import xml.etree.ElementTree as ET
import csv
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF
from collections import defaultdict
import os
from dotenv import load_dotenv

In [5]:
def read_tsv(file_tsv):
    with open(file_tsv, 'r', encoding='utf-8') as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        data = defaultdict(lambda: defaultdict(list))
        for row in reader:
            if row['MOA_SOURCE_URL']:
                data[row['DRUG_NAME'].lower()]['MOA'].append(row['MOA_SOURCE_URL'])
            if row['ACT_SOURCE_URL']:
                data[row['DRUG_NAME'].lower()]['ACT'].append(row['ACT_SOURCE_URL'])
        for drug in data:
            if not data[drug]['MOA'] and not data[drug]['ACT']:
                data[drug]['NO_URL'].append('null')
    return data

def generate_rdf_triples(xml_root, file_tsv):
    data_tsv = read_tsv(file_tsv)
    
    graph = Graph()
    rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    ex = Namespace("http://example.org/")
    graph.bind("rdf", rdf)
    graph.bind("ex", ex)

    study = xml_root

    brief_title = study.find('./brief_title').text
    condition = study.find('./condition').text

    intervention_type = study.find('.//intervention_type').text
    intervention_name = study.find('.//intervention_name').text.lower()

    if intervention_type.lower() == 'drug' and intervention_name in data_tsv:
        urls = data_tsv[intervention_name]

        # Add triples to graph
        subject = ex[intervention_name.replace(" ", "_")]  # Valid URIs
        graph.add((subject, RDF.type, ex['Drug']))
        graph.add((subject, ex['condition'], Literal(condition)))
        graph.add((subject, ex['intervention_name'], Literal(intervention_name)))
        
        for url_type in urls:
            for url in urls[url_type]:
                graph.add((subject, ex[url_type], URIRef(url)))

    # Serialize to JSON-LD
    json_ld_data = graph.serialize(format='json-ld', indent=4)
    print(json_ld_data)


In [6]:
# Load the environment variables from the .env file
load_dotenv()

file_xml = os.getenv('FILE_XML')
file_tsv = os.getenv('FILE_TSV')

tree = ET.parse(file_xml)
root = tree.getroot()

generate_rdf_triples(root, file_tsv)


[
    {
        "@id": "http://example.org/nifedipine",
        "@type": [
            "http://example.org/Drug"
        ],
        "http://example.org/MOA": [
            {
                "@id": "https://www.ebi.ac.uk/chembl/compound/inspect/CHEMBL193"
            },
            {
                "@id": "https://pubmed.ncbi.nlm.nih.gov/17276408"
            }
        ],
        "http://example.org/condition": [
            {
                "@value": "Congenital Adrenal Hyperplasia"
            }
        ],
        "http://example.org/intervention_name": [
            {
                "@value": "nifedipine"
            }
        ]
    }
]
