In [77]:
import os
import csv
import json
from rdflib import Graph, Namespace, URIRef, Literal
import urllib.parse

from dotenv import load_dotenv

In [78]:
def read_tsv_file(tsv_file):
    drugs = {}
    with open(tsv_file, 'r') as tsv:
        reader = csv.DictReader(tsv, delimiter='\t')
        for row in reader:
            drug_name = row['DRUG_NAME'].lower()
            moa_urls = row.get('MOA_SOURCE_URL', '').split(';')
            act_urls = row.get('ACT_SOURCE_URL', '').split(';')
            drugs[drug_name] = {'MOA': moa_urls if moa_urls != [''] else [], 'ACT': act_urls if act_urls != [''] else []}
    return drugs

In [79]:
def traverse_folder(folder):
    xml_files = []
    for root, dirs, files in os.walk(folder):
        for file in files:
            if file.endswith('.xml'):
                xml_files.append(os.path.join(root, file))
    return xml_files

In [80]:
import xml.etree.ElementTree as ET

def parse_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Initialize variables
    disease = None
    intervention_type = None
    intervention_name = None
    study_url = None

    # Extract information from the XML file
    try:
        required_header = root.find(".//required_header")
        url = required_header.findtext("url")

        condition = root.find(".//condition")
        if condition is not None:
            disease = condition.text.lower()

        intervention_type_elem = root.find(".//intervention_type")
        if intervention_type_elem is not None:
            intervention_type = intervention_type_elem.text.lower()

        if intervention_type == 'drug':
            intervention_name_elem = root.find(".//intervention_name")
            if intervention_name_elem is not None:
                intervention_name = intervention_name_elem.text.lower()
            study_url = url
    except Exception as e:
        print(f"Error parsing XML file {xml_file}: {e}")
        return None

    return {
        'disease': disease,
        'intervention_type': intervention_type,
        'intervention_name': intervention_name,
        'study_url': study_url
    }


In [81]:
def create_rdf_graph(drugs, xml_files):
    RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    DCT = Namespace("http://purl.org/dc/terms/")
    # Create an RDF Graph
    graph = Graph()

    for drug, urls in drugs.items():
        drug_node = URIRef(f"drug:{urllib.parse.quote(drug)}")
        for url in urls['MOA']:
            graph.add((drug_node, DCT.source, URIRef(urllib.parse.quote(url))))
        for url in urls['ACT']:
            graph.add((drug_node, DCT.hasPart, URIRef(urllib.parse.quote(url))))

    for xml_file in xml_files:
        data = parse_xml(xml_file)
        if data is None:
            continue
        
        disease = data['disease']
        intervention_type = data['intervention_type']
        intervention_name = data['intervention_name']
        study_url = data['study_url']

        if intervention_type is None or intervention_type.lower() != 'drug':
            continue

        if None in [disease, intervention_name, study_url]:
            continue

        disease_node = URIRef(f"disease:{urllib.parse.quote(disease)}")
        study_node = URIRef(f"study:{urllib.parse.quote(study_url)}")

        graph.add((study_node, DCT.subject, disease_node))
        graph.add((study_node, DCT.relation, URIRef(urllib.parse.quote(intervention_name))))
        graph.add((disease_node, RDF.type, DCT.subject))
        graph.add((disease_node, DCT.relation, URIRef(urllib.parse.quote(intervention_name))))
    
    return graph


In [82]:
def serialize_rdf_to_json_ld(graph):
    return graph.serialize(format='json-ld', indent=4)

In [84]:
load_dotenv()
tsv_file = os.getenv('FILE_TSV')
folder = os.getenv('TEST_FILES')

drugs = read_tsv_file(tsv_file)
xml_files = traverse_folder(folder)
rdf_graph = create_rdf_graph(drugs, xml_files)
json_ld = serialize_rdf_to_json_ld(rdf_graph)

print(json_ld)

[
    {
        "@id": "drug:lofexidine",
        "http://purl.org/dc/terms/source": [
            {
                "@id": "https%3A//pubmed.ncbi.nlm.nih.gov/20040696"
            }
        ]
    },
    {
        "@id": "drug:acipimox",
        "http://purl.org/dc/terms/source": [
            {
                "@id": "https%3A//pubmed.ncbi.nlm.nih.gov/17705685"
            }
        ]
    },
    {
        "@id": "disease:metabolism%2C%20inborn%20errors",
        "@type": [
            "http://purl.org/dc/terms/subject"
        ],
        "http://purl.org/dc/terms/relation": [
            {
                "@id": "cyclosporine"
            }
        ]
    },
    {
        "@id": "drug:etelcalcetide",
        "http://purl.org/dc/terms/source": [
            {
                "@id": "http%3A//www.ema.europa.eu/docs/en_GB/document_library/EPAR_-_Product_Information/human/003995/WC500217100.pdf"
            }
        ]
    },
    {
        "@id": "study:https%3A//clinicaltrials.gov/ct2/sho