In [103]:
import os
import csv
import glob
import xml.etree.ElementTree as ET
from rdflib import Graph, Literal, BNode, Namespace, RDF, URIRef
from urllib.parse import quote

In [104]:
# Define the namespaces
n = Namespace("http://example.org/drug/")
d = Namespace("http://example.org/url_type/")
s = Namespace("http://example.org/study/")
c = Namespace("http://example.org/condition/")
no_url = Namespace("http://example.org/no_url/")

In [105]:
def process_tsv_file(g, tsv_file):
    drug_set = set()
    # Read the TSV file
    with open(tsv_file, 'r') as f:
        reader = csv.DictReader(f, delimiter='\t')
        for row in reader:
            drug = quote(row['DRUG_NAME'].lower())
            drug_set.add(drug)
            moa_urls = row.get('MOA_SOURCE_URL', '').split(',')
            act_urls = row.get('ACT_SOURCE_URL', '').split(',')

            # Create a node for the drug
            drug_node = n[drug]

            # Add the MOA URLs as properties of the drug
            for moa_url in moa_urls:
                moa_url = moa_url.strip()
                if moa_url:
                    g.add((drug_node, d.moa_url, Literal(moa_url)))

            # Add the ACT URLs as properties of the drug
            for act_url in act_urls:
                act_url = act_url.strip()
                if act_url:
                    g.add((drug_node, d.act_url, Literal(act_url)))

            # If there are no MOA or ACT URLs, add a NO_URL property
            if not moa_urls and not act_urls:
                g.add((drug_node, no_url.null, Literal("null")))
    return drug_set


In [106]:
def process_xml_file(g, xml_file, drug_set):
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Get the intervention name
    intervention_name_element = root.find('intervention_name')
    if intervention_name_element is None:
        print("No intervention name found in {}".format(xml_file))
        return
    intervention_name = quote(intervention_name_element.text.lower())


    # Check if the intervention name is in the set of drug names
    if intervention_name not in drug_set:
        print("Intervention name {} not found in drug set".format(intervention_name))
        return

    # Get the study URL
    study_url_element = root.find('required_header/url')
    if study_url_element is None:
        print("No study URL found in {}".format(xml_file))
        return
    study_url = study_url_element.text

    # Create a node for the study
    study_node = s[quote(study_url)]

    # Check the intervention type
    intervention_type_element = root.find('intervention_type')
    if intervention_type_element is None or intervention_type_element.text.lower() != 'drug':
        # print("No drug intervention found in {}".format(xml_file))
        return

    
    # Get the condition
    condition_element = root.find('condition')
    if condition_element is None:
        print("No condition found in {}".format(xml_file))
        return
    condition = condition_element.text.lower()

    # Create a node for the condition
    condition_node = c[condition]

    # Add the relationships to the graph
    g.add((n[intervention_name], d.is_used_for, condition_node))
    g.add((n[intervention_name], d.is_used_in, study_node))
    g.add((condition_node, c.is_treated_by, n[intervention_name]))
    g.add((condition_node, c.is_cited_in, study_node))


In [107]:
def create_graph():
    # Create a graph
    g = Graph()

    # Get the TSV file and folder from environment variables
    tsv_file = os.getenv('FILE_TSV')
    folder = os.getenv('FOLDER')

    # Process the TSV file
    drug_set = process_tsv_file(g, tsv_file)
    print(drug_set)

    # Walk through the folder and subfolders
    for dirpath, dirnames, filenames in os.walk(folder):
        for filename in filenames:
            if filename.endswith('.xml'):
                # Process each XML file
                process_xml_file(g, os.path.join(dirpath, filename), drug_set)

    # Serialize the graph in JSON-LD format
    json_ld = g.serialize(format='json-ld', indent=4)
    print(json_ld)

In [108]:
create_graph()

{'macimorelin', 'iguratimod', 'clindamycin', 'inebilizumab', 'nilvadipine', 'methazolamide', 'imipenem', 'methionine', 'triamterene', 'cefetamet%20pivoxil', 'permethrin', 'enoximone', 'lifitegrast', 'moperone', 'diosmin', 'desoximetasone', 'lutetium%20%28177lu%29%20oxodotreotide', 'sodium%20lauryl%20sulfate', 'ulipristal', 'delorazepam', 'streptomycin', 'chlorcyclizine', 'trazodone', 'pegaptanib', 'fomepizole', 'oxycodone', 'pimavanserin', 'copanlisib', 'pralmorelin', 'benzocaine', 'kainic%20acid', 'fenbendazole', 'azacitidine', 'idoxuridine', 'diosmetin', 'carbenicillin', 'clorindione', 'aripiprazole', 'betaxolol', 'dolutegravir', 'mercaptopurine', 'acotiamide', 'vasopressin', 'lenvatinib', 'aspartic%20acid', 'sarpogrelate', 'floxuridine', 'mupirocin', 'rescinnamine', 'brexpiprazole', 'armodafinil', 'tasosartan', 'inotersen%20', 'sitagliptin', 'naltrexone', 'trimipramine', 'budesonide', 'tolvaptan', 'clometacin', 'domperidone', 'drospirenone', 'tolterodine', 'sulfisoxazole%20acetyl', 