In [1]:
import xml.etree.ElementTree as ET
import csv
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF
from collections import defaultdict
import os
from dotenv import load_dotenv

In [2]:
def read_tsv(file_tsv):
    with open(file_tsv, 'r', encoding='utf-8') as tsvfile:
        reader = csv.DictReader(tsvfile, delimiter='\t')
        data = defaultdict(lambda: defaultdict(list))
        for row in reader:
            if row['MOA_SOURCE_URL']:
                data[row['DRUG_NAME'].lower()]['MOA'].append(row['MOA_SOURCE_URL'].lower())
            if row['ACT_SOURCE_URL']:
                data[row['DRUG_NAME'].lower()]['ACT'].append(row['ACT_SOURCE_URL'].lower())
        for drug in data:
            if not data[drug]['MOA'] and not data[drug]['ACT']:
                data[drug]['NO_URL'].append('null')
    return data


def generate_rdf_triples(xml_tree, tsv_file):
    data_tsv = read_tsv(tsv_file)

    graph = Graph()
    rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    ex = Namespace("http://example.org/")
    graph.bind("rdf", rdf)
    graph.bind("ex", ex)
    
    xml_root = xml_tree.getroot()

    study = xml_root
    # brief_title = study.find('./brief_title')
    condition = study.find('./condition')
    intervention_type = study.find('.//intervention_type')
    intervention_name = study.find('.//intervention_name')

    # Check if any of the elements are None
    if None in [condition, intervention_type, intervention_name]:
        return None  # Return None to indicate that the file should be skipped

    condition = condition.text
    intervention_type = intervention_type.text.lower()
    intervention_name = intervention_name.text.lower()  # Convert to lowercase
    # print([condition, intervention_type, intervention_name])

    if intervention_type == 'drug' and intervention_name in data_tsv:
        urls = data_tsv[intervention_name]

        # Add triples to graph
        subject = ex[intervention_name.replace(" ", "_")]  # Valid URIs
        graph.add((subject, RDF.type, ex['Drug']))
        graph.add((subject, ex['condition'], Literal(condition)))
        graph.add((subject, ex['intervention_name'], Literal(intervention_name)))
        
        for url_type in urls:
            for url in urls[url_type]:
                graph.add((subject, ex[url_type], URIRef(url)))
    else:
        return None

    return graph



In [3]:
# Load the environment variables from the .env file
load_dotenv()

folder = os.getenv('FOLDER')
tsv_file = os.getenv('FILE_TSV')


graphs = []

# Iterate over all files in the directory and its subdirectories
for dirpath, dirnames, filenames in os.walk(folder):
    for filename in filenames:
        if filename.endswith('.xml'):
            file_xml = os.path.join(dirpath, filename)
            tree = ET.parse(file_xml)
            graph = generate_rdf_triples(tree, tsv_file)
            if graph is not None:  # Only add the graph to the list if the file was not skipped
                graphs.append(graph)

for graph in graphs:
    json_ld_data = graph.serialize(format='json-ld', indent=4)
    print(json_ld_data)


[
    {
        "@id": "http://example.org/paclitaxel",
        "@type": [
            "http://example.org/Drug"
        ],
        "http://example.org/ACT": [
            {
                "@id": "https://pubmed.ncbi.nlm.nih.gov/11309480"
            }
        ],
        "http://example.org/MOA": [
            {
                "@id": "https://pubmed.ncbi.nlm.nih.gov/11309480"
            }
        ],
        "http://example.org/condition": [
            {
                "@value": "Primary Peritoneal Carcinoma"
            }
        ],
        "http://example.org/intervention_name": [
            {
                "@value": "paclitaxel"
            }
        ]
    }
]
[
    {
        "@id": "http://example.org/docetaxel",
        "@type": [
            "http://example.org/Drug"
        ],
        "http://example.org/MOA": [
            {
                "@id": "https://www.ebi.ac.uk/chembl/compound/inspect/chembl3545252"
            }
        ],
        "http://example.org/condition"