In [3]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
from rdflib import Graph, URIRef, Literal, Namespace
from rdflib.namespace import RDF, RDFS
from dotenv import load_dotenv
from urllib.parse import urlparse, unquote

In [4]:
# Load the environment variables from the .env file
load_dotenv()

xml_folder_path = os.getenv('TEST_FILES')
tsv_file = os.getenv('FILE_TSV')
MeSH_file = os.getenv('MeSH')

# Create the RDF graph
graph = Graph()

# Define RDF namespaces and bind
CT = Namespace("https://clinicaltrials.gov/")
DRUG = Namespace("https://drugcentral.org/drugcard/")
STUDY = Namespace("https://clinicaltrials.gov/ct2/show/")
MESH = Namespace("https://www.ncbi.nlm.nih.gov/mesh/?term=")

graph.bind('ct', CT)
graph.bind('drug', DRUG)
graph.bind('file', STUDY)
graph.bind('mesh', MESH)

# Dictionary to store information associated with each drug
drugs_info = {}

# Set to store drugs not present in the TSV
missing_drugs = set()

In [5]:
# Iterate over all XML files
for filename in os.listdir(xml_folder_path):
    if filename.endswith('.xml'):
        xml_file_path = os.path.join(xml_folder_path, filename)

        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Extract information from the XML file
        intervention_type = root.find('.//intervention_type')
        intervention_name = root.find('.//intervention_name')

        # Check if the intervention_type field is "drug"
        if intervention_type is not None and intervention_type.text.lower() == 'drug':
            # Initialize the dictionary for the drug if it does not exist
            drug_name_uri = '_'.join(intervention_name.text.lower().split())
            if drug_name_uri not in drugs_info:
                drugs_info[drug_name_uri] = {
                    'moa_urls': [],
                    'act_urls': [],
                    'files': [],
                    'mesh_info': None,
                    'struct_id': None
                }

            # Add the current file to the list of files associated with the drug
            drugs_info[drug_name_uri]['files'].append(filename)

            # Add the drug name to the set of mentioned drugs
            drug_name = intervention_name.text.lower()

In [6]:
# Read the TSV file with drug information
with open(tsv_file, 'r') as tsv_file:
    tsv_data = pd.read_csv(tsv_file, delimiter='\t')

# Filter the data to include only drugs mentioned in the XML files
tsv_data = tsv_data[tsv_data['DRUG_NAME'].str.lower().isin(drugs_info.keys())]

# Iterate over drugs in the XML files
for index, row in tsv_data.iterrows():
    drug_name = row['DRUG_NAME']
    moa_source_url = row['MOA_SOURCE_URL']
    act_source_url = row['ACT_SOURCE_URL']
    struct_id = row['STRUCT_ID']

    # Replace spaces with underscores to create valid URIs
    drug_name_uri = '_'.join(drug_name.split())

    # Normalize URLs
    moa_source_url = unquote(str(moa_source_url))
    act_source_url = unquote(str(act_source_url))

    # Add information to the dictionary for the drug
    drugs_info[drug_name_uri]['moa_urls'].append(Literal(moa_source_url) if not pd.isna(moa_source_url) else Literal(None))
    drugs_info[drug_name_uri]['act_urls'].append(Literal(act_source_url) if not pd.isna(act_source_url) else Literal(None))
    drugs_info[drug_name_uri]['struct_id'] = struct_id 

In [7]:
# Read the MeSH XML file
mesh_tree = ET.parse(MeSH_file)
mesh_root = mesh_tree.getroot()

# Iterate over drugs in the XML files
for drug_name_uri, info in drugs_info.items():
    drug_name = drug_name_uri.replace('_', ' ')
    struct_id = info['struct_id']
    # Search for drug_name in MeSH XML
    for descriptor_record in mesh_root.findall('.//DescriptorRecord'):
        descriptor_name = descriptor_record.find('.//DescriptorName/String')
        if descriptor_name is not None and descriptor_name.text.lower() == drug_name.lower():
            # Drug found in MeSH XML, extract information
            descriptor_ui = descriptor_record.find('.//DescriptorUI')
            if descriptor_ui is not None:
                # Add MeSH information to the dictionary for the drug
                drugs_info[drug_name_uri]['mesh_info'] = {
                    'descriptor_ui': descriptor_ui.text
                }
                break

In [8]:
# Add information to the RDF graph
for drug_name_uri, info in drugs_info.items():
    # Check if the drug is missing in the TSV
    if drug_name_uri not in tsv_data['DRUG_NAME'].str.lower().values:
        missing_drugs.add(drug_name_uri)
        continue

    drug_uri = URIRef(DRUG[repr(info['struct_id'])])
    graph.add((drug_uri, RDF.type, DRUG.Drug)) ###
    graph.add((drug_uri, RDFS.label, Literal(drug_name_uri.replace('_', ' '))))

    # Check if all values in moa_urls are 'nan'
    moa_values = info['moa_urls']
    if any(moa_value.lower() != 'nan' for moa_value in moa_values):
        for moa_url in moa_values:
            if moa_url.lower() != 'nan':
                graph.add((drug_uri, DRUG.moaSourceURL, moa_url))

    # Check if all values in act_urls are 'nan'
    act_values = info['act_urls']
    if any(act_value.lower() != 'nan' for act_value in act_values):
        for act_url in act_values:
            if act_url.lower() != 'nan':
                graph.add((drug_uri, DRUG.actSourceURL, act_url))

    # Connect the current file to the drug node
    for filename in info['files']:
        file_uri = URIRef(STUDY[filename])
        graph.add((file_uri, RDF.type, STUDY.Study))
        graph.add((file_uri, STUDY.references, drug_uri))

    # Add MeSH information to the RDF graph
    mesh_info = info['mesh_info']
    if mesh_info is not None and 'descriptor_ui' in mesh_info:
        descriptor_uri = URIRef(MESH[mesh_info['descriptor_ui']])
        graph.add((drug_uri, MESH.hasDescriptor, descriptor_uri))


In [9]:
# Serialize the RDF graph in JSON-LD format
jsonld_data = graph.serialize(format='json-ld', indent=4)

print(jsonld_data)

print("\n\n",len(missing_drugs)," Drugs not present in TSV:", missing_drugs)


# Save the result to a JSON-LD file
# with open('output.jsonld', 'w') as f:
#     f.write(jsonld_data)

[
    {
        "@id": "https://clinicaltrials.gov/ct2/show/NCT00010075.xml",
        "@type": [
            "https://clinicaltrials.gov/ct2/show/Study"
        ],
        "https://clinicaltrials.gov/ct2/show/references": [
            {
                "@id": "https://drugcentral.org/drugcard/26"
            }
        ]
    },
    {
        "@id": "https://clinicaltrials.gov/ct2/show/NCT00010010.xml",
        "@type": [
            "https://clinicaltrials.gov/ct2/show/Study"
        ],
        "https://clinicaltrials.gov/ct2/show/references": [
            {
                "@id": "https://drugcentral.org/drugcard/1122"
            }
        ]
    },
    {
        "@id": "https://drugcentral.org/drugcard/26",
        "@type": [
            "https://drugcentral.org/drugcard/Drug"
        ],
        "http://www.w3.org/2000/01/rdf-schema#label": [
            {
                "@value": "fluorouracil"
            }
        ],
        "https://drugcentral.org/drugcard/moaSourceURL": [
   