# DiSignAtlas MESH terms

Structure
* 1. Get MESH terms from DiSignAtlas
* 2. Get MESH terms through DO DiSignAtlas

## 1. Get MESH terms from DiSignAtlas website

In [4]:
"""Get MESH terms from DiSignAtlas

Structure:
    1. Imports, Variables, Functions
    2. Load Data
    3. Get All External Links - Web Scraping
    4. Get MeSH tree terms
"""

# 1. Imports, Variables, Functions
# imports
import pandas as pd, numpy as np, sys, os
import logging, requests
import requests, pickle
from bs4 import BeautifulSoup
from tqdm.contrib.concurrent import process_map
import xml.etree.ElementTree as ET
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")
# variables
data_info_path = os.path.join(
    "..", "data", "DiSignAtlas", "Disease_information_Datasets.csv"
)
external_links_output_path = os.path.join(
    "..", "data", "DiSignAtlas", "external_links.pkl"
)
mesh_tree_terms_output_path = os.path.join(
    "..", "data", "DiSignAtlas", "mesh_tree_terms.pkl"
)

mesh_file_path = os.path.join("..", "data", "MeSH", "desc2023.xml")


# functions
def search_mesh(term):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=mesh&term={term}&retmode=json"
    response = requests.get(url)
    data = response.json()
    return data


def get_external_links(dsaid):
    """Get External Links from DiSignAtlas
    Args:
        dsaid (str): DiSignAtlas ID
    Returns:
        extenal_links (list): List of external links"""
    # imports
    import re

    # URL of the webpage you want to extract data from
    url = f"http://www.inbirg.com/disignatlas/detail/{dsaid}"

    # Fetch the content of the webpage
    response = requests.get(url)
    webpage = response.content

    # Parse the HTML content
    soup = BeautifulSoup(webpage, "html.parser")

    # Extract all scripts from the html
    scripts = soup.find_all("script")

    # Extract the script with dbXrefs
    dbXrefs_script = None
    for script in scripts:
        if "dbXrefs" in script.text:
            dbXrefs_script = script.text
            break

    # if there is no dbXrefs script, return an empty list
    if dbXrefs_script is None:
        return []
    # Extract from the script dbXrefs
    # These are what appear as External Links in the webpage
    dbXrefs_match = re.search(r'dbXrefs = "(.*?)"', dbXrefs_script)

    dbXrefs_str = dbXrefs_match.group(1)
    dbXrefs_list = dbXrefs_str.split("|")

    return dbXrefs_list


def find_mesh_tree_terms(mesh_ids, file_path):
    # Load and parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Create a dictionary to store tree terms for each MeSH ID
    mesh_tree_terms_dict = {mesh_id: None for mesh_id in mesh_ids}

    # Iterate through the XML to find each MeSH ID
    for descriptor in root.findall(".//DescriptorRecord"):
        descriptor_id = descriptor.find("./DescriptorUI")
        if descriptor_id is not None and descriptor_id.text in mesh_ids:
            # Extract tree terms
            tree_numbers = [tn.text for tn in descriptor.findall(".//TreeNumber")]
            mesh_tree_terms_dict[descriptor_id.text] = tree_numbers

    return mesh_tree_terms_dict


# 2. Load Data
# load MeSH data
mesh_file_path


# load DiSignAtlas data
df_data = pd.read_csv(data_info_path)

logging.info(f"Data Shape: {df_data.shape}")
logging.info(
    f"Nº of signatures without NIH Concept ID: {df_data['diseaseid'].isnull().sum()}"
)
logging.info(f"Nº of unique diseases: {df_data['diseaseid'].nunique()}")


# 3. Get All External Links - Web Scraping
if os.path.exists(external_links_output_path):
    logging.info(f"Loading external links from  {external_links_output_path}")
    with open(external_links_output_path, "rb") as f:
        data = pickle.load(f)
        dsaids = data["dsaids"]
        external_links = data["external_links"]
else:
    logging.info(f"Saving External Links in {external_links_output_path}")
    dsaids = df_data["dsaid"].to_list()
    external_links = process_map(
        get_external_links, dsaids, chunksize=50, max_workers=48
    )

    # example of dsaids with no external links
    for dsaid, external_link in zip(dsaids, external_links):
        if len(external_link) == 0:
            logging.info(dsaid)
            break

    # save external links
    path_external_links = os.path.join()
    with open(path_external_links, "wb") as f:
        data_to_save = {"dsaids": dsaids, "external_links": external_links}

        pickle.dump(data_to_save, f)

logging.info(
    f"Nº of dsaids with no external links {len([x for x in external_links if len(x) == 0])}"
)

2024-03-27 15:35:47,728 - Data Shape: (10306, 12)
2024-03-27 15:35:47,729 - Nº of signatures without NIH Concept ID: 472
2024-03-27 15:35:47,729 - Nº of unique diseases: 1427
2024-03-27 15:35:47,731 - Loading external links from  ../data/DiSignAtlas/external_links.pkl
2024-03-27 15:35:47,744 - Nº of dsaids with no external links 1


In [5]:
logging.info(
    f"{len([element for elements in external_links for element in elements if not element.startswith('MeSH')])}"
)

not_found = list()
for dsaid, external_link in tqdm(zip(dsaids, external_links)):
    if dsaid in df_data[df_data["organism"] == "Homo sapiens"]["dsaid"].to_list():
        switch = False
        for element in external_link:
            if element.startswith("MeSH"):
                switch = True
        if not switch:
            not_found.append(dsaid)


logging.info(
    f"Nº of not found {len(not_found)}/{len(df_data[df_data['organism']=='Homo sapiens']['dsaid'].to_list())}"
)

2024-03-27 15:35:47,772 - 93693
10306it [00:07, 1292.25it/s]
2024-03-27 15:35:55,752 - Nº of not found 1475/7194


In [6]:
# 4. Get MeSH tree terms
# Filter out MeSH terms from External Links
mesh_ids = list()
for external_link in external_links:
    found_terms = list()
    for element in external_link:
        if element.startswith("MeSH"):
            found_terms.append(element.split("MeSH:")[1])
    mesh_ids.append(found_terms)

logging.info(
    f"Nº of signatures w/ MeSH terms {len([a for a in mesh_ids if len(a)>0])} / {len(external_links)}"
)

logging.info(
    f"Nº of signatures w/ >1 MeSH terms {len([a for a in mesh_ids if len(a)>1])}"
)


# get unique mesh ids
unique_mesh_ids = list({e for elements in mesh_ids for e in elements})

# Get MeSH tree symbols for each ID
d_mesh_tree_terms = find_mesh_tree_terms(unique_mesh_ids, mesh_file_path)

2024-03-27 15:35:55,765 - Nº of signatures w/ MeSH terms 8155 / 10306
2024-03-27 15:35:55,766 - Nº of signatures w/ >1 MeSH terms 1


In [7]:
# mesh tree terms for each dsaid
mesh_tree_terms = list()
for mesh_ids_sublist in tqdm(mesh_ids):
    _terms = list()
    for mesh_id in mesh_ids_sublist:
        if type(d_mesh_tree_terms.get(mesh_id)) is list:
            _terms.extend(d_mesh_tree_terms.get(mesh_id))

    mesh_tree_terms.append(_terms)

logging.info(
    f"Nº of singnatures w/ MeSH tree terms {len([a for a in mesh_tree_terms if len(a)>0])} / {len(mesh_tree_terms)}"
)

100%|██████████| 10306/10306 [00:00<00:00, 1399731.14it/s]
2024-03-27 15:36:10,903 - Nº of singnatures w/ MeSH tree terms 7790 / 10306


In [8]:
not_found = list()
found = list()
for dsaid, mesh_tree_terms_sublist in tqdm(
    zip(dsaids, mesh_tree_terms), total=len(dsaids)
):
    if dsaid in df_data[df_data["organism"] == "Homo sapiens"]["dsaid"].to_list():
        switch = False
        if len(mesh_tree_terms_sublist) < 1:
            not_found.append(dsaid)
        else:
            found.append(dsaid)


logging.info(
    f"Nº of not found {len(not_found)}/{len(df_data[df_data['organism']=='Homo sapiens']['dsaid'].to_list())}"
)

logging.info(
    f"Nº of found {len(found)}/{len(df_data[df_data['organism']=='Homo sapiens']['dsaid'].to_list())}"
)

100%|██████████| 10306/10306 [00:07<00:00, 1305.78it/s]
2024-03-27 15:36:18,803 - Nº of not found 1755/7194
2024-03-27 15:36:18,804 - Nº of found 5439/7194


In [9]:
if os.path.exists(mesh_tree_terms_output_path):
    logging.info(f"File exists {mesh_tree_terms_output_path}")
    with open(mesh_tree_terms_output_path, "rb") as f:
        data = pickle.load(f)
        mesh_tree_terms = data.get("mesh_tree_terms")
        mesh_dsaids = data.get("dsaids")
        mesh_ids = data.get("mesh_ids")
        dsaids_2_mesh_tree_terms = {k: v for k, v in zip(mesh_dsaids, mesh_tree_terms)}
        dsaids_2_mesh_ids = {k: v for k, v in zip(mesh_dsaids, mesh_ids)}
else:
    logging.info(f"Saving data to {mesh_tree_terms_output_path}")
    data = {"dsaids": dsaids, "mesh_tree_terms": mesh_tree_terms, "mesh_ids": mesh_ids}
    with open(mesh_tree_terms_output_path, "wb") as f:
        pickle.dump(data, f)

2024-03-27 15:36:18,809 - File exists ../data/DiSignAtlas/mesh_tree_terms.pkl


## 2. Get MESH terms through DO DiSignAtlas

## Tests

In [None]:
# def get_medgen_id(disease_id):
#     """Get Medgen ID
#     Args:
#         disease_id (str): Disease ID
#     Returns:
#         medgen_id (str): MedGen ID
#     """

#     url = f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
#     params = {"db": "medgen", "term": disease_id, "retmode": "json"}
#     response = requests.get(url, params=params)
#     data = response.json()
#     medgen_ids = data["esearchresult"]["idlist"]
#     assert (
#         len(medgen_ids) == 1
#     ), f"For {disease_id} more than one MedGen ID found: {medgen_ids}"
#     return medgen_ids[0]


# def get_mesh_terms_from_medgen_id(medgen_id):
#     """Get Medgen ID
#     Args:
#         medgen_id (str): Disease ID
#     Returns:
#         mesh_terms (list): List of MESH terms
#     """

#     url = f"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?"
#     params = {"db": "medgen", "id": medgen_id, "retmode": "json"}
#     response = requests.get(url, params=params)
#     data = response.json()
#     return data
#     medgen_ids = data["esearchresult"]["idlist"]
#     assert (
#         len(medgen_ids) == 1
#     ), f"For {disease_id} more than one MedGen ID found: {medgen_ids}"
#     return medgen_ids[0]


# id = get_medgen_id("C0002395")
# print(id)
# d = get_mesh_terms_from_medgen_id(id)

# from xml.etree import ElementTree as ET
# import html

# # The conceptmeta XML-like string, slightly truncated for brevity in explanation
# encoded_conceptmeta = d["result"]["1853"]["conceptmeta"]

# # Decode HTML entities and ASCII encodings to convert them back to XML tags and characters
# decoded_conceptmeta = html.unescape(encoded_conceptmeta[:5493])

# # Now, you can parse the XML
# root = ET.fromstring(decoded_conceptmeta)

# # Extract MeSH terms using the previously mentioned method
# mesh_terms = [name.text for name in root.findall(".//Name[@SAB='MSH']")]

# for term in mesh_terms:
#     print(term)