In [2]:
"""MeSH
Retrieve MeSH terms

1. Imports, Variables, Functions
2. Retrieve Data
"""

# 1. Imports, Variables, Functions
# imports
import xml.etree.ElementTree as ET

# variables
mesh_file_path = '../data/desc2023.xml'

# functions
def parse_mesh_data(file_path):
    """Parse MeSH XML data and extract disease terms."""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract disease terms
    disease_terms=list(); list_tree_numbers=list()
    for descriptor in root.findall('DescriptorRecord'):
        # Check if the term is under the category of diseases
        tree_numbers = descriptor.findall('TreeNumberList/TreeNumber')
        for tree_number in tree_numbers:
            # This is a basic check for TreeNumbers starting with 'C' which usually denotes diseases in MeSH
            # You might need to adjust this based on the specific structure of your XML file
            if tree_number.text.startswith('C'):
                list_tree_numbers.append(tree_number.text)
                term = descriptor.find('DescriptorName/String').text
                disease_terms.append(term)
                break  # Break after adding the term to avoid duplicates

    return disease_terms,list_tree_numbers


# 2. Retrieve Data
disease_terms,list_tree_numbers = parse_mesh_data(file_path=mesh_file_path)
print(disease_terms)

['Abdomen, Acute', 'Abdominal Injuries', 'Abdominal Neoplasms', 'Abetalipoproteinemia', 'Congenital Abnormalities', 'Abnormalities, Drug-Induced', 'Abnormalities, Multiple', 'Abnormalities, Radiation-Induced', 'Abortion, Spontaneous', 'Abortion, Habitual', 'Abortion, Incomplete', 'Abortion, Missed', 'Abortion, Septic', 'Abortion, Threatened', 'Abortion, Veterinary', 'Abruptio Placentae', 'Abscess', 'Peritonsillar Abscess', 'Acantholysis', 'Acanthosis Nigricans', 'Severe Acute Malnutrition', 'Shellfish Hypersensitivity', 'Symptom Flare Up', 'Obesity, Metabolically Benign', 'Cold Injury', 'War-Related Injuries', 'Late Onset Disorders', 'Compassion Fatigue', 'Fractures, Multiple', 'Drug Resistant Epilepsy', 'Autoimmune Hypophysitis', 'Canaliculitis', 'Incisional Hernia', 'Plasmablastic Lymphoma', 'Mammary Analogue Secretory Carcinoma', 'Acne Conglobata', 'Fused Kidney', 'Long Term Adverse Effects', 'Infectious Encephalitis', 'Waterborne Diseases', 'Unilateral Breast Neoplasms', 'Degloving

In [9]:
len(disease_terms)

5004

In [5]:
a = list(set([int(l.split(".")[0][1:]) for l in list_tree_numbers]))
a.sort()
print(a)

[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


In [6]:
def parse_mesh_data_v2(file_path):
    """Parse MeSH XML data and extract disease terms."""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract disease terms
    disease_terms = list()
    list_tree_numbers = list()
    for descriptor in root.findall('DescriptorRecord'):
        tree_numbers = descriptor.findall('TreeNumberList/TreeNumber')
        for tree_number in tree_numbers:
            # Capture all TreeNumbers starting with 'C' without breaking
            if tree_number.text.startswith('C'):
                list_tree_numbers.append(tree_number.text)
                term = descriptor.find('DescriptorName/String').text
                disease_terms.append(term)

    # Remove duplicates
    disease_terms = list(set(disease_terms))
    list_tree_numbers = list(set(list_tree_numbers))

    return disease_terms, list_tree_numbers

# Retrieve Data
disease_terms, list_tree_numbers = parse_mesh_data_v2(file_path=mesh_file_path)
a = list(set([int(l.split(".")[0][1:]) for l in list_tree_numbers]))
a.sort()
print(a)


[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26]


In [7]:
def parse_main_C_terms(file_path):
    """Parse MeSH XML data and extract main C category terms."""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Dictionary to store main C category terms
    main_C_terms = {}

    for descriptor in root.findall('DescriptorRecord'):
        tree_numbers = descriptor.findall('TreeNumberList/TreeNumber')
        for tree_number in tree_numbers:
            # Check if the TreeNumber matches the pattern Cx (e.g., C1, C4, etc.)
            if tree_number.text.startswith('C') and "." not in tree_number.text:
                term = descriptor.find('DescriptorName/String').text
                main_C_terms[tree_number.text] = term

    return main_C_terms

# Retrieve Data
main_C_categories = parse_main_C_terms(file_path=mesh_file_path)
for tree_number, term in main_C_categories.items():
    print(f"{tree_number}: {term}")


C12: Urogenital Diseases
C22: Animal Diseases
C14: Cardiovascular Diseases
C06: Digestive System Diseases
C19: Endocrine System Diseases
C11: Eye Diseases
C15: Hemic and Lymphatic Diseases
C20: Immune System Diseases
C01: Infections
C21: Disorders of Environmental Origin
C07: Stomatognathic Diseases
C05: Musculoskeletal Diseases
C16: Congenital, Hereditary, and Neonatal Diseases and Abnormalities
C04: Neoplasms
C10: Nervous System Diseases
C18: Nutritional and Metabolic Diseases
C24: Occupational Diseases
C09: Otorhinolaryngologic Diseases
C08: Respiratory Tract Diseases
C23: Pathological Conditions, Signs and Symptoms
C26: Wounds and Injuries
C17: Skin and Connective Tissue Diseases
C25: Chemically-Induced Disorders


In [8]:
def parse_C01_subcategories(file_path):
    """Parse MeSH XML data and extract first-level subcategories under C01."""
    tree = ET.parse(file_path)
    root = tree.getroot()

    # List to store first-level subcategories under C01
    C01_subcategories = []

    for descriptor in root.findall('DescriptorRecord'):
        tree_numbers = descriptor.findall('TreeNumberList/TreeNumber')
        for tree_number in tree_numbers:
            # Check if the TreeNumber matches the pattern C01.x (e.g., C01.1, C01.2, etc.)
            if tree_number.text.startswith('C01.') and tree_number.text.count('.') == 1:
                term = descriptor.find('DescriptorName/String').text
                C01_subcategories.append((tree_number.text, term))

    return C01_subcategories

# Retrieve Data
C01_subs = parse_C01_subcategories(file_path=mesh_file_path)
for tree_number, term in C01_subs:
    print(f"{tree_number}: {term}")

C01.936: Waterborne Diseases
C01.918: Vaccine-Preventable Diseases
C01.920: Vector Borne Diseases
C01.550: Latent Infection
C01.645: Persistent Infection
C01.175: Breakthrough Infections
C01.069: Aneurysm, Infected
C01.100: Arthritis, Infectious
C01.150: Bacterial Infections and Mycoses
C01.160: Bone Diseases, Infectious
C01.207: Central Nervous System Infections
C01.221: Communicable Diseases
C01.248: Cross Infection
C01.392: Focal Infection
C01.408: Gingivitis
C01.436: Hepatitis, Animal
C01.503: Laboratory Infection
C01.597: Opportunistic Infections
C01.610: Parasitic Diseases
C01.674: Pregnancy Complications, Infectious
C01.748: Respiratory Tract Infections
C01.778: Sexually Transmitted Diseases
C01.800: Skin Diseases, Infectious
C01.830: Suppuration
C01.861: Toxemia
C01.915: Urinary Tract Infections
C01.925: Virus Diseases
C01.947: Wound Infection
C01.973: Zoonoses
C01.375: Eye Infections
C01.685: Prosthesis-Related Infections
C01.234: Community-Acquired Infections
C01.820: Soft Ti