In [1]:
import numpy as np
import pandas as pd
import gzip
import xml.etree.ElementTree as ET

In [2]:
file_path = r"C:\Users\Eugenia\Downloads\desc2025\desc2025"  # or .xml if it's named like that
tree = ET.parse(file_path)
root = tree.getroot()
root

<Element 'DescriptorRecordSet' at 0x000002601FA751C0>

In [7]:
tags = set()

for elem in root.iter():
    tags.add(elem.tag)

print(sorted(tags))

['Abbreviation', 'AllowableQualifier', 'AllowableQualifiersList', 'Annotation', 'CASN1Name', 'Concept', 'Concept1UI', 'Concept2UI', 'ConceptList', 'ConceptName', 'ConceptRelation', 'ConceptRelationList', 'ConceptUI', 'ConsiderAlso', 'DateCreated', 'DateEstablished', 'DateRevised', 'Day', 'DescriptorName', 'DescriptorRecord', 'DescriptorRecordSet', 'DescriptorReferredTo', 'DescriptorUI', 'ECIN', 'ECOUT', 'EntryCombination', 'EntryCombinationList', 'EntryVersion', 'HistoryNote', 'Month', 'NLMClassificationNumber', 'OnlineNote', 'PharmacologicalAction', 'PharmacologicalActionList', 'PreviousIndexing', 'PreviousIndexingList', 'PublicMeSHNote', 'QualifierName', 'QualifierReferredTo', 'QualifierUI', 'RegistryNumber', 'RegistryNumberList', 'RelatedRegistryNumber', 'RelatedRegistryNumberList', 'ScopeNote', 'SeeRelatedDescriptor', 'SeeRelatedList', 'String', 'Term', 'TermList', 'TermUI', 'ThesaurusID', 'ThesaurusIDlist', 'TreeNumber', 'TreeNumberList', 'Year']


In [9]:
for child in root.iter():
    print(child.tag)
    break

DescriptorRecordSet


In [19]:
category_map = {
    "A": "Anatomy",
    "B": "Organisms",
    "C": "Diseases",
    "D": "Chemicals and Drugs",
    "E": "Analytical, Diagnostic and Therapeutic Techniques and Equipment",
    "F": "Psychiatry and Psychology",
    "G": "Phenomena and Processes",
    "H": "Disciplines and Occupations",
    "I": "Anthropology, Education, Sociology and Social Phenomena",
    "J": "Technology, Industry, and Agriculture",
    "K": "Humanities",
    "L": "Information Science",
    "M": "Named Groups",
    "N": "Health Care",
    "V": "Publication Characteristics",
    "Z": "Geographic Locations"
}

# Extract disease records with extra category info
records = []

for descriptor in root.findall('DescriptorRecord'):
    name = descriptor.find('DescriptorName/String').text
    ui = descriptor.find('DescriptorUI').text
    tree_numbers = [t.text for t in descriptor.findall('TreeNumberList/TreeNumber')]

    for tn in tree_numbers:
        if tn.startswith("C"):  # filter only diseases
            top_code = tn.split('.')[0]
            top_letter = tn[0]
            category_name = category_map.get(top_letter, "Unknown")
            records.append({
                'DescriptorUI': ui,
                'DescriptorName': name,
                'TreeNumber': tn,
                'TopCategory': top_code,
                'TopLetter': top_letter,
                'TopLetterName': category_name
            })

# Convert to DataFrame
df_diseases_full = pd.DataFrame(records)
df_diseases_full.head()

Unnamed: 0,DescriptorUI,DescriptorName,TreeNumber,TopCategory,TopLetter,TopLetterName
0,D000006,"Abdomen, Acute",C23.888.592.612.054.200,C23,C,Diseases
1,D000006,"Abdomen, Acute",C23.888.821.030.249,C23,C,Diseases
2,D000007,Abdominal Injuries,C26.017,C26,C,Diseases
3,D000008,Abdominal Neoplasms,C04.588.033,C04,C,Diseases
4,D000012,Abetalipoproteinemia,C16.320.565.398.500.440.500,C16,C,Diseases


In [18]:
df_categories

Unnamed: 0,TopCategory,TopLetter,TopLetterName
0,C23,C,Diseases
1,C26,C,Diseases
2,C04,C,Diseases
3,C16,C,Diseases
4,C18,C,Diseases
5,C12,C,Diseases
6,C01,C,Diseases
7,C22,C,Diseases
8,C07,C,Diseases
9,C08,C,Diseases


In [20]:
records = []

for descriptor in root.findall('DescriptorRecord'):
    name = descriptor.find('DescriptorName/String').text
    ui = descriptor.find('DescriptorUI').text
    tree_numbers = [t.text for t in descriptor.findall('TreeNumberList/TreeNumber')]

    # Filter only diseases (TreeNumbers starting with "C")
    for tn in tree_numbers:
        if tn.startswith("C"):
            # Extract top-level category code, e.g., C14
            top_code = tn.split('.')[0]
            records.append({
                #'DescriptorUI': ui,
                #'DescriptorName': name,
                #'TreeNumber': tn,
                'TopCategory': top_code
            })

df_diseases_full = pd.DataFrame(records)
df_diseases_full


Unnamed: 0,TopCategory
0,C23
1,C23
2,C26
3,C04
4,C16
...,...
13175,C19
13176,C23
13177,C12
13178,C14
