In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
import csv


tree = ET.parse('data/mplus_topics_2025-11-19.xml')
root = tree.getroot()

with open('data/medlineplus_topics_2025-11-19.csv', 'w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # CSV header based on your schema:
    writer.writerow([
        'id', 'title', 'url', 'language', 'meta-desc', 'date-created', 'also-called',
        'full-summary', 'groups', 'language-mapped-topics', 'mesh-descriptors',
        'other-languages', 'primary-institute', 'see-reference',
        'sites'
    ])
    for topic in root.findall('health-topic'):
        id_ = topic.get('id')
        title = topic.get('title')
        url = topic.get('url')
        lang = topic.get('language')
        meta_desc = topic.get('meta-desc')
        date_created = topic.get('date-created')
        
        also_called = '; '.join([elt.text or '' for elt in topic.findall('also-called')])
        summary = topic.findtext('full-summary')

        groups = '; '.join([group.text or '' for group in topic.findall('group')])

        lang_mapped_topics = '; '.join([
            f"{elt.get('language')}:{elt.text or ''}" for elt in topic.findall('language-mapped-topic')
        ])

        mesh_descriptors = '; '.join([
            descriptor.text or ''
            for mesh_heading in topic.findall('mesh-heading')
            for descriptor in mesh_heading.findall('descriptor')
        ])
        
        other_languages = '; '.join([
            f"{elt.get('vernacular-name')}:{elt.text or ''}" for elt in topic.findall('other-language')
        ])
        primary_institute = topic.findtext('primary-institute')
        see_reference = topic.findtext('see-reference')

        # Combine site info (flatten major fields)
        sites = '; '.join([
            f"{site.get('title')},{site.get('url')},{site.findtext('information-category')},{site.findtext('organization')},{site.findtext('standard-description') or ''}"
            for site in topic.findall('site')
        ])
        
        writer.writerow([
            id_, title, url, lang, meta_desc, date_created, also_called, summary,
            groups, lang_mapped_topics, mesh_descriptors, other_languages,
            primary_institute, see_reference, sites
        ])

In [3]:
import pandas as pd
df=pd.read_csv('data/medlineplus_topics_2025-11-19.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2033 entries, 0 to 2032
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   id                      2033 non-null   int64 
 1   title                   2033 non-null   object
 2   url                     2033 non-null   object
 3   language                2033 non-null   object
 4   meta-desc               2032 non-null   object
 5   date-created            2033 non-null   object
 6   also-called             1090 non-null   object
 7   full-summary            2032 non-null   object
 8   groups                  2033 non-null   object
 9   language-mapped-topics  2032 non-null   object
 10  mesh-descriptors        1015 non-null   object
 11  other-languages         1016 non-null   object
 12  primary-institute       1619 non-null   object
 13  see-reference           1424 non-null   object
 14  sites                   2033 non-null   object
dtypes: i

In [4]:
df.head()

Unnamed: 0,id,title,url,language,meta-desc,date-created,also-called,full-summary,groups,language-mapped-topics,mesh-descriptors,other-languages,primary-institute,see-reference,sites
0,6308,A1C,https://medlineplus.gov/a1c.html,English,"If you are being tested for Type 2 diabetes, y...",12/22/2015,Glycohemoglobin; HbA1C; Hemoglobin A1C test,"<p>A1C is a blood test for <a href=""https://me...",Diagnostic Tests; Diabetes Mellitus,Spanish:Prueba de hemoglobina glicosilada (HbA1c),Glycated Hemoglobin,español:Spanish,National Institute of Diabetes and Digestive a...,Hemoglobin A1c,"A1C and eAG,https://diabetes.org/a1c-eag-conve..."
1,3061,Abdominal Pain,https://medlineplus.gov/abdominalpain.html,English,Stomach aches can be painful. Find out what mi...,01/07/2003,Bellyache,<p>Your abdomen extends from below your chest ...,Digestive System; Symptoms,Spanish:Dolor abdominal,Abdominal Pain,español:Spanish,,Bellyache,"Abdominal and Pelvic CT,https://www.radiologyi..."
2,122,Abortion,https://medlineplus.gov/abortion.html,English,An abortion is a medical procedure to end a pr...,03/19/2002,Induced Abortion,<p>An induced abortion is a procedure to end a...,Pregnancy and Reproduction; Female Reproductiv...,Spanish:Aborto,"Abortion, Induced","简体中文:Chinese, Simplified (Mandarin dialect); 繁...",,,"Abortion - medication,https://medlineplus.gov/..."
3,2238,Aborto,https://medlineplus.gov/spanish/abortion.html,Spanish,Un aborto es un procedimiento médico para term...,10/31/2006,Aborto terapéutico; Interrupción del embarazo,<p>Un aborto inducido es un procedimiento para...,Embarazo y reproducción; Sistema reproductor f...,English:Abortion,,,,,"Aborto médico,https://www.mayoclinic.org/es/te..."
4,3064,Abscesos,https://medlineplus.gov/spanish/abscess.html,Spanish,Los abscesos se llenan de pus y pueden aparece...,09/15/2010,,<p>Un absceso es una cavidad donde se acumula ...,Infecciones,English:Abscess,,,Instituto Nacional de Alergias y Enfermedades ...,,"Absceso,https://medlineplus.gov/spanish/ency/a..."


In [5]:
df[df['language']=='Spanish'].shape

(1016, 15)

In [6]:
df[df['language']=='English'].shape

(1017, 15)

In [7]:
df_upd=df[df['language']=='English']
df_upd.to_csv('data/medlineplus_topics_english_2025-11-19.csv', index=False)