In [3]:
import xml.etree.ElementTree as ET
import pandas as pd

# read XML （get xml from https://medlineplus.gov/xml.html）
tree = ET.parse('/mplus_topics_2025-04-05.xml')
root = tree.getroot()

# go through <health-topic> topic
data = []
for topic in root.findall('health-topic'):
    data.append(topic.attrib)

# DataFrame
df = pd.DataFrame(data)

print(df.head())

# get CSV
df.to_csv('health_topics.csv', index=False, encoding='utf-8-sig')


                                           meta-desc           title  \
0  If you are being tested for Type 2 diabetes, y...             A1C   
1  Stomach aches can be painful. Find out what mi...  Abdominal Pain   
2  An abortion is a medical procedure to end a pr...        Abortion   
3  Un aborto es un procedimiento médico para term...          Aborto   
4  Los abscesos se llenan de pus y pueden aparece...        Abscesos   

                                             url    id language date-created  
0               https://medlineplus.gov/a1c.html  6308  English   12/22/2015  
1     https://medlineplus.gov/abdominalpain.html  3061  English   01/07/2003  
2          https://medlineplus.gov/abortion.html   122  English   03/19/2002  
3  https://medlineplus.gov/spanish/abortion.html  2238  Spanish   10/31/2006  
4   https://medlineplus.gov/spanish/abscess.html  3064  Spanish   09/15/2010  


In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

# 1. Filter English
df_english = df[df['language'] == 'English'].copy()

# 2. Define a crawler function that grabs summary text
def fetch_summary(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code != 200:
            return None
        soup = BeautifulSoup(response.content, 'html.parser')
        summary_div = soup.find('div', id='topic-summary', class_='syndicate')
        if summary_div:
            return summary_div.get_text(separator=' ', strip=True)
        else:
            return None
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

# 3. run URL for summary
summaries = []
for url in df_english['url']:
    print(f"Fetching: {url}")
    summaries.append(fetch_summary(url))
    time.sleep(1)  # prevent IP blocking

# 4. add summary
df_english['summary'] = summaries

# 5. output
df_english.to_csv('english_health_topics_with_summary.csv', index=False, encoding='utf-8-sig')

print("Get！！ english_health_topics_with_summary.csv")


Fetching: https://medlineplus.gov/a1c.html
Fetching: https://medlineplus.gov/abdominalpain.html
Fetching: https://medlineplus.gov/abortion.html
Fetching: https://medlineplus.gov/abscess.html
Fetching: https://medlineplus.gov/acne.html
Fetching: https://medlineplus.gov/acousticneuroma.html
Fetching: https://medlineplus.gov/acupuncture.html
Fetching: https://medlineplus.gov/acutebronchitis.html
Fetching: https://medlineplus.gov/acuteflaccidmyelitis.html
Fetching: https://medlineplus.gov/acutelymphocyticleukemia.html
Fetching: https://medlineplus.gov/acutemyeloidleukemia.html
Fetching: https://medlineplus.gov/addisondisease.html
Fetching: https://medlineplus.gov/adenoids.html
Fetching: https://medlineplus.gov/adhesions.html
Fetching: https://medlineplus.gov/adrenalglandcancer.html
Fetching: https://medlineplus.gov/adrenalglanddisorders.html
Fetching: https://medlineplus.gov/advancedirectives.html
Fetching: https://medlineplus.gov/aftersurgery.html
Fetching: https://medlineplus.gov/airpoll