In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

BASE = "https://www.who.int"

# Step 1: Get all health topic links from the main page
def get_topic_links():
    url = f"{BASE}/health-topics"
    res = requests.get(url)
    soup = BeautifulSoup(res.content, 'html.parser')

    topics = []
    for item in soup.select('div.list-view--item a'):
        name = item.get('aria-label') or item.get_text(strip=True)
        href = item.get('href')
        if href and href.startswith("https://www.who.int/health-topics/"):
            topics.append((name, href))
    return list(set(topics))  # Remove duplicates

# Step 2: Scrape all tab sections (Overview, Impact, etc.) from each topic page
def scrape_topic_tabs(name, url):
    print(f"🔍 Scraping: {name} -> {url}")
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.content, 'html.parser')

        result = {'topic': name, 'url': url}

        tab_blocks = soup.select('div.singleTabWrapper')
        for block in tab_blocks:
            # Extract tab title
            title_div = block.select_one('div.tabHeaderWrapper .content-block')
            tab_title = title_div.get_text(strip=True) if title_div else "Unknown"

            # Extract tab content
            content_div = block.select_one('div.tabContent div.sf_colsIn')
            content_text = content_div.get_text(separator=' ', strip=True) if content_div else ""

            result[tab_title] = content_text

        return result

    except Exception as e:
        print(f"❌ Error on {url}: {e}")
        return {'topic': name, 'url': url, 'Error': str(e)}

# Step 3: Run the scraper
topics = get_topic_links()
print(f"Found {len(topics)} health topics.")
results = []

for name, link in topics:
    result = scrape_topic_tabs(name, link)
    results.append(result)
    time.sleep(1)  # Be polite to the server

# Step 4: Save the result to CSV
df = pd.DataFrame(results)
df.to_csv("who_health_topics_tabs_fixed.csv", index=False, encoding='utf-8-sig')
print("✅ Done. Saved to who_health_topics_tabs_fixed.csv")


共发现 198 个健康主题
🔍 Scraping: Common goods for health -> https://www.who.int/health-topics/common-goods-for-health
🔍 Scraping: Social determinants of health -> https://www.who.int/health-topics/social-determinants-of-health
🔍 Scraping: Mental health -> https://www.who.int/health-topics/mental-health
🔍 Scraping: Radiation -> https://www.who.int/health-topics/radiation
🔍 Scraping: Scabies -> https://www.who.int/health-topics/scabies
🔍 Scraping: Infodemic -> https://www.who.int/health-topics/infodemic
🔍 Scraping: Biologicals -> https://www.who.int/health-topics/biologicals
🔍 Scraping: Pertussis -> https://www.who.int/health-topics/pertussis
🔍 Scraping: Measles -> https://www.who.int/health-topics/measles
🔍 Scraping: Adolescent health -> https://www.who.int/health-topics/adolescent-health
🔍 Scraping: Snakebite envenoming -> https://www.who.int/health-topics/snakebite
🔍 Scraping: Cardiovascular diseases -> https://www.who.int/health-topics/cardiovascular-diseases
🔍 Scraping: Meningitis -> https