In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

def scrape_peopledaily(pages=1):
    # Set up headless Selenium browser
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Optional: run in background
    options.add_argument('--disable-blink-features=AutomationControlled')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


    base_url = "https://peopledaily.digital/"
    results = []

    try:
        # Loop through pages (if pagination exists — right now we assume 1)
        driver.get(base_url)
        time.sleep(2)

        # Get all <a> tags that contain article links
        article_links = driver.find_elements(By.XPATH, "//a[contains(@href, '/')]")

        visited = set()

        for a in article_links:
            url = a.get_attribute("href")
            title = a.text.strip()

            # Filter out non-article links or repeated ones
            if not url or not title or url in visited or "category" in url or "tag" in url:
                continue

            visited.add(url)

            # Visit article page
            driver.get(url)
            time.sleep(2)

            # Try to extract date
            try:
                date_el = driver.find_element(By.CLASS_NAME, "content--date--date-time")
                date = date_el.text.strip()
            except NoSuchElementException:
                date = None

            # Try to extract content
            try:
                content_wrapper = driver.find_element(By.CLASS_NAME, "col-md-7.content")
                paragraphs = content_wrapper.find_elements(By.TAG_NAME, "p")
                content = "\n".join([p.text.strip() for p in paragraphs if p.text.strip()])
            except NoSuchElementException:
                content = None

            # Add only if we have content
            if content:
                results.append({
                    "title": title,
                    "url": url,
                    "date": date,
                    "content": content,
                    "source": "People Daily",
                    "label": "real"
                })

    finally:
        driver.quit()

    return results
