In [3]:
import undetected_chromedriver as uc
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import pandas as pd

# Start undetected Chrome browser
options = uc.ChromeOptions()
options.headless = True  # Run headless (no browser window)

driver = uc.Chrome(options=options)

# Target site
base_url = "https://peopledaily.digital/"
driver.get(base_url)
time.sleep(5)

# Collect article URLs and titles
articles = driver.find_elements(By.CSS_SELECTOR, "a[href^='https://peopledaily.digital/']")
article_data = []

print("🔎 Extracting article links...")

for link in articles:
    href = link.get_attribute('href')
    title = link.text.strip()
    
    if title and href and "page" not in href.lower() and "/tag/" not in href.lower():
        article_data.append({"title": title, "url": href})

unique_articles = {item['url']: item for item in article_data}.values()

print(f"📰 Visiting {len(unique_articles)} articles...")

final_data = []

for article in unique_articles:
    try:
        driver.get(article['url'])
        time.sleep(3)

        # Get article date
        try:
            date_element = driver.find_element(By.CLASS_NAME, "content--date--date-time")
            date = date_element.text.strip()
        except NoSuchElementException:
            date = None

        # Get article content
        try:
            content_div = driver.find_element(By.CSS_SELECTOR, "div.col-md-7.content")
            paragraphs = content_div.find_elements(By.TAG_NAME, "p")
            content = " ".join(p.text.strip() for p in paragraphs if p.text.strip())
        except NoSuchElementException:
            content = None

        if content:
            final_data.append({
                "source": "People Daily",
                "title": article['title'],
                "url": article['url'],
                "date": date,
                "content": content
            })

    except TimeoutException:
        print(f"⏱️ Timeout at {article['url']}")
    except Exception as e:
        print(f"⚠️ Error at {article['url']}: {e}")

driver.quit()

df = pd.DataFrame(final_data)
print(f"\n✅ Done! Scraped {len(df)} full articles.")

df.to_csv("peopledaily_articles_uc.csv", index=False)


🔎 Extracting article links...
📰 Visiting 53 articles...

✅ Done! Scraped 42 full articles.


In [4]:
df.to_csv("../Data/RawData/people_daily_articles.csv", index=False)