# scrape list of CNA articles on one CNA topic website

In [5]:
# --------------------------
# CONFIG
# --------------------------
TOPIC_URL = "https://www.channelnewsasia.com/topic/housing-and-development-board"
START_DATE = date(2025, 1, 1)   # inclusive
END_DATE   = date(2025, 9, 30)  # inclusive
OUTPUT_CSV = "Output/cna_housing_and_development_board_20250101_20250930.csv"

MAX_PAGES = 10  # maximum number of pages to attempt (we now start at page=0)

In [7]:
import csv
import json
import re
import time
from datetime import date, datetime, timezone
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

# Polite crawling
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}
REQUEST_TIMEOUT = 20
SLEEP_BETWEEN_REQUESTS = 1.0  # seconds


# --------------------------
# HELPERS
# --------------------------
def is_article_url(url: str) -> bool:
    u = urlparse(url)
    if u.netloc not in {"www.channelnewsasia.com", "channelnewsasia.com"}:
        return False
    path = u.path.lower()

    # Exclude clear non-article sections
    if path.startswith("/watch") or "/watch/" in path:
        return False
    if "/podcast" in path or "/podcasts" in path or path.startswith("/listen"):
        return False
    if path.startswith("/profile") or path.startswith("/rss") or path.startswith("/weather"):
        return False
    if path == "/":
        return False

    return True


ISO_DT_RE = re.compile(r"^\d{4}-\d{2}-\d{2}")

def parse_publish_datetime_from_article(html: str) -> datetime | None:
    soup = BeautifulSoup(html, "html.parser")

    # 1) JSON-LD blocks
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
        except Exception:
            continue

        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue
            typ = obj.get("@type") or obj.get("@context")
            if (isinstance(typ, str) and "Article" in typ) or obj.get("headline") or obj.get("datePublished"):
                dt = obj.get("datePublished") or obj.get("dateModified")
                if isinstance(dt, str) and ISO_DT_RE.match(dt):
                    try:
                        dt_parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
                        SGT = timezone(timedelta(hours=8))
                        return dt_parsed.astimezone(SGT)
                        
                    except Exception:
                        pass

    # 2) Meta tags
    meta_names = [
        ("meta", {"property": "article:published_time"}),
        ("meta", {"name": "pubdate"}),
        ("meta", {"name": "publish-date"}),
        ("meta", {"name": "date"}),
        ("meta", {"itemprop": "datePublished"}),
        ("meta", {"property": "og:pubdate"}),
        ("meta", {"name": "parsely-pub-date"}),
        ("meta", {"name": "cXenseParse:recs:publishtime"}),
    ]
    for tag, attrs in meta_names:
        el = soup.find(tag, attrs=attrs)
        if el and el.get("content"):
            dt = el["content"].strip()
            if ISO_DT_RE.match(dt):
                try:
                    dt_parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
                    SGT = timezone(timedelta(hours=8))
                    return dt_parsed.astimezone(SGT)

                except Exception:
                    pass

    # 3) <time> tags
    for t in soup.find_all("time"):
        dt = (t.get("datetime") or t.get("content") or "").strip()
        if ISO_DT_RE.match(dt):
            try:
                dt_parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
                SGT = timezone(timedelta(hours=8))
                return dt_parsed.astimezone(SGT)

            except Exception:
                continue

    return None


def fetch(url: str) -> str | None:
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
        r.encoding = "utf-8"
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"[WARN] GET failed: {url} | {e}")
        return None


def within_range(dt: datetime) -> bool:
    d = dt.astimezone(timezone.utc).date()
    return START_DATE <= d <= END_DATE


def date_is_before_start(dt: datetime) -> bool:
    return dt.astimezone(timezone.utc).date() < START_DATE


def extract_listing_links(topic_html: str) -> list[str]:
    soup = BeautifulSoup(topic_html, "html.parser")
    hrefs = set()
    for a in soup.find_all("a", href=True):
        href = a["href"]
        if href.startswith("#"):
            continue
        abs_url = urljoin(TOPIC_URL, href)
        if is_article_url(abs_url):
            hrefs.add(abs_url)
    return sorted(hrefs)


# --------------------------
# MAIN
# --------------------------
def main():
    print(f"Scraping CNA Housing articles from {START_DATE} to {END_DATE} ...")
    rows = []
    seen_urls = set()

    reached_older_than_start = False

    # Start from page=0 now
    for page in range(0, MAX_PAGES):
        page_url = f"{TOPIC_URL}?page={page}"
        print(f"\n[Page {page}] {page_url}")
        html = fetch(page_url)
        if not html:
            print("No HTML returned; stopping.")
            break

        links = extract_listing_links(html)
        if not links:
            print("No candidate links found on page; stopping.")
            break

        print(f"Found {len(links)} candidate links; fetching articles...")
        page_old_count = 0
        page_in_range = 0

        for article_url in links:
            if article_url in seen_urls:
                continue
            seen_urls.add(article_url)

            time.sleep(SLEEP_BETWEEN_REQUESTS)
            article_html = fetch(article_url)
            if not article_html:
                continue

            # Parse date
            dt = parse_publish_datetime_from_article(article_html)
            if not dt:
                continue

            if within_range(dt):
                soup = BeautifulSoup(article_html, "html.parser")

                # Title
                meta_title = soup.find("meta", property="og:title")
                if meta_title and meta_title.get("content"):
                    title = meta_title["content"].strip()
                else:
                    t = soup.find("title")
                    title = t.text.strip() if t else ""

                rows.append({
                    "title": title,
                    "url": article_url,
                    "published_utc": dt.isoformat(),
                })
                page_in_range += 1
            elif date_is_before_start(dt):
                page_old_count += 1

        print(f"Kept in range on this page: {page_in_range} | Older-than-start on this page: {page_old_count}")

        if page_in_range == 0 and page_old_count >= 10:
            reached_older_than_start = True
            print("Reached mostly items older than start date; stopping pagination.")
            break

    # Deduplicate by URL and sort by date
    dedup = {r["url"]: r for r in rows}
    final_rows = sorted(dedup.values(), key=lambda r: r["published_utc"])

    # Write CSV (only the requested columns)
    fieldnames = ["title", "url", "published_utc"]
    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(final_rows)

    print(f"\nDone. Saved {len(final_rows)} rows to {OUTPUT_CSV}")
    if not reached_older_than_start:
        print("Note: You may increase MAX_PAGES if you think there are more pages within range.")


if __name__ == "__main__":
    main()


Scraping CNA Housing articles from 2025-01-01 to 2025-09-30 ...

[Page 0] https://www.channelnewsasia.com/topic/housing-and-development-board?page=0
Found 81 candidate links; fetching articles...
Kept in range on this page: 12 | Older-than-start on this page: 0

[Page 1] https://www.channelnewsasia.com/topic/housing-and-development-board?page=1
Found 74 candidate links; fetching articles...
Kept in range on this page: 9 | Older-than-start on this page: 0

[Page 2] https://www.channelnewsasia.com/topic/housing-and-development-board?page=2
Found 81 candidate links; fetching articles...
Kept in range on this page: 4 | Older-than-start on this page: 11

[Page 3] https://www.channelnewsasia.com/topic/housing-and-development-board?page=3
Found 78 candidate links; fetching articles...
Kept in range on this page: 0 | Older-than-start on this page: 12
Reached mostly items older than start date; stopping pagination.

Done. Saved 25 rows to Output/cna_housing_and_development_board_20250101_202509

# scrape contents of articles from output file

In [15]:
# --------------------------
# CONFIG
# --------------------------
INPUT_CSV = "Output/cna_housing_and_development_board_20250101_20250930.csv"  # CSV with at least a 'url' column
OUTPUT_XLSX = "Output/articles_cna_housing_and_development_board_20250101_20250930.xlsx"

In [16]:
"""
Scrape CNA article pages (title, publish time in Singapore, cleaned text)
from a CSV list of URLs and save results to an Excel (.xlsx) file.

Updates in this version:
- Stops scraping body once it encounters the phrase "Sign up for our newsletters"
  (case-insensitive), so ad/housekeeping blocks at the end are excluded.
- Adds a 'content_chars' column (character count) to help detect truncation.

Input CSV must contain a 'url' column (case-insensitive). If not found,
the first column is treated as URL.
"""

import csv
import json
import os
import re
import time
from datetime import datetime, timedelta, timezone
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "en-US,en;q=0.9",
}
REQUEST_TIMEOUT = 20
SLEEP_BETWEEN_REQUESTS = 0.6  # seconds, be polite

SGT = timezone(timedelta(hours=8))
ISO_DT_RE = re.compile(r"^\d{4}-\d{2}-\d{2}")

# Phrase that signals start of housekeeping/ads; stop collecting at this
STOP_MARKER = "sign up for our newsletters"

# Boilerplate lines to drop
_BOILERPLATE_PATTERNS = [
    r"Get CNA updates on WhatsApp",
    r"Subscribe to our newsletter",
    r"Related:?$",
    r"Recommended(?: for you)?:?$",
    r"READ:?$",
    r"READ ALSO:?$",
    r"Follow us on",
    r"Read more:?$",
    r"Get the CNA app",
    r"Get WhatsApp alerts",
    r"Also worth reading",
]
_BOILERPLATE_RE = re.compile("|".join(_BOILERPLATE_PATTERNS), re.IGNORECASE)

# Heuristic content class hints
_CONTENT_CLASS_RE = re.compile(
    r"(article|story|post).*(content|body)|"
    r"(content|text|rich)(-|_)?(body|area|container)",
    re.IGNORECASE
)


# --------------------------
# HELPERS
# --------------------------
def ensure_dirs(path: str):
    d = os.path.dirname(path)
    if d:
        os.makedirs(d, exist_ok=True)


def fetch(url: str) -> str | None:
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
        # Ensure correct decoding to prevent â€œ/â€™ garbling
        r.encoding = "utf-8"
        r.raise_for_status()
        return r.text
    except Exception as e:
        print(f"[WARN] GET failed: {url} | {e}")
        return None


def _clean_text(s: str) -> str:
    s = s.strip()
    s = re.sub(r"\xa0", " ", s)
    s = re.sub(r"[ \t]+", " ", s)
    if _BOILERPLATE_RE.search(s):
        return ""
    return s


def _maybe_fix_misencoded(text: str) -> str:
    """
    If viewer later shows â€œ/â€™ etc it’s usually Excel-view issue,
    but if source is double-decoded, this can help.
    """
    if "â€" in text or "â€™" in text or "â€“" in text:
        try:
            return text.encode("latin1").decode("utf-8")
        except Exception:
            return text
    return text


def _truncate_after_marker(text: str) -> str:
    """Cut content at the stop marker phrase (case-insensitive), if present."""
    if not text:
        return text
    idx = text.lower().find(STOP_MARKER)
    return text[:idx].rstrip() if idx != -1 else text


def parse_publish_datetime_sgt(html: str) -> datetime | None:
    """Parse publish datetime and return timezone-aware datetime in SGT (UTC+8)."""
    soup = BeautifulSoup(html, "html.parser")

    # 1) JSON-LD (prefer datePublished)
    for script in soup.find_all("script", type="application/ld+json"):
        try:
            data = json.loads(script.string or "")
        except Exception:
            continue
        candidates = data if isinstance(data, list) else [data]
        for obj in candidates:
            if not isinstance(obj, dict):
                continue
            typ = obj.get("@type") or obj.get("@context")
            if (isinstance(typ, str) and "Article" in typ) or obj.get("headline") or obj.get("datePublished"):
                dt = obj.get("datePublished") or obj.get("dateModified")
                if isinstance(dt, str) and ISO_DT_RE.match(dt):
                    try:
                        dt_parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
                        return dt_parsed.astimezone(SGT)
                    except Exception:
                        pass

    # 2) Meta tags
    meta_names = [
        ("meta", {"property": "article:published_time"}),
        ("meta", {"name": "pubdate"}),
        ("meta", {"name": "publish-date"}),
        ("meta", {"name": "date"}),
        ("meta", {"itemprop": "datePublished"}),
        ("meta", {"property": "og:pubdate"}),
        ("meta", {"name": "parsely-pub-date"}),
        ("meta", {"name": "cXenseParse:recs:publishtime"}),
    ]
    for tag, attrs in meta_names:
        el = soup.find(tag, attrs=attrs)
        if el and el.get("content"):
            dt = el["content"].strip()
            if ISO_DT_RE.match(dt):
                try:
                    dt_parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
                    return dt_parsed.astimezone(SGT)
                except Exception:
                    pass

    # 3) <time> tags
    for t in soup.find_all("time"):
        dt = (t.get("datetime") or t.get("content") or "").strip()
        if ISO_DT_RE.match(dt):
            try:
                dt_parsed = datetime.fromisoformat(dt.replace("Z", "+00:00"))
                return dt_parsed.astimezone(SGT)
            except Exception:
                continue

    return None


def extract_article_content(html: str) -> str:
    """Return cleaned plaintext of the article body, truncated at STOP_MARKER."""
    soup = BeautifulSoup(html, "html.parser")

    # 1) JSON-LD articleBody
    try:
        for script in soup.find_all("script", type="application/ld+json"):
            data = json.loads(script.string or "")
            blocks = data if isinstance(data, list) else [data]
            for obj in blocks:
                if isinstance(obj, dict) and obj.get("@type") and "Article" in str(obj.get("@type")):
                    body = obj.get("articleBody")
                    if isinstance(body, str) and len(body.strip()) > 40:
                        text = "\n\n".join(
                            _clean_text(x) for x in re.split(r"\n{2,}", body.strip()) if _clean_text(x)
                        )
                        if text:
                            text = _maybe_fix_misencoded(text)
                            return _truncate_after_marker(text)
    except Exception:
        pass

    def collect_paragraphs(container):
        paras = []
        for p in container.find_all(["p", "h2", "h3"], recursive=True):
            if p.find_parent(["figure", "figcaption", "aside", "blockquote"]):
                continue
            raw = "".join(t for t in p.stripped_strings if isinstance(t, str))
            # Stop if we hit the marker phrase in this paragraph
            if STOP_MARKER in raw.lower():
                break
            txt = _clean_text(raw)
            if txt:
                paras.append(txt)
        return paras

    # 2) <article> block
    article_tag = soup.find("article")
    if article_tag:
        for c in article_tag.find_all(attrs={"itemprop": "articleBody"}):
            paras = collect_paragraphs(c)
            if len(" ".join(paras)) > 100:
                return _truncate_after_marker(_maybe_fix_misencoded("\n\n".join(paras)))
        paras = collect_paragraphs(article_tag)
        if len(" ".join(paras)) > 100:
            return _truncate_after_marker(_maybe_fix_misencoded("\n\n".join(paras)))

    # 3) Heuristic containers
    for div in soup.find_all(["div", "section"], class_=True):
        classes = " ".join(div.get("class") or [])
        if _CONTENT_CLASS_RE.search(classes):
            paras = collect_paragraphs(div)
            if len(" ".join(paras)) > 100:
                return _truncate_after_marker(_maybe_fix_misencoded("\n\n".join(paras)))

    # 4) Last resort: main/body
    main_like = soup.find("main") or soup.body
    if main_like:
        paras = collect_paragraphs(main_like)
        if len(" ".join(paras)) > 100:
            return _truncate_after_marker(_maybe_fix_misencoded("\n\n".join(paras)))

    return ""


def parse_title(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    meta_title = soup.find("meta", property="og:title")
    if meta_title and meta_title.get("content"):
        return meta_title["content"].strip()
    t = soup.find("title")
    return t.text.strip() if t else ""


def is_article_url(url: str) -> bool:
    u = urlparse(url)
    if u.netloc not in {"www.channelnewsasia.com", "channelnewsasia.com"}:
        return False
    path = u.path.lower()
    if path.startswith("/watch") or "/watch/" in path:
        return False
    if "/podcast" in path or "/podcasts" in path or path.startswith("/listen"):
        return False
    if path.startswith("/profile") or path.startswith("/rss") or path.startswith("/weather"):
        return False
    if path == "/":
        return False
    return True


def scrape_article(url: str) -> dict:
    html = fetch(url)
    if not html:
        raise RuntimeError("Failed to fetch article HTML.")
    title = parse_title(html)
    published_sgt = parse_publish_datetime_sgt(html)
    content_text = extract_article_content(html)
    return {
        "title": title,
        "url": url,
        "published_sgt_iso": published_sgt.isoformat() if published_sgt else "",
        "content_text": content_text,
        "content_chars": len(content_text) if content_text else 0,
    }


def read_urls_from_csv(path: str) -> list[str]:
    urls = []
    with open(path, "r", encoding="utf-8-sig", newline="") as f:
        reader = csv.reader(f)
        rows = list(reader)
        if not rows:
            return urls
        header = [h.strip().lower() for h in rows[0]]
        data_rows = rows[1:] if any(header) else rows  # headerless CSV
        url_idx = 0
        if any(header):
            if "url" in header:
                url_idx = header.index("url")
            else:
                # Fall back to first column if 'url' not found
                url_idx = 0
        for r in data_rows:
            if not r:
                continue
            u = r[url_idx].strip()
            if u:
                urls.append(u)
    return urls


def save_rows_to_excel(rows: list[dict], output_path: str):
    ensure_dirs(output_path)
    wb = Workbook()
    ws = wb.active
    ws.title = "Articles"

    # Header (added 'content_chars')
    headers = ["title", "url", "published_sgt_iso", "content_text", "content_chars"]
    ws.append(headers)

    # Rows
    for r in rows:
        ws.append([r.get(h, "") for h in headers])

    # Optional: auto width
    for column in ws.columns:
        max_length = 0
        col_letter = column[0].column_letter
        for cell in column:
            try:
                val = "" if cell.value is None else str(cell.value)
                if len(val) > max_length:
                    max_length = len(val)
            except Exception:
                pass
        ws.column_dimensions[col_letter].width = min(max_length + 2, 80)

    wb.save(output_path)
    print(f"✅ Saved Excel file to: {output_path}")


def main():
    print(f"Reading URLs from: {INPUT_CSV}")
    urls = read_urls_from_csv(INPUT_CSV)
    print(f"Found {len(urls)} URL(s).")

    rows = []
    failures = []

    for i, url in enumerate(urls, 1):
        if not is_article_url(url):
            print(f"[{i}/{len(urls)}] Skipping non-article URL: {url}")
            continue

        print(f"[{i}/{len(urls)}] Scraping: {url}")
        time.sleep(SLEEP_BETWEEN_REQUESTS)
        try:
            row = scrape_article(url)
            rows.append(row)
        except Exception as e:
            print(f"[ERROR] {url} | {e}")
            failures.append((url, str(e)))

    save_rows_to_excel(rows, OUTPUT_XLSX)

    if failures:
        print("\nSome URLs failed to scrape:")
        for u, err in failures:
            print(f"- {u} | {err}")

    print(f"\nDone. Scraped {len(rows)} article(s).")


if __name__ == "__main__":
    main()


Reading URLs from: Output/cna_housing_and_development_board_20250101_20250930.csv
Found 25 URL(s).
[1/25] Scraping: https://www.channelnewsasia.com/singapore/worksite-death-tengah-hdb-plantation-edge-bto-indian-national-construction-workplace-safety-mom-4843271
[2/25] Scraping: https://www.channelnewsasia.com/singapore/19600-bto-flats-launched-2025-desmond-lee-4859601
[3/25] Scraping: https://www.channelnewsasia.com/singapore/singapore-not-averse-new-property-cooling-measures-desmond-lee-4860611
[4/25] Scraping: https://www.channelnewsasia.com/singapore/mogulsg-property-listings-ai-driven-tool-home-buyers-4918046
[5/25] Scraping: https://www.channelnewsasia.com/singapore/hdb-home-improvement-programme-29000-flats-4941461
[6/25] Scraping: https://www.channelnewsasia.com/singapore/hdb-february-2025-bto-sales-exercise-tanjong-rhu-queenstown-4943201
[7/25] Scraping: https://www.channelnewsasia.com/singapore/bto-new-flats-multi-agency-committee-support-residents-5061846
[8/25] Scraping: htt