# scrape list of BT articles from BT housing page

In [10]:
import csv
import time
from datetime import datetime, date
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup

# --- Selenium imports ---
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# ================= Config =================
START_DATE = date(2025, 1, 1)
END_DATE   = date(2025, 9, 30)

LISTING_URL = "https://www.businesstimes.com.sg/property/residential"
OUT_CSV     = "Output/bt_residential_2025-01-01_to_2025-09-30.csv"

# How many scroll-batches at most (safety cap)
MAX_SCROLL_BATCHES = 60

# Pause timings
SCROLL_PAUSE_SEC = 1.2          # pause after each scroll
NEW_CARDS_SETTLE_SEC = 1.5      # give time for new cards to render
REQUEST_TIMEOUT = 20
PAUSE_BETWEEN_ARTICLE_REQUESTS = 0.3

# Accept these meta slots as "published_time"
PUBLISHED_META_SLOTS = [
    ("property", "article:published_time"),
    ("name",     "article:published_time"),
    ("name",     "parsely-pub-date"),
    ("itemprop", "datePublished"),
]

SGT_TZ = "Asia/Singapore"

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    )
}

# ================= Helpers =================
def to_abs(base, href):
    if not href:
        return None
    return href if href.startswith(("http://", "https://")) else urljoin(base, href)

def same_domain(u):
    p = urlparse(u)
    return p.netloc in {"www.businesstimes.com.sg", "businesstimes.com.sg"}

def find_published_meta(soup):
    for attr, val in PUBLISHED_META_SLOTS:
        tag = soup.find("meta", attrs={attr: val})
        if tag and tag.get("content"):
            return tag["content"].strip(), f'meta[{attr}="{val}"]'
    return None, None

def parse_iso_like(iso_str):
    if not iso_str:
        return None
    s = iso_str.replace("Z", "+00:00")
    dt = None
    try:
        dt = datetime.fromisoformat(s)
    except Exception:
        for fmt in (
            "%Y-%m-%dT%H:%M:%S%z",
            "%Y-%m-%dT%H:%M:%S.%f%z",
            "%Y-%m-%d %H:%M:%S%z",
            "%Y-%m-%dT%H:%M",
            "%Y-%m-%d %H:%M:%S",
        ):
            try:
                dt = datetime.strptime(s, fmt)
                break
            except Exception:
                continue
    if dt and dt.tzinfo is None:
        from zoneinfo import ZoneInfo
        dt = dt.replace(tzinfo=ZoneInfo(SGT_TZ))
    return dt

def to_sgt(dt):
    from zoneinfo import ZoneInfo
    return dt.astimezone(ZoneInfo(SGT_TZ))

def within_window(dt):
    return START_DATE <= dt.date() <= END_DATE

def fetch_article_soup(url):
    r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
    r.raise_for_status()
    return BeautifulSoup(r.text, "html.parser")

def extract_title(soup):
    h1 = soup.find("h1")
    return h1.get_text(strip=True) if h1 else ""

# ================= Selenium setup (fixed) =================
chrome_options = Options()
chrome_options.add_argument("--headless=new")  # comment this if you want to see the browser
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1280,2000")
chrome_options.add_argument("--no-sandbox")

# ✅ Correct way: use service= and options=
from selenium.webdriver.chrome.service import Service
service = Service(ChromeDriverManager().install())

driver = webdriver.Chrome(service=service, options=chrome_options)
driver.get(LISTING_URL)

# Wait for first batch of cards to appear
try:
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a[href]"))
    )
except Exception:
    driver.quit()
    raise SystemExit("Failed to load the listing page or find anchors.")

rows = []
seen_links = set()

# For early-stopping heuristics
global_oldest_seen = None
batches_since_last_new_in_window = 0

for batch in range(1, MAX_SCROLL_BATCHES + 1):
    # Scroll to bottom to trigger lazy load/infinite scroll
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(SCROLL_PAUSE_SEC)
    time.sleep(NEW_CARDS_SETTLE_SEC)

    # Collect anchors currently in DOM
    anchors = driver.find_elements(By.CSS_SELECTOR, "a[href]")
    hrefs = set()
    for a in anchors:
        try:
            href = a.get_attribute("href")
        except Exception:
            continue
        if not href:
            continue
        if not same_domain(href):
            continue
        hrefs.add(href)

    # Process only NEW links (dedup)
    new_links = sorted([u for u in hrefs if u not in seen_links])
    if not new_links and batches_since_last_new_in_window >= 2:
        print(f"\nNo new links for 2 batches. Stopping at batch {batch}.")
        break

    kept_this_batch = 0
    checked_meta_this_batch = 0
    drop_before_this_batch = 0
    drop_after_this_batch = 0
    no_meta_this_batch = 0
    errors_this_batch = 0
    oldest_this_batch = None

    for u in new_links:
        seen_links.add(u)

        # Quick cheap skip for obvious section roots
        path = urlparse(u).path.rstrip("/")
        if path in {"", "/", "/property", "/property/residential", "/property/commercial-industrial"}:
            continue

        # Fetch article page and check for published_time meta
        try:
            art = fetch_article_soup(u)
        except Exception:
            errors_this_batch += 1
            continue

        iso, src = find_published_meta(art)
        if not iso:
            no_meta_this_batch += 1
            continue

        checked_meta_this_batch += 1
        dt = parse_iso_like(iso)
        if not dt:
            # meta exists but unparsable; treat as no-date
            continue

        sgt_dt = to_sgt(dt)
        if (oldest_this_batch is None) or (sgt_dt.date() < oldest_this_batch):
            oldest_this_batch = sgt_dt.date()
        if (global_oldest_seen is None) or (sgt_dt.date() < global_oldest_seen):
            global_oldest_seen = sgt_dt.date()

        if within_window(sgt_dt):
            title = extract_title(art)
            rows.append({
                "title": title,
                "url": u,
                "published_sgt": sgt_dt.strftime("%Y-%m-%d %H:%M"),
            })
            kept_this_batch += 1
        elif sgt_dt.date() < START_DATE:
            drop_before_this_batch += 1
        else:
            drop_after_this_batch += 1

        time.sleep(PAUSE_BETWEEN_ARTICLE_REQUESTS)

    # ---- Batch summary (what you asked to monitor) ----
    print(f"\n--- Batch {batch} summary ---")
    print(f"New links discovered:      {len(new_links)}")
    print(f"Meta-backed links checked: {checked_meta_this_batch}")
    print(f"Kept (in window):          {kept_this_batch}")
    print(f"Dropped before window:     {drop_before_this_batch}")
    print(f"Dropped after window:      {drop_after_this_batch}")
    print(f"No published_time meta:    {no_meta_this_batch}")
    print(f"Errors fetching articles:  {errors_this_batch}")
    print(f"Oldest date this batch:    {oldest_this_batch or '(none)'}")
    print(f"Global oldest seen:        {global_oldest_seen or '(none)'}")

    # early-stop rule:
    # - if we’ve seen any dates and the global oldest is earlier than START_DATE,
    #   and we didn’t keep anything in the last 2 batches, stop.
    if kept_this_batch > 0:
        batches_since_last_new_in_window = 0
    else:
        batches_since_last_new_in_window += 1

    if (global_oldest_seen is not None) and (global_oldest_seen < START_DATE) and (batches_since_last_new_in_window >= 2):
        print("\nReached older-than-start dates and no new in-window items for 2 batches. Stopping.")
        break

# Tidy up Selenium
driver.quit()

# ===== Output =====
rows.sort(key=lambda r: r["published_sgt"])
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=["title", "url", "published_sgt"])
    w.writeheader()
    w.writerows(rows)

print("\n=== Overall ===")
print(f"Total kept (in window): {len(rows)}")
print(f"Saved to: {OUT_CSV}")



--- Batch 1 summary ---
New links discovered:      139
Meta-backed links checked: 30
Kept (in window):          15
Dropped before window:     0
Dropped after window:      15
No published_time meta:    100
Errors fetching articles:  0
Oldest date this batch:    2025-07-14
Global oldest seen:        2025-07-14

--- Batch 2 summary ---
New links discovered:      10
Meta-backed links checked: 10
Kept (in window):          10
Dropped before window:     0
Dropped after window:      0
No published_time meta:    0
Errors fetching articles:  0
Oldest date this batch:    2025-06-13
Global oldest seen:        2025-06-13

--- Batch 3 summary ---
New links discovered:      10
Meta-backed links checked: 10
Kept (in window):          10
Dropped before window:     0
Dropped after window:      0
No published_time meta:    0
Errors fetching articles:  0
Oldest date this batch:    2025-04-28
Global oldest seen:        2025-04-28

--- Batch 4 summary ---
New links discovered:      10
Meta-backed links ch

# scrape articles from output file

In [5]:
# ======= CONFIG =======
INPUT_CSV = "Output/bt_residential_2025-01-01_to_2025-09-30.csv"  # must contain a column named 'url'
OUTPUT_XLSX = "Output/bt_articles_with_content.xlsx"

# If True, launch Chrome with Selenium so you can log in to BT once.
USE_SELENIUM_AUTH = True

In [6]:
import csv
import json
import time
import html
from pathlib import Path
from urllib.parse import urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup

# Politeness
REQUEST_TIMEOUT = 25
PAUSE_BETWEEN_REQUESTS = 0.5  # seconds

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/122.0.0.0 Safari/537.36"
    )
}

# Some phrases/containers we typically want to skip if they appear in-paragraph
STOP_PHRASES = {
    "Get the BT app", "Get WhatsApp alerts", "Also read:", "Related:", "More on this topic",
    "Have a news tip?", "Sign up", "Subscribe to", "Unlimited access", "Already a subscriber"
}


# ======= REQUESTS MODE (no login) =======
def fetch_html_requests(url: str) -> str | None:
    try:
        r = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        # Best-effort encoding normalisation
        if not r.encoding:
            r.encoding = "utf-8"
        return r.text
    except Exception as e:
        print(f"  [requests] fetch failed for {url}: {e}")
        return None


# ======= SELENIUM MODE (login once, use your subscription) =======
driver = None
def init_selenium():
    global driver
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager

    chrome_options = Options()
    # Comment out the next line if you want to see Chrome UI (recommended for login)
    # chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--window-size=1280,2000")
    chrome_options.add_argument("--no-sandbox")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    # Open BT site so you can log in with your subscription
    driver.get("https://www.businesstimes.com.sg/")
    print("\nA Chrome window has opened. Please log in to The Business Times with your subscription.")
    print("Once you can access a paywalled article in that window, return here and press Enter to continue.")
    input("Press Enter after logging in successfully... ")


def fetch_html_selenium(url: str) -> str | None:
    global driver
    try:
        driver.get(url)
        # Give the page some time to render (adjust if needed)
        time.sleep(2.0)
        return driver.page_source
    except Exception as e:
        print(f"  [selenium] fetch failed for {url}: {e}")
        return None


# ======= CONTENT EXTRACTION =======
def parse_jsonld_article_body(soup: BeautifulSoup) -> str | None:
    """
    Try to get articleBody from JSON-LD if present.
    """
    try:
        for tag in soup.find_all("script", type="application/ld+json"):
            # Some pages have multiple JSON-LD blocks; parse safely
            txt = tag.string or tag.get_text()
            if not txt:
                continue
            data = json.loads(txt.strip())
            # Could be a list or a dict
            candidates = data if isinstance(data, list) else [data]
            for obj in candidates:
                if not isinstance(obj, dict):
                    continue
                t = (obj.get("@type") or "").lower()
                if "article" in t or "newsarticle" in t or "reportage" in t:
                    body = obj.get("articleBody")
                    if body:
                        return html.unescape(body).strip()
    except Exception:
        pass
    return None


def extract_article_text_from_dom(soup: BeautifulSoup) -> str:
    """
    Fallback: collect paragraphs from <article> or main content container.
    We avoid obvious non-article blocks (aside, nav, footer).
    """
    # Prefer <article> tag if available
    root = soup.find("article")
    if root is None:
        # fall back to the biggest content-ish container
        for sel in ["main", "section", "div[role='main']", "div.content", "div.article"]:
            root = soup.select_one(sel)
            if root:
                break
    if root is None:
        root = soup  # last resort: whole document

    # gather <p> that look like article paragraphs
    paras = []
    for p in root.find_all("p"):
        # Skip empty, nav, footer crumbs
        txt = p.get_text(" ", strip=True)
        if not txt:
            continue
        # Skip lines that are clearly utility / promotion
        if any(stop in txt for stop in STOP_PHRASES):
            continue
        paras.append(txt)

    # If nothing found under root, try all <p> (very last resort)
    if not paras:
        for p in soup.find_all("p"):
            txt = p.get_text(" ", strip=True)
            if txt and not any(stop in txt for stop in STOP_PHRASES):
                paras.append(txt)

    # Join with blank lines to preserve paragraph breaks a bit
    content = "\n\n".join(paras).strip()
    # Clean stray HTML entities
    content = html.unescape(content)
    return content


def extract_article_content(html_text: str) -> str:
    soup = BeautifulSoup(html_text, "html.parser")

    # First try JSON-LD articleBody
    body = parse_jsonld_article_body(soup)
    if body and len(body) > 80:  # sanity check length
        return body.strip()

    # Fallback: DOM paragraphs
    return extract_article_text_from_dom(soup)


# ======= MAIN =======
def main():
    global driver

    # Load URLs
    df = pd.read_csv(INPUT_CSV)
    # Normalise column name
    if "url" not in df.columns:
        # try case-insensitive
        for c in df.columns:
            if c.lower().strip() == "url":
                df.rename(columns={c: "url"}, inplace=True)
                break
    if "url" not in df.columns:
        raise SystemExit("Input CSV must contain a 'url' column.")

    if USE_SELENIUM_AUTH:
        init_selenium()

    contents = []
    char_counts = []

    for i, url in enumerate(df["url"], start=1):
        if not isinstance(url, str) or not url.strip():
            contents.append("")
            char_counts.append(0)
            continue

        print(f"[{i}/{len(df)}] Fetching: {urlparse(url).path}")
        time.sleep(PAUSE_BETWEEN_REQUESTS)

        # Fetch HTML (requests or selenium)
        html_text = fetch_html_selenium(url) if USE_SELENIUM_AUTH else fetch_html_requests(url)
        if not html_text:
            print("    -> fetch failed; leaving content empty")
            contents.append("")
            char_counts.append(0)
            continue

        # Extract content
        content = extract_article_content(html_text)
        n_chars = len(content)

        print(f"    -> {n_chars} characters")
        contents.append(content)
        char_counts.append(n_chars)

    # Add columns and save to Excel
    df["content"] = contents
    df["content_chars"] = char_counts

    # Make sure output folder exists
    Path(OUTPUT_XLSX).parent.mkdir(parents=True, exist_ok=True)
    df.to_excel(OUTPUT_XLSX, index=False)
    print(f"\nSaved {len(df)} rows to {OUTPUT_XLSX}")

    # Clean up Selenium
    if USE_SELENIUM_AUTH and driver is not None:
        driver.quit()


if __name__ == "__main__":
    main()



A Chrome window has opened. Please log in to The Business Times with your subscription.
Once you can access a paywalled article in that window, return here and press Enter to continue.


Press Enter after logging in successfully...  


[1/58] Fetching: /property/residential/river-valley-apartments-sale-s56-million
    -> 2732 characters
[2/58] Fetching: /property/residential/good-class-bungalow-sales-volume-2025-expected-match-if-not-exceed-last-years-tally
    -> 8545 characters
[3/58] Fetching: /property/residential/dont-spare-asset-rich-cash-poor-private-homeowners-paying-higher-property-taxes
    -> 6448 characters
[4/58] Fetching: /property/residential/measured-bids-tengah-housing-plot-bullish-play-dairy-farm-site-developers-stay-conservative
    -> 6484 characters
[5/58] Fetching: /property/hdb-supply-19600-bto-flats-2025-amid-continued-rise-resale-prices
    -> 3922 characters
[6/58] Fetching: /property/residential/jadescape-owner-sells-unit-s4-4-million-profit-after-5-years-topping-q4-resale-gains
    -> 5595 characters
[7/58] Fetching: /property/residential/hdb-resale-prices-climb-9-7-2024-rise-double-2023
    -> 6093 characters
[8/58] Fetching: /property/residential/ditch-vip-inside-track-new-condo-launches