# Standard Chartered Scraper

In [2]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [4]:
headers = {"User-Agent": "Mozilla/5.0"}

def extract_tables(soup):
    tables = []
    for table in soup.find_all("table"):
        rows = []
        for tr in table.find_all("tr"):
            cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
            if cols:
                rows.append(" | ".join(cols))
        if rows:
            tables.append(" / ".join(rows))
    return " || ".join(tables)

# Step 1: Get sitemap HTML
sitemap_url = "https://www.sc.com/sg/sitemap/"
response = requests.get(sitemap_url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Step 2: Filter personal banking URLs
all_links = soup.find_all("a", href=True)
base_url = "https://www.sc.com"
personal_urls = []

for link in all_links:
    href = link["href"]
    if href.startswith("/sg/"):
        full_url = base_url + href
        personal_urls.append(full_url)

# Optional: remove duplicates
personal_urls = list(set(personal_urls))
print(f"✅ Found {len(personal_urls)} personal banking pages.")

# Step 3: Scrape each URL and save content to CSV
with open("sc_personal_banking_content.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow([
        "URL", "Title", "H1", "Subheadings", "Paragraphs", "Bullet Points", "Tables"
    ])

    for i, url in enumerate(personal_urls):
        print(f"[{i+1}/{len(personal_urls)}] Scraping: {url}")
        try:
            page = requests.get(url, headers=headers, timeout=10)
            page_soup = BeautifulSoup(page.text, "html.parser")

            # Title
            title = page_soup.find("title").text.strip() if page_soup.find("title") else ""

            # H1
            h1 = page_soup.find("h1").text.strip() if page_soup.find("h1") else ""

            # H2s
            h2s = [h2.text.strip() for h2 in page_soup.find_all("h2")]
            subheadings = " | ".join(h2s)

            # Paragraphs
            paragraphs = [
                p.text.strip()
                for p in page_soup.find_all("p")
                if p.text.strip()
            ]
            paragraph_text = " | ".join(paragraphs)

            # Bullet points
            bullets = [
                li.text.strip()
                for li in page_soup.find_all("li")
                if li.text.strip()
            ]
            bullet_text = " | ".join(bullets)

            # Tables
            tables = extract_tables(page_soup)

            # Write to CSV
            writer.writerow([
                url, title, h1, subheadings, paragraph_text, bullet_text, tables
            ])

        except Exception as e:
            print(f"❌ Failed to scrape {url}: {e}")
            writer.writerow([url, "ERROR", "", "", "", "", str(e)])

        time.sleep(1)

print("✅ Scraping complete. Results saved to 'sc_personal_banking_content.csv'")


✅ Found 81 personal banking pages.
[1/81] Scraping: https://www.sc.com/sg/priority/
[2/81] Scraping: https://www.sc.com/sg/borrow/mortgages/green-mortgage/
[3/81] Scraping: https://www.sc.com/sg/insurance/allianz-home-protect/
[4/81] Scraping: https://www.sc.com/sg/save/savings-accounts/esaver/
[5/81] Scraping: https://www.sc.com/sg/rewards-programmes/ars/
[6/81] Scraping: https://www.sc.com/sg/borrow/mortgages/mortgageone/
[7/81] Scraping: https://www.sc.com/sg/borrow/mortgages/loanrepricing/
[8/81] Scraping: https://www.sc.com/sg/insurance/msig-maidplus/
[9/81] Scraping: https://www.sc.com/sg/pricing-guide/
[10/81] Scraping: https://www.sc.com/sg/wealth/investment/livefx/
[11/81] Scraping: https://www.sc.com/sg/find/?referId=pb-insure
[12/81] Scraping: https://www.sc.com/sg/borrow/mortgages/
[13/81] Scraping: https://www.sc.com/sg/market-outlook/
[14/81] Scraping: https://www.sc.com/sg/credit-cards/journey-credit-card/
[15/81] Scraping: https://www.sc.com/sg/borrow/travel/
[16/81] Sc