# OCBC Scraper

In [77]:
import requests
from bs4 import BeautifulSoup
import bs4
import csv
import time
import re

import selenium
from selenium import webdriver

In [78]:
# Links to Scrape

# Savings Accounts
accounts_links = [
    "https://www.ocbc.com/personal-banking/deposits/360-savings-account?pid=Mass:accounts:360Acc:AlwaysOn:Acquisition:Sep:2022:internal:::peb-banner:evergreen:::CFSPersonalise",
    "https://www.ocbc.com/personal-banking/deposits/multi-currency-global-savings-account",
    "https://www.ocbc.com/personal-banking/deposits/bonus-plus-savings-account",
    "https://www.ocbc.com/personal-banking/deposits/multi-currency-usd-current-account",
    "https://www.ocbc.com/personal-banking/deposits/monthly-savings-account",
    "https://www.ocbc.com/personal-banking/deposits/fixed-deposit-account",
    "https://www.ocbc.com/personal-banking/deposits/myown-account",
    "https://www.ocbc.com/personal-banking/deposits/child-development-account",
    "https://www.ocbc.com/personal-banking/deposits/mighty-savers-child-savings-account",
    "https://www.ocbc.com/personal-banking/deposits/passbook-savings-account",
    "https://www.ocbc.com/personal-banking/deposits/statement-passbook-savings-account",
    "https://www.ocbc.com/personal-banking/deposits/basic-current-account",
    "https://www.ocbc.com/personal-banking/investments/online-equities-account",
    "https://www.ocbc.com/personal-banking/investments/unit-trusts/what-are-unit-trusts",
    "https://www.ocbc.com/personal-banking/investments/roboinvest",
    "https://www.ocbc.com/personal-banking/investments/precious-metals-account"
]

card_links = [
    "https://www.ocbc.com/personal-banking/cards/infinity-unlimited-cashback-card",
    "https://www.ocbc.com/personal-banking/cards/rewards-card",
    "https://www.ocbc.com/personal-banking/cards/365-cashback-credit-card",
    "https://www.ocbc.com/personal-banking/cards/90-degrees-travel-credit-card.page",
    "https://www.ocbc.com/personal-banking/cards/bnpl-nxt-card.page",
    "https://www.ocbc.com/premier-banking/our-solutions/cards/premier-visa-infinite",
    "https://www.ocbc.com/personal-banking/cards/great-eastern-cashflo-credit-card.page",
    "https://www.ocbc.com/personal-banking/cards/platinum-rewards-credit-card.page",
    "https://www.ocbc.com/personal-banking/cards/best-denki-rewards-credit-card.page",
    "https://www.ocbc.com/personal-banking/cards/arts-rewards-credit-card.page",
    "https://www.ocbc.com/personal-banking/cards/ocbc-debit-card.page",
]

loan_links = [
    "https://www.ocbc.com/personal-banking/loans/easicredit-flexible-repayment-cash-loan",
    "https://www.ocbc.com/personal-banking/loans/car-loans",
    "https://www.ocbc.com/personal-banking/loans/new-purchase-of-hdb-private-property",
]

insurance_links = [
    "https://www.ocbc.com/personal-banking/insurance/investment-linked-insurance-plan-2",
    "https://www.ocbc.com/personal-banking/insurance/great-careshield",
    "https://www.ocbc.com/personal-banking/insurance/great-term-guard",
    "https://www.ocbc.com/personal-banking/insurance/explorer-travel-insurance"
]

links = accounts_links + card_links + loan_links + insurance_links

In [79]:
len(links)


34

## Header Chunk

In [80]:
# Filters
def header_filter(tag: bs4.element.Tag):
    return tag.name == "div" and "com_slider" in tag.get("class", []) and not tag.find("a")

def section_filter(tag: bs4.element.Tag):
    excluded_ids = {
        "section-interestcalculator",
        "section-articles",
        "section-stitch",
        "section-bankingservices",
    }

    return (
        tag.name == "div"
        and tag.get("id", "").startswith("section-")
        and tag.get("id") not in excluded_ids
        and "section" in tag.get("class", [])
    )


In [81]:
def extract_metadata_tags(url):
    tags = []
    if "insurance" in url:
        tags.append("insurance")
    if "loan" in url or "loans" in url:
        tags.append("loans")
    if "credit-card" in url or "cards" in url:
        tags.append("credit cards")
    if "save" in url or "deposit" in url:
        tags.append("savings")
    if "invest" in url:
        tags.append("investments")
    if "wealth" in url:
        tags.append("wealth management")
    return ", ".join(tags) if tags else "general"

In [82]:
def scraper(links):
    # Initialise Selenium
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    driver = webdriver.Chrome(options=options)
    
    rows = []

    for url in links:
        print(f"Current URL: {url}")

        driver.get(url)
        time.sleep(5)

        html = driver.page_source
        soup = BeautifulSoup(html, "html.parser")


        main = soup.find("section", id="main-wrapper")
        if not main:
            print("❌ Could not find main content area with id='main-wrapper'")
            return

        else:
            print("✅ Found main content section.")

    
        # Scrape header content
        output = main.find_all(header_filter)
        print(f"Found {len(output)} matching 'com_slider' sections.")

        h1 = main.find("h1")
        h2 = main.find("h2")
        title = h1.get_text(strip=True) if h1 else ""
        subtitle = h2.get_text(strip=True) if h2 else ""
        metadata = extract_metadata_tags(url)

        # Parse each 'com_slider' block
        for section in output:
            current_subheader = ""

            for tag in section.descendants:
                if isinstance(tag, bs4.element.Tag):
                    if tag.name == "b":
                        current_subheader = tag.get_text(strip=True)
                    elif tag.name == "span":
                        text = tag.get_text(strip=True)
                        if text:
                            rows.append({
                                "url": url,
                                "title": title,
                                "subtitle": subtitle,
                                "subheader": current_subheader,
                                "text": text,
                                "tag": metadata
                            })

        # Scrape sectional content

        candidates = soup.find_all("div", id=lambda x: x and x.startswith("section-"))
        print(f"Found {len(candidates)} divs with id starting 'section-':")
        for c in candidates:
            print(f"  -> id: {c['id']}, class: {c.get('class', [])}")

        sections = main.find_all(section_filter, recursive=True)
        print(f"✅ Found {len(sections)} structured section divs.")

        for section in sections:
            print(f"🔍 Scraping section: {section.get('id', 'No ID')}")

            h1 = main.find("h1")
            h2 = main.find("h2")
            title = h1.get_text(strip=True) if h1 else ""
            subtitle = h2.get_text(strip=True) if h2 else ""
            metadata = extract_metadata_tags(url)


            current_h3 = ""
            current_b = ""
            current_subcat = ""

            # Fallback subheader from section id
            section_id = section.get("id", "")
            fallback_subheader = section_id.replace("section-", "").replace("-", " ").title() if section_id else ""

            for tag in section.find_all(["h1", "h2", "h3", "b", "p", "li", "div"]):
                text = tag.get_text(strip=True)
                if not text:
                    continue

                tag_name = tag.name.lower()
                classes = tag.get("class", [])

                if tag_name == "h3":
                    current_h3 = text
                elif tag_name == "b":
                    current_b = text
                elif tag_name == "div" and "subcategory" in classes:
                    current_subcat = text
                elif tag_name in ["p", "li"]:
                    # Subheader priority: b > subcategory > h3 > section fallback
                    subheader = current_b or current_subcat or current_h3 or fallback_subheader
                    rows.append({
                        "url": url,
                        "title": title,
                        "subtitle": subtitle,
                        "subheader": subheader,
                        "text": text,
                        "tag": metadata
                    })

    # Write to CSV
    print("Writing to CSV...")
    fieldnames = ["url", "title", "subtitle", "subheader", "text", "tag"]
    try:
        with open("ocbc_data.csv", "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for row in rows:
                writer.writerow(row)
        
        print("Data written to ocbc_data.csv.")
    except Exception as e:
        print(f"Error writing to CSV: {e}")
        return
    

In [83]:
scraper(links)

Current URL: https://www.ocbc.com/personal-banking/deposits/360-savings-account?pid=Mass:accounts:360Acc:AlwaysOn:Acquisition:Sep:2022:internal:::peb-banner:evergreen:::CFSPersonalise
✅ Found main content section.
Found 1 matching 'com_slider' sections.
Found 11 divs with id starting 'section-':
  -> id: section-importantnotice, class: ['section', 'section-both', 'bg-grey']
  -> id: section-howitworks, class: ['section', 'section-both']
  -> id: section-interestcalculator, class: ['section', 'section-both', 'bg-grey', 'noprint']
  -> id: section-trackyourbonuseligibilityeasily, class: ['section', 'section-both']
  -> id: section-stitch, class: ['section', 'section-both', 'bg-grey']
  -> id: section-moneylock, class: ['section', 'section-both']
  -> id: section-articles, class: ['section', 'section-both', 'bg-grey', 'noprint']
  -> id: section-beforeyouapply, class: ['section', 'section-both']
  -> id: section-bankingservices, class: ['section', 'section-both', 'bg-grey']
  -> id: secti