# Citibank Scraper

In [1]:
import requests
from bs4 import BeautifulSoup
import csv
import time

In [2]:
# Function to extract tables from a page
def extract_tables(soup):
    tables = []
    for table in soup.find_all("table"):
        rows = []
        for tr in table.find_all("tr"):
            cols = [td.get_text(strip=True) for td in tr.find_all(["td", "th"])]
            if cols:
                rows.append(" | ".join(cols))
        if rows:
            tables.append(" / ".join(rows))
    return " || ".join(tables)

# Step 1: Download and parse the sitemap
sitemap_url = "https://www.citibank.com.sg/sitemap.xml"
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(sitemap_url, headers=headers)
soup = BeautifulSoup(response.content, "xml")

# Step 2: Extract all URLs from the sitemap
urls = [loc.text for loc in soup.find_all("loc")]

# Step 3: Filter personal banking URLs
personal_banking_keywords = [
    "/personal-banking/",
    "/credit-cards/",
    "/loans/",
    "/investments/",
    "/insurance/"
]
personal_urls = [url for url in urls if any(keyword in url for keyword in personal_banking_keywords)]

print(f"Found {len(personal_urls)} personal banking pages.")

# Step 4: Scrape each personal banking page
with open("citibank_personal_banking_data.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["URL", "Title", "H1", "Subheadings", "Paragraphs", "Bullet Points", "Tables"])

    for i, url in enumerate(personal_urls):
        print(f"[{i+1}/{len(personal_urls)}] Scraping: {url}")
        try:
            page = requests.get(url, headers=headers, timeout=10)
            page_soup = BeautifulSoup(page.text, "html.parser")

            # Extract title
            title = page_soup.find("title").text.strip() if page_soup.find("title") else ""

            # Extract h1
            h1 = page_soup.find("h1").text.strip() if page_soup.find("h1") else ""

            # Extract all h2 subheadings
            h2s = [h2.text.strip() for h2 in page_soup.find_all("h2")]
            subheadings = " | ".join(h2s)

            # Extract all paragraphs
            paragraphs = [p.text.strip() for p in page_soup.find_all("p") if p.text.strip()]
            paragraph_text = " | ".join(paragraphs)

            # Extract all bullet points
            bullets = [li.text.strip() for li in page_soup.find_all("li") if li.text.strip()]
            bullet_text = " | ".join(bullets)

            # Extract tables
            tables = extract_tables(page_soup)

            # Write to CSV
            writer.writerow([url, title, h1, subheadings, paragraph_text, bullet_text, tables])

        except Exception as e:
            print(f"Failed to scrape {url}: {e}")
            writer.writerow([url, "ERROR", "", "", "", "", str(e)])

        time.sleep(1)

print("Scraping complete. Data saved to 'citibank_personal_banking_data.csv'.")


Found 105 personal banking pages.
[1/105] Scraping: https://www.citibank.com.sg/credit-cards/
[2/105] Scraping: https://www.citibank.com.sg/gcb/loans/home.htm
[3/105] Scraping: https://www.citibank.com.sg/personal-banking/
[4/105] Scraping: https://www.citibank.com.sg/gcb/insurance/creditInsure.htm
[5/105] Scraping: https://www.citibank.com.sg/gcb/insurance/creditinsure_insurance.htm
[6/105] Scraping: https://www.citibank.com.sg/gcb/insurance/health-insurance.htm
[7/105] Scraping: https://www.citibank.com.sg/gcb/insurance/in_ap_cds.htm
[8/105] Scraping: https://www.citibank.com.sg/gcb/insurance/insurance-products.htm
[9/105] Scraping: https://www.citibank.com.sg/gcb/insurance/travel-insurance.htm
[10/105] Scraping: https://www.citibank.com.sg/loans/mortgage/learn-and-help
[11/105] Scraping: https://www.citibank.com.sg/loans/mortgage/manage-mortgage
[12/105] Scraping: https://www.citibank.com.sg/loans/mortgage/mortgage-calculator
[13/105] Scraping: https://www.citibank.com.sg/gcb/loans/