In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options 
import csv
import time
import re
from bs4 import BeautifulSoup
import os
import pandas as pd

In [2]:
# Set Chrome driver options
chrome_options = Options() 

chrome_options.add_argument('--headless=new')  
chrome_options.add_argument('--disable-dev-shm-usage') 
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--window-position=-2400,-2400')
chrome_options.add_argument('--disable-blink-features=AutomationControlled')

chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option('useAutomationExtension', False)
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36')

# Set ChromeDriver path
service = Service()

# Initialize driver
driver = webdriver.Chrome(service=service, options=chrome_options)

Setting up Chrome driver...
Chrome driver initialized successfully


In [3]:
print("Opening main page to fetch property links...")
# Open main page
driver.get('https://www.99.co/singapore/rent/hdb')
page = driver.page_source
soup = BeautifulSoup(page, "html.parser")
print(soup)

Opening main page to fetch property links...
<html class="__className_d041f4" lang="en"><head><meta charset="utf-8"/><link crossorigin="" href="http://router-web-dev.sg-dev.svc.cluster.local" rel="preconnect"/><link crossorigin="" href="http://router-web-dev.sg-dev.svc.cluster.local" rel="dns-prefetch"/><link crossorigin="" href="http://router-web-prod.sg-prod.svc.cluster.local" rel="preconnect"/><link crossorigin="" href="http://router-web-prod.sg-prod.svc.cluster.local" rel="dns-prefetch"/><link crossorigin="" href="https://pic2.99.co" rel="preconnect"/><link crossorigin="" href="https://pic2.99.co" rel="dns-prefetch"/><link crossorigin="" href="https://spaassets.99.co" rel="preconnect"/><link crossorigin="" href="https://spaassets.99.co" rel="dns-prefetch"/><link crossorigin="" href="htpps://images.prismic.io" rel="preconnect"/><link crossorigin="" href="htpps://images.prismic.io" rel="dns-prefetch"/><link crossorigin="" href="https://img.youtube.com" rel="preconnect"/><link crossor

In [4]:
rent_links = []

In [5]:
csv_filename = 'hdb_links.csv'
if not os.path.exists(csv_filename):
    with open(csv_filename, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['property_link'])
    print(f"Created new file: {csv_filename}")

def save_links_to_csv(links, filename):
    with open(filename, 'a', newline='') as f:
        writer = csv.writer(f)
        for link in links:
            writer.writerow([link])
    print(f"Saved {len(links)} links to {filename}")

In [6]:
page_num = 1
while True:
    try:
        print(f"Scraping page {page_num}...")
        time.sleep(10)
        
        page = driver.page_source
        soup = BeautifulSoup(page, "html.parser")
        
        current_page_links = []
        
        links = soup.find_all('a', class_='Heading_heading5__sWyfX CellHeading_headingText__gRv8Q Heading_baseColor__xWzRr')
        for link in links:
            href = link.get('href')
            if href:
                elem = "https://www.99.co" + href
                current_page_links.append(elem)
                rent_links.append(elem)
        
        save_links_to_csv(current_page_links, csv_filename)

        #if len(rent_links)>=6000:
            #break

        if len(rent_links)>=200:
            break
        
        next_button = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@aria-label='Next page']"))
        )
        
        if 'aria-disabled' in next_button.get_attribute('class'):
            print("Reached the last page")
            break
            
        next_button.click()
        page_num += 1
        
    except TimeoutException:
        print("Timeout waiting for next page button - might be the last page")
        break
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        break

Scraping page 1...
Saved 36 links to hdb_links.csv
Scraping page 2...
Saved 36 links to hdb_links.csv
Scraping page 3...
Saved 36 links to hdb_links.csv
Scraping page 4...
Saved 36 links to hdb_links.csv
Scraping page 5...
Saved 36 links to hdb_links.csv
Scraping page 6...
Saved 36 links to hdb_links.csv


In [7]:
# Read property links from CSV
with open('hdb_links.csv', 'r') as f:
    reader = csv.DictReader(f)
    property_links = [row['property_link'] for row in reader]

print(f"Loaded {len(property_links)} links from CSV")

Loaded 216 links from CSV


In [8]:
columns = ['property_link', 'name', 'beds', 'baths', 'size', 'price_per_sqft', 'floor_level', 
          'furnishing', 'built_year', 'tenure', 'property_type', 'amenities', 'description']
csv_filename = "hdb_details.csv"

if os.path.exists(csv_filename):
    df = pd.read_csv(csv_filename)
    
    for col in columns:
        if col not in df.columns:
            df[col] = ''
            
    df = df.reindex(columns=columns)
    
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"Updated CSV with columns: {', '.join(df.columns)}")
else:
    df = pd.DataFrame(columns=columns)
    
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"Created new file: {csv_filename} with columns: {', '.join(columns)}")

Created new file: hdb_details.csv with columns: property_link, name, beds, baths, size, price_per_sqft, floor_level, furnishing, built_year, tenure, property_type, amenities, description


In [9]:
def extract_property_info(soup, property_link):
    name = ""
    beds = ""
    baths = ""
    size = ""
    pps = ""
    floor = ""
    furn = ""
    year = ""
    ten = ""
    ptype = ""
    description_str = ""
    amenities_str = ""
    room_type = ""
    
    # Room name
    name = soup.find('h1', class_='Heading_heading1__rfsmG Overview_title__t3sng Heading_baseColor__xWzRr')
    if name:
        name = name.get_text().strip()
    else:
        name = ""

    # Room type and size
    room_type = soup.find("div", class_="Overview_items__2Cmab")
    if room_type:
        room_type = room_type.get_text().strip()
        
        elements = [elem.strip() for elem in room_type.split('·')]
        
        # Initialize variables
        size = ""
        beds = ""
        baths = ""
        
        # Process each element
        for i, elem in enumerate(elements):
            if 'sqft' in elem:
                size = elem.split('(')[0].replace('sqft', '').strip()
            
            if 'Bed' in elem:  # This will match both 'Bed' and 'Beds'
                beds = elem.replace('Beds', '').replace('Bed', '').strip()
                
            if 'Bath' in elem:  # This will match both 'Bath' and 'Baths'
                baths = elem.replace('Baths', '').replace('Bath', '').strip()


    # Price per sqft
    target_price = soup.find("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_label__ZTXLo", string="Price/sqft")
    if target_price:
        price_per_sqrt = target_price.find_next("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_text__wDVAb")
        if price_per_sqrt:
            pps_text = price_per_sqrt.get_text().strip()
            pps = float(re.search(r'[\d.]+', pps_text).group()) if re.search(r'[\d.]+', pps_text) else None
    else:
        pps = ""

    # Floor Level
    floor_level = soup.find("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_label__ZTXLo", string="Floor Level")
    if floor_level:
        floor = floor_level.find_next("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_text__wDVAb")
        if floor:
            floor = floor.get_text().strip()
    else:
        floor = ""

    # Furnishing
    furnishing = soup.find("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_label__ZTXLo", string="Furnishing")
    if furnishing:
        furn = furnishing.find_next("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_text__wDVAb")
        if furn:
            furn = furn.get_text().strip()
    else:
        furn = ""

    # Built year
    built_year = soup.find("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_label__ZTXLo", string="Built year")
    if built_year:
        year = built_year.find_next("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_text__wDVAb")
        if year:
            year = year.get_text().strip()
    else:
        year = ""

    # Tenure
    tenure = soup.find("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_label__ZTXLo", string="Tenure")
    if tenure:
        ten = tenure.find_next("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_text__wDVAb")
        if ten:
            ten = ten.get_text().strip()
    else:
        ten = ""

    # Property type
    prop_type = soup.find("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_label__ZTXLo", string="Property type")
    if prop_type:
        ptype = prop_type.find_next("td", class_="KeyValueDescription_cell__cMNpj KeyValueDescription_text__wDVAb")
        if ptype:
            ptype = ptype.get_text().strip()
    else:
        ptype = ""

    # Description
    description = soup.find("h2", class_="Heading_heading4__UnFza components_section__title__MQTjb Heading_baseColor__xWzRr", string="Description")
    if description:
        description_div = description.find_next("div", class_="Description_collapse__Jn4WL")
        if description_div:
            description_str = description_div.get_text().strip()
    else:
        description_str = ""
    
    
    # Amenities
    amenities = soup.find("h2", class_="Heading_heading4__UnFza components_section__title__MQTjb Heading_baseColor__xWzRr", string="Amenities")
    if amenities:
        amenities_divs = soup.find_all("div", class_="Amenities_amenity___j1_u")
        amenities = []
        for div in amenities_divs:
            amenity_text = div.find("p", class_="Body_body1__BxpBP Amenities_amenityLabel__Ke5Rk Body_baseColor__4vUEC")
            if amenity_text:
                amenities.append(amenity_text.get_text().strip())
        
        # Join all amenities with space
        amenities_str = " ".join(amenities)
    else:
        amenities_str = ""
    
    return {
        'property_link': property_link,
        'name': name,
        'beds': beds,
        'baths': baths,
        'size': size,
        'price_per_sqft': pps,
        'floor_level': floor,
        'furnishing': furn,
        'built_year': year,
        'tenure': ten,
        'property_type': ptype,
        'amenities': amenities_str,
        'description': description_str
    }

In [10]:
def get_property_soup(url):
    try:
        print(f"Accessing: {url}")
        driver.get(url)
        time.sleep(2)
        page = driver.page_source
        return BeautifulSoup(page, "html.parser")
    except Exception as e:
        print(f"Error accessing {url}: {str(e)}")
        return None

In [11]:
# Main processing loop
processed_links = set(df['property_link'].tolist()) if not df.empty else set()
print(f"Found {len(processed_links)} already processed links")

unprocessed_links = [link for link in property_links if link not in processed_links]
print(f"Found {len(unprocessed_links)} links to process")

for _, property_link in enumerate(unprocessed_links):
    try:
        soup = get_property_soup(property_link)
        if soup:
            property_info = extract_property_info(soup, property_link)
            if property_info['name'] and property_info['name'].strip():
                df = pd.concat([df, pd.DataFrame([property_info])], ignore_index=True)
                df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
                print(f"Successfully processed: {property_link}")
            else:
                print(f"Skipped {property_link} due to empty name")
            time.sleep(2)
    except Exception as e:
        print(f"Error processing {property_link}: {str(e)}")
        continue

Found 0 already processed links
Found 216 links to process
Accessing: https://www.99.co/singapore/rent/property/116a-rivervale-drive-hdb-g8bgGfFB6xipR2sLcgo7jW


  df = pd.concat([df, pd.DataFrame([property_info])], ignore_index=True)


Successfully processed: https://www.99.co/singapore/rent/property/116a-rivervale-drive-hdb-g8bgGfFB6xipR2sLcgo7jW
Accessing: https://www.99.co/singapore/rent/property/74-bedok-north-road-hdb-gbmzVS7YFWs3LHrZQfZGmU
Successfully processed: https://www.99.co/singapore/rent/property/74-bedok-north-road-hdb-gbmzVS7YFWs3LHrZQfZGmU
Accessing: https://www.99.co/singapore/rooms/property/86-telok-blangah-heights-hdb-ktxgvy9UjE7rjLkhRDSo8Z
Successfully processed: https://www.99.co/singapore/rooms/property/86-telok-blangah-heights-hdb-ktxgvy9UjE7rjLkhRDSo8Z
Accessing: https://www.99.co/singapore/rent/property/409-woodlands-street-41-hdb-mjtkmjFaTYsG6dY8ds6Bzq
Successfully processed: https://www.99.co/singapore/rent/property/409-woodlands-street-41-hdb-mjtkmjFaTYsG6dY8ds6Bzq
Accessing: https://www.99.co/singapore/rooms/property/515-jelapang-road-hdb-aS8LAyAEqUVbdnakJmqp8A
Successfully processed: https://www.99.co/singapore/rooms/property/515-jelapang-road-hdb-aS8LAyAEqUVbdnakJmqp8A
Accessing: https