# This is to scrape the HDB application rates data from various sources

In [23]:
# let's import some libraries

import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service


In [24]:
# Correct path to the ChromeDriver executable
service = Service('./chromedriver-win64/chromedriver.exe')  # Update the path if needed
driver = webdriver.Chrome(service=service)

In [25]:
# Open the Python website
driver.get("https://www.propertyguru.com.sg/property-guides/bto-application-rate-which-hdb-estate-easiest-62148")

In [26]:
element = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div[1]/div[4]/div[1]/article/div[1]/figure[4]/div')

In [28]:
text_body = element.text

In [20]:
driver.quit()

In [30]:
text_body.split('\n')
# Split the text into lines

['Project(s) No. of units (3-, 4- and 5-Room) No. of applicants Avg. application rate',
 'Canberra Vista (Sembawang, Non-mature) 3-room: 124',
 '',
 '4-room: 385',
 '',
 '5-room/ 3Gen: 266',
 '',
 '',
 '',
 'Total rooms:775',
 '3-room: 847',
 '4-room: 3,048',
 '5-room/ 3Gen: 3,738',
 '= 7,633',
 '3-room: 6.8',
 '4-room: 7.9',
 '5-room/ 3Gen: 14.1',
 '= 9.8',
 'Toa Payoh Ridge / Kim Keat Ripples (Toa Payoh, Mature)',
 '3-room: 102',
 '4-room: 1,211',
 '= 1,313',
 '3-room: 935',
 '4-room: 11,684',
 '= 12,619',
 '3-room: 9.2',
 '4-room: 9.6',
 '= 9.6']

In [69]:
def extract_data(text):
    """
    This function extracts the number of rooms from the given text.
    Args:
        text (str): The input text containing information about rooms.
    Returns:
        int: The number of rooms extracted from the text. And the number of applications.
    If no valid room number is found, it returns 0.
    """
    # Split the text into words
    words = text.split()
    num_rm = ''

    for word in words:
        # Check if the word contains '-room' and extract the number
        if '-room' in word:
            try:
                num_rm = (word.split('-')[0].strip())
            except ValueError:
                # If the number cannot be converted, skip this word
                continue
        
        if num_rm.isdigit() and word.isdigit():
            # Return the number of rooms as an integer
            return int(num_rm), int(word)

    # Return 0 if no valid room number is found
    return 0,0

def extract_relevant(text_body_ls):

    project_dict = {}
    project_details = retrieve_project_details(text_body_ls)

    for i in text_body_ls:
        if i == '':
            continue
        num_rm, number = extract_data(i)
        if num_rm not in project_dict:
            project_dict[num_rm] = [number]
        elif len(project_dict[num_rm]) > 2:
            return project_details,project_dict
        else:
            # Append the number to the list for the corresponding number of rooms
            project_dict[num_rm].append(number)



(3, 12619)

In [46]:
def separate_by_mature(text_body_ls):
    """
    Separates items in the list into groups based on occurrences of 'Mature'.
    Args:
        text_body_ls (list): List of strings to process.
    Returns:
        list: A list of groups, where each group is a list of items between 'Mature' occurrences.
    """
    groups = []
    current_group = []

    for item in text_body_ls:
        # Check if the item contains 'Mature'
        if 'Mature' in item or 'Non-mature' in item:
            # If there's an existing group, add it to the groups list
            if current_group:
                groups.append(current_group)
                current_group = []
        # Add the current item to the current group
        current_group.append(item)

    # Add the last group if it exists
    if current_group:
        groups.append(current_group)

    return groups


result = separate_by_mature(text_body.split('\n'))
for group in result:
    print(group)

def retrieve_project_details(text_ls):

    for i in text_ls:
        if 'Mature' in i or 'Non-mature' in i:
            name = i.split('(')[0].strip()
            details = i.split('(')[1].split(')')[0].strip()
            details = details.split(',')
            return name, details

retrieve_project_details()

['Project(s) No. of units (3-, 4- and 5-Room) No. of applicants Avg. application rate']
['Canberra Vista (Sembawang, Non-mature) 3-room: 124', '', '4-room: 385', '', '5-room/ 3Gen: 266', '', '', '', 'Total rooms:775', '3-room: 847', '4-room: 3,048', '5-room/ 3Gen: 3,738', '= 7,633', '3-room: 6.8', '4-room: 7.9', '5-room/ 3Gen: 14.1', '= 9.8']
['Toa Payoh Ridge / Kim Keat Ripples (Toa Payoh, Mature)', '3-room: 102', '4-room: 1,211', '= 1,313', '3-room: 935', '4-room: 11,684', '= 12,619', '3-room: 9.2', '4-room: 9.6', '= 9.6']


TypeError: retrieve_project_details() missing 1 required positional argument: 'text_ls'

In [42]:
result = separate_by_mature(text_body.split('\n'))

In [56]:
result[1]

['Canberra Vista (Sembawang, Non-mature) 3-room: 124',
 '',
 '4-room: 385',
 '',
 '5-room/ 3Gen: 266',
 '',
 '',
 '',
 'Total rooms:775',
 '3-room: 847',
 '4-room: 3,048',
 '5-room/ 3Gen: 3,738',
 '= 7,633',
 '3-room: 6.8',
 '4-room: 7.9',
 '5-room/ 3Gen: 14.1',
 '= 9.8']

In [None]:


extract_data(result[1][0])

(3, 124)

In [60]:
result[1][0].split()

['Canberra', 'Vista', '(Sembawang,', 'Non-mature)', '3-room:', '124']

In [70]:
extract_relevant(result[1])

(('Canberra Vista', ['Sembawang', ' Non-mature']),
 {3: [124, 847], 4: [385], 5: [266], 0: [0, 0, 0]})