In [1]:
# import required packages

import pandas as pd
import re
import time
# import undetected_chromedriver as the site has anti bot countermeasure
import undetected_chromedriver as uc 
from bs4 import BeautifulSoup
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait


In [27]:
# helper functions

def click_by_xpath(driver, xpath):
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()

def accept_cookies_policy(driver):
    try:
        # accept cookies policy
        click_by_xpath(driver, "//button[normalize-space()='Accept Cookies']")
        time.sleep(3)
    except:
        # catch exception and do nothing
        pass

def load_all_projects(driver):
    try:
        while True:
            # find load more button for ready to buy projects
            soup = BeautifulSoup(driver.page_source,'html.parser')
            project_title = soup.find("h3", string='Ready to Buy Projects')
            project_div = project_title.parent
            load_more_button = project_div.find('button', attrs={'class': 'load-more-button'})
            if load_more_button is None:
                # end the load if load more button does not exists
                return
            
            # get load more index
            match = re.search('(\d+)', load_more_button.text)
            if match is None:
                # end the loop if load more button contains no integer value
                return
            load_more_index = int(match.group(0))     
            print(f'Load more: {load_more_index}')

            # get load more button element and scroll into view so selenium can click on it
            load_more_xpath = f"//button[normalize-space()='Load More ({load_more_index})']"
            load_more_element = driver.find_element(By.XPATH, load_more_xpath)
            driver.execute_script('arguments[0].scrollIntoView(false)', load_more_element)
            time.sleep(3)
            # click load more button
            click_by_xpath(driver, load_more_xpath)
            time.sleep(3)
    except Exception as e:
        if hasattr(e, 'message'):
            print(f'load_all_projects: {e.message}')
        else:
            print(f'load_all_projects: {e}')
            
def get_price(price):
    if price is None:
        return 'N/A'
    match = re.search(r'\D*(\d+.\d+)', price)
    if match:
        price = float(re.sub(r'[^0-9.]', '', match.group(0)))
        return f'{price:.2f}'
    return 'N/A'

def get_element_text(elements, tag, attrs):
    try:    
        element = elements.find(tag, attrs=attrs);
        if element:
            return element.text.strip()
        return None
    except:
        return None





In [5]:
# scrape ready to buy projects from property guru web site
url='https://www.propertyguru.com.my/new-property-launch'
driver = uc.Chrome()
driver.get(url)
driver.maximize_window() 

# wait for cookies policy window
time.sleep(3)
accept_cookies_policy(driver)

load_all_projects(driver)


Load more: 87
Load more: 79
Load more: 71
Load more: 63
Load more: 55
Load more: 47
Load more: 39
Load more: 31
Load more: 23
Load more: 15
Load more: 7


In [6]:
html_content = driver.page_source

In [59]:
def get_project_basic(project):
    title = get_element_text(project, "h4", attrs={"class":"project-listing-card__title"})
    address = get_element_text(project, "span", attrs={"class":"project-listing-card__address"})
    link_element = project.find("a", attrs={"class":"actionable-link"})
    project_link = link_element['href']
    image_link = link_element.find("img")['src']
    
    return {
        'title': title,
        'Tenure'
        'address': address,
        'project_link': project_link,
        'image_link': image_link
    }
    
def get_projects(driver, project_list):
    # find all ready to buy projects
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    project_title = soup.find("h3", string='Ready to Buy Projects')
    project_div = project_title.parent
    projects = project_div.find_all('div', attrs={"class": "project-listing-card-root"})
    print(f'Total projects: {len(projects)}')
    for project in projects:
        project_list.append(get_project_basic(project))


In [52]:
project_list = []
get_projects(driver, project_list)

Total projects: 50


In [53]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(project_list)

[   {   'address': 'Jalan Ss 16/1, Subang Jaya, Mukim Damansara, Subang Jaya, '
                   'Selangor',
        'image_link': 'https://my1-cdn.pgimgs.com/project-listing-project/39482719/PLPHO.227248119.V550/Pinnacle-SJ-Subang-Jaya-Malaysia.jpg',
        'project_link': 'https://www.propertyguru.com.my/property-listing/project/pinnacle-sj-for-sale-by-pinnacle-homes-sdn-bhd-39482719',
        'title': 'Pinnacle SJ'},
    {   'address': 'Taman Ponderosa, Johor Bahru, Johor',
        'image_link': 'https://my1-cdn.pgimgs.com/project-listing-project/38285089/PLPHO.215481037.V550/Ponderosa-Vista-2-Storey-Semi-Detached-Johor-Bahru-Malaysia.jpg',
        'project_link': 'https://www.propertyguru.com.my/property-listing/project/ponderosa-vista-2-storey-semi-detached-for-sale-by-pandan-baru-sdn-bhd-38285089',
        'title': 'Ponderosa Vista 2-Storey Semi-Detached'},
    {   'address': 'Setia AlamImpian, Shah Alam, Selangor',
        'image_link': 'https://my1-cdn.pgimgs.com/project-lis

In [40]:
print(project_list[0]['project_link'])

https://www.propertyguru.com.my/property-listing/project/pinnacle-sj-for-sale-by-pinnacle-homes-sdn-bhd-39482719


In [58]:
driver.get(project_list[10]['project_link'])
driver.maximize_window() 

show_more_xpath = "//button[normalize-space()='Show more']"
show_more_element = driver.find_element(By.XPATH, show_more_xpath)
driver.execute_script('arguments[0].scrollIntoView(false)', show_more_element)
time.sleep(5)
click_by_xpath(driver, show_more_xpath)


ElementClickInterceptedException: Message: element click intercepted: Element is not clickable at point (444, 1006)
  (Session info: chrome=121.0.6167.86)
Stacktrace:
	GetHandleVerifier [0x00971673+52979]
	(No symbol) [0x008F7961]
	(No symbol) [0x007DDD3D]
	(No symbol) [0x0081B79B]
	(No symbol) [0x00819EB7]
	(No symbol) [0x00817F1B]
	(No symbol) [0x008170BD]
	(No symbol) [0x0080E010]
	(No symbol) [0x008344EC]
	(No symbol) [0x0080DA4E]
	(No symbol) [0x00834784]
	(No symbol) [0x0084B3FE]
	(No symbol) [0x00834286]
	(No symbol) [0x0080C063]
	(No symbol) [0x0080CECD]
	GetHandleVerifier [0x00C88D83+3294723]
	GetHandleVerifier [0x00CC6CC2+3548482]
	GetHandleVerifier [0x00CC1C9C+3527964]
	GetHandleVerifier [0x00A0870E+671630]
	(No symbol) [0x00901EB4]
	(No symbol) [0x008FD808]
	(No symbol) [0x008FD92D]
	(No symbol) [0x008EF7E0]
	BaseThreadInitThunk [0x76007BA9+25]
	RtlInitializeExceptionChain [0x772DBD2B+107]
	RtlClearBits [0x772DBCAF+191]


In [47]:
soup = BeautifulSoup(driver.page_source, 'html.parser')
project_info = soup.find("div", attrs={"class":"col-lg-8 col-md-12"})

title = get_element_text(project, "h1", attrs={"class":"title"})

In [54]:
project_list[10]

{'title': 'The Senai Garden',
 'address': 'Jalan Impian Senai Utama, Taman Impian Senai,, Senai, Kulai, Johor',
 'project_link': 'https://www.propertyguru.com.my/property-listing/project/the-senai-garden-for-sale-by-kcc-development-m-sdn-bhd-33698731',
 'image_link': 'https://sg1-cdn.pgimgs.com/projectnet-project/2373/ZPPHO.132170192.R550X550/The-Senai-Garden-Kulai-Malaysia.jpg'}