In [9]:
# import required packages
import os
import pandas as pd
import re
import time
# import undetected_chromedriver as the site has anti bot countermeasure
import undetected_chromedriver as uc 
from bs4 import BeautifulSoup
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [22]:
# helper functions
def remove_html_tag(text):
    return BeautifulSoup(text, "lxml").text

def click_by_xpath(driver, xpath):
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()

def get_by_xpath(driver, xpath):
    try:
        return WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
    except:
        return None
    
def hover_and_click_by_xpath(driver, xpath):
    element = get_by_xpath(driver, xpath)
    action = ActionChains(driver)
    action.move_to_element(element).perform()
    time.sleep(1)
    click_by_xpath(driver, xpath)
    
def get_element_text(elements, tag, attrs):
    try:    
        element = elements.find(tag, attrs=attrs);
        if element:
            return element.text.strip()
        return None
    except:
        return None

def get_price(text):
    if text is None:
        return None
    return float(re.sub(r'[^0-9.]', '', text))

def get_square_feet(text):
    match = re.search(r'(\d+) sqft', text)
    if match:
        return int(match.group(1))
    return None

def get_year(text):
    match = re.search(r'(\d{4})$', text)
    if match:
        return match.group(1)
    return 'N/A'
    
def accept_cookies_policy(driver):
    try:
        # accept cookies policy
        click_by_xpath(driver, "//button[normalize-space()='Accept Cookies']")
        time.sleep(3)
    except:
        # catch exception and do nothing
        pass
    
def load_all_projects(driver):
    try:
        while True:
            # find load more button for ready to buy projects
            soup = BeautifulSoup(driver.page_source,'html.parser')
            project_title = soup.find("h3", string='Ready to Buy Projects')
            project_div = project_title.parent
            load_more_button = project_div.find('button', attrs={'class': 'load-more-button'})
            if load_more_button is None:
                # end the load if load more button does not exists
                return            
            # get load more index
            match = re.search('(\d+)', load_more_button.text)
            if match is None:
                # end the loop if load more button contains no integer value
                return
            load_more_index = int(match.group(0))     
            print(f'Load more: {load_more_index}')
            # get load more button element and scroll into view so selenium can click on it
            load_more_xpath = f"//button[normalize-space()='Load More ({load_more_index})']"
            load_more_element = get_by_xpath(driver, load_more_xpath)
            driver.execute_script('arguments[0].scrollIntoView(false)', load_more_element)
            time.sleep(3)
            # click load more button
            click_by_xpath(driver, load_more_xpath)
            time.sleep(3)
    except Exception as e:
        if hasattr(e, 'message'):
            print(f'load_all_projects: {e.message}')
        else:
            print(f'load_all_projects: {e}')

def get_projects(driver, project_list):
    # find all ready to buy projects
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    project_title = soup.find("h3", string='Ready to Buy Projects')
    project_div = project_title.parent
    projects = project_div.find_all("div", attrs={"class": "project-listing-card-root"})
    print(f'Total projects: {len(projects)}')
    for project in projects:
        project_list.extend(get_project_info(project))

def get_project_info(project):
    title = get_element_text(project, "h4", attrs={"class":"project-listing-card__title"})
    address = get_element_text(project, "span", attrs={"class":"project-listing-card__address"})
    launched_in = get_element_text(project, "div", attrs={"class":"project-listing-card__status"})
    labels = project.find_all("div", attrs={"class":"project-listing-card__labels--pill"})
    tenure = labels[0].text.strip()
    type = labels[1].text.strip()
    link_element = project.find("a", attrs={"class":"actionable-link"})
    project_link = link_element["href"]
    image_link = link_element.find("img")['src']
    project_basic_info = {
        'title': title,
        'type': type,
        'tenure': tenure,
        'launched_in': launched_in,
        'address': address,        
        'project_link': project_link,
        'image_link': image_link
    }    
    return get_project_detail(driver, project_basic_info)
    
def get_project_detail(driver, project_info):
    print(f"Getting project detail for: {project_info['title']}")
    project_list = []
    # go to project link
    driver.get(project_info['project_link'])
    show_more_xpath = "//button[normalize-space()='Show more']"
    show_more_button = get_by_xpath(driver, show_more_xpath)
    # check if show more button exists. If exists,
    # click on it to load the complete project description
    if show_more_button:
        driver.execute_script('arguments[0].scrollIntoView(false)', show_more_button)
        time.sleep(1)
        # show more button blocked by sticky div
        # hover over to the element and click
        hover_and_click_by_xpath(driver, show_more_xpath)
    # load page source to BeatifulSoup
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # get project detail section
    project_div = soup.find("div", attrs={"class":"col-lg-8 col-md-12"})
    description = get_element_text(project_div, "div", attrs={"class":"description"})
    developer_div = project_div.find("div", string='Developer')
    developer = developer_div.next_sibling.text.strip()
    bedrooms_types_nav = project_div.find("nav", attrs={'data-automation-id':'unit-types-navbar'})
    project_info['description'] = description
    project_info['developer'] = developer
    if bedrooms_types_nav:
        bedrooms_types = bedrooms_types_nav.find_all("a")
        for bedrooms_type in bedrooms_types:
            bedrooms_type_xpath = f"//a[normalize-space()='{bedrooms_type.text.strip()}']"
            bedrooms_type_tab = get_by_xpath(driver, bedrooms_type_xpath)
            driver.execute_script('arguments[0].scrollIntoView(false)', bedrooms_type_tab)
            time.sleep(1)
            hover_and_click_by_xpath(driver, bedrooms_type_xpath)
            time.sleep(2)
            # reload page source
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            project_div = soup.find("div", attrs={"class":"col-lg-8 col-md-12"})
            price = get_element_text(project_div, "span", attrs={"class":"price__label"})
            if price is None:
                # skip is price span does not exists
                continue
            project_list.extend(get_unit_type_info(driver, project_info, project_div))
    else:
        project_list.extend(get_unit_type_info(driver, project_info, project_div))
    return project_list

def get_unit_type_info(driver, project_info, project_div):
    project_list = []
    unit_types_div = project_div.find("div", attrs={'class':'property-unit-type-selection-root'})
    unit_types = unit_types_div.find_all("div", attrs={'class':'box'})
    tab_idx = 0
    for unit_type in unit_types:
        if tab_idx > 0:
            # click on tab to load unit type summary for second tab onwards
            unit_type_label = get_element_text(unit_type, "div", attrs={"class":"box__label"})        
            unit_type_xpath = f"//div[normalize-space()='{unit_type_label}']"
            try:
                # try with click unit type tab with full label value
                unit_type_tab = get_by_xpath(driver, unit_type_xpath)
                time.sleep(1)
                hover_and_click_by_xpath(driver, unit_type_xpath)
                time.sleep(2)
            except:
                # try with click unit type tab with partial label value
                unit_type_xpath = f"//div[@class='box__label'][contains(text(),'{unit_type_label}')]"
                unit_type_tab = get_by_xpath(driver, unit_type_xpath)
                time.sleep(1)
                hover_and_click_by_xpath(driver, unit_type_xpath)
                time.sleep(2)
        tab_idx += 1
        # reload page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        project_div = soup.find("div", attrs={"class":"col-lg-8 col-md-12"})
        price = get_element_text(project_div, "span", attrs={"class":"price__label"})
        if price is None:
            # skip is price span does not exists
            continue
        summary_div = project_div.find("div", attrs={"class":"summary"})
        summary_span = summary_div.find_all("span") 
        bedroom = summary_span[0].text.strip()
        bathroom = summary_span[1].text.strip()
        square_feet = get_square_feet(summary_span[2].text.strip())
        price_per_square_feet = get_price(summary_span[3].text.strip())
        # calculate price from square feet and price per square feet
        # as price in span in some projects not showing the correct value
        price = square_feet * price_per_square_feet
        furnishing = get_element_text(project_div, "span", attrs={"class":"furnishing__value"})
        project_list.append({
            'title': project_info['title'],
            'type': project_info['type'],
            'tenure': project_info['tenure'],
            'price': f'{price:.2f}',
            'square_feet': square_feet,
            'price_per_square_feet': price_per_square_feet,
            'bedroom': bedroom,
            'bathroom': bathroom,
            'furnishing': furnishing,
            'launched_in': get_year(project_info['launched_in']),
            'address': project_info['address'],
            'developer': project_info['developer'],
            'description': project_info['description'],
            'project_link': project_info['project_link'],
            'image_link': project_info['image_link'],
        })
    return project_list

In [3]:
# scrape ready to buy projects from property guru web site
url='https://www.propertyguru.com.my/new-property-launch'
driver = uc.Chrome()
driver.get(url)
driver.maximize_window() 

# wait for cookies policy window
time.sleep(3)
accept_cookies_policy(driver)
time.sleep(1)
load_all_projects(driver)
project_list = []
get_projects(driver, project_list)

data_folder = '../data'
# create data folder if not exists
if not os.path.exists(data_folder): 
    os.makedirs(data_folder)

today_date = datetime.now()
data_folder = '../data'
filename = f'property_guru_ready_to_buy_projects_{today_date.strftime("%Y-%m-%d")}.csv'
df = pd.DataFrame(project_list)
df.to_csv(f'{data_folder}/{filename}', sep=',', encoding='utf-8', index=False)

Load more: 87
Load more: 79
Load more: 71
Load more: 63
Load more: 55
Load more: 47
Load more: 39
Load more: 31
Load more: 23
Load more: 15
Load more: 7
Total projects: 54
Getting project detail for: Pinnacle SJ
Getting project detail for: Ponderosa Vista 2-Storey Semi-Detached
Getting project detail for: Ferrous 2
Getting project detail for: BON KIARA
Getting project detail for: Blooming Residence
Getting project detail for: Candella
Getting project detail for: SouthPlace 2 Residences
Getting project detail for: Miranda Hill
Getting project detail for: Sejati Lakeside 2
Getting project detail for: Tiara Sendayan
Getting project detail for: Aderyn
Getting project detail for: The Senai Garden
Getting project detail for: Residensi Dian II
Getting project detail for: Hana Residences
Getting project detail for: Interpoint
Getting project detail for: Sunway Flora Residences
Getting project detail for: Pearl Garden - Panorama Lapangan Mutiara
Getting project detail for: Senadi Hills
Getting 

FileExistsError: [WinError 183] Cannot create a file when that file already exists: '../data'

In [4]:
today_date = datetime.now()
data_folder = '../data'
filename = f'property_guru_ready_to_buy_projects_{today_date.strftime("%Y-%m-%d")}.csv'
df = pd.DataFrame(project_list)
df.to_csv(f'{data_folder}/{filename}', sep=',', encoding='utf-8', index=False)

In [40]:
print(project_list[0]['project_link'])

https://www.propertyguru.com.my/property-listing/project/pinnacle-sj-for-sale-by-pinnacle-homes-sdn-bhd-39482719


In [44]:
#driver.get(project_list[11]['project_link'])
#driver.get('https://www.propertyguru.com.my/property-listing/project/the-senai-garden-for-sale-by-kcc-development-m-sdn-bhd-33698731')
driver.get('https://www.propertyguru.com.my/property-listing/project/sky-trees-commercial-for-sale-by-bukit-indah-johor-sdn-bhd-38861360')
driver.maximize_window() 




In [14]:
project_list

[]

In [None]:
url='https://www.propertyguru.com.my/property-listing/project/southplace-2-residences-for-sale-by-tropicana-metropark-sdn-bhd-35069647'
driver = uc.Chrome()
driver.get(url)
driver.maximize_window() 

# wait for cookies policy window
time.sleep(3)
accept_cookies_policy(driver)
time.sleep(1)

project_basic_info = {
        'title': 'title',
        'type': 'type',
        'tenure': 'tenure',
        'launched_in': 'launched_in',
        'address': 'address',        
        'project_link': 'project_link',
        'image_link': 'image_link',
        'developer': '',
        'description': ''
    }   

project_list = []

show_more_xpath = "//button[normalize-space()='Show more']"
show_more_button = get_by_xpath(driver, show_more_xpath)
# check if show more button exists. If exists,
# click on it to load the complete project description
if show_more_button:
    driver.execute_script('arguments[0].scrollIntoView(false)', show_more_button)
    time.sleep(1)
    # show more button blocked by sticky div
    # hover over to the element and click
    hover_and_click_by_xpath(driver, show_more_xpath)

# load page source to BeatifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
# get project detail section
project_div = soup.find("div", attrs={"class":"col-lg-8 col-md-12"})

description = get_element_text(project_div, "div", attrs={"class":"description"})

developer_div = project_div.find("div", string='Developer')
developer = developer_div.next_sibling.text.strip()

bedrooms_types_nav = project_div.find("nav", attrs={'data-automation-id':'unit-types-navbar'})
if bedrooms_types_nav:
    bedrooms_types = bedrooms_types_nav.find_all("a")
    for bedrooms_type in bedrooms_types:
        bedrooms_type_xpath = f"//a[normalize-space()='{bedrooms_type.text.strip()}']"
        bedrooms_type_tab = get_by_xpath(driver, bedrooms_type_xpath)
        driver.execute_script('arguments[0].scrollIntoView(false)', bedrooms_type_tab)
        time.sleep(1)
        hover_and_click_by_xpath(driver, bedrooms_type_xpath)
        time.sleep(2)
        # reload page source
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        project_div = soup.find("div", attrs={"class":"col-lg-8 col-md-12"})
        price = get_element_text(project_div, "span", attrs={"class":"price__label"})
        if price is None:
            # skip is price span does not exists
            continue
        project_list.extend(get_unit_type_info(driver, project_basic_info, project_div))
else:
    project_list.extend(get_unit_type_info(driver, project_basic_info, project_div))


In [24]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(project_list)

[   {   'address': 'address',
        'bathroom': '1',
        'bedroom': '1',
        'description': '',
        'developer': '',
        'furnishing': 'Partially Furnished',
        'image_link': 'image_link',
        'launched_in': 'N/A',
        'price': '505997.36',
        'price_per_square_feet': 867.92,
        'project_link': 'project_link',
        'square_feet': 583,
        'tenure': 'tenure',
        'title': 'title',
        'type': 'type'},
    {   'address': 'address',
        'bathroom': '2',
        'bedroom': '2',
        'description': '',
        'developer': '',
        'furnishing': 'Partially Furnished',
        'image_link': 'image_link',
        'launched_in': 'N/A',
        'price': '588003.34',
        'price_per_square_feet': 775.73,
        'project_link': 'project_link',
        'square_feet': 758,
        'tenure': 'tenure',
        'title': 'title',
        'type': 'type'},
    {   'address': 'address',
        'bathroom': '2',
        'bedroom': '3',
 