This notebook is used to scrape new listed property data from __[iProperty.com.my](https://www.iproperty.com.my/new-property)__ website.

<div class="alert alert-block alert-info">
Import required packages
</div>

In [1]:
import os
import pandas as pd
import re
import time
# import undetected_chromedriver as the site has anti bot countermeasure
import undetected_chromedriver as uc 
from bs4 import BeautifulSoup
from datetime import datetime
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

<div class="alert alert-block alert-info">
General helper functions
</div>

In [2]:
def click_by_xpath(driver, xpath):
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, xpath))).click()

def get_by_xpath(driver, xpath):
    try:
        return WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
    except:
        return None
    
def get_element_text(elements, tag, attrs):
    try:    
        element = elements.find(tag, attrs=attrs);
        if element:
            return element.text.strip()
        return None
    except:
        return None

def get_price(text):
    if text is None:
        return None
    return float(re.sub(r'[^0-9.]', '', text))

<div class="alert alert-block alert-info">
Page content related helper functions
</div>

In [3]:
def get_title(text):
    match = re.search(r':([\w\W]+)$', text)
    if match:
        return match.group(1).strip()
    return text

def close_policy_window(driver):
    try:
        click_by_xpath(driver, "//button[@aria-label='Close Message']")
    except:
        pass

def get_next_page(soup):
    try:
        page_navigation = soup.find_all('div', attrs={"data-test-id": "pagination-wrapper"})
        if page_navigation:
            pages = page_navigation[0].find('ul')
            active_page = page_navigation[0].find('li', attrs={"class": "active"})
            if active_page.next_sibling:
                next_page = int(active_page.next_sibling.text.strip())
                return True, next_page
            else:
                return False, None
        else:
            return False, None
    except Exception as e:
        if hasattr(e, 'message'):
            print(e.message)
        else:
            print(e)
        return False, None

def get_project_basic_info(driver, soup, project_list):
    base_url = 'https://www.iproperty.com.my'
    project_ul = soup.find("ul", attrs={"data-test-id": "listing-list"})
    projects = project_ul.find_all("li")
    for project in projects:
        project_div = project.find('div', attrs={"data-test-id": "Hyperlink"})
        if project_div is None:
            # skip advertisement ul element
            continue
        price_div = project_div.find('div', attrs={"class": "listing-price"})
        price = price_div.text.strip()
        if price == 'Contact for price':
            # skip if no price info
            continue
        title_h2 = price_div.next_sibling.find('h2')
        if title_h2 is None:
            title_h2 = price_div.parent.find('h2')
        title = get_title(title_h2.text.strip())
        link_a = project.find("a", attrs={"class":"depth-listing-card-link"})
        project_link = f'{base_url}{link_a["href"]}'
        project_list.append({
            'title': title,
            'link': project_link
        })

def get_projects(driver, project_list):
    has_next_page = True
    while has_next_page:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        get_project_basic_info(driver, soup, project_list)
        has_next_page, next_page = get_next_page(soup)
        if has_next_page == False:
            break
        print(f'Go next page: {next_page}')
        next_path_xpath = f"//div//ul//li[normalize-space()='{next_page}']"
        # get next page element and scroll into view so that selenium can click on it
        next_page_element = driver.find_element(By.XPATH, next_path_xpath)
        driver.execute_script('arguments[0].scrollIntoView(false)', next_page_element)
        time.sleep(3)
        click_by_xpath(driver, next_path_xpath)
        # wait the page load
        time.sleep(10)

def get_project_info(soup, title, link):
    project_list = []
    summary_div = soup.find('div', attrs={'class': 'property-summary-ppp'})
    status = summary_div.previous_sibling.contents[0].text.strip()
    address = get_element_text(soup, "span", attrs={"class": "property-address"})    
    property_type_span = soup.find("span", string='Property type')
    property_type = property_type_span.parent.next_sibling.text.strip()
    tenure_span = soup.find("span", string='Tenure')
    tenure = tenure_span.parent.next_sibling.text.strip()
    land_title_div = soup.find("div", attrs={"data-test-id": "AttributeItemTitle"})
    land_title = land_title_div.next_sibling.text.strip()
    description = get_element_text(soup, "pre", attrs={"class": "property-description"})
    floor_plan_h2 = soup.find("h2", string='Floor Plan Information')
    if floor_plan_h2 is None:
        # skip if no floor plan information
        return project_list
    unit_type_li = floor_plan_h2.next_sibling.find_all("li")
    for unit_type in unit_type_li:
        price_span = unit_type.find("span", string='Starting price: ')
        price = price_span.next_sibling.text.strip()
        if price == "Contact for price":
            # skip if no price info
            continue 
        price = get_price(price)
        configuration_span = unit_type.find("span", string='Configuration: ')
        configurations = configuration_span.next_sibling.find_all("span")
        bedroom = configurations[1].text.strip()
        bathroom = configurations[3].text.strip()
        car_park = configurations[5].text.strip()
        built_up_size_span = unit_type.find("span", string='Built-up size: ')
        built_up_size = built_up_size_span.next_sibling.text.strip()
        project_list.append({
            'title': title,
            'status': status,
            'type': property_type,
            'tenure': tenure,
            'land_title': land_title,
            'price': price,
            'built_up_size': built_up_size,
            'bedroom': bedroom,
            'bathroom': bathroom,
            'car_park': car_park,
            'address': address,
            'description': description,
            'link': link
        })
    return project_list

<div class="alert alert-block alert-info">
Scrape new listed property data from iProperty website
</div>

In [4]:
url = 'https://www.iproperty.com.my/new-property/'
driver = uc.Chrome()
driver.maximize_window() 
driver.get(url)

# wait policy window to show up
time.sleep(3)
close_policy_window(driver)
time.sleep(1)

# default value for search combo box is All States
click_by_xpath(driver, "//button[@class='ant-btn ant-btn-primary']")
time.sleep(3)

# get link for new launch projects
projects = []
get_projects(driver, projects)
print(f'Total projects: {len(projects)}')

project_list = []
# get project info for each project
is_first_project = True
for project in projects:
    print(f"Getting project info for {project['title']}")
    reload_count = 0
    while True:
        driver.get(project['link'])
        # wait 3 seconds for page to be fully loaded
        time.sleep(3)
        # load page source to BeatifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        error_page_body = soup.find('body', attrs={'class': 'errorPage'})
        if error_page_body is None:
            # exit while loop if not an error page
            break
        summary_div = soup.find('div', attrs={'class': 'property-summary-ppp'})
        if summary_div:
            # exit while loop if summary section exists
            break
        reload_count += 1        
        if reload_count > 3:
            # exit while loop if continuously hitting error page for 3 times
            print(f"Failed to get project info for {project['title']}")
            break
        # wait 10 seconds before reloading the page
        time.sleep(10)
        print(f"Hit error page, retry ({reload_count}) for {project['title']}")
    project_info = get_project_info(soup, project['title'], project['link'])
    if len(project_info) > 0:
        project_list.extend(project_info)

data_folder = '../data'
# create data folder if it does not exists
if not os.path.exists(data_folder): 
    os.makedirs(data_folder)

# save scraped data into a csv file
today_date = datetime.now()
data_folder = '../data'
filename = f'iproperty_new_listed_projects_{today_date.strftime("%Y-%m-%d")}.csv'
df = pd.DataFrame(project_list)
df.to_csv(f'{data_folder}/{filename}', sep=',', encoding='utf-8', index=False)


Go next page: 2
Go next page: 3
Go next page: 4
Go next page: 5
Go next page: 6
Go next page: 7
Go next page: 8
Go next page: 9
Go next page: 10
Go next page: 11
Go next page: 12
Go next page: 13
Total projects: 216
Getting project info for Pangsapuri Saujana Indah, Molek
Getting project info for R Suite Chancery Residences
Getting project info for Amverton Greens
Getting project info for Residensi Dian II
Getting project info for The MINH
Getting project info for JHomes
Getting project info for Summerwoods
Getting project info for MHP 3 Residences
Getting project info for Maya Ara Residences
Getting project info for Triara
Getting project info for Suasana Ainsdale
Getting project info for Casira 3
Getting project info for Nadira 3
Getting project info for Harmoni Permai
Getting project info for Pagoh Special Economic Z
Getting project info for Sarjana Promenade
Getting project info for Elmina Green 7
Getting project info for Ilham Residence 3
Getting project info for The Eighth
Gettin