In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [2]:
# Defining Function To Load the Page

def wait_for_page_to_load(driver, wait):
    title = driver.title
    try:
        wait.until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
    except:
        print(f"The webpage {title} did not get fully loaded. \n")
    else:
        print(f"The webpage {title} did get fully loaded. \n")

In [None]:
# Initialize Chrome options
chrome_options = Options()
chrome_options.add_argument("--disable-http2")  
chrome_options.add_argument("--incognito")  
chrome_options.add_argument("--disable-blink-features=AutomationControlled")  
chrome_options.add_argument("--ignore-certificate-errors")  
chrome_options.add_argument("--enable-features=NetworkServiceInProcess")  
chrome_options.add_argument("--disable-features=NetworkService")  
chrome_options.add_argument("start-maximized")  # start full screen

# Set a custom User-Agent (important for scraping)
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)

# Initializing Driver Object

driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()

# Explicit Wait

wait = WebDriverWait(driver, 5)

# Accessing the target webpage
url = "https://www.99acres.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)

# Identifying Search Bars and Entering the Text
try:
    search_bar = wait.until(
        EC.presence_of_element_located((By.XPATH,'//*[@id="keyword2"]'))
    
    )
except:
    print("Timeout While locating Search bar. \n")
else:
    search_bar.send_keys("Chennai")
    time.sleep(2)

# Selecting Valid Option From List

try:
    valid_options = wait.until(
        EC.element_to_be_clickable((By.XPATH,'//*[@id="0"]'))
    )
except:
    print("Timeout while locating valid search options. \n")
else:
    valid_options.click()
    time.sleep(2)

# Click On Search Button

try:
    search_button = wait.until(
        EC.element_to_be_clickable((By.XPATH,'//*[@id="searchform_search_btn"]'))
    )

except:
    print("Timeout while locating search button. \n")
else:
    search_button.click()
    wait_for_page_to_load(driver, wait)
    time.sleep(3)

# Adjust The Budget Slider

try:
    slider = wait.until(
        EC.element_to_be_clickable((By.XPATH,'//*[@id="budgetLeftFilter_max_node"]'))
    )

except:
    print("Timeout While Clicking On Budget slider circle. \n")

else:
    actions = ActionChains(driver)
    (
        actions.click_and_hold(slider).
        move_by_offset(-73,0).
        release().
        perform()
    )

    time.sleep(2)

# Filter Results to show Geniune Listing
# 1. Verified

verified = wait.until(
    EC.element_to_be_clickable((By.XPATH,'/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[3]/span[2]'))
)
verified.click()
time.sleep(2)

# 2. Ready To Move
ready_to_move = wait.until(
    EC.element_to_be_clickable((By.XPATH,'/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[5]/span[2]'))
)

ready_to_move.click()
time.sleep(2)

# Moving to right side to unhide remaining Filters

while True:
    try:
        filter_right_button = wait.until(
            EC.presence_of_element_located((By.XPATH,"//i[contains(@class,'iconS_Common_24 icon_upArrow cc__rightArrow')]"))
        )
    except:
        print("We have reached the end.")
        break
    else:
        filter_right_button.click()
        time.sleep(2)

# 3. With Photos
with_photos = wait.until(
    EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[6]/span[2]'))
)
with_photos.click()
time.sleep(2)

# 4. With Videos
with_videos = wait.until(
    EC.element_to_be_clickable((By.XPATH, "/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[7]/span[2]"))
)
with_videos.click()
time.sleep(3)

# Navigate Pages and Extract Data
data = []
page_count = 0
while True:
    page_count +=1
    try:
        time.sleep(3)
        # Wait until "Next Page >" button is present in DOM
        wait.until(EC.presence_of_element_located((By.XPATH, "//a[normalize-space()='Next Page >']")))
    except:
        print(f"It has navigated all the {page_count} pages.")
        break
    else:
        # Try clicking with retry to avoid stale element issues

        try:
            next_page_button = wait.until(
                EC.element_to_be_clickable((By.XPATH, "//a[normalize-space()='Next Page >']"))
            )
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_page_button)
            time.sleep(2)

        # Scraping The Data
            rows = driver.find_elements(By.CLASS_NAME,"tupleNew__contentWrap")
            for row in rows:
                # Property Name
                try:
                    name = row.find_element(By.CLASS_NAME,"tupleNew__headingNrera").text
                except:
                    name= np.nan
                
                # Property Location
                try:
                    location = row.find_element(By.CLASS_NAME,"tupleNew__propType").text
                except:
                    location= np.nan

                # Property Price
                try:
                    price = row.find_element(By.CLASS_NAME,"tupleNew__priceValWrap").text
                except:
                    price= np.nan

                # Property Area And Size
                try:
                    elements = row.find_elements(By.CLASS_NAME,"tupleNew__area1Type")
                except:
                    area, bhk = [np.nan, np.nan]
                else:
                    area, bhk = [ele.text for ele in elements]


                # break
                property = {
                    "name": name,
                    "location": location,
                    "price": price,
                    "area": area,
                    "bhk" : bhk
                }
                data.append(property)

            next_page_button.click()
            # break  # success → exit retry loop
        except StaleElementReferenceException:
            print("Stale element, retrying...")
            time.sleep(2)

# Scraping Data From Last Page
# Scraping The Data
rows = driver.find_elements(By.CLASS_NAME,"tupleNew__contentWrap")
for row in rows:
    # Property Name
    try:
        name = row.find_element(By.CLASS_NAME,"tupleNew__headingNrera").text
    except:
        name= np.nan
                
    # Property Location
    try:
        location = row.find_element(By.CLASS_NAME,"tupleNew__propType").text
    except:
        location= np.nan

    # Property Price
    try:
        price = row.find_element(By.CLASS_NAME,"tupleNew__priceValWrap").text
    except:
        price= np.nan

    # Property Area And Size
    try:
        elements = row.find_elements(By.CLASS_NAME,"tupleNew__area1Type")
    except:
        area, bhk = [np.nan, np.nan]
    else:
        area, bhk = [ele.text for ele in elements]


    # break
    property = {
                    "name": name,
                    "location": location,
                    "price": price,
                    "area": area,
                    "bhk" : bhk
                }
    data.append(property)
    time.sleep(2)
    driver.quit()
    


The webpage India Real Estate Property Site - Buy Sell Rent Properties Portal - 99acres.com did get fully loaded. 

The webpage Property in Chennai - Real Estate in Chennai did get fully loaded. 

We have reached the end.
Stale element, retrying...
Stale element, retrying...
It has navigated all the 46 pages.


In [46]:
# Creating Pandas Dataframe

df = pd.DataFrame(data)
df = df.drop_duplicates()
df = df.apply(lambda col: col.str.strip().str.lower() if col.dtype == "object" else col)
df["has_rating"] = df["name"].str.contains("\n").astype(int)
# Cleaning Name Column
df["name"] = df["name"].str.replace("\n[0-9.]+","",regex = True).str.strip()
# Cleaning Location Column
df["location"] = df["location"].str.replace("chennai","").str.strip()
df["location"] = df["location"].str.replace(",$","",regex=True).str.strip()
df["location"] = df["location"].str.split("in").str[-1].str.strip()
# Cleaning Price Column
df["price"] = df["price"].str.replace("₹","")
df["price"] = df["price"].apply(
    lambda val: float(val.replace("lac", "").strip())
    if isinstance(val, str) and "lac" in val.lower()
    else (
        float(val.replace("cr", "").strip()) * 100
        if isinstance(val, str) and "cr" in val.lower()
        else np.nan
    )
)
# Cleaning Area Column
df["area"] = df["area"].str.replace("sqft","").str.replace(",","").str.strip().astype(int)
# Cleaning BHK column
df["bhk"] = df["bhk"].str.replace("bhk","").str.strip().astype(int)
# Renaming Columns
df.rename(columns={"price":"price_in_lakhs",
                   "area":"area_in_sqft"},inplace=True)
#Reset Index
df.reset_index(drop = True,inplace=True)
df.head()

Unnamed: 0,name,location,price_in_lakhs,area_in_sqft,bhk,has_rating
0,happy homes,kolathur,85.83,1162,3,0
1,snd sai swaraj,velachery,105.0,1400,3,0
2,prince courtyard,egmore,443.0,2110,3,0
3,s and p the address,"mambakkam, vandalur kelambakkam road",220.0,1500,3,0
4,panayur,ganallur,150.0,2250,3,0


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1146 entries, 0 to 1145
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            1146 non-null   object 
 1   location        1146 non-null   object 
 2   price_in_lakhs  1145 non-null   float64
 3   area_in_sqft    1146 non-null   int64  
 4   bhk             1146 non-null   int64  
 5   has_rating      1146 non-null   int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 53.8+ KB


In [51]:
# Exporting To Excel

df.to_csv("Data/chennai_properties-99acres.csv",index = False)