In [1]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc

In [2]:
# ----- SCRAPING THE DATA -----
def wait_for_page_to_load(driver, wait):
	title = driver.title
	try:
		wait.until(
			lambda d: d.execute_script("return document.readyState") == "complete"
		)
	except:
		print(f"The webpage \"{title}\" did not get fully laoded.\n")
	else:
		print(f"The webpage \"{title}\" did get fully laoded.\n")
  

# This code does not detect captchas
options = uc.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-blink-features=AutomationControlled")
options.add_argument("--incognito")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--enable-features=NetworkServiceInProcess")
options.add_argument("--disable-features=NetworkService")


In [None]:
driver = uc.Chrome(options=options,use_subprocess=True)
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# accessing the target webpage
url = "https://www.99acres.com/"
driver.get(url)
wait_for_page_to_load(driver, wait)

# identify and enter text into search bar
try:
    search_bar = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="keyword2"]')))

except:
    print("Timeout Exception: The search bar is not found.\n")
    
else:
    search_bar.clear()
    search_bar.send_keys("Chennai")
    
# selecting valid option from list
try:
    vaild_option =wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="0"]')))

except:
    print("Timeout Exception: The valid option is not found.\n")

else:
    vaild_option.click()
    
# click on Search button
try:
    search_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform_search_btn"]')))

except:
    print("Timeout Exception: The Search button is not found.\n")
    
else:
    search_button.click()
    wait_for_page_to_load(driver, wait)


# Adjust the Budget Slider
try:
    slider = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="budgetLeftFilter_max_node"]')))

except:
    print("Timeout Exception: The Budget Slider is not found.\n")

else:
    actions = ActionChains(driver)
    (
        actions
        .click_and_hold(slider) # Click and hold the slider
        .move_by_offset(-73, 0) # Adjust this value as needed
        .release()              # Release the slider
        .perform()
    ) 
    time.sleep(1) 


# filter results to show genuine listings
# 1. Verified
verified = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[3]'))
                      )
verified.click()
time.sleep(1)

# 2. Ready To Move
ready_to_move = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="app"]/div/div/div[4]/div[3]/div[1]/div[3]/section/div/div/div/div/div[1]/div/div[5]'))
                           )
ready_to_move.click()  
time.sleep(1)


# moving to the right side to unhide remaining filters
while True:
    try:
        filter_right_button = wait.until(EC.presence_of_element_located((
            By.XPATH, "//i[@class='iconS_Common_24 icon_upArrow cc__rightArrow']"
        )))
    except:
        print("Timeout Exception: The right arrow button is not found.\n")
        break
    else:
        filter_right_button.click()
           

# 3. With Photos
with_photos = wait.until(EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[6]/span[2]')))
with_photos.click() 
time.sleep(1)

# 4. With Videos
with_videos = wait.until(EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[7]/span[2]')))
with_videos.click()
time.sleep(1)

# navigate pages and extract data
data = []
page_count = 0
while True:
	page_count += 1
	try:
		time.sleep(2)
		next_page_button = driver.find_element(By.XPATH, "//a[normalize-space()='Next Page >']")
	except:
		print(f"Timeout because we have navigated all the {page_count} pages.\n")
		break
	else:
		try:
			driver.execute_script("window.scrollBy(0, arguments[0].getBoundingClientRect().top - 100);", next_page_button)
			time.sleep(1)
	
			# Scrape the data
			rows = driver.find_elements(By.CLASS_NAME, "tupleNew__contentWrap")
			for row in rows:
				# Property Name
				try:
					name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
				except:
					name = 	np.nan
					print("Name not found")

				# Property location
				try:
					location = row.find_element(By.CSS_SELECTOR, ".tupleNew__tupleHeadingTopaz, .tupleNew__tupleHeading").text
				except:
					location = np.nan
					print("Location not found")
				
				# Property price
				try:
					price = row.find_element(By.CLASS_NAME, "tupleNew__priceValWrap").text
				except:
					price = np.nan
					print("Price not found")

				# property area and bhk
				try:
					elements = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
				except:
					area, bhk =[np.nan, np.nan]
					print("Area and BHK not found")
				else:
					area, bhk = [ele.text for ele in elements]
				
				# store data in dictionary			
				property = {
						"name": name,
						"location": location,
						"price": price,
						"area": area,
						"bhk": bhk	
					}
				# append dictionary to data list
				data.append(property)	
	
			# click on Next Page button
			wait.until(
				EC.element_to_be_clickable((By.XPATH, "//a[normalize-space()='Next Page >']"))
			).click()
			time.sleep(3)
		except:
			print("Timeout while clicking on \"Next Page\".\n")
# Scrape the data
rows = driver.find_elements(By.CLASS_NAME, "tupleNew__contentWrap")
for row in rows:
	# Property Name
	try:
		name = row.find_element(By.CLASS_NAME, "tupleNew__headingNrera").text
	except:
		name = 	np.nan
		print("Name not found")

	# Property location
	try:
		location = row.find_element(By.CSS_SELECTOR, ".tupleNew__tupleHeadingTopaz, .tupleNew__tupleHeading, .tupleNew__tupleHeadingPlat").text
	except:
		location = np.nan
		print("Location not found")
	
	# Property price
	try:
		price = row.find_element(By.CLASS_NAME, "tupleNew__priceValWrap").text
	except:
		price = np.nan
		print("Price not found")

	# property area and bhk
	try:
		elements = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
	except:
		area, bhk =[np.nan, np.nan]
		print("Area and BHK not found")
	else:
		area, bhk = [ele.text for ele in elements]
	
	# store data in dictionary			
	property = {
			"name": name,
			"location": location,
			"price": price,
			"area": area,
			"bhk": bhk	
		}
 
	# append dictionary to data list
	data.append(property)
  
time.sleep(1)   
driver.quit()

The webpage "India Real Estate Property Site - Buy Sell Rent Properties Portal - 99acres.com" did get fully laoded.

The webpage "Property in Chennai - Real Estate in Chennai" did get fully laoded.

Timeout Exception: The right arrow button is not found.

Timeout while clicking on "Next Page".

Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Location not found
Timeout because we have navigated all the 38 pages.



In [128]:
df = pd.DataFrame(data)

In [129]:
df

Unnamed: 0,name,location,price,area,bhk
0,Navganesh eden,"3 BHK Flat in Kolathur, Chennai",₹72.04 Lac,"1,181 sqft",3 BHK
1,Shree Varu Homes Gruham,"3 BHK Flat in Vadapalani, Chennai South",₹1.59 Cr,"1,328 sqft",3 BHK
2,Prince Courtyard,"3 BHK Flat in Egmore, Chennai",₹4.43 Cr,"2,110 sqft",3 BHK
3,S and P The Address,"3 Bedroom House in Mambakkam, Vandalur Kelamba...",₹2.2 Cr,"1,500 sqft",3 BHK
4,"MKB Nagar, Vyasarpadi, Chennai, Chennai North","4 Bedroom House in MKB Nagar, Vyasarpadi, Chennai",₹1.75 Cr,"1,600 sqft",4 BHK
...,...,...,...,...,...
1093,Excel vijay jagat,"2 BHK Flat in Thoraipakkam, Chennai",₹55 Lac,820 sqft,2 BHK
1094,Shri Vana Durga apartment,"2 BHK Flat in Madambakkam, Chennai",₹60 Lac,"1,241 sqft",2 BHK
1095,Shri Vana Durga Apartment,"1 BHK Flat in Madambakkam, Chennai",₹28 Lac,617 sqft,1 BHK
1096,s c giri homes,"3 BHK Flat in Korattur, Chennai",₹47 Lac,"1,000 sqft",3 BHK


In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098 entries, 0 to 1097
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   name      1098 non-null   object
 1   location  1084 non-null   object
 2   price     1098 non-null   object
 3   area      1098 non-null   object
 4   bhk       1098 non-null   object
dtypes: object(5)
memory usage: 43.0+ KB


In [136]:
df['price'].str.lower().replace("₹", "", regex=True).str.strip().isna().sum()



np.int64(0)

In [179]:
df_properties = (
    pd
    .DataFrame(data)
    .drop_duplicates()
    .apply(lambda col: col.str.strip().str.lower() if col.dtype == "object" else col)
    .assign(
        starred = lambda df_: df_['name'].str.extract(r'\n([\d.]+)')[0].fillna(0).astype(float),
        name = lambda df_: (
            df_['name']
            .str.replace("\n[0-9.]+", "", regex=True)
            .str.strip()
            .replace("adroit district s", "adroit district's")
        ),
        location = lambda df_: (
            df_['location']
            .str.replace("chennai", "")
            .str.strip()
            .str.replace(",$", "", regex=True)
            .str.split("in")
            .str[-1]
            .str.strip()
        ),
        price = lambda df_: (
            df_['price']
            .loc[df_['price'].str.strip().str.lower() != 'price on request']
            .str.replace("₹", "")
            .str.strip()
            .apply(lambda val: float(val.replace("lac", "").strip()) if "lac" in val else float(val.replace("cr", "").strip()) * 100)
        ),
        area = lambda df_: (
            df_['area']
            .str.replace("sqft", "", regex=True)
            .str.strip()
			.str.replace(",", "")
            .pipe(lambda ser: pd.to_numeric(ser))
        ),
        bhk=lambda df_: (
			df_['bhk']
			.str.replace("bhk", "")
			.str.strip()
			.pipe(lambda ser: pd.to_numeric(ser))
		)
    )
    .rename(columns={
		"price": "price (in lacs)",
		"area": "area (in sqft)"
	})
    .reset_index(drop=True)
    .to_csv("chennai-properties-99acres.csv", index=False)
    
)


In [185]:
df_properties = pd.read_csv("chennai-properties-99acres.csv")
df_properties.head()

Unnamed: 0,name,location,price (in lacs),area (in sqft),bhk,starred
0,navganesh eden,kolathur,72.04,1181,3,0.0
1,shree varu homes gruham,"vadapalani, south",159.0,1328,3,0.0
2,prince courtyard,egmore,443.0,2110,3,0.0
3,s and p the address,"mambakkam, vandalur kelambakkam road",220.0,1500,3,0.0
4,"mkb nagar, vyasarpadi, chennai, chennai north","mkb nagar, vyasarpadi",175.0,1600,4,0.0


np.int64(1)