In [4]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

In [5]:
# ----- SCRAPING THE DATA -----

def wait_for_page_to_load(driver, wait):
	title = driver.title
	try:
		wait.until(
			lambda d: d.execute_script("return document.readyState") == "complete"
		)
	except:
		print(f"The webpage \"{title}\" did not get fully laoded.\n")
	else:
		print(f"The webpage \"{title}\" did get fully laoded.\n")

In [6]:
# options
chrome_options = Options()
chrome_options.add_argument("--disable-http2")
chrome_options.add_argument("--incognito")
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_argument("--ignore-certificate-errors")
chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
chrome_options.add_argument("--disable-features=NetworkService")
chrome_options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
)

driver = webdriver.Chrome(options=chrome_options)
driver.maximize_window()

# explicit wait
wait = WebDriverWait(driver, 5)

# accessing the target webpage
url = "https://www.99acres.com/"
driver.get(url)

In [7]:
# identify and enter text into search bar
try:
	search_bar = wait.until(
		EC.presence_of_element_located((By.XPATH, '//*[@id="keyword2"]'))
	)
except:
	print("Timeout while locating Search Bar.\n")
else:
	search_bar.send_keys("Kolkata")
	time.sleep(5)
 
# selecting valid option from list
try:
	valid_option = wait.until(
		EC.element_to_be_clickable((By.XPATH, '//*[@id="0"]'))
	)
except:
	print("Timeout while locating valid search option.\n")
else:
	valid_option.click()
	time.sleep(2)

# click on Search button
try:
	search_button = wait.until(
		EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform_search_btn"]'))
	)
except:
	print("Timeout while clicking on \"Search\" button.\n")
else:
	search_button.click()
	wait_for_page_to_load(driver, wait)

The webpage "Property in Kolkata - Real Estate in Kolkata" did get fully laoded.



In [None]:
import numpy as np
# adjust the Budget slider
try:
	slider = wait.until(
		EC.element_to_be_clickable((By.XPATH, '//*[@id="budgetLeftFilter_max_node"]'))
	)
except:
	print("Timeout while clicking on Budget slider circle.\n")
else:
	actions = ActionChains(driver)
	(
		actions
		.click_and_hold(slider)
		.move_by_offset(-73, 0)
		.release()
		.perform()
	)
	time.sleep(2)

# filter results to show genuine listings
# 1. Verified
verified = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[3]/span[2]'))
)
verified.click()
time.sleep(1)

# 2. Ready To Move
ready_to_move = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[1]/div[3]/span[2]'))
)
ready_to_move.click()
time.sleep(1)


# moving to the right side to unhide remaining filters
while True:
	try:
		filter_right_button = wait.until(
			EC.presence_of_element_located((By.XPATH, "/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/i[1]"))
		)
	except:
		print("Timeout because we have uncovered all filters.\n")
		break
	else:
		filter_right_button.click()
		time.sleep(1)
  

# 3. With Photos
with_photos = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[6]/span[2]'))
)
with_photos.click()
time.sleep(2)

# 4. With Videos
with_videos = wait.until(
	EC.element_to_be_clickable((By.XPATH, '/html[1]/body[1]/div[1]/div[1]/div[1]/div[4]/div[3]/div[1]/div[3]/section[1]/div[1]/div[1]/div[1]/div[1]/div[2]/div[1]/div[7]/span[2]'))
)
with_videos.click()
time.sleep(4)

#navigate pages and extract data
data = []
page_count = 0
while True:
    page_count = page_count + 1
    try:
        next_page_button = driver.find_element(By.XPATH, "//a[normalize-space()='Next Page >']")
    except:
        print("Timeout beacuse we have navigated all the {page_count} pages. \n")
        break
    else:
        try:
           driver.execute_script("window.scrollBy(0, arguments[0].getBoundingClientRect().top - 100);", next_page_button)
           time.sleep(2)
           
           #Scrapping Data
           rows = driver.find_elements(By.CSS_SELECTOR, ".tupleNew__contentWrap, .PseudoTupleRevamp__contentWrapAb")
           
           for row in rows:
                        # property name
                        try:
                            name = row.find_element(By.CSS_SELECTOR, ".tupleNew__headingNrera, .PseudoTupleRevamp__headNrating").text
                        except:
                               name = np.nan
                        # property location
                        try:
                            location = row.find_element(By.CSS_SELECTOR, ".tupleNew__propType, .PseudoTupleRevamp__w400Ml4").text
                        except:
                               location = np.nan
                    
                        # property price
                        try:
                            price = row.find_element(By.CSS_SELECTOR, ".tupleNew__priceValWrap, .configs__ccl2").text
                        except:
                               price = np.nan
                        # for some cases bhk and area = tupleNew_area1Type and in some cases bhk's are = configs_ccl1
                        # property area and size
                        try:
    # Try to extract both area and BHK from tupleNew__area1Type (2 spans expected)
                            elements = row.find_elements(By.CSS_SELECTOR, ".tupleNew__area1Type")

                            if len(elements) >= 2:
                               area, bhk = [ele.text for ele in elements[:2]]
                            elif len(elements) == 1:
                               area = elements[0].text
        # Fallback: try to extract BHK from configs__ccl1
                            try:
                                bhk = row.find_element(By.CSS_SELECTOR, ".configs__ccl1").text
                            except:
                                  bhk = np.nan
                            else:
        # If neither tupleNew__area1Type found, fallback to configs__ccl1 only
                               area = np.nan
                               try:
                                   bhk = row.find_element(By.CSS_SELECTOR, ".configs__ccl1").text
                               except:
                                     bhk = np.nan

                        except:
    # If entire block fails (row broken or elements missing), fill both as NaN
                            area, bhk = [np.nan, np.nan]

                        
                        property = {
							"Name": name,
                            "Location": location,
                            "Price" : price,
                            "Area": area,
                            "BHK" : bhk
						}
                        data.append(property)
           
           
           wait.until(
			EC.element_to_be_clickable((By.XPATH, "//a[normalize-space()='Next Page >']"))
		   ).click()
           time.sleep(5)
        except:
              print("TimeOut on clicking on \"Next Page\".\n")
              break
 
 #Scrapping Data from the last page
rows = driver.find_elements(By.CSS_SELECTOR, ".tupleNew__contentWrap, .PseudoTupleRevamp__contentWrapAb")
           
for row in rows:
                        # property name
                        try:
                            name = row.find_element(By.CSS_SELECTOR, ".tupleNew__headingNrera, .PseudoTupleRevamp__headNrating").text
                        except:
                               name = np.nan
                        # property location
                        try:
                            location = row.find_element(By.CSS_SELECTOR, ".tupleNew__propType, .PseudoTupleRevamp__w400Ml4").text
                        except:
                               location = np.nan
                    
                        # property price
                        try:
                            price = row.find_element(By.CSS_SELECTOR, ".tupleNew__priceValWrap, .configs__ccl2").text
                        except:
                               price = np.nan
                        # for some cases bhk and area = tupleNew_area1Type and in some cases bhk's are = configs_ccl1
                        # property area and size
                        try:
    # Try to extract both area and BHK from tupleNew__area1Type (2 spans expected)
                            elements = row.find_elements(By.CSS_SELECTOR, ".tupleNew__area1Type")

                            if len(elements) >= 2:
                               area, bhk = [ele.text for ele in elements[:2]]
                            elif len(elements) == 1:
                               area = elements[0].text
        # Fallback: try to extract BHK from configs__ccl1
                            try:
                                bhk = row.find_element(By.CSS_SELECTOR, ".configs__ccl1").text
                            except:
                                  bhk = np.nan
                            else:
        # If neither tupleNew__area1Type found, fallback to configs__ccl1 only
                               area = np.nan
                               try:
                                   bhk = row.find_element(By.CSS_SELECTOR, ".configs__ccl1").text
                               except:
                                     bhk = np.nan

                        except:
    # If entire block fails (row broken or elements missing), fill both as NaN
                            area, bhk = [np.nan, np.nan]

                        
                        property = {
							"Name": name,
                            "Location": location,
                            "Price" : price,
                            "Area": area,
                            "BHK" : bhk
						}
                        data.append(property)

time.sleep(3)
driver.quit()

In [None]:
#len(data)

In [None]:
#print(page_count)

In [None]:
df_properties = (
    pd
    .DataFrame(data).drop_duplicates()
    .apply(lambda col: col.str.strip().str.lower() if col.dtype == "object" else col)
    .assign(
    is_starred=lambda df_: df_.Name.str.contains("\n").astype(int),
    Name=lambda df_: (
        df_
        .Name
        .str.replace("\n[0-9.]+", "", regex=True)
        .str.strip()
    ),
    #location
    Location = lambda df_: (
    df_
    .Location
    .str.split(" in ").str[-1].str.strip().str.replace(", ", ",").str.replace(r"^in\s+", "", regex=True)
),
#price
Price = lambda df_: (
    df_
    .Price
    .str.replace("₹", "", regex=False)
    .str.replace("p", "", regex=False)
    .apply(
    lambda val: 0.0 if "request" in val.lower()
    else (
        # If range exists, average both ends
        sum([
            float(
                v.replace("lac", "")
                 .replace("l", "")
                 .replace("cr", "")
                 .replace("₹", "")
                 .strip()
            )
            for v in val.lower().split("-")
        ]) / 2
        if "-" in val else float(
            val.lower()
               .replace("lac", "")
               .replace("l", "")
               .replace("cr", "")
               .replace("₹", "")
               .strip()
        )
    ) * (
        1 if any(x in val.lower() for x in ["lac", "l"]) else 100
    )
)



),
#Area
Area = lambda df_: (
    df_
    .Area
    .str.replace("sqft", "", regex=False)
    .str.strip()
    .astype(float)
    # repalced Nan with median of Area
    .fillna(df_.Area.str.replace("sqft", "", regex=False).str.strip().astype(float).median())
    .pipe(lambda ser: pd.to_numeric(ser))
    
    
),
#BHK
BHK = lambda df_: (
    df_
    .BHK
    .str.extract(r"(\d+)\s*bhk", expand=False)  # extract number
    .fillna(0)                                   # fill NaNs as 0
    .astype(int)                                 # convert to integer
)


    

    
).rename(columns={"Price": "Price_lakhs",
                  "Area" : "Area_sqft"})
 .to_excel("Kolkata_Properties-99acres.xlsx", index=False)


)

df_properties


### Data Cleaning

In [None]:
df_properties.Name.unique()

array(['shanti kunj - hindmotor', 'akashlina apartment - konnagar',
       'merlin rise', 'f residences merlin', 'orbit dakshini',
       'godrej blue', 'merlin niyasa', 'godrej prakriti\n4.2',
       'siddha suburbia\n3.9', 'crown at aquaview', 'vindhya primrose',
       'ora heights', 'merlin serenia', 'optima',
       'shriram grand city\n3.7', 'purti tatsam', 'cellesta',
       'deeshari palm villa', 'kamalika', 'srijan spacia',
       'merlin skygaze', 'merlin avana', 'ganguly 4sight superia',
       'swapnabhumi township', 'gokul vista', 'siddha serena',
       'quintessa', 'ust heights', 'urban lakes phase 2',
       'chitrakut heights phase 2', 'dtc sojon', 'merlin isle',
       'dtc palm grove', 'ps sansara', 'vinayak amara', 'dtc good earth',
       'morya phase 2', 'natural city madhyamgram', 'urban vista',
       'srijan town square', 'shriram sunshine 2', 'the avalon heights',
       'rishi ventoso phase 2', 'shriram symphony', 'shree miraya',
       'vayu', 'kappa gamma',

In [None]:
# to view which properties have \n in there name 
df_properties.loc[lambda df_: df_.Name.str.contains("\n")].Name.unique()


array(['godrej prakriti\n4.2', 'siddha suburbia\n3.9',
       'shriram grand city\n3.7', 'prasad rare earth\n3.5',
       'siddha waterfront\n3.8', 'swayam city\n3.3',
       'alcove new kolkata sangam\n3.9', 'ps one 10\n4.2',
       'bengal peerless avidipta phase 2\n4.0',
       'siddha eden lakeville\n3.7'], dtype=object)

In [None]:
df_properties.Location.iloc[0]

'2 bhk flat in hindmotor, uttarpara'

In [None]:
df_properties.Location

0      2 bhk flat in hindmotor, uttarpara
1         2 bhk flat in konnagar, hooghly
2                    in rajarhat, kolkata
3                    in rajarhat, kolkata
4                  in bara nagar, kolkata
                      ...                
97                 in joka, kolkata south
98                    in entally, kolkata
99            in em bypass, kolkata south
100               in madhyamgram, kolkata
101                in joka, kolkata south
Name: Location, Length: 102, dtype: object

In [None]:
df_properties.Location.str.split(" in ").iloc[0]


['2 bhk flat', 'hindmotor, uttarpara']

In [None]:
df_properties.Location.str.split(" in ").str[-1].str.strip().str.replace(", ", ",").iloc[0]


'hindmotor,uttarpara'

In [None]:
df_properties.Location.unique()

array(['hindmotor,uttarpara', 'konnagar,hooghly', 'rajarhat,kolkata',
       'bara nagar,kolkata', 'new alipore,kolkata', 'anandapur,em bypass',
       'sodepur,kolkata north', 'baruipur,kolkata south',
       'new town,kolkata', 'narendrapur,kolkata', 'tiljala,em bypass',
       'bt road,kolkata', 'uttarpara,hooghly', 'lake town,kolkata',
       'southern bypass,kolkata', 'chandannagar,hooghly',
       'madhyamgram,kolkata', 'tollygunge,kolkata',
       'nager bazar,jessore road', 'kankurgachi,kolkata',
       'diamond harbour road,kolkata south', 'joka,kolkata south',
       'picnic garden,em bypass', 'golabari,bandhaghat,howrah',
       'garia,kolkata', 'kaikhali,vip road',
       'belgharia expressway,kolkata', 'rajpur,southern bypass',
       'bally,howrah', 'serampore,hooghly', 'beleghata,kolkata',
       'manicktala,kolkata north', 'em bypass,kolkata south',
       'bangur,lake town', 'entally,kolkata', 'khardah,bt road',
       'santragachi,howrah', 'shibpur,howrah', 'barasat,k

In [None]:
df_properties['Price'].str[0].unique()

array(['₹', 'p'], dtype=object)

In [None]:
df_properties['Price'].str.split(" ").str[-1].unique()

array(['lac', 'l', 'cr', 'request'], dtype=object)

In [None]:
df_properties['Price'].dtype

dtype('float64')

In [None]:
df_properties['Area'].str.contains('sqft').unique()

array([True, nan], dtype=object)

In [None]:
df_properties['BHK'].unique()

array([nan, '2 bhk apartment', '3 bhk apartment', '1 bhk apartment',
       '3 bhk villa', '4 bhk apartment', '4 bhk villa', 'land',
       '5 bhk apartment'], dtype=object)