In [1]:
import os
import time
import random
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

In [2]:
def scrape_99acres(num_iterations=10):
    folder_name = "99acres_Bangalore_scraped_data"
    os.makedirs(folder_name, exist_ok=True)
    final_file = os.path.join(folder_name, "99acres_scraped_data.xlsx")
    
    chrome_options = Options()
    #chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.0.0 Safari/537.36")
    
    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    
    regions = {
        "central": (5, 21),
        "north": (5, 100),
        "south": (5, 100),
        "east": (5, 100),
        "west": (5, 42)
    }
    
    all_data = []
    
    if os.path.exists(final_file):
        existing_data = pd.read_excel(final_file)
        all_data = existing_data.to_dict(orient='records')

    try:
        for _ in range(num_iterations):
            region = random.choice(list(regions.keys()))
            start_page, end_page = regions[region]
            page_number = random.randint(start_page, end_page)
            
            try:
                print(f"Scraping {region.title()} - Page {page_number}...")
                url = f'https://www.99acres.com/property-for-rent-in-bangalore-{region}-ffid-page-{page_number}'
                driver.get(url)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "html")))
                time.sleep(random.uniform(5, 10))
                
                soup = BeautifulSoup(driver.page_source, 'lxml')
                links = [a.get("href") for a in soup.find_all('a', class_="tupleNew__propertyHeading ellipsis")]
                
                if not links:
                    print(f"No properties found on page {page_number}.")
                    continue
                
                selected_links = random.sample(links, min(5, len(links)))
                
                for link in selected_links:
                    try:
                        driver.get(link)
                        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "html")))
                        time.sleep(random.uniform(4, 7))
                        
                        new_soup = BeautifulSoup(driver.page_source, 'lxml')
                        
                        def safe_extract(soup, tag, attr_type, attr_value):
                            try:
                                if attr_type == "id":
                                    element = soup.find(tag, id=attr_value)
                                elif attr_type == "class":
                                    element = soup.find(tag, class_=attr_value)
                                elif attr_type == "all_class":
                                    elements = soup.find_all(tag, class_=attr_value)
                                    return [item.text.strip() for item in elements]
                                return element.text.strip() if element else np.nan
                            except:
                                return np.nan
                        
                        area_tags = ['superArea_span', 'superbuiltupArea_span', 'builtupArea_span', 'carpetArea_span']
                        area = ''
                        for tag in area_tags:
                            area = safe_extract(new_soup, 'span', 'id', tag)
                            if (area != '') & (pd.notna(area)):
                                break
                                
                        floor = safe_extract(new_soup, 'span', 'id', 'Floor_Num_Label')
                        available_from = new_soup.find_all('div', class_='component__details')
                        
                        all_data.append({
                            "Region": f"Bangalore {region.title()}",
                            "Link": link,
                            "Bedroom": safe_extract(new_soup, 'span', 'id', 'bedRoomNum'),
                            "Bathroom": safe_extract(new_soup, 'span', 'id', 'bathroomNum'),
                            "Balcony": safe_extract(new_soup, 'span', 'id', 'balconyNum'),
                            "Additional_rooms": safe_extract(new_soup, 'span', 'id', 'additionalRooms'),
                            "Area": area,
                            "Facing": safe_extract(new_soup, 'span', 'id', 'Facing_Label'),
                            "Furnishing": safe_extract(new_soup, 'span', 'id', 'furnishingLabel'),
                            "Rating": safe_extract(new_soup, 'div', 'class', 'display_l_semiBold'),
                            "Address": safe_extract(new_soup, 'span', 'class', 'component__pdPropAddress'),
                            "Nearby": safe_extract(new_soup, 'span', 'all_class', 'NearByLocation__infoText'),
                            "Power_backup": safe_extract(new_soup, 'span', 'id', 'Powerbackup_Label'),
                            "Parking": safe_extract(new_soup, 'span', 'id', 'Reserved_Parking_Label'),
                            "Charges": safe_extract(new_soup, 'span', 'id', 'electricityWaterCharges'),
                            "Posted_By_and_On": safe_extract(new_soup, 'span', 'id', 'postedOnAndByLabel'),
                            "Age": safe_extract(new_soup, 'span', 'id', 'Age_Label'),
                            "Pet_friendly": safe_extract(new_soup, 'span', 'id', 'PetFriendly'),
                            "Floor": safe_extract(new_soup, 'span', 'id', 'Total_Floor') if pd.isnull(floor) else floor,
                            "Available_from": available_from[-2].text.strip() if available_from else np.nan,
                            "Available_for": safe_extract(new_soup, 'span', 'id', 'availableForLabel'),
                            "Type": safe_extract(new_soup, 'div', 'class', 'component__pdPropConfSide component__rentHeading pd__rentHeading'),
                            "Advance": safe_extract(new_soup, 'div', 'class', 'component__tableTooltip'),
                            "Location": safe_extract(new_soup, 'span', 'class', 'component__pdPropAddress'),
                            "Rent": safe_extract(new_soup, 'span', 'id', 'pdPrice2')
                        })
                        print(f"Scraped property: {link}")
                        
                        time.sleep(random.uniform(5, 8))
                    except Exception as e:
                        print(f"Error processing link {link}: {e}")
                        continue
            except Exception as e:
                print(f"Error scraping page {page_number}: {e}")
                continue
    finally:
        driver.quit()
        pd.DataFrame(all_data).to_excel(final_file, index=False)
        print(f"Data saved in '{final_file}'")

In [3]:
scrape_99acres(20)

Scraping South - Page 58...
Scraped property: https://www.99acres.com/1-bhk-bedroom-independent-builder-floor-for-rent-in-sector-4-hsr-layout-bangalore-south-600-sq-ft-spid-J80871885
Scraped property: https://www.99acres.com/3-bhk-bedroom-apartment-flat-for-rent-in-sattva-salarpuria-greenage-bommanahalli-bangalore-south-1700-sq-ft-spid-N80601939
Scraped property: https://www.99acres.com/2-bhk-bedroom-independent-builder-floor-for-rent-in-7th-phase-jp-nagar-bangalore-south-1200-sq-ft-spid-E80922215
Scraped property: https://www.99acres.com/2-bhk-bedroom-independent-builder-floor-for-rent-in-sector-2-hsr-layout-bangalore-south-1200-sq-ft-spid-E80829645
Scraped property: https://www.99acres.com/1-bhk-bedroom-independent-builder-floor-for-rent-in-sector-3-hsr-layout-bangalore-south-600-sq-ft-spid-G80876275
Scraping North - Page 95...
Scraped property: https://www.99acres.com/3-bhk-bedroom-apartment-flat-for-rent-in-embassy-lake-terraces-hebbal-kempapura-bangalore-north-3789-sq-ft-spid-G801