In [1]:
import os
import time
import random
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

### Description:
This script scrapes rental property listings from 99acres.com for Bangalore Regions. 
It automates the browser using Selenium, extracts data with BeautifulSoup, 
and saves the scraped details in Excel files.

### Key functionalities:
- Uses a headless Chrome browser for scraping.
- Iterates over multiple pages within a specified range.
- Extracts various property details (e.g., area, rent, furnishing, location).
- Handles missing data gracefully with a `safe_extract` function.
- Saves processed data in Excel format per page.
- Maintains a separate log for unprocessed property links.
- Introduces random delays to mimic human behavior and avoid detection.

In [2]:
def scrape_99acres(start_page, end_page):
    
    folder_name = "99acres_Bangalore_Central_scraped_data"
    os.makedirs(folder_name, exist_ok=True)

    unprocessed_file = os.path.join(folder_name, "99acres_unprocessed.xlsx")

    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.0.0 Safari/537.36")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    unprocessed_links = []

    for page_number in range(start_page, end_page + 1):
        page_data = [] 

        try:
            print(f"Scraping page {page_number}...")

            url = f'https://www.99acres.com/property-for-rent-in-bangalore-central-ffid-page-{page_number}'
            driver.get(url)

            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "html")))
            time.sleep(random.uniform(5, 10)) 

            soup = BeautifulSoup(driver.page_source, 'lxml')
            links = [a.get("href") for a in soup.find_all('a', class_="tupleNew__propertyHeading ellipsis")]

            if not links:
                print(f"No properties found on page {page_number}. Adding to unprocessed pages.")
                continue

            for link in links:
                try:
                    driver.get(link)
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "html")))
                    time.sleep(random.uniform(4, 7))

                    new_soup = BeautifulSoup(driver.page_source, 'lxml')

                    def safe_extract(soup, tag, attr_type, attr_value):
                        try:
                            if attr_type == "id":
                                element = soup.find(tag, id=attr_value)
                            elif attr_type == "class":
                                element = soup.find(tag, class_=attr_value)
                            elif attr_type == "all_class":
                                elements = soup.find_all(tag, class_=attr_value)
                                return [item.text.strip() for item in elements]
                            return element.text.strip() if element else np.nan
                        except:
                            return np.nan

                    area_tags = ['superArea_span', 'superbuiltupArea_span', 'builtupArea_span', 'carpetArea_span']
                    area = ''
                    for tag in area_tags:
                        area = safe_extract(new_soup, 'span', 'id', tag)
                        if (area != '') & (pd.notna(area)):
                            break

                    available_from = new_soup.find_all('div' , class_ = 'component__details')

                    floor = safe_extract(new_soup, 'span', 'id', 'Floor_Num_Label')

                    page_data.append({
                        "Link": link,
                        "Bedroom": safe_extract(new_soup, 'span', 'id', 'bedRoomNum'),
                        "Bathroom": safe_extract(new_soup, 'span', 'id', 'bathroomNum'),
                        "Balcony": safe_extract(new_soup, 'span', 'id', 'balconyNum'),
                        "Additional_rooms": safe_extract(new_soup, 'span', 'id', 'additionalRooms'),
                        "Area": area,
                        "Facing": safe_extract(new_soup, 'span', 'id', 'Facing_Label'),
                        "Furnishing": safe_extract(new_soup, 'span', 'id', 'furnishingLabel'),
                        "Rating": safe_extract(new_soup, 'div', 'class', 'display_l_semiBold'),
                        "Address": safe_extract(new_soup, 'span', 'class', 'component__pdPropAddress'),
                        "Nearby": safe_extract(new_soup, 'span', 'all_class', 'NearByLocation__infoText'),
                        "Power_backup": safe_extract(new_soup, 'span', 'id', 'Powerbackup_Label'),
                        "Parking": safe_extract(new_soup, 'span', 'id', 'Reserved_Parking_Label'),
                        "Charges": safe_extract(new_soup, 'span', 'id', 'electricityWaterCharges'),
                        "Posted_By_and_On" : safe_extract(new_soup,'span' , 'id' , 'postedOnAndByLabel'),
                        "Age": safe_extract(new_soup, 'span', 'id', 'Age_Label'),
                        "Pet_friendly": safe_extract(new_soup, 'span', 'id', 'PetFriendly'),
                        "Floor": safe_extract(new_soup , 'span' , 'id' , 'Total_Floor') if pd.isnull(floor) else floor,
                        "Avaliable_from" : available_from[len(available_from)-2].text.strip(),
                        "Available_for": safe_extract(new_soup, 'span', 'id', 'availableForLabel'),
                        "Type": safe_extract(new_soup, 'div', 'class', 'component__pdPropConfSide component__rentHeading pd__rentHeading'),
                        "Advance": safe_extract(new_soup, 'div', 'class', 'component__tableTooltip'),
                        "Location": safe_extract(new_soup, 'span', 'class', 'component__pdPropAddress'),
                        "Rent": safe_extract(new_soup, 'span', 'id', 'pdPrice2')
                    })

                    print(f"Scraped property: {link}")

                except Exception as e:
                    print(f"Error processing link {link}: {e}")
                    unprocessed_links.append({"Page": page_number, "Link": link})
                    continue

                time.sleep(random.uniform(5, 8))

        except Exception as e:
            print(f"Error scraping page {page_number}: {e}")
            continue

        df_page = pd.DataFrame(page_data)

        file_name = os.path.join(folder_name, f"99acres_rentals_page_{page_number}.xlsx")
        df_page.to_excel(file_name, index=False)

    driver.quit()

    df_unprocessed_links = pd.DataFrame(unprocessed_links)

    if os.path.exists(unprocessed_file):
        existing_data = pd.ExcelFile(unprocessed_file)
        df_old_links = pd.read_excel(existing_data, sheet_name="Unprocessed Links")
        df_unprocessed_links = pd.concat([df_old_links, df_unprocessed_links], ignore_index=True)

    df_unprocessed_links.to_excel(unprocessed_file, sheet_name="Unprocessed Links", index=False)

    print(f"Data saved in '{folder_name}'")

In [3]:
scrape_99acres(21,21)

Scraping page 21...
Scraped property: https://www.99acres.com/studio-apartment-flat-for-rent-in-divyam-apartments-ulsoor-bangalore-central-120-sq-ft-r1-spid-P77468985
Scraped property: https://www.99acres.com/2-bhk-bedroom-independent-builder-floor-for-rent-in-kamraj-road-bangalore-central-1200-sq-ft-spid-J78623779
Scraped property: https://www.99acres.com/studio-apartment-flat-for-rent-in-brigade-road-bangalore-central-250-sq-ft-spid-J78502119
Scraped property: https://www.99acres.com/2-bhk-bedroom-independent-builder-floor-for-rent-in-sudhama-nagar-bangalore-central-500-sq-ft-spid-Y78317321
Scraped property: https://www.99acres.com/2-bhk-bedroom-apartment-flat-for-rent-in-langford-and-richmond-town-richmond-town-bangalore-central-1000-sq-ft-r2-spid-E60729090
Scraped property: https://www.99acres.com/1-bhk-bedroom-independent-builder-floor-for-rent-in-ulsoor-bangalore-central-450-sq-ft-r1-spid-R76861511
Scraped property: https://www.99acres.com/1-bhk-bedroom-independent-builder-floor-

### Description:
This script retries scraping rental property listings from 99acres.com that were previously unprocessed. 
It reads unprocessed links from an Excel file, extracts property details using Selenium and BeautifulSoup, 
and appends the collected data to the corresponding page file from the initial scraping attempt.

### Key functionalities:
- Reads unprocessed property links and their corresponding page numbers from an Excel file.
- Uses a headless Chrome browser to scrape data.
- Extracts various property details (e.g., area, rent, furnishing, location).
- Handles missing data with a `safe_extract` function.
- Appends newly scraped data to the same page file it originally belonged to.
- Updates the unprocessed links file with any links that still fail to load.
- Introduces random delays to mimic human behavior and avoid detection.

In [4]:
def scrape_Unprocessed_links(folder_name):
    
    folder_name = folder_name

    unprocessed_file = os.path.join(folder_name, "99acres_unprocessed.xlsx")

    df = pd.read_excel(unprocessed_file)
    page_numbers = df.iloc[:, 0]
    links = df.iloc[:, 1]

    chrome_options = Options()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.0.0 Safari/537.36")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    unprocessed_links = []

    for i in range(len(links)):
        page_data = []
        try:
            driver.get(links[i])
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "html")))
            time.sleep(random.uniform(4, 7)) 

            new_soup = BeautifulSoup(driver.page_source, 'lxml')

            def safe_extract(soup, tag, attr_type, attr_value):
                try:
                    if attr_type == "id":
                        element = soup.find(tag, id=attr_value)
                    elif attr_type == "class":
                        element = soup.find(tag, class_=attr_value)
                    elif attr_type == "all_class":
                        elements = soup.find_all(tag, class_=attr_value)
                        return [item.text.strip() for item in elements]
                    return element.text.strip() if element else np.nan
                except:
                    return np.nan

            area_tags = ['superArea_span', 'superbuiltupArea_span', 'builtupArea_span', 'carpetArea_span']
            area = ''
            for tag in area_tags:
                area = safe_extract(new_soup, 'span', 'id', tag)
                if (area != '') & (pd.notna(area)):
                    break
                    
            available_from = new_soup.find_all('div' , class_ = 'component__details')

            floor = safe_extract(new_soup, 'span', 'id', 'Floor_Num_Label')

            page_data.append({
                "Link": links[i],
                "Bedroom": safe_extract(new_soup, 'span', 'id', 'bedRoomNum'),
                "Bathroom": safe_extract(new_soup, 'span', 'id', 'bathroomNum'),
                "Balcony": safe_extract(new_soup, 'span', 'id', 'balconyNum'),
                "Additional_rooms": safe_extract(new_soup, 'span', 'id', 'additionalRooms'),
                "Area": area,
                "Facing": safe_extract(new_soup, 'span', 'id', 'Facing_Label'),
                "Furnishing": safe_extract(new_soup, 'span', 'id', 'furnishingLabel'),
                "Rating": safe_extract(new_soup, 'div', 'class', 'display_l_semiBold'),
                "Address": safe_extract(new_soup, 'span', 'class', 'component__pdPropAddress'),
                "Nearby": safe_extract(new_soup, 'span', 'all_class', 'NearByLocation__infoText'),
                "Power_backup": safe_extract(new_soup, 'span', 'id', 'Powerbackup_Label'),
                "Parking": safe_extract(new_soup, 'span', 'id', 'Reserved_Parking_Label'),
                "Charges": safe_extract(new_soup, 'span', 'id', 'electricityWaterCharges'),
                "Posted_By_and_On": safe_extract(new_soup,'span', 'id', 'postedOnAndByLabel'),
                "Age": safe_extract(new_soup, 'span', 'id', 'Age_Label'),
                "Pet_friendly": safe_extract(new_soup, 'span', 'id', 'PetFriendly'),
                "Floor": safe_extract(new_soup , 'span' , 'id' , 'Total_Floor') if pd.isnull(floor) else floor,
                "Avaliable_from": available_from[len(available_from)-2].text.strip(),
                "Available_for": safe_extract(new_soup, 'span', 'id', 'availableForLabel'),
                "Type": safe_extract(new_soup, 'div', 'class', 'component__pdPropConfSide component__rentHeading pd__rentHeading'),
                "Advance": safe_extract(new_soup, 'div', 'class', 'component__tableTooltip'),
                "Location": safe_extract(new_soup, 'span', 'class', 'component__pdPropAddress'),
                "Rent": safe_extract(new_soup, 'span', 'id', 'pdPrice2')
            })

            print(f"Scraped property: {links[i]}")

        except Exception as e:
            print(f"Error processing link {links[i]}: {e}")
            unprocessed_links.append({"Page": page_numbers[i], "Link": links[i]})
            continue

        time.sleep(random.uniform(5, 8))
        df_page = pd.DataFrame(page_data)
        file_name = os.path.join(folder_name, f"99acres_rentals_page_{page_numbers[i]}.xlsx")
        
        if os.path.exists(file_name):
            df_old = pd.read_excel(file_name)
            df_page = pd.concat([df_old, df_page], ignore_index=True)
        
        df_page.to_excel(file_name, index=False)

    driver.quit()

    df_unprocessed_links = pd.DataFrame(unprocessed_links)
    df_unprocessed_links.to_excel(unprocessed_file, sheet_name="Unprocessed Links", index=False)

    print(f"Data saved in '{folder_name}'")

In [5]:
scrape_Unprocessed_links("99acres_Bangalore_Central_scraped_data")

Scraped property: https://www.99acres.com/3-bhk-bedroom-apartment-flat-for-rent-in-cooke-town-bangalore-central-1750-sq-ft-r1-spid-E78896081
Scraped property: https://www.99acres.com/2-bhk-bedroom-apartment-flat-for-rent-in-deepa-residency-someshwarpura-bangalore-central-1100-sq-ft-spid-K80075533
Data saved in '99acres_Bangalore_Central_scraped_data'
