In [None]:
import csv
import time
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager


In [None]:
def setup_driver(headless_mode):
    """Setup the Chrome WebDriver with optional headless mode."""
    chrome_options = Options()
    if headless_mode:
        chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")  # Bypass OS security model
    chrome_options.add_argument("--disable-gpu")  # Disable GPU hardware acceleration
    chrome_options.add_argument("--window-size=1920x1080")
    chrome_options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems
    chrome_options.add_argument(
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36')

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
    return driver


In [1]:

def scrape_square_yards_data(output_file, max_pages, headless_mode=True):
    driver = setup_driver(headless_mode)

    driver.maximize_window()
    url = 'https://www.squareyards.com/sale/property-for-sale-in-bangalore'
    driver.get(url)

    property_type = []
    address = []
    price_range = []
    total_area = []
    all_features = []
    listing_type = []
    amenities = []
    project_info_text = []
    price = []

    for page_num in range(1, max_pages + 1):
        driver.get(f"https://www.squareyards.com/sale/property-for-sale-in-bangalore?page={page_num}")
        time.sleep(np.random.randint(4, 5))  # Random delay to avoid detection
        
        elements = driver.find_elements(By.XPATH, "//div[contains(@class, 'dseprojectdata') and contains(@class, 'tileNo_')]")
        print(f"Scraping page {page_num}, found {len(elements)} elements")
        
        for element in elements:
            try:
                driver.execute_script("arguments[0].scrollIntoView(true);", element)
                time.sleep(np.random.uniform(1, 2))  # Small delay for stability
                element.click()
                driver.switch_to.window(driver.window_handles[1])

                # Retrieve price
                price_element = WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div[2]/div[1]/div[2]/div[1]/strong"))
                )
                price.append(price_element.text)

                # Retrieve project information
                project_info_element = WebDriverWait(driver, 10).until(
                    EC.presence_of_element_located((By.ID, "projectInformation"))
                )
                project_info_text.append(project_info_element.text)

                # Retrieve amenities
                amenities_elements = driver.find_elements(By.XPATH, "/html/body/div[6]/div/div/div[1]/div[3]/div[2]/div/table")
                amenities_text = ", ".join([amenity.text for amenity in amenities_elements]) if amenities_elements else "No amenities found"
                amenities.append(amenities_text)

                driver.close()
                driver.switch_to.window(driver.window_handles[0])

            except Exception as e:
                print(f"Error processing element: {e}")
                continue

        # Write data to CSV after each page
        mode = 'a' if page_num > 1 else 'w'
        with open(output_file, mode, newline='', encoding="utf-8") as csvfile:
            fieldnames = ['price', 'project_info_text', 'amenities']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            if mode == 'w':
                writer.writeheader()
            for i in range(len(price)):
                writer.writerow({
                    'price': price[i],
                    'project_info_text': project_info_text[i],
                    'amenities': amenities[i]
                })
        
        print(f"Page {page_num} data saved")

    driver.quit()



Enter the output CSV file name:  newdatacheck
Enter the number of pages to scrape:  1
Enter 'headless' for headless mode, or press Enter for standard mode:  


Scraping page 1, found 41 elements
Error processing element: list index out of range
Error processing element: list index out of range
Page 1 data saved


In [None]:
# Run the scraper
output_file = input("Enter the output CSV file name: ") + ".csv"
max_pages = int(input("Enter the number of pages to scrape: "))
browser_mode = input("Enter 'headless' for headless mode, or press Enter for standard mode: ")
headless_mode = True if browser_mode.lower() == 'headless' else False

scrape_square_yards_data(output_file, max_pages, headless_mode)
