In [2]:
import os
import time
import random
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

def scrape_floors(folder_name, start_page, end_page):
    
    chrome_options = Options()
    chrome_options.add_argument("--disable-blink-features=AutomationControlled")
    chrome_options.add_argument("--disable-infobars")
    chrome_options.add_argument("--disable-extensions")
    chrome_options.add_argument("--disable-gpu")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument(f"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{random.randint(90, 110)}.0.0.0 Safari/537.36")

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)

    for page in range(start_page, end_page + 1):
        file_path = f'{folder_name}/99acres_rentals_page_{page}.xlsx'
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            continue
        
        df = pd.read_excel(file_path)
        
        indies = df[df['Floor'].isnull()].index

        for index in indies:
            try:
                url = df.at[index, 'Link']
                driver.get(url)
                WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "html")))
                time.sleep(random.uniform(4, 7))
                
                new_soup = BeautifulSoup(driver.page_source, 'lxml')
                element = new_soup.find('span', id='Total_Floor')
                floor_value = element.text.strip() if element else None
            
                df.at[index, 'Floor'] = floor_value
                print(f"Updated index {index} on page {page} with floor value: {floor_value}")
            
            except Exception as e:
                print(f"Error processing index {index} on page {page}: {e}")

        df.to_excel(file_path, index=False)

    driver.quit()

In [None]:
scrape_floors('99acres_Bangalore_East_scraped_data' ,91 ,100)

Updated index 1 on page 91 with floor value: 1 Floors
Updated index 4 on page 91 with floor value: 1 Floors
Updated index 5 on page 91 with floor value: 1 Floors
Updated index 10 on page 91 with floor value: None
Updated index 12 on page 91 with floor value: 2 Floors
Updated index 15 on page 91 with floor value: 1 Floors
Updated index 20 on page 91 with floor value: 2 Floors
Updated index 21 on page 91 with floor value: 4 Floors
Updated index 22 on page 91 with floor value: 2 Floors
Updated index 2 on page 92 with floor value: 3 Floors
Updated index 9 on page 92 with floor value: 1 Floors
Updated index 11 on page 92 with floor value: 4 Floors
Updated index 13 on page 92 with floor value: 1 Floors
Updated index 14 on page 92 with floor value: 1 Floors
Updated index 8 on page 93 with floor value: 2 Floors
Updated index 11 on page 93 with floor value: None
Updated index 13 on page 93 with floor value: 1 Floors
Updated index 14 on page 93 with floor value: 3 Floors
Updated index 15 on page