In [1]:
# Import necessary Selenium and Python modules
from selenium import webdriver  # Controls the browser
from selenium.webdriver.common.by import By  # Allows selection of elements by tag, class, etc.
from selenium.webdriver.chrome.service import Service  # Manages ChromeDriver service
from selenium.webdriver.chrome.options import Options  # Configures browser settings
from selenium.webdriver.support.ui import WebDriverWait  # Waits for elements to load
from selenium.webdriver.support import expected_conditions as EC  # Defines wait conditions
import pandas as pd  # For storing and exporting data
import time  # For sleep delays
import random  # For randomized retry delays
 
# -------------------------------
# Setup Chrome WebDriver options
# -------------------------------
options = Options()
options.add_argument("--start-maximized")  # Open browser maximized for proper rendering
options.add_argument("--disable-gpu")      # Disable GPU acceleration
options.add_argument("--window-size=1920,1080")  # Set fixed window size
# options.add_argument("--headless")  # Headless mode may prevent dynamic content loading

# Initialize WebDriver with options
driver = webdriver.Chrome(service=Service(), options=options)
wait = WebDriverWait(driver, 10)  # Wait up to 10 seconds for elements to appear

# -------------------------------
# Step 1: Collect all car listing URLs via scrolling
# -------------------------------
base_url = "https://www.cars24.com/buy-used-hyundai-cars-mumbai/?sort=bestmatch&storeCityId=2378"
driver.get(base_url)  # Open the main page
time.sleep(5)  # Wait for initial page load

car_urls = []        # Master list to store all unique car URLs
seen_urls = set()    # Set to track already seen URLs to avoid duplicates
last_count = 0       # Number of car elements in the previous scroll
same_count_rounds = 0  # Count consecutive scrolls with no new cars

while True:
    # Scroll to the bottom of the page to trigger dynamic loading
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)  # Wait for newly loaded cars to appear

    # Find all currently visible car cards on the page
    car_elements = driver.find_elements(By.CLASS_NAME, "styles_carCardWrapper__sXLIp")
    urls = [car.get_attribute("href") for car in car_elements if car.get_attribute("href")]

    # Filter out duplicates
    new_urls = [u for u in urls if u not in seen_urls]
    seen_urls.update(new_urls)  # Add newly found URLs to the seen set
    car_urls.extend(new_urls)   # Add to the master list

    # Log progress
    print(f"Currently visible cars: {len(car_elements)} | Collected unique: {len(car_urls)}")

    # Check if no new cars appeared in this scroll
    if len(car_elements) == last_count:
        same_count_rounds += 1  # Increment counter for consecutive identical scrolls
    else:
        same_count_rounds = 0  # Reset if new cars were found

    last_count = len(car_elements)  # Update last count

    # Stop scrolling if bottom reached (3 consecutive scrolls with no new cars)
    if same_count_rounds >= 3:
        print("Reached bottom of the listings.")
        break

print(f"\nTotal unique car URLs collected: {len(car_urls)}\n")

# -------------------------------
# Step 2: Visit each car page and extract details
# -------------------------------
carList = []  # List to store each car's data as a dictionary

for idx, car_url in enumerate(car_urls):
    for attempt in range(3):  # Retry up to 3 times if scraping fails
        try:
            driver.get(car_url)  # Open car detail page
            time.sleep(2)  # Wait for page load

            # Extract main details: model, location, price
            car_model = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "styles_carName__xzcd4"))
            ).text
            location = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "styles_carLocation__UrZVn"))
            ).text
            price = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "styles_price__3yE9i"))
            ).text.split("\n")

            # Store main details in a dictionary
            carData = {
                "Car Model": car_model,
                "Location": location,
                "Price": price[0]
            }

            # Extract "Know Your Car" specs
            specs_section = wait.until(
                EC.presence_of_element_located((By.ID, "CATALOG_CDP_KNOW_YOUR_CAR"))
            )
            specs_container = specs_section.find_elements(By.CLASS_NAME, "styles_content__KtXDs")[0]
            specs_items = specs_container.find_elements(By.TAG_NAME, "p")  # All <p> tags (labels + values)

            # Loop through specs to extract relevant info
            for i in range(len(specs_items) - 1):
                key = specs_items[i].text.strip()
                value = specs_items[i + 1].text.strip()
                if key in ["Fuel", "KM driven", "Transmission"]:
                    carData[key] = value

            # Log car data and add to master list
            print(f"Option {idx}: {carData}")
            carList.append(carData)
            break  # Exit retry loop on success

        except Exception as e:
            print(f"Attempt {attempt + 1} failed for car {idx}: {e}")
            time.sleep(random.uniform(1, 3))  # Wait a random interval before retry

# -------------------------------
# Step 3: Save all scraped data to a CSV file
# -------------------------------
df = pd.DataFrame(carList)  # Convert list of dictionaries to DataFrame
df.to_csv("hyundai_cars_mumbai.csv", index=False)  # Save as CSV
print(f"\nAll {len(carList)} cars saved to 'hyundai_cars_mumbai.csv'")

# -------------------------------
# Step 4: Close the browser session
# -------------------------------
driver.quit()  # Shut down the browser



Currently visible cars: 120 | Collected unique: 120
Currently visible cars: 291 | Collected unique: 291
Currently visible cars: 291 | Collected unique: 291
Currently visible cars: 291 | Collected unique: 291
Currently visible cars: 291 | Collected unique: 291
Reached bottom of the listings.

Total unique car URLs collected: 291

Option 0: {'Car Model': '2019 Hyundai Grand i10 SPORTZ 1.2 KAPPA VTVT', 'Location': 'Goregaon, Mumbai', 'Price': '₹3.55 lakh', 'Fuel': 'CNG', 'KM driven': '82,657 km', 'Transmission': 'Manual'}
Option 1: {'Car Model': '2016 Hyundai Elite i20 SPORTZ 1.2', 'Location': 'Goregaon, Mumbai', 'Price': '₹3.24 lakh', 'Fuel': 'Petrol', 'KM driven': '82,473 km', 'Transmission': 'Manual'}
Option 2: {'Car Model': '2021 Hyundai AURA S 1.2 CNG', 'Location': 'Regency Anantam, Dombivli East', 'Price': '₹5.8 lakh', 'Fuel': 'CNG', 'KM driven': '6,916 km', 'Transmission': 'Manual'}
Option 3: {'Car Model': '2022 Hyundai AURA S 1.2 CNG', 'Location': 'Regency Anantam, Dombivli East',