In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import re
import os

In [None]:

# --- CONFIGURATION ---
# NOTE: These URLs do NOT have the "pakwheels-inspected" filter. We want everything.
TARGET_URLS = [
    # "https://www.pakwheels.com/used-cars/search/-/mk_honda/",
    "https://www.pakwheels.com/used-cars/search/-/mk_toyota/",
    # "https://www.pakwheels.com/used-cars/search/-/mk_suzuki/", 
    # "https://www.pakwheels.com/used-cars/search/-/mk_hyundai/",
    # "https://www.pakwheels.com/used-cars/search/-/mk_kia/"
]

OUTPUT_FILE = "pakwheels_silver_data.csv"
MAX_PAGES_PER_URL = 20  # 20 Pages = ~500-600 cars per brand. This is the big haul.

# --- SETUP CHROME ---
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_argument("--headless") # Optional: Enable this to run in background
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def clean_text(text):
    if not text: return "N/A"
    return " ".join(text.split())

try:
    print(f"--- STARTING SILVER VACUUM (Volume Scraper) ---")
    
    file_exists = os.path.isfile(OUTPUT_FILE)
    
    # Open in Append Mode
    with open(OUTPUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        fieldnames = ['url', 'title_version', 'model_year', 'mileage', 'engine', 'transmission', 'fuel', 'price', 'description']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        if not file_exists:
            writer.writeheader()
            print("-> New file created. Header written.")
        else:
            print("-> Appending to existing file...")
        
        for target_url in TARGET_URLS:
            print(f"\n\n>>> TARGETING: {target_url}")
            driver.get(target_url)
            
            page_count = 0
            while page_count < MAX_PAGES_PER_URL:
                page_count += 1
                print(f"\n--- Processing Page {page_count} ---")
                
                try:
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "well")))
                except:
                    print("Timeout waiting for list. Moving on.")
                    break

                # Harvest Links
                ad_elements = driver.find_elements(By.CSS_SELECTOR, "a.car-name")
                car_urls = list(set([ad.get_attribute("href") for ad in ad_elements if ad.get_attribute("href") and "/used-cars/" in ad.get_attribute("href")]))
                print(f"-> Found {len(car_urls)} cars.")

                # Secure Next Page
                try:
                    next_btn = driver.find_element(By.CSS_SELECTOR, "li.next_page a")
                    next_page_url = next_btn.get_attribute("href")
                except:
                    next_page_url = None

                # Visit Cars
                for link in car_urls:
                    # print(f"Visiting: {link[-20:]}...") # Optional: Comment out to reduce clutter
                    try:
                        driver.get(link)
                        # We can reduce sleep slightly since we aren't loading heavy reports
                        time.sleep(0.5) 
                        
                        data = {
                            'url': link, 'title_version': 'N/A', 'model_year': 'N/A', 
                            'mileage': 'N/A', 'engine': 'N/A', 'transmission': 'N/A', 'fuel': 'N/A',
                            'price': 'N/A', 'description': 'N/A'
                        }

                        # SPECS (Icons Table)
                        try: data['model_year'] = driver.find_element(By.XPATH, "//span[contains(@class, 'year')]/following-sibling::p").text.strip()
                        except: pass
                        try: data['mileage'] = driver.find_element(By.XPATH, "//span[contains(@class, 'millage')]/following-sibling::p").text.strip()
                        except: pass
                        try: data['fuel'] = driver.find_element(By.XPATH, "//span[contains(@class, 'type')]/following-sibling::p").text.strip()
                        except: pass
                        try: data['transmission'] = driver.find_element(By.XPATH, "//span[contains(@class, 'transmission')]/following-sibling::p").text.strip()
                        except: pass
                        
                        # ENGINE (List)
                        try:
                            specs_list = driver.find_element(By.ID, "scroll_car_detail")
                            items = specs_list.find_elements(By.TAG_NAME, "li")
                            for i in range(len(items)-1):
                                if "Engine Capacity" in items[i].text:
                                    data['engine'] = items[i+1].text.strip(); break 
                        except: pass

                        # TITLE & PRICE
                        try: data['title_version'] = driver.find_element(By.TAG_NAME, "h1").text.strip()
                        except: pass
                        try: data['price'] = driver.find_element(By.CLASS_NAME, "price-box").text.strip()
                        except: pass

                        # DESCRIPTION
                        try:
                            header = driver.find_element(By.ID, "scroll_seller_comments")
                            desc_div = header.find_element(By.XPATH, "./following-sibling::div[1]")
                            clean_desc = desc_div.text.replace("Mention PakWheels.com when calling Seller to get a good deal", "").strip()
                            data['description'] = clean_text(clean_desc)
                        except: pass
                        
                        writer.writerow(data)

                    except Exception as e:
                        continue

                if next_page_url:
                    driver.get(next_page_url)
                    time.sleep(2)
                else:
                    print("End of list.")
                    break

except Exception as e:
    print(f"CRITICAL ERROR: {e}")
finally:
    driver.quit()
    print("Batch Complete.")

--- STARTING SILVER VACUUM (Volume Scraper) ---
-> Appending to existing file...


>>> TARGETING: https://www.pakwheels.com/used-cars/search/-/mk_toyota/

--- Processing Page 1 ---
-> Found 25 cars.

--- Processing Page 2 ---
-> Found 26 cars.

--- Processing Page 3 ---
-> Found 25 cars.

--- Processing Page 4 ---
-> Found 26 cars.

--- Processing Page 5 ---
-> Found 26 cars.

--- Processing Page 6 ---
-> Found 26 cars.

--- Processing Page 7 ---
-> Found 31 cars.

--- Processing Page 8 ---
-> Found 31 cars.
