In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import re

In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import re

# --- CONFIGURATION ---
START_URL = "https://www.pakwheels.com/used-cars/search/-/mk_honda/md_civic/cert_pakwheels-inspected/"
OUTPUT_FILE = "pakwheels_gold_data_final.csv"
MAX_PAGES_TO_SCRAPE = 5

# --- SETUP CHROME ---
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
# options.add_argument("--headless") # Uncomment if you want it invisible later
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def clean_text(text):
    if not text: return "N/A"
    return " ".join(text.split())

def extract_report_score(driver):
    try:
        body_text = driver.find_element(By.TAG_NAME, "body").text
        match = re.search(r"(\d+\.\d+)\s*/\s*10", body_text)
        if match:
            return match.group(1)
    except:
        return "N/A"
    return "N/A"

try:
    print(f"--- STARTING GOLD VACUUM FINAL (With 'Millage' Fix) ---")
    driver.get(START_URL)
    
    with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as file:
        # Added 'transmission' and 'fuel' since we are grabbing them anyway!
        fieldnames = ['url', 'title_version', 'model_year', 'mileage', 'engine', 'transmission', 'fuel', 'price', 'description', 'inspection_score']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        
        page_count = 0
        
        while page_count < MAX_PAGES_TO_SCRAPE:
            page_count += 1
            print(f"\n--- PREPARING PAGE {page_count} ---")
            
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "well")))
            
            # 1. HARVEST LINKS
            ad_elements = driver.find_elements(By.CSS_SELECTOR, "a.car-name")
            car_urls = list(set([ad.get_attribute("href") for ad in ad_elements if ad.get_attribute("href") and "/used-cars/" in ad.get_attribute("href")]))
            print(f"-> Found {len(car_urls)} cars.")

            # 2. SECURE NEXT PAGE
            try:
                next_btn = driver.find_element(By.CSS_SELECTOR, "li.next_page a")
                next_page_url = next_btn.get_attribute("href")
            except:
                next_page_url = None

            # 3. VISIT CARS
            for link in car_urls:
                print(f"Visiting: {link[-30:]}...")
                driver.get(link)
                time.sleep(1) # Fast but safe
                
                data = {
                    'url': link,
                    'title_version': 'N/A', 'model_year': 'N/A', 
                    'mileage': 'N/A', 'engine': 'N/A', 'transmission': 'N/A', 'fuel': 'N/A',
                    'price': 'N/A', 'description': 'N/A', 'inspection_score': 'N/A'
                }
                
                # --- A. SCRAPE ICONS TABLE (Year, Mileage, Fuel, Trans) ---
                # This is the "td" table you found!
                try:
                    # 1. YEAR
                    try:
                        data['model_year'] = driver.find_element(By.XPATH, "//span[contains(@class, 'year')]/following-sibling::p").text.strip()
                    except: pass

                    # 2. MILEAGE (The Big Fix - looks for 'millage' typo)
                    try:
                        data['mileage'] = driver.find_element(By.XPATH, "//span[contains(@class, 'millage')]/following-sibling::p").text.strip()
                    except: pass
                    
                    # 3. FUEL
                    try:
                        data['fuel'] = driver.find_element(By.XPATH, "//span[contains(@class, 'type')]/following-sibling::p").text.strip()
                    except: pass
                    
                    # 4. TRANSMISSION
                    try:
                        data['transmission'] = driver.find_element(By.XPATH, "//span[contains(@class, 'transmission')]/following-sibling::p").text.strip()
                    except: pass
                except: pass

                # --- B. SCRAPE LIST (Engine Capacity) ---
                try:
                    specs_list = driver.find_element(By.ID, "scroll_car_detail")
                    items = specs_list.find_elements(By.TAG_NAME, "li")
                    for i in range(len(items)-1):
                        label = items[i].text.strip()
                        value = items[i+1].text.strip()
                        if "Engine Capacity" in label:
                            data['engine'] = value
                            break 
                except: pass

                # --- C. TITLE & PRICE ---
                try: data['title_version'] = driver.find_element(By.TAG_NAME, "h1").text.strip()
                except: pass
                try: data['price'] = driver.find_element(By.CLASS_NAME, "price-box").text.strip()
                except: pass
                
                # --- D. DESCRIPTION ---
                try:
                    header = driver.find_element(By.ID, "scroll_seller_comments")
                    desc_div = header.find_element(By.XPATH, "./following-sibling::div[1]")
                    clean_desc = desc_div.text.replace("Mention PakWheels.com when calling Seller to get a good deal", "").strip()
                    data['description'] = clean_text(clean_desc)
                except: pass

                # --- E. SCORE ---
                try:
                    report_btn = driver.find_element(By.CSS_SELECTOR, "a[href*='carsure-reports']")
                    report_url = report_btn.get_attribute("href")
                    driver.get(report_url)
                    time.sleep(1)
                    data['inspection_score'] = extract_report_score(driver)
                except: pass
                
                writer.writerow(data)

            # 4. NEXT PAGE
            if next_page_url:
                driver.get(next_page_url)
                time.sleep(3)
            else:
                break

except Exception as e:
    print(f"ERROR: {e}")
finally:
    driver.quit()
    print("Vacuum Complete.")

--- STARTING GOLD VACUUM FINAL (With 'Millage' Fix) ---

--- PREPARING PAGE 1 ---
-> Found 25 cars.
Visiting: or-sale-in-rawalpindi-10789686...
Visiting: 0-for-sale-in-karachi-10908996...
Visiting: 12-for-sale-in-lahore-10855869...
Visiting: 8-for-sale-in-karachi-10773820...
Visiting: or-sale-in-rawalpindi-10778379...
Visiting: 5-for-sale-in-karachi-10864618...
Visiting: 7-for-sale-in-karachi-10909673...
Visiting: 20-for-sale-in-lahore-10787406...
Visiting: 18-for-sale-in-lahore-10823857...
Visiting: for-sale-in-islamabad-10889590...
Visiting: 13-for-sale-in-lahore-10801349...
Visiting: for-sale-in-islamabad-10872301...
Visiting: 6-for-sale-in-karachi-10846814...
Visiting: -for-sale-in-peshawar-10716941...
Visiting: 13-for-sale-in-lahore-10882023...
Visiting: 21-for-sale-in-lahore-10873477...
Visiting: for-sale-in-islamabad-10772062...
Visiting: 7-for-sale-in-karachi-10771405...
Visiting: 19-for-sale-in-multan-10899649...
Visiting: 0-for-sale-in-karachi-10760297...
Visiting: or-sale-in

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import time
import re
import os # <--- NEW: Needed to check if file exists

# --- CONFIGURATION ---
# UNCOMMENT JUST ONE LINE AT A TIME TO RUN "ONE MAKE AT A TIME"
TARGET_URLS = [
    # "https://www.pakwheels.com/used-cars/search/-/mk_honda/cert_pakwheels-inspected/",
    # "https://www.pakwheels.com/used-cars/search/-/mk_toyota/cert_pakwheels-inspected/",
    # "https://www.pakwheels.com/used-cars/search/-/mk_suzuki/cert_pakwheels-inspected/", 
    # "https://www.pakwheels.com/used-cars/search/-/mk_hyundai/cert_pakwheels-inspected/",
    # "https://www.pakwheels.com/used-cars/search/-/mk_kia/cert_pakwheels-inspected/"
]

OUTPUT_FILE = "pakwheels_gold_data_MIXED.csv"
MAX_PAGES_PER_URL = 2

# --- SETUP CHROME ---
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
options.add_argument("--disable-blink-features=AutomationControlled")
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def clean_text(text):
    if not text: return "N/A"
    return " ".join(text.split())

def extract_report_score(driver):
    try:
        body_text = driver.find_element(By.TAG_NAME, "body").text
        match = re.search(r"(\d+\.\d+)\s*/\s*10", body_text)
        if match: return match.group(1)
    except: return "N/A"
    return "N/A"

try:
    print(f"--- STARTING SINGLE-BATCH GOLD VACUUM ---")
    
    # NEW LOGIC: Check if file exists so we don't write headers twice
    file_exists = os.path.isfile(OUTPUT_FILE)
    
    # OPEN IN 'a' (APPEND) MODE
    with open(OUTPUT_FILE, mode='a', newline='', encoding='utf-8') as file:
        fieldnames = ['url', 'title_version', 'model_year', 'mileage', 'engine', 'transmission', 'fuel', 'price', 'description', 'inspection_score']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # ONLY WRITE HEADER IF FILE IS NEW
        if not file_exists:
            writer.writeheader()
            print("-> New file created. Header written.")
        else:
            print("-> Found existing file. Appending new data...")
        
        for target_url in TARGET_URLS:
            print(f"\n\n>>> TARGETING: {target_url}")
            driver.get(target_url)
            
            page_count = 0
            while page_count < MAX_PAGES_PER_URL:
                page_count += 1
                print(f"\n--- Processing Page {page_count} ---")
                
                try:
                    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "well")))
                except:
                    print("Timeout waiting for list. Moving on.")
                    break

                ad_elements = driver.find_elements(By.CSS_SELECTOR, "a.car-name")
                car_urls = list(set([ad.get_attribute("href") for ad in ad_elements if ad.get_attribute("href") and "/used-cars/" in ad.get_attribute("href")]))
                print(f"-> Found {len(car_urls)} cars.")

                # Get Next Page
                try:
                    next_btn = driver.find_element(By.CSS_SELECTOR, "li.next_page a")
                    next_page_url = next_btn.get_attribute("href")
                except:
                    next_page_url = None

                # Visit Cars
                for link in car_urls:
                    print(f"Visiting: {link[-30:]}...")
                    try:
                        driver.get(link)
                        time.sleep(1) 
                        
                        data = {
                            'url': link, 'title_version': 'N/A', 'model_year': 'N/A', 
                            'mileage': 'N/A', 'engine': 'N/A', 'transmission': 'N/A', 'fuel': 'N/A',
                            'price': 'N/A', 'description': 'N/A', 'inspection_score': 'N/A'
                        }

                        # SPECS
                        try: data['model_year'] = driver.find_element(By.XPATH, "//span[contains(@class, 'year')]/following-sibling::p").text.strip()
                        except: pass
                        try: data['mileage'] = driver.find_element(By.XPATH, "//span[contains(@class, 'millage')]/following-sibling::p").text.strip()
                        except: pass
                        try: data['fuel'] = driver.find_element(By.XPATH, "//span[contains(@class, 'type')]/following-sibling::p").text.strip()
                        except: pass
                        try: data['transmission'] = driver.find_element(By.XPATH, "//span[contains(@class, 'transmission')]/following-sibling::p").text.strip()
                        except: pass
                        try:
                            specs_list = driver.find_element(By.ID, "scroll_car_detail")
                            items = specs_list.find_elements(By.TAG_NAME, "li")
                            for i in range(len(items)-1):
                                if "Engine Capacity" in items[i].text:
                                    data['engine'] = items[i+1].text.strip(); break 
                        except: pass

                        # CORE DATA
                        try: data['title_version'] = driver.find_element(By.TAG_NAME, "h1").text.strip()
                        except: pass
                        try: data['price'] = driver.find_element(By.CLASS_NAME, "price-box").text.strip()
                        except: pass
                        try:
                            header = driver.find_element(By.ID, "scroll_seller_comments")
                            desc_div = header.find_element(By.XPATH, "./following-sibling::div[1]")
                            clean_desc = desc_div.text.replace("Mention PakWheels.com when calling Seller to get a good deal", "").strip()
                            data['description'] = clean_text(clean_desc)
                        except: pass

                        # SCORE
                        try:
                            report_btn = driver.find_element(By.CSS_SELECTOR, "a[href*='carsure-reports']")
                            report_url = report_btn.get_attribute("href")
                            driver.get(report_url)
                            time.sleep(1)
                            data['inspection_score'] = extract_report_score(driver)
                        except: pass
                        
                        writer.writerow(data) # Writes immediately to file

                    except Exception as e:
                        print(f"Skipping car: {e}")
                        continue

                if next_page_url:
                    driver.get(next_page_url)
                    time.sleep(3)
                else:
                    print("End of list for this brand.")
                    break

except Exception as e:
    print(f"CRITICAL ERROR: {e}")
finally:
    driver.quit()
    print("Batch Complete.")

--- STARTING SINGLE-BATCH GOLD VACUUM ---
-> Found existing file. Appending new data...


>>> TARGETING: https://www.pakwheels.com/used-cars/search/-/mk_kia/cert_pakwheels-inspected/

--- Processing Page 1 ---
-> Found 25 cars.
Visiting: or-sale-in-rawalpindi-10819479...
Visiting: 2-for-sale-in-karachi-10731686...
Visiting: for-sale-in-islamabad-10806448...
Visiting: 20-for-sale-in-lahore-10824922...
Visiting: 21-for-sale-in-lahore-10888656...
Visiting: 4-for-sale-in-karachi-10910070...
Visiting: or-sale-in-faisalabad-10498380...
Skipping car: HTTPConnectionPool(host='localhost', port=52434): Read timed out. (read timeout=120)
Visiting: 1-for-sale-in-karachi-10714040...
Visiting: 0-for-sale-in-karachi-10899670...
Visiting: or-sale-in-gujranwala-10907306...
Visiting: -for-sale-in-peshawar-10857089...
Visiting: 21-for-sale-in-lahore-10790588...
Visiting: or-sale-in-faisalabad-10903057...
Visiting: 20-for-sale-in-lahore-10786573...
Visiting: 1-for-sale-in-karachi-10793977...
Visiting: 4-f