In [1]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import time

In [2]:
# --- CONFIGURATION ---
START_URL = "https://www.pakwheels.com/used-cars/search/-/mk_honda/md_civic/cert_pakwheels-inspected/"
OUTPUT_FILE = "pakwheels_gold_data.csv"
MAX_PAGES = 3 

# The "Perfect Disguise" Headers (Keep these!)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Accept-Encoding": "gzip, deflate, br",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

def get_soup(url):
    try:
        # We are using standard requests now
        response = requests.get(url, headers=headers, timeout=20)
        response.raise_for_status()
        return BeautifulSoup(response.content, "html.parser")
    except Exception as e:
        print(f"Error fetching {url}: {e}")
        return None

def extract_car_data(car_url):
    """ Visits a single car page and extracts data. """
    soup = get_soup(car_url)
    if not soup: return None
    
    data = {
        'url': car_url,
        'price': 'N/A',
        'description': 'N/A',
        'inspection_score': 'N/A'
    }

    # 1. Get Price
    try:
        price_box = soup.find(class_="price-box")
        if price_box:
            data['price'] = price_box.get_text(strip=True)
    except: pass

    # 2. Get Description (Sibling Strategy)
    try:
        header = soup.find(id="scroll_seller_comments")
        if header:
            desc_div = header.find_next_sibling("div")
            if desc_div:
                data['description'] = desc_div.get_text(separator=" ", strip=True)
    except: pass

    # 3. Get Inspection Score
    try:
        # Find report link
        report_link = soup.find("a", href=re.compile("carsure-reports"))
        if report_link:
            report_url = report_link['href']
            if not report_url.startswith("http"):
                report_url = "https://www.pakwheels.com" + report_url
            
            # Deep Dive into Report
            report_soup = get_soup(report_url)
            if report_soup:
                # Look for "X.X / 10"
                score_pattern = re.compile(r"(\d+\.\d+)\s*/\s*10")
                found_text = report_soup.find(string=score_pattern)
                if found_text:
                    match = score_pattern.search(found_text)
                    if match:
                        data['inspection_score'] = match.group(1)
                        print(f"   -> Score Found: {data['inspection_score']}/10")
    except Exception as e:
        print(f"   -> Error getting score: {e}")

    return data

# --- MAIN EXECUTION ---
print(f"--- STARTING GOLD VACUUM (Requests Mode) ---")
print(f"Target: {START_URL}")

with open(OUTPUT_FILE, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=['url', 'price', 'description', 'inspection_score'])
    writer.writeheader()

    current_url = START_URL
    page_count = 0

    while current_url and page_count < MAX_PAGES:
        page_count += 1
        print(f"\nScanning Page {page_count}: {current_url}")
        
        soup = get_soup(current_url)
        if not soup: break

        # --- UPDATED LINK FINDER (The Fix) ---
        # Instead of trusting one class name, we find ALL links and filter them.
        all_links = soup.find_all("a", href=True)
        car_urls = []
        
        for link in all_links:
            href = link['href']
            # Heuristic: Valid car links contain '/used-cars/' but NOT 'search' or 'payment'
            if "/used-cars/" in href and "search" not in href and "pakwheels-inspected" not in href:
                full_link = "https://www.pakwheels.com" + href if not href.startswith("http") else href
                
                # Avoid duplicates in the list
                if full_link not in car_urls:
                    car_urls.append(full_link)

        print(f"Found {len(car_urls)} potential car ads. Processing...")
        
        if len(car_urls) == 0:
            print("WARNING: Still found 0 ads. Saving HTML debug file.")
            with open("debug_fail.html", "w", encoding="utf-8") as f:
                f.write(str(soup))
            break

        for link in car_urls:
            print(f"Processing: {link[-30:]}...") # Print last 30 chars of URL
            
            car_data = extract_car_data(link)
            
            if car_data:
                writer.writerow(car_data)
                time.sleep(1) # Be polite

        # Find Next Page
        next_btn = soup.find("li", class_="next_page")
        if next_btn and next_btn.find("a"):
            next_url_part = next_btn.find("a")['href']
            current_url = "https://www.pakwheels.com" + next_url_part if not next_url_part.startswith("http") else next_url_part
        else:
            print("\nNo 'Next Page' button found. Job Complete.")
            current_url = None

print("\n--- VACUUM COMPLETE ---")

--- STARTING GOLD VACUUM (Requests Mode) ---
Target: https://www.pakwheels.com/used-cars/search/-/mk_honda/md_civic/cert_pakwheels-inspected/

Scanning Page 1: https://www.pakwheels.com/used-cars/search/-/mk_honda/md_civic/cert_pakwheels-inspected/


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Found 0 potential car ads. Processing...

--- VACUUM COMPLETE ---
