# CARFAX WEB SCRAPING 2
- Web Scraping for Ford Mach-E and Edge MY 21-24 from CarFax


In [19]:
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select  # Add this line

import pandas as pd
import time
import random


In [5]:
AUTH = 
SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'
def main():
    print('Connecting to Scraping Browser...')
    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
    with Remote(sbr_connection, options=ChromeOptions()) as driver:
        print('Connected! Navigating...')
        driver.get("https://www.carfax.com/cars-for-sale")
        print('Taking page screenshot to file page.png')
        driver.get_screenshot_as_file('./page.png')
        print('Navigated! Scraping page content...')
        html = driver.page_source
        print(html)
if __name__ == '__main__':
  main()

Connecting to Scraping Browser...
Connected! Navigating...
Taking page screenshot to file page.png
Navigated! Scraping page content...
<html lang="en"><head><script async="" src="https://tags.creativecdn.com/QSd357enBhkveALlVkv9.js"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/destination?id=AW-973448781&amp;l=dataLayer&amp;cx=c&amp;gtm=45He4b70v72113301za200"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/destination?id=AW-975668885&amp;l=dataLayer&amp;cx=c&amp;gtm=45He4b70v72113301za200"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/destination?id=DC-9112615&amp;l=dataLayer&amp;cx=c&amp;gtm=45He4b70v72113301za200"></script><script type="text/javascript" async="" src="https://static.criteo.net/js/ld/ld.js"></script><script type="text/javascript" async="" src="https://www.googletagmanager.com/gtag/js?id=G-KN603HK3Y1&amp;l=dataLayer&amp;cx=c&amp;gtm=4

In [14]:
# Create a function that outputs a random number between 0.75 and 1.25
def get_random_delay():
    """Returns a random float between 0.75 and 1.25 seconds"""
    return random.uniform(0.75, 1.25)


In [57]:
def search_carfax(driver, make='Ford', model='Mustang Mach-E', min_year='2021', zip_code='30309', radius='Unlimited'):
    """Navigate to CarFax and perform search"""
    try:
        print("Attempting to search CarFax...")
        # Navigate to main page
        driver.get("https://www.carfax.com/cars-for-sale")
        time.sleep(2)  # Brief pause for page load
        
        print("Connected to website")
        
        print("Finding and selecting make...")
        make_input = driver.find_element(by=By.ID, value="undefined-make-input")
        select_make = Select(make_input)
        make_input.click()
        time.sleep(get_random_delay())

        print("Attempting to find 'Ford' element...")
        select_make.select_by_visible_text(make)
        time.sleep(get_random_delay())

        print("Make selected successfully!")

        print("Finding and selecting model...")
        model_input = driver.find_element(by=By.ID, value="undefined-model-input")
        select_model = Select(model_input)
        model_input.click()
        time.sleep(get_random_delay())

        print("Attempting to select model element...")
        select_model.select_by_visible_text(model)
        time.sleep(get_random_delay())

        print("Make and model selected successfully!")

        print("Finding and clicking location change button...")
        loc_change_button = driver.find_element(By.CSS_SELECTOR, "button.button[data-theme='blue-flat'][type='button']").click()
        time.sleep(get_random_delay())
        print("Location change button clicked successfully!")

        print("Finding and selecting zip code input...")
        zip_input = driver.find_element(By.CSS_SELECTOR, "input[aria-label='ZIP Code']")
        zip_input.clear()
        zip_input.send_keys(zip_code)
        time.sleep(get_random_delay())

        print("Zip code input filled successfully!")

        print("Finding and selecting distance dropdown...")
        distance_dropdown = driver.find_element(By.CSS_SELECTOR, "select[aria-label='Search Radius']")
        distance_dropdown.click()
        select_distance = Select(distance_dropdown)
        time.sleep(get_random_delay())

        print("Selecting 'Unlimited' distance option...")
        select_distance.select_by_visible_text('Unlimited')
        time.sleep(get_random_delay())

        print("Distance selected successfully!")

        
        #Take screenshot of zip code input
        driver.get_screenshot_as_file('./zip_code_input.png')


        # print("Finding and clicking apply button...")
        # apply_button = driver.find_element(By.CSS_SELECTOR, "button.button[data-theme='blue'][data-large='true']").click()
        # apply_button.click()
        # time.sleep(get_random_delay())
        # print("Apply button clicked successfully!")
        print("Handling overlay and clicking apply button...")
        try:
            # Close overlay if present
            overlay_close_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.overlay[title='Close dialog']"))
            )
            overlay_close_button.click()
            WebDriverWait(driver, 10).until(
                EC.invisibility_of_element_located((By.CSS_SELECTOR, "button.overlay[title='Close dialog']"))
            )

            # Now find and click the Apply button using JavaScript
            apply_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "button.button[data-theme='blue'][data-large='true']"))
            )
            driver.execute_script("arguments[0].click();", apply_button)
            print("Apply button clicked successfully!")
            time.sleep(get_random_delay())

        except Exception as e:
            print(f"Error clicking apply button: {str(e)}") 
            return False





        return True
        
    except Exception as e:
        print(f"Error during search: {str(e)}")
        return False

In [4]:
def scrape_listings(driver):
    """Scrape vehicle listings from all pages"""
    all_vehicles = []
    page = 1
    
    while True:
        print(f"Scraping page {page}...")
        
        # Wait for listings to appear
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "listing-card"))
        )
        
        # Find all vehicle listings on current page
        listings = driver.find_elements(By.CLASS_NAME, "listing-card")
        
        for listing in listings:
            try:
                # Get basic info from listing card
                vehicle = {
                    'CARFAX Value': listing.find_element(By.CLASS_NAME, "price").text,
                    'Mileage': listing.find_element(By.CLASS_NAME, "mileage").text,
                    'Location': listing.find_element(By.CLASS_NAME, "dealer-name").text,
                    'VIN #': listing.find_element(By.CLASS_NAME, "vin").text
                }
                
                # Get detailed info by clicking into listing
                detailed_info = get_detailed_vehicle_info(driver, listing)
                if detailed_info:
                    vehicle.update(detailed_info)
                    all_vehicles.append(vehicle)
                
                time.sleep(1)  # Brief pause between listings
                
            except Exception as e:
                print(f"Error scraping listing: {str(e)}")
                continue
        
        # Check for next page
        if not handle_pagination(driver):
            break
            
        page += 1
    
    return all_vehicles

In [5]:
def get_detailed_vehicle_info(driver, listing):
    """Get detailed information by clicking into each vehicle listing"""
    try:
        # Store the main window handle
        main_window = driver.current_window_handle
        
        # Click on the listing to open in new tab
        listing_link = listing.find_element(By.CLASS_NAME, "listing-link")
        listing_link.click()
        time.sleep(3)
        
        # Switch to the new tab
        new_window = [window for window in driver.window_handles if window != main_window][0]
        driver.switch_to.window(new_window)
        
        # Wait for details to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "vehicle-info"))
        )
        
        # Extract detailed information
        vehicle = {}
        try:
            # Basic info (already have some from listing page)
            title_elem = driver.find_element(By.CLASS_NAME, "vehicle-title")
            title_parts = title_elem.text.split()
            vehicle['Year'] = title_parts[0]
            vehicle['Make'] = 'Ford'
            vehicle['Model'] = ' '.join(title_parts[2:]) if len(title_parts) > 2 else 'N/A'
            
            # Try to get trim level
            try:
                vehicle['Trim'] = driver.find_element(By.CLASS_NAME, "trim-level").text
            except:
                vehicle['Trim'] = 'N/A'
            
            # Vehicle history details
            try:
                history_section = driver.find_element(By.CLASS_NAME, "vehicle-history")
                vehicle['Accidents'] = history_section.find_element(By.XPATH, "//span[contains(text(), 'Accidents')]/../following-sibling::div").text
                vehicle['Damage'] = history_section.find_element(By.XPATH, "//span[contains(text(), 'Damage')]/../following-sibling::div").text
                vehicle['Service History'] = history_section.find_element(By.XPATH, "//span[contains(text(), 'Service History')]/../following-sibling::div").text
                vehicle['# Previous Owners'] = history_section.find_element(By.XPATH, "//span[contains(text(), 'Owner')]/../following-sibling::div").text
            except:
                vehicle['Accidents'] = 'N/A'
                vehicle['Damage'] = 'N/A'
                vehicle['Service History'] = 'N/A'
                vehicle['# Previous Owners'] = 'N/A'
            
            # Additional details
            try:
                specs_section = driver.find_element(By.CLASS_NAME, "vehicle-specs")
                vehicle['Drive Type'] = specs_section.find_element(By.XPATH, "//span[contains(text(), 'Drive Type')]/../following-sibling::div").text
            except:
                vehicle['Drive Type'] = 'N/A'
            
            # Recalls and usage
            try:
                vehicle['Open Recalls'] = driver.find_element(By.XPATH, "//span[contains(text(), 'Open Recalls')]/../following-sibling::div").text
                vehicle['Personal Vehicle'] = driver.find_element(By.XPATH, "//span[contains(text(), 'Personal Use')]/../following-sibling::div").text
            except:
                vehicle['Open Recalls'] = 'N/A'
                vehicle['Personal Vehicle'] = 'N/A'
            
        finally:
            # Close the detail tab and switch back to main window
            driver.close()
            driver.switch_to.window(main_window)
            
        return vehicle
        
    except Exception as e:
        print(f"Error getting detailed info: {str(e)}")
        if driver.current_window_handle != main_window:
            driver.close()
            driver.switch_to.window(main_window)
        return None

In [6]:
def handle_pagination(driver):
    """Check and handle pagination if multiple pages exist"""
    try:
        next_button = driver.find_element(By.CLASS_NAME, "pagination-next")
        if "disabled" not in next_button.get_attribute("class"):
            next_button.click()
            time.sleep(3)
            return True
        return False
    except:
        return False

In [58]:
def main():
    # Your existing Bright Data setup
    AUTH = open('keys/brightdata_key.txt', 'r').read()
    SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'
    
    print('Connecting to Scraping Browser...')
    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
    
    # Models to search for
    models = ['Mustang Mach-E', 'Edge']
    all_vehicles = []
    
    with Remote(sbr_connection, options=ChromeOptions()) as driver:
        print('Connected! Starting scraping...')
        
        for model in models:
            # Perform search for each model
            if search_carfax(driver, model=model):
                # Scrape the results
                vehicles = scrape_listings(driver)
                all_vehicles.extend(vehicles)
                print(f"Found {len(vehicles)} vehicles for {model}")
                time.sleep(2)  # Brief pause between searches
    
    # Save results to CSV
    if all_vehicles:
        df = pd.DataFrame(all_vehicles)
        df.to_csv('ford_vehicles.csv', index=False)
        print(f"Saved {len(all_vehicles)} vehicles to ford_vehicles.csv")
    else:
        print("No vehicles found")

if __name__ == '__main__':
    main()

Connecting to Scraping Browser...
Connected! Starting scraping...
Attempting to search CarFax...
Connected to website
Finding and selecting make...
Attempting to find 'Ford' element...
Make selected successfully!
Finding and selecting model...
Attempting to select model element...
Make and model selected successfully!
Finding and clicking location change button...
Location change button clicked successfully!
Finding and selecting zip code input...
Zip code input filled successfully!
Finding and selecting distance dropdown...
Selecting 'Unlimited' distance option...
Distance selected successfully!
Handling overlay and clicking apply button...
Error clicking apply button: Message: element click intercepted: Element <button aria-hidden="true" class="overlay overlay__visible" tabindex="-1" title="Close dialog"></button> is not clickable at point (768, 347). Other element would receive the click: <span>...</span>
  (Session info: chrome=130.0.6723.92)

Attempting to search CarFax...
Connect

KeyboardInterrupt: 