# CARFAX WEB SCRAPING 3
- Web Scraping for Ford Mach-E and Edge MY 21-24 from CarFax by first initializing the search


In [1]:
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select  # Add this line
from bs4 import BeautifulSoup

import pandas as pd
import time
import random


In [13]:
def scrape_carfax(driver, url):
    # Navigate to the given URL
    print("Opening provided URL...")
    driver.get(url)
    driver.get_screenshot_as_file("images/step_1_page_loaded.png")  # Take screenshot after loading URL
    time.sleep(2)  # Brief pause for the page load

    # Set radius to "Unlimited" if needed
    print("Setting search radius to Unlimited...")
    try:
        set_distance_to_unlimited(driver)
    except Exception as e:
        print(f"Could not set search radius to Unlimited: {str(e)}")

    # Scrape listings
    print("Scraping vehicle listings...")
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    listings = soup.find_all('div', class_='srp-list-item')  # Adjust class if necessary

    # Initialize a DataFrame to store results
    columns = [
        "Make", "Year", "Model", "Trim", "VIN #", "CARFAX Value", "Mileage", 
        "Accidents", "Damage", "Service History", "# Previous Owners", 
        "Drive Type", "Open Recalls", "Personal Vehicle", "Location"
    ]
    data = pd.DataFrame(columns=columns)

    # Iterate over listings and extract details
    for listing in listings:
        try:
            make = "Ford"  # Predefined
            year = listing.find('span', {'class': 'year'}).text.strip()
            model = "Mustang Mach-E"  # Predefined in URL
            trim = listing.find('span', {'class': 'trim'}).text.strip()
            vin = listing['data-vin'] if 'data-vin' in listing.attrs else None
            carfax_value = listing.find('div', {'class': 'carfax-value'}).text.strip()
            mileage = listing.find('div', {'class': 'mileage'}).text.strip()
            accidents = listing.find('div', {'class': 'accidents'}).text.strip()
            damage = listing.find('div', {'class': 'damage'}).text.strip()
            service_history = listing.find('div', {'class': 'service-history'}).text.strip()
            owners = listing.find('div', {'class': 'owners'}).text.strip()
            drive_type = listing.find('div', {'class': 'drive-type'}).text.strip()
            recalls = listing.find('div', {'class': 'open-recalls'}).text.strip()
            personal = listing.find('div', {'class': 'personal-use'}).text.strip()
            location = listing.find('div', {'class': 'location'}).text.strip()
            
            # Append extracted data to DataFrame
            data = data.append({
                "Make": make, "Year": year, "Model": model, "Trim": trim, "VIN #": vin, 
                "CARFAX Value": carfax_value, "Mileage": mileage, "Accidents": accidents, 
                "Damage": damage, "Service History": service_history, 
                "# Previous Owners": owners, "Drive Type": drive_type, 
                "Open Recalls": recalls, "Personal Vehicle": personal, "Location": location
            }, ignore_index=True)

        except Exception as e:
            print(f"Error extracting data for listing: {str(e)}")
            continue

    # Export to Excel
    output_path = '/mnt/data/CARFAX_Data_Scraped.xlsx'
    data.to_excel(output_path, index=False)
    print(f"Data has been exported to {output_path}")

In [16]:
def set_distance_to_unlimited(driver):
    # Step 1: Expand the accordion if it's not already open
    print("Clicking the Location accordion to reveal the distance dropdown...")
    try:
        # Use JavaScript to ensure the accordion is open
        accordion_element = driver.find_element(By.CSS_SELECTOR, "details.accordion_expander")
        driver.execute_script("arguments[0].setAttribute('open', 'true');", accordion_element)
        time.sleep(1)  # Brief pause to ensure the accordion content is visible
        
        # Take screenshot after expanding accordion
        driver.get_screenshot_as_file("images/1_accordion_expanded.png")
        print("Screenshot saved: 1_accordion_expanded.png")
        
    except Exception as e:
        print(f"Failed to expand the accordion: {e}")
        # Take screenshot of error state
        driver.get_screenshot_as_file("images/error_accordion.png")
        print("Error screenshot saved: error_accordion.png")
        return

    # Locate the distance dropdown
    print("Locating the distance dropdown to set radius to Unlimited...")
    try:
        # Wait for the dropdown to be present in the DOM and visible
        distance_dropdown = WebDriverWait(driver, 10).until(
            EC.visibility_of_element_located((By.CSS_SELECTOR, "select[aria-label='Search Radius']"))
        )

        # Initialize Select and set the dropdown to "Unlimited"
        select_distance = Select(distance_dropdown)
        select_distance.select_by_visible_text("Unlimited")
        print("Search radius set to Unlimited.")
    except Exception as e:
        print(f"Error setting search radius to Unlimited: {e}")


In [None]:
driver = webdriver.Chrome()  # Assuming ChromeDriver is in PATH
url = "https://www.carfax.com/Used-Ford-Mustang-Mach-E-Atlanta-GA_w9809_c16158"  # Example URL
scrape_carfax(driver, url)
driver.quit()

In [17]:
def main():
    # Your existing Bright Data setup
    AUTH = open('keys/brightdata_key.txt', 'r').read()
    SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'
    
    print('Connecting to Scraping Browser...')
    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
    
    
    with Remote(sbr_connection, options=ChromeOptions()) as driver:
        print('Connected! Starting scraping...')
        url = "https://www.carfax.com/Used-Ford-Mustang-Mach-E-Atlanta-GA_w9809_c16158"  # Example URL
        scrape_carfax(driver, url)
        

if __name__ == '__main__':
    main()

Connecting to Scraping Browser...
Connected! Starting scraping...
Opening provided URL...
Setting search radius to Unlimited...
Clicking the Location accordion to reveal the distance dropdown...
Screenshot saved: 1_accordion_expanded.png
Locating the distance dropdown to set radius to Unlimited...
Error setting search radius to Unlimited: Message: 

Scraping vehicle listings...


OSError: Cannot save file into a non-existent directory: '\mnt\data'