# CARFAX WEB SCRAPING 4
- Web Scraping for Ford Mach-E and Edge MY 21-24 from CarFax by first initializing the search


In [5]:
import pytest
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import Remote, ChromeOptions
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection

import pandas as pd
import time
import random


In [3]:
def scrape_carfax(driver, url):
    # Navigate to the given URL
    print("Opening provided URL...")
    driver.get(url)
    driver.get_screenshot_as_file("images/step_1_page_loaded.png")  # Take screenshot after loading URL
    time.sleep(2)  # Brief pause for the page load

    # Set radius to "Unlimited" if needed
    print("Setting search radius to Unlimited...")
    try:
        set_distance_to_unlimited(driver)
    except Exception as e:
        print(f"Could not set search radius to Unlimited: {str(e)}")

    # Scrape listings
    print("Scraping vehicle listings...")
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    listings = soup.find_all('div', class_='srp-list-item')  # Adjust class if necessary

    # Initialize a DataFrame to store results
    columns = [
        "Make", "Year", "Model", "Trim", "VIN #", "CARFAX Value", "Mileage", 
        "Accidents", "Damage", "Service History", "# Previous Owners", 
        "Drive Type", "Open Recalls", "Personal Vehicle", "Location"
    ]
    data = pd.DataFrame(columns=columns)

    # Iterate over listings and extract details
    for listing in listings:
        try:
            make = "Ford"  # Predefined
            year = listing.find('span', {'class': 'year'}).text.strip()
            model = "Mustang Mach-E"  # Predefined in URL
            trim = listing.find('span', {'class': 'trim'}).text.strip()
            vin = listing['data-vin'] if 'data-vin' in listing.attrs else None
            carfax_value = listing.find('div', {'class': 'carfax-value'}).text.strip()
            mileage = listing.find('div', {'class': 'mileage'}).text.strip()
            accidents = listing.find('div', {'class': 'accidents'}).text.strip()
            damage = listing.find('div', {'class': 'damage'}).text.strip()
            service_history = listing.find('div', {'class': 'service-history'}).text.strip()
            owners = listing.find('div', {'class': 'owners'}).text.strip()
            drive_type = listing.find('div', {'class': 'drive-type'}).text.strip()
            recalls = listing.find('div', {'class': 'open-recalls'}).text.strip()
            personal = listing.find('div', {'class': 'personal-use'}).text.strip()
            location = listing.find('div', {'class': 'location'}).text.strip()
            
            # Append extracted data to DataFrame
            data = data.append({
                "Make": make, "Year": year, "Model": model, "Trim": trim, "VIN #": vin, 
                "CARFAX Value": carfax_value, "Mileage": mileage, "Accidents": accidents, 
                "Damage": damage, "Service History": service_history, 
                "# Previous Owners": owners, "Drive Type": drive_type, 
                "Open Recalls": recalls, "Personal Vehicle": personal, "Location": location
            }, ignore_index=True)

        except Exception as e:
            print(f"Error extracting data for listing: {str(e)}")
            continue

    # Export to Excel
    output_path = '/mnt/data/CARFAX_Data_Scraped.xlsx'
    data.to_excel(output_path, index=False)
    print(f"Data has been exported to {output_path}")

In [7]:
def set_distance_to_unlimited(driver):

    driver.set_window_size(1550, 830)
    print("Window size set to 1550x830")
    driver.get_screenshot_as_file('images/screenshot1.png')
    driver.execute_script("window.scrollTo(0,156)")
    print("Scrolled to position 156")
    driver.get_screenshot_as_file('images/screenshot2.png')
    driver.find_element(By.CSS_SELECTOR, ".accordion > div:nth-child(1) .accordion_header").click()
    print("Clicked on accordion header")
    driver.get_screenshot_as_file('images/screenshot3.png')
    time.sleep(1)
    driver.find_element(By.XPATH, "//div[@id=\'srp-filter-container\']/div[2]/div[3]/div[2]/div/details/div/div/div/div/select").click()
    dropdown = driver.find_element(By.XPATH, "//div[@id=\'srp-filter-container\']/div[2]/div[3]/div[2]/div/details/div/div/div/div/select")
    dropdown.find_element(By.XPATH, "//option[. = 'Unlimited']").click()
    print("Clicked on select dropdown")
    driver.get_screenshot_as_file('images/screenshot4.png')
    time.sleep(1.2)


    # self.driver.find_element(By.XPATH, "//div[@id=\'srp-filter-container\']/div[2]/div[3]/div[2]/div/details/div/div/div/div/select").click()
    # dropdown = self.driver.find_element(By.XPATH, "//div[@id=\'srp-filter-container\']/div[2]/div[3]/div[2]/div/details/div/div/div/div/select")
    # dropdown.find_element(By.XPATH, "//option[. = '50']").click()
    # self.driver.find_element(By.XPATH, "//div[@id=\'srp-filter-container\']/div[2]/div[3]/div[2]/div/details/div/div/div/div/select").click()
    # dropdown = self.driver.find_element(By.XPATH, "//div[@id=\'srp-filter-container\']/div[2]/div[3]/div[2]/div/details/div/div/div/div/select")
    # dropdown.find_element(By.XPATH, "//option[. = 'Unlimited']").click()


In [None]:
driver = webdriver.Chrome()  # Assuming ChromeDriver is in PATH

url = "https://www.carfax.com/Used-Ford-Mustang-Mach-E-Atlanta-GA_w9809_c16158"  # Example URL
scrape_carfax(driver, url)
driver.quit()

In [8]:
def main():
    # Your existing Bright Data setup
    AUTH = open('keys/brightdata_key.txt', 'r').read()
    SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'
    
    print('Connecting to Scraping Browser...')
    sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome')
    
    
    with Remote(sbr_connection, options=ChromeOptions()) as driver:
        print('Connected! Starting scraping...')
        url = "https://www.carfax.com/Used-Ford-Mustang-Mach-E-Atlanta-GA_w9809_c16158"  # Example URL
        
        scrape_carfax(driver, url)
        

if __name__ == '__main__':
    main()

Connecting to Scraping Browser...
Connected! Starting scraping...
Opening provided URL...
Setting search radius to Unlimited...
Window size set to 1550x830
Scrolled to position 156
Clicked on accordion header
Could not set search radius to Unlimited: Message: javascript error: {"status":11,"value":"Element is not currently visible and may not be manipulated"}
  (Session info: chrome=130.0.6723.92)

Scraping vehicle listings...


NameError: name 'BeautifulSoup' is not defined