In [10]:
#load packages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from bs4 import BeautifulSoup
from IPython.display import display
import pandas as pd
 


In [None]:
# Set up the driver (make sure to specify the path to your WebDriver)
service = Service(r'C:\Users\amyfo\.wdm\drivers\chromedriver\win64\132.0.6834.159\chromedriver-win32\chromedriver.exe')
driver = webdriver.Chrome(service=service)

# Set a timeout for the WebDriver
driver.set_page_load_timeout(60)

# Base URL
base_url = "https://www.edmunds.com/inventory/srp.html?inventorytype=used%2Ccpo&make=volvo&model=volvo%7Cxc60"

# Initialize an empty list to store car data
volvo_data = []

# Loop through pages
page_number = 1
max_retries = 3

while True:
    # Construct the URL with the current page number
    url = f"{base_url}&pagenumber={page_number}"
    retires = 0
    try:
        driver.get(url)
        break
    except TimeoutException:
        retries += 1
        print(f"Page {page_number} timed out. Retrying ({retries}/{max_retries})...")
        if retries == max_retries:
            print(f"Page {page_number} failed to load after {max_retries} retries. Skipping to the next page.")
            page_number += 1
            continue

    try:
        # Wait until the listings load
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.CLASS_NAME, "vehicle-info"))
        )

        # Get the HTML content of the page
        html_content = driver.page_source

        # Parse the HTML content with BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all divs with class 'vehicle-info'
        car_listings = soup.find_all('div', class_='vehicle-info')

        # Debug print
        print(f"Page {page_number}: Number of car listings found: {len(car_listings)}")

        # If no car listings are found, break the loop
        if not car_listings:
            break

        # Extract car listings from the results container
        for car in car_listings:
            title = car.find('div', class_='size-16 text-cool-gray-10 fw-bold mb-0_5')
            description = car.find('div', class_='fw-normal size-14 text-cool-gray-30')
            # Getting the pricing details container
            pricing_details = car.find('div', class_='pricing-details')
            # Pulling the price from within the heading to exclude the extra price details
            price = pricing_details.find('span', class_='heading-3') if pricing_details else None
            # Add mileage which is in ul class=mb_0 and then within span class text-cool-gray-30
            mileage = car.find('ul', class_='mb-0').find('span', class_='text-cool-gray-30')

            if title and description and price:
                volvo_data.append({
                    'Title': title.text.strip(),
                    'Description': description.text.strip(),
                    'Price': price.text.strip(),
                    'Mileage': mileage.text.strip()
                })
                # Debug print
                print(f"Car found: {title.text.strip()}, {description.text.strip()}, {price.text.strip()}, {mileage.text.strip()}")


    except Exception as e:
        print(f"An error occurred on page {page_number}: {e}")
        break

    # Increment the page number
    page_number += 1

# Close the driver
driver.quit()

# Create a DataFrame from the car data
volvo_df = pd.DataFrame(volvo_data)

#save the data to a csv file\
volvo_df.to_csv('volvo_data.csv', index=False)

Page 1: Number of car listings found: 21
Car found: 2019 Volvo XC60, T6 Inscription  4dr SUV, $27,293, 39,106 miles
Car found: 2012 Volvo XC60, T6  4dr SUV, $11,799, 83,670 miles
Car found: 2023 Volvo XC60, B5 Core  4dr SUV, $31,789, 24,514 miles
Car found: 2022 Volvo XC60, Recharge Plug-In Hybrid T8 Inscription Extended Range  4dr SUV, $45,482, 22,964 miles
Car found: 2020 Volvo XC60, T5 Momentum  4dr SUV, $27,998, 48,894 miles
Car found: 2023 Volvo XC60, B6 Ultimate Bright  4dr SUV, $37,990, 32,922 miles
Car found: 2022 Volvo XC60, Recharge Plug-In Hybrid T8 R-Design Extended Range  4dr SUV, $40,675, 30,553 miles
Car found: 2022 Volvo XC60, Recharge Plug-In Hybrid T8 Inscription  4dr SUV w/Prod. End 11/21, $45,850, 11,527 miles
Car found: 2013 Volvo XC60, T6  4dr SUV, $7,902, 165,954 miles
Car found: 2022 Volvo XC60, B5 Momentum  4dr SUV, $28,998, 49,321 miles
Car found: 2015 Volvo XC60, T5 Premier  4dr SUV, $18,998, 65,123 miles
Car found: 2017 Volvo XC60, T5 Dynamic  4dr SUV, $19,9

ReadTimeoutError: HTTPConnectionPool(host='localhost', port=49261): Read timed out. (read timeout=120)

In [9]:

# Extract the year from the title and create a new column for it, make it an integer
volvo_df['Year'] = volvo_df['Title'].str.extract(r'(\d{4})').astype(int)

# Remove the year from the title
volvo_df['Title'] = volvo_df['Title'].str.replace(r'\d{4}', '', regex=True).str.strip()

#make price an integer
volvo_df['Price'] = volvo_df['Price'].str.replace('$', '').str.replace(',', '').astype(int)

#remove the word miles from the mileage column and make mileage an integer
volvo_df['Mileage'] = volvo_df['Mileage'].str.replace(' miles', '').str.replace(',', '').astype(int)

# Display the DataFrame as a pretty table
display(volvo_df)


ValueError: cannot convert float NaN to integer

In [None]:
#scatterplot using plotly
fig = px.scatter(volvo_df, x='Mileage', y='Price', title='Price vs Year', color="Year")
fig.update_layout(height=800)
fig.update_yaxes(tickprefix="$")
fig.update_layout(title="Volvo XC60 Price vs Mileage")
fig.show()