In [1]:
#load packages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import display
import pandas as pd
 


In [None]:
# Set up the driver (make sure to specify the path to your WebDriver)
service = Service(r'C:\Users\amyfo\.wdm\drivers\chromedriver\win64\132.0.6834.159\chromedriver-win32\chromedriver.exe')
driver = webdriver.Chrome(service=service)

# Set a timeout for the WebDriver
driver.set_page_load_timeout(180)

# Base URL
base_url = "https://www.edmunds.com/inventory/srp.html?inventorytype=used%2Ccpo&make=volvo&model=volvo%7Cxc60"

# Initialize an empty list to store car data
volvo_data = []

# Loop through pages, if the program quit at page "n", start the application again on that page
# for example, if the program quit at page 10, start the application again on page 11 by setting page_number = 10
page_number = 10
max_retries = 3

while True:
    page_number += 1

    # Construct the URL with the current page number
    url = f"{base_url}&pagenumber={page_number}"
    retries = 0
    while retries < max_retries:
        try:
            driver.get(url)
            break
        except TimeoutException:
            print(f"Page {page_number} timed out.")
            retries += 1
            if retries == max_retries:
                print(f"Page {page_number} failed to load after {max_retries} retries. Skipping to the next page.")
                page_number += 1
                continue

    try:
        # Wait until the listings load
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.CLASS_NAME, "vehicle-info"))
        )

        # Find all divs with class 'vehicle-info'
        car_listings = driver.find_elements(By.CLASS_NAME, 'vehicle-info')

        # Debug print
        print(f"Page {page_number}: Number of car listings found: {len(car_listings)}")

        # If no car listings are found, break the loop
        if not car_listings:
            break

        # Extract car listings from the results container
        for car in car_listings:
            title = car.find_element(By.CLASS_NAME, 'size-16.text-cool-gray-10.fw-bold.mb-0_5').text.strip()
            description = car.find_element(By.CLASS_NAME, 'fw-normal.size-14.text-cool-gray-30').text.strip()
            # Getting the pricing details container
            pricing_details = car.find_element(By.CLASS_NAME, 'pricing-details')
            # Pulling the price from within the heading to exclude the extra price details
            price = pricing_details.find_element(By.CLASS_NAME, 'heading-3').text.strip() if pricing_details else None
            # Add mileage which is in ul class=mb_0 and then within span class text-cool-gray-30
            mileage = car.find_element(By.CSS_SELECTOR, 'ul.mb-0 span.text-cool-gray-30').text.strip()

            if title and description and price:
                volvo_data.append({
                    #saving the page number to keep track of what data was scraped
                    'Page': page_number,
                    'Title': title,
                    'Description': description,
                    'Price': price,
                    'Mileage': mileage
                })
                # Debug print
                print(f"Car found: {title}, {description}, {price}, {mileage}")

        # save each batch of page data to a csv
        volvo_df = pd.DataFrame(volvo_data)
        with open('volvo_data.csv', 'a', newline='', encoding='utf-8') as f:
            volvo_df.to_csv(f, header=f.tell()==0, index=False)

        # Clear the volvo_data list to avoid duplicates
        volvo_data.clear()

    except Exception as e:
        print(f"An error occurred on page {page_number}: {e}")
        break

    # Increment the page number
    page_number += 1

# Close the driver
driver.quit()

Page 11: Number of car listings found: 21
Car found: 2023 Volvo XC60, B5 Core 4dr SUV, $32,998, 28,998 miles
Car found: 2021 Volvo XC60, Recharge Plug-In Hybrid T8 R-Design 4dr SUV, $38,000, 29,224 miles
Car found: 2020 Volvo XC60, T5 Inscription 4dr SUV, $27,998, 34,022 miles
Car found: 2022 Volvo XC60, B5 Inscription 4dr SUV, $37,990, 15,825 miles
Car found: 2021 Volvo XC60, T5 Momentum 4dr SUV, $26,998, 36,953 miles
Car found: 2021 Volvo XC60, Recharge Plug-In Hybrid T8 Inscription 4dr SUV, $41,998, 10,811 miles
Car found: 2022 Volvo XC60, B5 Momentum 4dr SUV, $31,998, 30,448 miles
Car found: 2022 Volvo XC60, B5 Inscription 4dr SUV, $37,998, 11,933 miles
Car found: 2019 Volvo XC60, T6 Momentum 4dr SUV, $25,998, 52,060 miles
Car found: 2023 Volvo XC60, B5 Plus Dark 4dr SUV, $39,998, 11,837 miles
Car found: 2021 Volvo XC60, T5 Momentum 4dr SUV, $26,998, 39,091 miles
Car found: 2024 Volvo XC60, B5 Plus 4dr SUV, $41,998, 12,549 miles
Car found: 2022 Volvo XC60, B5 R-Design 4dr SUV, $30,

In [None]:
# load the data from the csv
volvo_df = pd.read_csv('volvo_data.csv')

# remove any rows where the price is set to "Not Priced"
volvo_df = volvo_df[volvo_df['Price'] != 'Not Priced']

# Extract the year from the title and create a new column for it, make it an integer
volvo_df['Year'] = volvo_df['Title'].str.extract(r'(\d{4})').astype(int)

# Remove the year from the title
volvo_df['Title'] = volvo_df['Title'].str.replace(r'\d{4}', '', regex=True).str.strip()

# make price an integer
volvo_df['Price'] = volvo_df['Price'].str.replace('$', '').str.replace(',', '').astype(int)

# remove the word miles from the mileage column and make mileage an integer
volvo_df['Mileage'] = volvo_df['Mileage'].str.replace(' miles', '').str.replace(',', '').astype(int)

# preview the cleaned data
display(volvo_df)


Unnamed: 0,Page,Title,Description,Price,Mileage,Year
0,1,Volvo XC60,T6 Inscription 4dr SUV,27293,39106,2019
1,1,Volvo XC60,B6 Ultimate Bright 4dr SUV,37175,32922,2023
2,1,Volvo XC60,B5 Core 4dr SUV,31789,24514,2023
3,1,Volvo XC60,T6 4dr SUV,11799,83670,2012
4,1,Volvo XC60,Recharge Plug-In Hybrid T8 Inscription Extende...,45482,22964,2022
...,...,...,...,...,...,...
246,19,Volvo XC60,T6 4dr SUV,19990,47405,2016
247,19,Volvo XC60,T5 Dynamic 4dr SUV,19590,67977,2017
249,19,Volvo XC60,T6 4dr SUV,7999,142252,2011
250,19,Volvo XC60,T5 Drive-E Premier 4dr SUV,16590,61404,2015


In [None]:
#save the new, cleaned data to a csv
volvo_df.to_csv('volvo_data_clean.csv', index=False)


In [None]:
#pull the new csv as a dataframe
volvo_df_clean = pd.read_csv('volvo_data_clean.csv')

#scatterplot using plotly
fig = px.scatter(volvo_df_clean, x='Mileage', y='Price', title='Price vs Year', color="Year")
fig.update_layout(height=800)
fig.update_yaxes(tickprefix="$")
fig.update_layout(title="Volvo XC60 Price vs Mileage")
fig.show()


In [13]:
#use the volvo_df_clean to fit a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
x = volvo_df_clean[['Mileage']]
y = volvo_df_clean['Price']


# Create and fit the linear regression model

volvo_model = LinearRegression()

#model.fit(x.values.reshape(-1, 1), y) 
volvo_model.fit(x, y) 
#plot the model
fig = go.Figure()
fig.add_trace(go.Scatter(x=x['Mileage'], y=y, mode='markers', name='Actual'))
fig.add_trace(go.Scatter(x=x['Mileage'], y=volvo_model.predict(x), mode='lines', name='Predicted'))
fig.update_layout(title="Linear Regression Model", height=800)
fig.update_xaxes(title="Mileage")
fig.update_yaxes(title="Price")
fig.show()



In [None]:
# Access model parameters
print("Slope:", volvo_model.coef_[0])
print("Intercept:", volvo_model.intercept_)

# Calculate the R^2 score
r2 = volvo_model.score(x, y)

#get the summary statistics
y_pred = volvo_model.predict(x)
mse = mean_squared_error(y, y_pred)
rmse = mse ** 0.5

#print the model coefficients
print(f"Model coefficients: {volvo_model.coef_}")

#print the regression equation
print(f"Regression equation: predicted_price = { round(volvo_model.intercept_, 2)} {round(volvo_model.coef_[0], 2)} * mileage")



Model coefficients: [-0.26109015]
Regression equation: predicted_price = 41290.63 -0.26 * mileage


In [15]:
#predict the price of a volvo based on mileage
mileage = 50000
predicted_price = volvo_model.predict([[mileage]])[0]
print(f"Predicted price for a car with {mileage} miles: ${predicted_price:.2f}")


Predicted price for a car with 50000 miles: $28494.13



X does not have valid feature names, but LinearRegression was fitted with feature names

