In [30]:
#load packages
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import display
import pandas as pd
 


# Scraping Edmunds.com for vehicle data

In [None]:
# Set up the driver (make sure to specify the path to your WebDriver)
service = Service(r'C:\Users\amyfo\.wdm\drivers\chromedriver\win64\132.0.6834.159\chromedriver-win32\chromedriver.exe')
driver = webdriver.Chrome(service=service)

# Set a timeout for the WebDriver
driver.set_page_load_timeout(180)

# Base URL
base_url = "https://www.edmunds.com/inventory/srp.html?inventorytype=used%2Ccpo&make=volvo&model=volvo%7Cxc60"

# Initialize an empty list to store car data
volvo_data = []

# Loop through pages, if the program quit at page "n", start the application again on that page
# for example, if the program quit at page 10, start the application again on page 11 by setting page_number = 10
page_number = 10
max_retries = 3

while True:
    page_number += 1

    # Construct the URL with the current page number
    url = f"{base_url}&pagenumber={page_number}"
    retries = 0
    while retries < max_retries:
        try:
            driver.get(url)
            break
        except TimeoutException:
            print(f"Page {page_number} timed out.")
            retries += 1
            if retries == max_retries:
                print(f"Page {page_number} failed to load after {max_retries} retries. Skipping to the next page.")
                page_number += 1
                continue

    try:
        # Wait until the listings load
        WebDriverWait(driver, 60).until(
            EC.presence_of_element_located((By.CLASS_NAME, "vehicle-info"))
        )

        # Find all divs with class 'vehicle-info'
        car_listings = driver.find_elements(By.CLASS_NAME, 'vehicle-info')

        # Debug print
        print(f"Page {page_number}: Number of car listings found: {len(car_listings)}")

        # If no car listings are found, break the loop
        if not car_listings:
            break

        # Extract car listings from the results container
        for car in car_listings:
            title = car.find_element(By.CLASS_NAME, 'size-16.text-cool-gray-10.fw-bold.mb-0_5').text.strip()
            description = car.find_element(By.CLASS_NAME, 'fw-normal.size-14.text-cool-gray-30').text.strip()
            # Getting the pricing details container
            pricing_details = car.find_element(By.CLASS_NAME, 'pricing-details')
            # Pulling the price from within the heading to exclude the extra price details
            price = pricing_details.find_element(By.CLASS_NAME, 'heading-3').text.strip() if pricing_details else None
            # Add mileage which is in ul class=mb_0 and then within span class text-cool-gray-30
            mileage = car.find_element(By.CSS_SELECTOR, 'ul.mb-0 span.text-cool-gray-30').text.strip()

            if title and description and price:
                volvo_data.append({
                    #saving the page number to keep track of what data was scraped
                    'Page': page_number,
                    'Title': title,
                    'Description': description,
                    'Price': price,
                    'Mileage': mileage
                })
                # Debug print
                print(f"Car found: {title}, {description}, {price}, {mileage}")

        # save each batch of page data to a csv
        volvo_df = pd.DataFrame(volvo_data)
        with open('volvo_data.csv', 'a', newline='', encoding='utf-8') as f:
            volvo_df.to_csv(f, header=f.tell()==0, index=False)

        # Clear the volvo_data list to avoid duplicates
        volvo_data.clear()

    except Exception as e:
        print(f"An error occurred on page {page_number}: {e}")
        break

    # Increment the page number
    page_number += 1

# Close the driver
driver.quit()

Page 11: Number of car listings found: 21
Car found: 2023 Volvo XC60, B5 Core 4dr SUV, $32,998, 28,998 miles
Car found: 2021 Volvo XC60, Recharge Plug-In Hybrid T8 R-Design 4dr SUV, $38,000, 29,224 miles
Car found: 2020 Volvo XC60, T5 Inscription 4dr SUV, $27,998, 34,022 miles
Car found: 2022 Volvo XC60, B5 Inscription 4dr SUV, $37,990, 15,825 miles
Car found: 2021 Volvo XC60, T5 Momentum 4dr SUV, $26,998, 36,953 miles
Car found: 2021 Volvo XC60, Recharge Plug-In Hybrid T8 Inscription 4dr SUV, $41,998, 10,811 miles
Car found: 2022 Volvo XC60, B5 Momentum 4dr SUV, $31,998, 30,448 miles
Car found: 2022 Volvo XC60, B5 Inscription 4dr SUV, $37,998, 11,933 miles
Car found: 2019 Volvo XC60, T6 Momentum 4dr SUV, $25,998, 52,060 miles
Car found: 2023 Volvo XC60, B5 Plus Dark 4dr SUV, $39,998, 11,837 miles
Car found: 2021 Volvo XC60, T5 Momentum 4dr SUV, $26,998, 39,091 miles
Car found: 2024 Volvo XC60, B5 Plus 4dr SUV, $41,998, 12,549 miles
Car found: 2022 Volvo XC60, B5 R-Design 4dr SUV, $30,

## Cleaning the scraped Edmunds data

In [31]:
# load the data from the csv
volvo_df = pd.read_csv('volvo_data.csv')

# remove any rows where the price is set to "Not Priced"
volvo_df = volvo_df[volvo_df['Price'] != 'Not Priced']

# Extract the year from the title and create a new column for it, make it an integer
volvo_df['Year'] = volvo_df['Title'].str.extract(r'(\d{4})').astype(int)

# Remove the year from the title
volvo_df['Title'] = volvo_df['Title'].str.replace(r'\d{4}', '', regex=True).str.strip()

# make price an integer
volvo_df['Price'] = volvo_df['Price'].str.replace('$', '').str.replace(',', '').astype(int)

# remove the word miles from the mileage column and make mileage an integer
volvo_df['Mileage'] = volvo_df['Mileage'].str.replace(' miles', '').str.replace(',', '').astype(int)

# preview the cleaned data
display(volvo_df)


Unnamed: 0,Page,Title,Description,Price,Mileage,Year
0,1,Volvo XC60,T6 Inscription 4dr SUV,27293,39106,2019
1,1,Volvo XC60,B6 Ultimate Bright 4dr SUV,37175,32922,2023
2,1,Volvo XC60,B5 Core 4dr SUV,31789,24514,2023
3,1,Volvo XC60,T6 4dr SUV,11799,83670,2012
4,1,Volvo XC60,Recharge Plug-In Hybrid T8 Inscription Extende...,45482,22964,2022
...,...,...,...,...,...,...
246,19,Volvo XC60,T6 4dr SUV,19990,47405,2016
247,19,Volvo XC60,T5 Dynamic 4dr SUV,19590,67977,2017
249,19,Volvo XC60,T6 4dr SUV,7999,142252,2011
250,19,Volvo XC60,T5 Drive-E Premier 4dr SUV,16590,61404,2015


In [32]:
#save the new, cleaned data to a csv
volvo_df.to_csv('volvo_data_clean.csv', index=False)


# Predicting the price of a Volvo XC60

In [33]:
#pull the new csv as a dataframe
volvo_df_clean = pd.read_csv('volvo_data_clean.csv')

#scatterplot using plotly
fig = px.scatter(volvo_df_clean, x='Mileage', y='Price', title='Price vs Year', color="Year")
fig.update_layout(height=800)
fig.update_yaxes(tickprefix="$")
fig.update_layout(title="Volvo XC60 Price vs Mileage")
fig.show()


In [35]:
#use the volvo_df_clean to fit a linear regression model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into training and testing sets
x = volvo_df_clean[['Mileage']]
y = volvo_df_clean['Price']


# Create and fit the linear regression model

volvo_model = LinearRegression()

#model.fit(x.values.reshape(-1, 1), y) 
volvo_model.fit(x, y) 
#plot the model
v_fig = go.Figure()
v_fig.add_trace(go.Scatter(x=x['Mileage'], y=y, mode='markers', name='Actual'))
v_fig.add_trace(go.Scatter(x=x['Mileage'], y=volvo_model.predict(x), mode='lines', name='Predicted'))
v_fig.update_layout(title="Linear Regression Model for Volvo XC60", height=800)
v_fig.update_xaxes(title="Mileage")
v_fig.update_yaxes(title="Price")
v_fig.show()



In [36]:
# Calculate the R^2 score
from sklearn.metrics import mean_squared_error


r2 = round(volvo_model.score(x, y), 2)

# Calculate the mean squared error
mse = mean_squared_error(y, volvo_model.predict(x))

# Calculate the standard error (root mean squared error)
standard_error = round(mse ** 0.5, 2)

#print the regression equation
print(f"Regression equation: predicted_price = { round(volvo_model.intercept_, 2)} {round(volvo_model.coef_[0], 2)} * mileage")
print(f"R^2: {r2}")
print(f"Standard Error: {standard_error}")



Regression equation: predicted_price = 41459.0 -0.26 * mileage
R^2: 0.71
Standard Error: 4906.31


# Predicting the price of a Rav4

In [37]:
#open the csv file called toyotarav4 and read it as a dataframe
toyota_df = pd.read_csv('toyotarav4.csv')

#convert the price, mileage and year to integer
toyota_df['price'] = toyota_df['price'].str.replace('$', '').str.replace(',', '').astype(int)
toyota_df['mileage'] = toyota_df['mileage'].str.replace(' miles', '').str.replace(',', '').astype(int)
#rename mileage to Mileage
toyota_df.rename(columns={'mileage': 'Mileage'}, inplace=True)
toyota_df['year'] = toyota_df['year'].astype(int)

#preview the first 5 rows of data
display(toyota_df.head())



Unnamed: 0,year,price,Mileage
0,2019,22953,41706
1,2023,30798,36517
2,2023,28489,46231
3,2022,23189,58206
4,2023,41553,5794


In [38]:
#create a scatterplot of the rav4 data with mileage on the x-axis and price on the y-axis
fig = px.scatter(toyota_df, x='Mileage', y='price', title='Price vs Mileage', color="year")
fig.update_layout(height=800)
fig.update_yaxes(tickprefix="$")
fig.update_layout(title="Toyota RAV4 Price vs Mileage")
fig.update_yaxes(title="Price")
fig.update_xaxes(title="Mileage")
fig.show()


In [39]:
from sklearn.linear_model import LinearRegression

#create a linear regression model for the rav4 data
explanatory = toyota_df[['Mileage']]
response = toyota_df['price']

# Create and fit the linear regression model

rav4_model = LinearRegression()

#fit the model
rav4_model.fit(explanatory, response)

# Plot the model
t_fig = go.Figure()
t_fig.add_trace(go.Scatter(x=explanatory['Mileage'], y=response, mode='markers', name='Actual'))
t_fig.add_trace(go.Scatter(x=explanatory['Mileage'], y=rav4_model.predict(explanatory), mode='lines', name='Predicted'))
t_fig.update_layout(title="Linear Regression Model for Rav4", height=800)
t_fig.update_xaxes(title="Mileage")
t_fig.update_yaxes(title="Price")
t_fig.show()

In [40]:
# Calculate the R^2 score for the Rav4 model
rav_r2 = round(rav4_model.score(explanatory, response), 2)

# Calculate the mean squared error
rav_mse = mean_squared_error(response, rav4_model.predict(explanatory))

# Calculate the standard error (root mean squared error)
rav_standard_error = round(rav_mse ** 0.5, 2)


#print the Rav4 regression equation
print(f"Regression equation: predicted_price = {round(rav4_model.intercept_, 2)} + {round(rav4_model.coef_[0], 2)} * mileage")
print(f"R^2: {rav_r2}")
print(f"Standard Error: {rav_standard_error}")


Regression equation: predicted_price = 37392.46 + -0.16 * mileage
R^2: 0.74
Standard Error: 4051.23


# Comparing the Toyota Rav4 to the Volvo XC60

In [41]:
#plot the rav4 model with the volvo model
fig = go.Figure()
fig.add_trace(go.Scatter(x=x['Mileage'], y=y, mode='markers', name='Volvo XC60'))
fig.add_trace(go.Scatter(x=explanatory['Mileage'], y=response, mode='markers', name='Toyota RAV4'))
fig.add_trace(go.Scatter(x=x['Mileage'], y=volvo_model.predict(x), mode='lines', name='Volvo XC60 Model'))
fig.add_trace(go.Scatter(x=explanatory['Mileage'], y=rav4_model.predict(explanatory), mode='lines', name='Toyota RAV4 Model'))
fig.update_layout(title="Linear Regression Models for Volvo XC60 and Toyota RAV4", height=800)
fig.update_xaxes(title="Mileage")
fig.update_yaxes(title="Price")
fig.show()


In [43]:
#ignore warning
import warnings
warnings.filterwarnings("ignore")

# Predict both the volvo and rav4 price based on mileage
pred_mileage = 40000
volvo_predicted_price = volvo_model.predict([[pred_mileage]])
rav4_predicted_price = rav4_model.predict([[pred_mileage]])

print(f"Predicted price for a Volvo XC60 with {pred_mileage} miles: ${volvo_predicted_price[0]:.0f}")
print(f"Predicted price for a Toyota RAV4 with {pred_mileage} miles: ${rav4_predicted_price[0]:.0f}")

#state the price difference, rounded to the nearest dollar
price_difference = abs(volvo_predicted_price - rav4_predicted_price)
if volvo_predicted_price > rav4_predicted_price:
    print(f"At {pred_mileage}, the Volvo XC60 is ${round(price_difference[0])} more than the Rav4.")
else:
    print(f"At {pred_mileage}, the Toyota Rav4 is ${round(price_difference[0])} more than the Volvo XC60.")

Predicted price for a Volvo XC60 with 40000 miles: $31087
Predicted price for a Toyota RAV4 with 40000 miles: $30806
At 40000, the Volvo XC60 is $281 more than the Rav4.


# Calculating cost of ownnership

The ownership costs below were taken from Edmunds.com. Note, this cost is for the first five years of ownership. Predicting the ownership of these vehicles when purchased gently used requires more time than I have at the moment. I decided to utilize this five year total into this rough estimate.

Of note, Edmunds does call out that the Volvo XC60 typically depreciates $32,044 in the first five years. The Toyota Rav4 depreciates by $14,412. I did not add depreciation into the costs below, since this was taken care of by the linear regression model.

In [45]:
#Volvo XC60 ownership data
v_fuel = 12232
v_maint = 4809
v_repairs = 2049
v_insurance = 6564
#add the total cost of ownership for the volvo
volvo_tco = v_fuel + v_maint + v_repairs + v_insurance


#Toyota Rav4 ownership data
t_fuel = 8618
t_maint = 5244
t_repairs = 854
t_insurance = 5523
#add the total cost of ownership for the rav4
rav4_tco = t_fuel + t_maint + t_repairs + t_insurance

#calculate the difference in total cost of ownership
tco_difference = volvo_tco - rav4_tco
print(f"For the first five years, the Volvo XC60 has a total cost of ownership of ${volvo_tco} and the Toyota Rav4 has a total cost of ownership of ${rav4_tco}.")
print(f"The Volvo XC60 is ${tco_difference} more expensive to own than the Toyota Rav4 for the first five years.")

For the first five years, the Volvo XC60 has a total cost of ownership of $25654 and the Toyota Rav4 has a total cost of ownership of $20239.
The Volvo XC60 is $5415 more expensive to own than the Toyota Rav4 for the first five years.


In [48]:
# Predict both the volvo and rav4 price for their milage when I predict I will sell them
end_mileage =150000
volvo_end_price = volvo_model.predict([[end_mileage]])
rav4_end_price = rav4_model.predict([[end_mileage]])

#print the predicted end price for both cars
print(f"Predicted end price for a Volvo XC60 with {end_mileage} miles: ${volvo_end_price[0]:.0f}")
print(f"Predicted end price for a Toyota Rav4 with {end_mileage} miles: ${rav4_end_price[0]:.0f}")



Predicted end price for a Volvo XC60 with 150000 miles: $2564
Predicted end price for a Toyota Rav4 with 150000 miles: $12695


## Calculating predicted long term ownership costs
I took the original purchase price at 40,000 miles, then added the five year ownership cost for each vehicle to get the rough, true cost of ownership. Finally, I subtracted the predicted value at 150,000 miles to approximate the net ownership cost.



In [51]:
#calculate the net ownership cost for the vehicles
volvo_net_cost = volvo_predicted_price + volvo_tco - volvo_end_price
rav4_net_cost = rav4_predicted_price + rav4_tco - rav4_end_price

#calculate the difference in net ownership cost
net_cost_difference = volvo_net_cost - rav4_net_cost

#print the net ownership cost for the vehicles
print(f"The net ownership cost for the Volvo XC60 is ${volvo_net_cost[0]:.0f}")
print(f"The net ownership cost for the Toyota Rav4 is ${rav4_net_cost[0]:.0f}")
print(f"When purchasing at {pred_mileage} and selling at {end_mileage} and considering cost of ownership, "
      f"the Volvo XC60 is ${net_cost_difference[0]:.0f} more expensive to own than the Toyota Rav4.")

The net ownership cost for the Volvo XC60 is $54177
The net ownership cost for the Toyota Rav4 is $38350
When purchasing at 40000 and selling at 150000 and considering cost of ownership, the Volvo XC60 is $15826 more expensive to own than the Toyota Rav4.
