In [1]:
import pandas as pd
import streamlit as st
import plotly.express as px

In [2]:
car_data = pd.read_csv('/Users/cslewicki/Desktop/Sprint4Project/vehicles_us.csv')
# The above dataset contains various details about cars that are being advertised.

In [3]:
car_data.info()
# It's always a good idea to use .info() to see which columns, if any, have missing values.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [4]:
car_data = car_data[~car_data['model_year'].isna()]
car_data['model_year'] = car_data['model_year'].astype(int)
# It doesn't make sense to advertise a car without telling its model year, so those null rows are now dropped.
# Furthermore, it doesn't make sense for the model years to be float values. They are now integer values.

In [5]:
car_data = car_data[~car_data['cylinders'].isna()]
car_data['cylinders'] = car_data['cylinders'].astype(int)
# It doesn't make sense to advertise a car without telling its number of cylinders, so those null rows are now dropped.
# Furthermore, it doesn't make sense for the number of cylinders to be float values. They are now integer values.

In [6]:
car_data = car_data[~car_data['odometer'].isna()]
car_data['odometer'] = car_data['odometer'].astype(int)
# It doesn't make sense to advertise a car without telling its number of miles driven, so those null rows are now dropped.
# Furthermore, I have never seen an odometer use float values. The odometer values are now all integers.

In [7]:
car_data['paint_color'] = car_data['paint_color'].fillna('Unknown')
# It seems appropriate to me to replace missing paint color values with 'Unknown'.

In [8]:
car_data['is_4wd'] = car_data['is_4wd'].replace(1.0, 'Yes')
car_data['is_4wd'] = car_data['is_4wd'].fillna('No')
# I noticed that all of the is_4wd values were either 1.0 or NaN. I think it is more intuitive to use the words "Yes" and "No" instead.

In [9]:
car_data.info()
# It is never a bad idea to double check and make sure that the desired changes to the dataset were successfully made.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 36419 entries, 0 to 51523
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   price         36419 non-null  int64 
 1   model_year    36419 non-null  int64 
 2   model         36419 non-null  object
 3   condition     36419 non-null  object
 4   cylinders     36419 non-null  int64 
 5   fuel          36419 non-null  object
 6   odometer      36419 non-null  int64 
 7   transmission  36419 non-null  object
 8   type          36419 non-null  object
 9   paint_color   36419 non-null  object
 10  is_4wd        36419 non-null  object
 11  date_posted   36419 non-null  object
 12  days_listed   36419 non-null  int64 
dtypes: int64(5), object(8)
memory usage: 3.9+ MB


In [10]:
print('The possible conditions of the cars are...')
print(car_data['condition'].unique())
print()
print('The possible fuel types used by the cars are...')
print(car_data['fuel'].unique())
print()
print('The possible transmissions of the cars are...')
print(car_data['transmission'].unique())
print()
print('Each car in the dataset is one of the following types...')
print(car_data['type'].unique())

# It's useful to know what all the possible names for each of these categories are.

The possible conditions of the cars are...
['good' 'like new' 'excellent' 'fair' 'salvage' 'new']

The possible fuel types used by the cars are...
['gas' 'diesel' 'hybrid' 'other' 'electric']

The possible transmissions of the cars are...
['automatic' 'manual' 'other']

Each car in the dataset is one of the following types...
['SUV' 'sedan' 'pickup' 'truck' 'coupe' 'hatchback' 'van' 'wagon'
 'mini-van' 'convertible' 'other' 'bus' 'offroad']


In [11]:
car_data = car_data.reset_index(drop = True)
# Since many of the rows in the original dataset have been dropped, it makes sense to reset the index.

In [12]:
print(car_data['price'].describe())
# It's useful to know the summary statistics of the prices of the cars.

count     36419.000000
mean      12187.675417
std       10076.739680
min           1.000000
25%        5000.000000
50%        9000.000000
75%       16900.000000
max      375000.000000
Name: price, dtype: float64


In [13]:
price_data = px.histogram(car_data[car_data['price'] <= 50000], 
                          color_discrete_sequence = ['blue'],
                          labels = {'price': 'Price of Car (In Dollars)'},
                          nbins = 10,
                          opacity = 0.5,
                          title = 'Number of Advertised Cars By Price ($50,000 Max)',
                          x = 'price') 

price_data.update_traces(marker_line_color = 'black', marker_line_width = 1.5)

price_data.update_xaxes(tickvals = [0, 5000, 10000, 15000, 20000, 25000, 30000, 35000, 40000, 45000, 50000],
                        ticktext = [0, '5K', '10K', '15K', '20K', '25K', '30K', '35K', '40K', '45K', '50K']) 

price_data.update_yaxes(title_text = 'Number of Cars', 
                        tickvals = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000],
                        ticktext = [0, '1K', '2K', '3K', '4K', '5K', '6K', '7K', '8K', '9K', '10K'])

price_data.show()

NOTE: By using the line of code print(len(car_data[car_data['price'] <= 50000])) I found out that there are 36,248 cars with a price less than or equal to $50,000 being advertised. In contrast, using print(len(car_data[car_data['price'] > 50000])) I found out that only 171 of the cars cost more than $50,000. The y–axis ticks on the histogram go up by 1,000 each time, hence I think it is reasonable to exclude the remaining 171 as they would be difficult to show on the histogram anyways.

In [14]:
print(car_data['model_year'].describe())
# It's useful to know the summary statistics of the model years of the cars.

count    36419.000000
mean      2009.753425
std          6.265305
min       1908.000000
25%       2006.000000
50%       2011.000000
75%       2014.000000
max       2019.000000
Name: model_year, dtype: float64


In [15]:
model_year_data = px.histogram(car_data[car_data['model_year'] >= 2000], 
                               color_discrete_sequence = ['red'],
                               labels = {'model_year': 'Manufacture Year'},
                               opacity = 0.5,
                               title = 'Number of Advertised Cars That Were Manufactured In Or After 2000',
                               x = 'model_year') 

model_year_data.update_traces(marker_line_color = 'black', marker_line_width = 1.5)

model_year_data.update_xaxes(tickvals = [2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 
                                         2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019]) 

model_year_data.update_yaxes(title_text = 'Number of Cars', 
                             tickvals = [0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500])

model_year_data.show()

NOTE: By using the line of code print(len(car_data[car_data['model_year'] >= 2000])) I found out that there are 34,554 cars that were manufactured on or after 2000 being advertised. In contrast, using print(len(car_data[car_data['model_year'] < 2000])) I found out that only 1,865 of the cars were manufactured before 2000. Regarding these 1,865 there is wide spread in their manufacture years, with the oldest car being from 1908. Due to this spread, it is reasonable to exclude them from the histogram.

In [16]:
print(car_data['odometer'].describe())
# It's useful to know the summary statistics of the number of miles driven by the cars.

count     36419.000000
mean     115323.632747
std       65068.650067
min           0.000000
25%       69854.000000
50%      113000.000000
75%      155000.000000
max      990000.000000
Name: odometer, dtype: float64


In [17]:
odometer_data = px.histogram(car_data[car_data['odometer'] > 0][car_data['odometer'] <= 300000], 
                             color_discrete_sequence = ['green'],
                             labels = {'odometer': 'Number of Miles Driven'},
                             nbins = 30, 
                             opacity = 0.5,
                             title = 'Number of Used Cars By Number of Miles Driven',
                             x = 'odometer') 

odometer_data.update_traces(marker_line_color = 'black', marker_line_width = 1.5)

odometer_data.update_xaxes(tickvals = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000, 
                                       170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, 290000, 300000]) 

odometer_data.update_yaxes(title_text = 'Number of Used Cars', tickvals = [0, 250, 500, 750, 1000, 1250, 1500, 1750, 2000, 2250, 2500])

odometer_data.show()


Boolean Series key will be reindexed to match DataFrame index.



NOTE: By using the line of code print(len(car_data[car_data['odometer'] > 300000])) I found out that there are 237 cars that have been driven more than 300,000 miles. These are undoubtedly extreme outliers. The "run–of–the–mill" car can drive up to around 200,000 miles before breaking down, and higher quality cars can drive up to around 300,000 miles. Hence, I think it is reasonable to exclude those 237 cars from the histogram.

In [18]:
print(car_data['days_listed'].describe())
# It's useful to know the summary statistics of the number of days the cars have been listed.

count    36419.000000
mean        39.649798
std         28.119391
min          0.000000
25%         19.000000
50%         33.000000
75%         53.000000
max        271.000000
Name: days_listed, dtype: float64


In [19]:
days_listed_data = px.histogram(car_data[car_data['days_listed'] <= 150], 
                                color_discrete_sequence = ['gray'],
                                labels = {'days_listed': 'Number of Days Car Has Been Listed'},
                                nbins = 16,
                                opacity = 0.5,
                                title = 'Number of Cars By Number of Days Listed',
                                x = 'days_listed') 

days_listed_data.update_traces(marker_line_color = 'black', marker_line_width = 1.5)

days_listed_data.update_xaxes(tickvals = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]) 

days_listed_data.update_yaxes(title_text = 'Number of Cars',
                              tickvals = [500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000])

days_listed_data.show()

NOTE: By using the line of code print(len(car_data[car_data['days_listed'] > 150])) I found out that there are 167 cars that have been listed for more than 150 days. Since the ticks on the y–axis of the histogram go up by 500 each time, in seems reasonable to me to exclude those 167 from the histogram, as they would be difficult to show on it anyways.

In [20]:
miles_vs_price = px.scatter(car_data[car_data['odometer'] > 0][car_data['odometer'] <= 300000][car_data['price'] <= 50000], 
                            x = 'odometer', 
                            y = 'price', 
                            title = 'Number of Miles Driven By Used Cars vs Price Scatterplot',
                            trendline = 'ols',
                            trendline_color_override = 'orange')

miles_vs_price.update_xaxes(title_text = 'Number of Miles Driven',
                            tickvals = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 
                                        160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, 290000, 300000]) 

miles_vs_price.update_yaxes(title_text = 'Price of Car')

miles_vs_price.show()


Boolean Series key will be reindexed to match DataFrame index.



In [21]:
miles_vs_days = px.scatter(car_data[car_data['odometer'] > 0][car_data['odometer'] <= 300000][car_data['days_listed'] <= 150], 
                           x = 'odometer', 
                           y = 'days_listed', 
                           title = 'Number of Miles Driven By Used Cars vs Days Listed Scatterplot')

miles_vs_days.update_xaxes(title_text = 'Number of Miles Driven',
                            tickvals = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 
                                        160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, 290000, 300000]) 

miles_vs_days.update_yaxes(title_text = 'Number of Days Listed', tickvals = [0, 30, 60, 90, 120, 150])

miles_vs_days.show()


Boolean Series key will be reindexed to match DataFrame index.

