Project Car Sales

In [154]:
import pandas as pd
import streamlit as st 
import plotly.express as px 

# Read the dataset's CSV file
data_car_sales = pd.read_csv('../vehicles_us.csv')
print(data_car_sales.head(20))

    price  model_year                     model  condition  cylinders fuel  \
0    9400      2011.0                    bmw x5       good        6.0  gas   
1   25500         NaN                ford f-150       good        6.0  gas   
2    5500      2013.0            hyundai sonata   like new        4.0  gas   
3    1500      2003.0                ford f-150       fair        8.0  gas   
4   14900      2017.0              chrysler 200  excellent        4.0  gas   
5   14990      2014.0              chrysler 300  excellent        6.0  gas   
6   12990      2015.0              toyota camry  excellent        4.0  gas   
7   15990      2013.0               honda pilot  excellent        6.0  gas   
8   11500      2012.0               kia sorento  excellent        4.0  gas   
9    9200      2008.0               honda pilot  excellent        NaN  gas   
10  19500      2011.0  chevrolet silverado 1500  excellent        8.0  gas   
11   8990      2012.0              honda accord  excellent      

In [155]:
data_car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


In [156]:
# Converting the floats and booleans to integers:
# the 1 repersent yes for four wheel drive and the 0 represents the contrary.
data_car_sales['is_4wd'] = data_car_sales['is_4wd'].fillna(0).astype(int)


In [157]:
#filling in missing value for car color with 'unknown'.
data_car_sales['paint_color'] = data_car_sales['paint_color'].fillna('unknown')


In [158]:
data_car_sales['cylinders'] = data_car_sales[['cylinders', 'type']].groupby('type').transform(lambda x:x.fillna(x.median()))
data_car_sales.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,unknown,1,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,unknown,0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,0,2019-04-02,28


In [159]:
data_car_sales['model_year'] = data_car_sales[['model_year', 'type']].groupby('type').transform(lambda x:x.fillna(x.median()))
data_car_sales.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,unknown,1,2018-06-23,19
1,25500,2011.0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,unknown,0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,0,2019-04-02,28


In [160]:
data_car_sales['odometer'] = data_car_sales[['odometer', 'model_year']].groupby('model_year').transform(lambda x:x.fillna(x.median()))
data_car_sales.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,unknown,1,2018-06-23,19
1,25500,2011.0,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,0,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,161397.0,automatic,pickup,unknown,0,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,0,2019-04-02,28


In [161]:
# display the dataframe with Streamlit 
# creating a df header
st.header("Car Sales")
#diplay the df with streamlit
st.dataframe(data_car_sales)



DeltaGenerator()

In [162]:
# Checkbox to toggle between showing "condition" or "type" in histograms
color_by_condition = st.checkbox('Color by Condition', value=True)

# Plotly Express histogram for vehicles that are in good condition with low mileage
filtered_dcs = data_car_sales[(data_car_sales['condition'] == 'good') & (data_car_sales['odometer'] <= 30000)]
fig_hist = px.histogram(filtered_dcs, x="odometer", nbins=5, title="Good Condition Cars with Low Mileage")

st.plotly_chart(fig_hist)

# Histogram that shows the count of cars by their types, with checkbox control
if color_by_condition:
    fig_hist2 = px.histogram(data_car_sales, x='type', color='condition', title="Count of Cars By Vehicle Type and Condition")
else:
    fig_hist2 = px.histogram(data_car_sales, x='type', title="Count of Cars By Vehicle Type")

st.plotly_chart(fig_hist2)



DeltaGenerator()

In [163]:
# histogram that shows the count of cars by their types
fig_hist2 = px.histogram(data_car_sales, x='type', title="Count of Cars By Vehicle Tpye")
st.plotly_chart(fig_hist2)



DeltaGenerator()

In [164]:
#Scatter plot of Car Mileage vs. Years
fig_cmvy = px.scatter(data_car_sales, x="model_year", y="odometer", color="type", hover_data=["model", "paint_color"], title="Car Mileage vs Year")

st.plotly_chart(fig_cmvy)



DeltaGenerator()

In [165]:
# Histogram of Condition vs Model_year
# The relationship between condition and model_year
fig_cmy = px.histogram(data_car_sales, x='model_year', color='condition')
st.header('Histogram of Condition vs Model_Year')
st.plotly_chart(fig_cmy)



DeltaGenerator()

In [166]:
#volin plot of mileage by make
fig_vmm = px.violin(data_car_sales, x='model', y='odometer', title="Mileage Distribution by Make", box=True, points="all", color='model')

fig_vmm.update_layout(width=900, height=600)

st.plotly_chart(fig_vmm)



DeltaGenerator()