In [54]:
import pandas as pd
import streamlit as sl
import plotly.express as px

df = pd.read_csv("vehicles_us.csv")

df.info()
display(df)

print("The lowest sale price is", df['price'].min())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
dtypes: float64(4), int64(2), object(7)
memory usage: 5.1+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013.0,nissan maxima,like new,6.0,gas,88136.0,automatic,sedan,black,,2018-10-03,37
51521,2700,2002.0,honda civic,salvage,4.0,gas,181500.0,automatic,sedan,white,,2018-11-14,22
51522,3950,2009.0,hyundai sonata,excellent,4.0,gas,128000.0,automatic,sedan,blue,,2018-11-15,32
51523,7455,2013.0,toyota corolla,good,4.0,gas,139573.0,automatic,sedan,black,,2018-07-02,71


The lowest sale price is 1


A sale price of $1.00 doesn't make any sense. Either the car was donated or that just means the data is missing.
I want to compare Condition vs Price and Model Year vs Price. Sales of $1 will not help my analysis. I will filter out all rows contaning a sale price of $1.

I'm going to change model_year to int class type for less memory and replace missing values with 0.

I'm going to change odomoter to int class type for less memory. Also, any missing values, I will change to a value of -1. This makes more sense then putting 0. I wouldn't want any one else to believe the odometer is zero. I want it to be clear the value is missing.

There are missing values in is_4wd. A value of 1 should mean, the car is indeed Four Wheel Drive. So instead of having missing values, I will fill in missing values with 0, to represent vehicles without Four Wheel Drive.

Likewise, I will replace missing paint_color values with "unknown"

I will change date_posted in to DateTime data type.

In [55]:
#create new column for just the manufacturer
df['manufacturer'] = df['model'].apply(lambda x: x.split()[0])

#fill missing values with 0. and change column to int type
df['model_year'] = df['model_year'].fillna(0)
df['model_year'] = df['model_year'].astype('Int64')

#fill missing values with -1. and change column to int type
df['odometer'] = df['odometer'].fillna(-1)
df['odometer'] = df['odometer'].astype('Int64')

#fill missing values with -0. and change column to int type
df['is_4wd'] = df['is_4wd'].fillna(0)
df['is_4wd'] = df['is_4wd'].astype('Int64')

#fill missing values with unknown
df['paint_color'] = df['paint_color'].fillna('unknown')

#fill missing values with unknown
df['date_posted'] = pd.to_datetime(df['date_posted'], format='%Y-%m-%d')

#filter out all sales of $1
df = df.query("price > 1")

df.info()
display(df)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 50727 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   price         50727 non-null  int64         
 1   model_year    50727 non-null  Int64         
 2   model         50727 non-null  object        
 3   condition     50727 non-null  object        
 4   cylinders     45538 non-null  float64       
 5   fuel          50727 non-null  object        
 6   odometer      50727 non-null  Int64         
 7   transmission  50727 non-null  object        
 8   type          50727 non-null  object        
 9   paint_color   50727 non-null  object        
 10  is_4wd        50727 non-null  Int64         
 11  date_posted   50727 non-null  datetime64[ns]
 12  days_listed   50727 non-null  int64         
 13  manufacturer  50727 non-null  object        
dtypes: Int64(3), datetime64[ns](1), float64(1), int64(2), object(7)
memory usage: 6.0+ MB


Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,manufacturer
0,9400,2011,bmw x5,good,6.0,gas,145000,automatic,SUV,unknown,1,2018-06-23,19,bmw
1,25500,0,ford f-150,good,6.0,gas,88705,automatic,pickup,white,1,2018-10-19,50,ford
2,5500,2013,hyundai sonata,like new,4.0,gas,110000,automatic,sedan,red,0,2019-02-07,79,hyundai
3,1500,2003,ford f-150,fair,8.0,gas,-1,automatic,pickup,unknown,0,2019-03-22,9,ford
4,14900,2017,chrysler 200,excellent,4.0,gas,80903,automatic,sedan,black,0,2019-04-02,28,chrysler
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51520,9249,2013,nissan maxima,like new,6.0,gas,88136,automatic,sedan,black,0,2018-10-03,37,nissan
51521,2700,2002,honda civic,salvage,4.0,gas,181500,automatic,sedan,white,0,2018-11-14,22,honda
51522,3950,2009,hyundai sonata,excellent,4.0,gas,128000,automatic,sedan,blue,0,2018-11-15,32,hyundai
51523,7455,2013,toyota corolla,good,4.0,gas,139573,automatic,sedan,black,0,2018-07-02,71,toyota


In [None]:
'''
#distribution of vehicle type by the manufacturer
st.header('Vehicle types by manfacturer')
#create a plotly histogram figure
fig = px.histogram(df, x ='manufacturer', color = 'type')
#display the figure with streamlit
st.write(fig)

#histogram of condition vs model year
st.header("Histogram of `condition` vs `model_year`")
fig2 = px.histogram(df, x = 'model_year', color = 'condition')
st.write(fig2)

st.header('Compare price distribution between manufacturers')
# get a list of car manufacturers
manufac_list = sorted(df['manufacturer'].unique())
# get user's inputs from a dropdown menu
manufacturer_1 = st.selectbox(
                              label='Select manufacturer 1', # title of the select box
                              options=manufac_list, # options listed in the select box
                              index=manufac_list.index('chevrolet') # default pre-selected option
                              )
# repeat for the second dropdown menu
manufacturer_2 = st.selectbox(
                              label='Select manufacturer 2',
                              options=manufac_list, 
                              index=manufac_list.index('hyundai')
                              )
# filter the dataframe 
mask_filter = (df['manufacturer'] == manufacturer_1) | (df['manufacturer'] == manufacturer_2)
df_filtered = df[mask_filter]

# add a checkbox if a user wants to normalize the histogram
normalize = st.checkbox('Normalize histogram', value=True)
if normalize:
    histnorm = 'percent'
else:
    histnorm = None

# create a plotly histogram figure
fig3 = px.histogram(df_filtered,
                      x='price',
                      nbins=30,
                      color='manufacturer',
                      histnorm=histnorm,
                      barmode='overlay')
# display the figure with streamlit
st.write(fig3)'''

'\n#distribution of vehicle type by the manufacturer\nst.header(\'Vehicle types by manfacturer\')\n#create a plotly histogram figure\nfig = px.histogram(df, x =\'manufacturer\', color = \'type\')\n#display the figure with streamlit\nst.write(fig)\n\n#histogram of condition vs model year\nst.header("Histogram of `condition` vs `model_year`")\nfig2 = px.histogram(df, x = \'model_year\', color = \'condition\')\nst.write(fig2)\n\nst.header(\'Compare price distribution between manufacturers\')\n# get a list of car manufacturers\nmanufac_list = sorted(df[\'manufacturer\'].unique())\n# get user\'s inputs from a dropdown menu\nmanufacturer_1 = st.selectbox(\n                              label=\'Select manufacturer 1\', # title of the select box\n                              options=manufac_list, # options listed in the select box\n                              index=manufac_list.index(\'chevrolet\') # default pre-selected option\n                              )\n# repeat for the second dropd