In [2]:
import streamlit as st
import pandas as pd
import plotly_express as px

# reading the data
df = pd.read_csv('../vehicles_us.csv')
df['manufacturer'] = df['model'].apply(lambda x:x.split()[0])


In [3]:
# let's see what the data looks like
df.head()

Unnamed: 0,price,model_year,model,condition,cylinders,fuel,odometer,transmission,type,paint_color,is_4wd,date_posted,days_listed,manufacturer
0,9400,2011.0,bmw x5,good,6.0,gas,145000.0,automatic,SUV,,1.0,2018-06-23,19,bmw
1,25500,,ford f-150,good,6.0,gas,88705.0,automatic,pickup,white,1.0,2018-10-19,50,ford
2,5500,2013.0,hyundai sonata,like new,4.0,gas,110000.0,automatic,sedan,red,,2019-02-07,79,hyundai
3,1500,2003.0,ford f-150,fair,8.0,gas,,automatic,pickup,,,2019-03-22,9,ford
4,14900,2017.0,chrysler 200,excellent,4.0,gas,80903.0,automatic,sedan,black,,2019-04-02,28,chrysler


In [4]:
# a different way to view the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51525 entries, 0 to 51524
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   price         51525 non-null  int64  
 1   model_year    47906 non-null  float64
 2   model         51525 non-null  object 
 3   condition     51525 non-null  object 
 4   cylinders     46265 non-null  float64
 5   fuel          51525 non-null  object 
 6   odometer      43633 non-null  float64
 7   transmission  51525 non-null  object 
 8   type          51525 non-null  object 
 9   paint_color   42258 non-null  object 
 10  is_4wd        25572 non-null  float64
 11  date_posted   51525 non-null  object 
 12  days_listed   51525 non-null  int64  
 13  manufacturer  51525 non-null  object 
dtypes: float64(4), int64(2), object(8)
memory usage: 5.5+ MB


In [5]:
# Looking to see how much data is missing
print("Number of missing values", df.isnull().sum().sum())

Number of missing values 51991


In [6]:
# removing missing values
df = df.dropna()

In [7]:
# checking again for missing data
print("Number of missing values", df.isnull().sum().sum())

Number of missing values 0


In [8]:
# checking for duplicate values
df.duplicated().sum()

0

In [9]:
# some exploratory data analysis on the 'odometer' column
df['odometer'].describe()

count     14852.000000
mean     117713.691287
std       65610.650451
min           0.000000
25%       71597.500000
50%      115665.500000
75%      158378.000000
max      866000.000000
Name: odometer, dtype: float64

In [37]:
df['price'].describe()

count     14852.000000
mean      15446.921155
std       11393.676296
min           1.000000
25%        6989.500000
50%       13500.000000
75%       21988.000000
max      375000.000000
Name: price, dtype: float64

In [40]:
# creating a scatter plot for price distribution

st.header('Distribution of Prices by Odemeter Miles')

fig_scatter_new = px.scatter(df, x='price', y='odometer', color='condition',
                             color_discrete_map={
                                "new": "green",
                                "like new": "blue",
                                "excellent": "orange",
                                "good": "yellow",
                                "fair": "red",
                                "salvage": "black"
                             },
                             category_orders={'condition': ['new', 'like new', 'excellent', 'good', 'fair', 'salvage']})

fig_scatter_new.update_layout(xaxis_title='Price', yaxis_title='Odometer')

show_outliers_scatter = st.checkbox('Show Outliers', value=False)

if show_outliers_scatter:
    fig_scatter_new.update_layout(xaxis_range=[0, 350000])
    fig_scatter_new.update_layout(yaxis_range=[0,500000])
else:
    fig_scatter_new.update_layout(xaxis_range=[0, 100000])
    fig_scatter_new.update_layout(yaxis_range=[0,350000])

# fig_scatter_new.show()

st.plotly_chart(fig_scatter_new)

In [42]:
# creating a histogram for price distribution

st.header('Distribution of Prices by Vehicle Condition')

fig_histo_new = px.histogram(df, x='price', color='condition', marginal='rug',
                             color_discrete_map={
                                "new": "green",
                                "like new": "blue",
                                "excellent": "orange",
                                "good": "yellow",
                                "fair": "red",
                                "salvage": "black"
                             },
                             category_orders={'condition': ['new', 'like new', 'excellent', 'good', 'fair', 'salvage']}
                             )

fig_histo_new.update_layout(xaxis_title='Price', yaxis_title='Count')

show_rug = st.checkbox('Show Rug Plot')

show_outliers_histo = st.checkbox('Show Outliers', value=False)

if show_outliers_histo:
    fig_histo_new.update_layout(xaxis_range=[0, 350000])
else:
    fig_histo_new.update_layout(xaxis_range=[0, 100000])

# fig_histo_new.show()

st.plotly_chart(fig_histo_new)


DeltaGenerator(_root_container=0, _provided_cursor=None, _parent=None, _block_type=None, _form_data=None)