In [None]:
import pandas as pd
import streamlit as st
import pandas as pd
import plotly.express as px

This notebook explores and preprocesses the `vehicles_us.csv` dataset, focusing on identifying patterns and trends in vehicle advertisements. The goal is to clean and prepare the data for building a web application.

In [2]:
df = pd.read_csv("vehicles_us.csv")

In [31]:
print(df.head(10)) 

   price  model_year           model  condition  cylinders fuel  odometer  \
0   9400      2011.0          bmw x5       good        6.0  gas  145000.0   
1  25500      2011.0      ford f-150       good        6.0  gas   88705.0   
2   5500      2013.0  hyundai sonata   like new        4.0  gas  110000.0   
3   1500      2003.0      ford f-150       fair        8.0  gas       0.0   
4  14900      2017.0    chrysler 200  excellent        4.0  gas   80903.0   
5  14990      2014.0    chrysler 300  excellent        6.0  gas   57954.0   
6  12990      2015.0    toyota camry  excellent        4.0  gas   79212.0   
7  15990      2013.0     honda pilot  excellent        6.0  gas  109473.0   
8  11500      2012.0     kia sorento  excellent        4.0  gas  104174.0   
9   9200      2008.0     honda pilot  excellent        6.0  gas  147191.0   

  transmission    type paint_color  is_4wd date_posted  days_listed  
0    automatic     SUV     Unknown     1.0  2018-06-23           19  
1    automat

In [37]:
df.fillna({"price": 0, "condition": "unknown", "odometer": 0, "year": 0, "model": "unknown"}, inplace=True)

df['model_year'] = df['model_year'].fillna(df.groupby('model')['model_year'].transform('median'))


df['is_4wd'] = df['is_4wd'].fillna(0)


df['paint_color'] = df['paint_color'].fillna('Unknown')
 

In [34]:

print(f"Number of duplicate rows: {df.duplicated().sum()}")


df = df.drop_duplicates()


print("Missing values by column:")
print(df.isnull().sum())

Number of duplicate rows: 0
Missing values by column:
price           0
model_year      0
model           0
condition       0
cylinders       0
fuel            0
odometer        0
transmission    0
type            0
paint_color     0
is_4wd          0
date_posted     0
days_listed     0
dtype: int64


In [30]:
st.header("Car Price and Condition Dashboard")



DeltaGenerator()

In [18]:
if st.checkbox("Show only cars with price > $5,000"):
    df = df[df["price"] > 5000]



In [19]:
st.subheader("Histogram: Distribution of Car Prices")
fig_hist = px.histogram(df, x="price", title="Car Price Distribution", nbins=30)
st.plotly_chart(fig_hist)



DeltaGenerator()

In [33]:
st.subheader("Scatter Plot: Price vs Odometer")
fig_scatter = px.scatter(
    df,
    x="odometer",
    y="price",
    color="condition",
    title="Price vs Odometer by Condition",
    labels={"odometer": "Odometer (miles)", "price": "Price (USD)"},
    hover_data=["model_year", "model"]
)
st.plotly_chart(fig_scatter)



DeltaGenerator()

In [15]:
if st.checkbox("Show Raw Dataset"):
    st.write("Dataset Preview", df.head())




- Newer vehicles and those in better condition are priced higher.
- Missing data has been restored meaningfully using domain-specific assumptions.
- The dataset is now clean and ready for use in the Streamlit app.