In [39]:
import streamlit as st
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
df = pd.read_csv('./vehicles_us.csv')

In [40]:
# Analyzing Different Trends about Different Vehicle Types and Much more
# In this project I am taking a dataset about many types of vehicles and models and analyzing the different trends when compared to other outside factors. The other factors include the odometer, the model, the model year, the type of vehicle, the price, as well as some other factors. 

In [41]:
# Group the data by 'model' and calculate the median 'model_year' for each group
median_years = df.groupby('model')['model_year'].transform('median')

# Fill the missing values in 'model_year' with the median year for each 'model'
df.fillna({'model_year': median_years}, inplace=True)

In [42]:
# Fill missing values in 'cylinders' by grouping by 'model'
cylinder_medians = df.groupby('model')['cylinders'].transform('median')
df.fillna({'cylinders': cylinder_medians}, inplace=True)
# Fill missing values in 'odometer' by grouping by 'model' and 'model_year'
# Using median or mean as specified
odometer_means = df.groupby(['model', 'model_year'])['odometer'].transform('mean')
df.fillna({'odometer': odometer_means}, inplace=True)

In [44]:
# Function to remove outliers based on the IQR method
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from 'model_year' and 'price'
data_cleaned = remove_outliers(df, 'model_year')
data_cleaned = remove_outliers(data_cleaned, 'price')

In [36]:
# A header for the chart
st.header("The Average Vehicle Price by Model Year")
# Line plot for average price per model year
avg_price_per_year = df.groupby('model_year')['price'].mean().reset_index()
fig = px.line(avg_price_per_year, x='model_year', y='price', labels={'model_year': 'Model Year', 'price': 'Average Price'})
fig.update_xaxes(tickangle=45)
st.plotly_chart(fig)
# This compares the price of a vehicle to the model year of that vehicle



DeltaGenerator()

In [45]:
st.header("The Vehicle Condition by Model Year")
# Histogram for vehicle condition by model year (Plotly)
fig2 = px.histogram(df, x='model_year', color='condition', barmode='overlay',
                    histnorm='probability density')

st.plotly_chart(fig2)

# Get unique manufacturers for the dropdown
unique_models = df['model'].unique()

# Creating the histogram traces for each manufacturer
traces = []
for m in unique_models:
    filtered_df = df[df['model'] == m]
    traces.append(go.Histogram(
        x=filtered_df['price'],
        name=m,
        opacity=0.75,
        histnorm='probability density',
        visible=(m == unique_models[0])  # Show the first manufacturer by default
    ))



In [None]:
# Create the figure
fig = go.Figure(data=traces)
st.header("Price Distribution by Manufacturer")
# Add dropdown buttons
fig.update_layout(
    xaxis_title='Price',
    yaxis_title='Density',
    barmode='overlay',
    updatemenus=[
        {
            'buttons': [
                {
                    'label': m,
                    'method': 'update',
                    'args': [{'visible': [m == model for model in unique_models]}]
                } for m in unique_models
            ],
            'direction': 'down',
            'showactive': True,
        }
    ]
)

# Show the figure using Streamlit
st.plotly_chart(fig)  

In [None]:
# Scatter plot for odometer vs price (Matplotlib)
st.header("The Odometer and Days Listed vs Price")
fig = px.scatter(df, x='odometer', y='price', opacity=0.36, labels={'odometer': 'Odometer', 'price': 'Price'}) 
fig.update_xaxes(range=[0, 500000]) 
fig.update_yaxes(range=[0, 100000])
# Create a checkbox for changing x-axis to 'days_listed'
use_days_listed = st.checkbox('Change x-axis to Days Listed')

# Set the x-axis label dynamically based on the checkbox state
x_axis = 'days_listed' if use_days_listed else 'odometer'

# Create the scatter plot with Plotly
fig = px.scatter(df, x=x_axis, y='price', opacity=0.36, labels={x_axis: x_axis.capitalize(), 'price': 'Price'})

# Update x-axis range based on the checkbox state
if x_axis == 'odometer':
    fig.update_xaxes(range=[0, 500000])  # For 'odometer'
else:
    fig.update_xaxes(range=[0, df['days_listed'].max()])  # For 'days_listed'

# Update y-axis range (kept constant)
fig.update_yaxes(range=[0, 100000])

# Display the chart in Streamlit (the chart will dynamically update based on checkbox state)
st.plotly_chart(fig)