##  Importing Necessary Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.graph_objects as g
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.express as px
import matplotlib.cm as cm
from scipy.stats import gaussian_kde
%matplotlib inline
sns.set()


In [None]:
df = pd.read_csv(r"cleaned_car_prices.csv")
df.head()

## Exploratory Data Analysis

### Univariant Analysis

#### Distributions

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Distribution of Selling Price", "Distribution of Odometer Readings"))

fig.add_trace(go.Histogram(x=df['sell_price'], nbinsx=30, name='Selling Price', marker_color='blue'), row=1, col=1)

fig.add_trace(go.Histogram(x=df['odometer'], nbinsx=30, name='Odometer Readings', marker_color='green'), row=1, col=2)

fig.update_layout(
    title_text="Distribution of Selling Price and Odometer Readings",
    xaxis_title="Selling Price",
    yaxis_title="Frequency",
    xaxis2_title="Odometer (Miles)",
    yaxis2_title="Frequency",
    showlegend=False,
    template="plotly_white"
)

fig.show()

 Most cars are sold between **10k and 30k** and have been driven between **20k and 60k miles**.

In [None]:

fig = px.histogram(df, x='condition', color='condition',
                   title='Distribution of Car Condition',
                   labels={'condition': 'Condition', 'count': 'Frequency'},
                   color_discrete_sequence=px.colors.sequential.Plasma,
                   text_auto=True)


fig.update_layout(
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_title='Condition',
    yaxis_title='Frequency',
    showlegend=False,
    template="plotly_white",
    bargap=0.1
)

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgray', griddash='dot')


fig.show()

 Most cars are in condition **3**, followed by conditions **5** and **2**, indicating that the majority of cars are in good to excellent condition.


In [None]:
top_10_models = df['model'].value_counts().nlargest(10).reset_index()
top_10_models.columns = ['model', 'count']

fig = px.bar(top_10_models,
             x='count',
             y='model',
             orientation='h',
             title='Top 10 Car Models',
             labels={'count': 'Count', 'model': 'Model'},
             color='count',
             color_continuous_scale='RdBu',
             text='count')

fig.update_layout(
    title_font_size=16,
    title_font_weight='bold',
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_title='Count',
    yaxis_title='Model',
    template="plotly_white",
    showlegend=False
)


fig.update_xaxes(tickfont=dict(size=12))
fig.update_yaxes(tickfont=dict(size=12))


fig.show()

 The **ALTIMA** is the most popular model with **26,513** occurrences, followed by **FUSION** and **F-150**.

In [None]:
top_10_trims = df['trim'].value_counts().nlargest(10).reset_index()
top_10_trims.columns = ['trim', 'count']

fig = px.bar(
    top_10_trims,
    x='count',
    y='trim',
    orientation='h',
    title='Top 10 Car Trims',
    labels={'count': 'Count', 'trim': 'Trim'},
    color='trim',
    color_discrete_sequence=px.colors.qualitative.Set2,
    text_auto=True
)

fig.update_layout(
    title_font_size=16,
    title_font_weight='bold',
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_title='Count',
    yaxis_title='Trim',
    template="plotly_white",
    showlegend=False,
    bargap=0.2
)

fig.update_xaxes(
    tickfont=dict(size=12),
    title_font=dict(size=14)
)
fig.update_yaxes(
    tickfont=dict(size=12),
    title_font=dict(size=14)
)

fig.show()

 The **BASE** trim is the most common with **56,046** occurrences, followed by **SE** and **LX**.

In [None]:
top_10_body = df['body'].value_counts().nlargest(10).reset_index()
top_10_body.columns = ['body', 'count']

fig = px.bar(
    top_10_body,
    x='count',
    y='body',
    orientation='h',
    title='Top 10 Car Body Types',
    labels={'count': 'Count', 'body': 'Body Type'},
    color='count',
    color_continuous_scale='Viridis',
    text_auto=True
)

fig.update_layout(
    title_font_size=16,
    title_font_weight='bold',
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_title='Count',
    yaxis_title='Body Type',
    template="plotly_white",
    showlegend=False,
    bargap=0.2
)

fig.update_xaxes(tickfont=dict(size=12))
fig.update_yaxes(tickfont=dict(size=12))


fig.show()


**SEDAN** is the most common body type with **122,597k** occurrences, followed by **SUV** and **HATCHBACK**.

In [None]:
top_3_trims = df['trim'].value_counts().nlargest(3).reset_index()
top_3_trims.columns = ['Trim', 'Sale Amount']

fig = px.bar(
    top_3_trims,
    x='Trim',
    y='Sale Amount',
    text='Sale Amount',
    title='Top 3 Trim Sales',
    color='Sale Amount',
    color_continuous_scale='Blues',
)

fig.update_traces(
    texttemplate='%{text}',
    textposition='outside',
    marker=dict(line=dict(color='darkblue', width=1))
)

fig.update_layout(
    title_font=dict(size=18, color='darkblue', family='Arial'),
    xaxis_title='Trim',
    yaxis_title='Sale Amount',
    xaxis=dict(tickfont=dict(size=12)),
    yaxis=dict(tickfont=dict(size=12)),
    width=800, height=500,
    template='plotly_white'
)

fig.show()


**BASE** having the highest sales amount, followed by **SE** and **LX**.

#### Some Questions And Insights:

##### What is the shape of cars on the market?

In [None]:



top_5_body = df['body'].value_counts().nlargest(5).reset_index()
top_5_body.columns = ['body', 'count']

fig = px.bar(
    top_5_body,
    x='body',
    y='count',
    title='Top 5 Most Common Body Types',
    labels={'body': 'Body Type', 'count': 'Count'},
    color='count',
    color_continuous_scale='viridis',
    text_auto=True
)

fig.update_layout(
    title_font_size=16,
    xaxis_title_font_size=14,
    yaxis_title_font_size=14,
    xaxis_title='Body Type',
    yaxis_title='Count',
    template="plotly_white",
    bargap=0.2
)

fig.update_xaxes(tickfont=dict(size=12), tickangle=45)
fig.update_yaxes(tickfont=dict(size=12))


fig.show()

The distribution of car bodies indicates a strong preference for sedans and SUVs in the market.

##### What are the days and months in which the sale is most frequent?

In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Number of Sales by Month", "Number of Sales by Day of Week"))

sales_by_month = df['sale_month'].value_counts().sort_index()
fig.add_trace(
    go.Bar(
        x=sales_by_month.index,
        y=sales_by_month.values,
        marker_color=px.colors.sequential.Viridis,
        name='Sales by Month'
    ),
    row=1, col=1
)

sales_by_dayofweek = df['sale_dayofweek'].value_counts().sort_index()
fig.add_trace(
    go.Bar(
        x=sales_by_dayofweek.index,
        y=sales_by_dayofweek.values,
        marker_color=px.colors.sequential.Plasma,
        name='Sales by Day of Week'
    ),
    row=1, col=2
)

fig.update_layout(
    title_text="Sales Analysis",
    title_font_size=20,
    title_font_weight='bold',
    showlegend=False,
    template="plotly_white",
    width=1000,
    height=500
)

fig.update_xaxes(
    title_text="Month",
    title_font_size=14,
    tickfont_size=12,
    row=1, col=1
)
fig.update_yaxes(
    title_text="Number of Sales",
    title_font_size=14,
    tickfont_size=12,
    row=1, col=1
)

fig.update_xaxes(
    title_text="Day of Week",
    title_font_size=14,
    tickfont_size=12,
    row=1, col=2
)
fig.update_yaxes(
    title_text="Number of Sales",
    title_font_size=14,
    tickfont_size=12,
    row=1, col=2
)

fig.show()

Tuesday is the most peak day for sale & February is the best-selling month in the data

##### What year is the best-selling car?

In [None]:
fig = px.histogram(
    df,
    x='year',
    title='Number of Cars Sold by Year',
    labels={'year': 'Year', 'count': 'Number of Sales'},
    color_discrete_sequence=px.colors.sequential.Viridis,
    text_auto=True
)

fig.update_layout(
    title={
        'text': 'Number of Cars Sold by Year',
        'font': {'size': 18, 'weight': 'bold'}
    },
    xaxis_title='Year',
    yaxis_title='Number of Sales',
    xaxis=dict(
        tickangle=-45,
        tickfont={'size': 12},
        title_font={'size': 14}
    ),
    yaxis=dict(
        tickfont={'size': 12},
        title_font={'size': 14},
        showgrid=True,
        gridcolor='lightgray',
        griddash='dot'
    ),
    plot_bgcolor='white',
    bargap=0.1
)


fig.update_traces(
    texttemplate='%{text:,}',
    textposition='outside'
)

fig.show()

2012 cars are the best sellers

##### What are the best-selling brands on the market?

In [None]:
top_makes = df['make'].value_counts().head(10).reset_index()
top_makes.columns = ['make', 'count']

fig = px.bar(
    top_makes,
    x='make',
    y='count',
    title='Top 10 Car Makes by Sales Count',
    labels={'make': 'Car Make', 'count': 'Number of Cars Sold'},
    color='count',
    color_continuous_scale='Viridis',
    text_auto=True
)

fig.update_layout(
    title={
        'text': 'Top 10 Car Makes by Sales Count',
        'font': {'size': 16, 'weight': 'bold'}
    },
    xaxis_title='Car Make',
    yaxis_title='Number of Cars Sold',
    xaxis=dict(
        tickangle=-45,
        tickfont={'size': 12},
        title_font={'size': 14}
    ),
    yaxis=dict(
        tickfont={'size': 12},
        title_font={'size': 14},
        showgrid=True,
        gridcolor='rgba(0,0,0,0.2)',
        griddash='dot'
    ),
    plot_bgcolor='white',
    width=1200,
    height=700
)


fig.update_traces(
    texttemplate='%{y:,}',
    textposition='outside',
    marker_line_color='black',
    marker_line_width=0.5
)

fig.show()


Ford, Chevrolet then nissan are the most in demand

##### What is the average price in the market?

In [None]:
fig = px.histogram(
    df,
    x='sell_price',
    nbins=30,
    title='Distribution of Selling Price',
    labels={'sell_price': 'Selling Price'},
    color_discrete_sequence=['lightblue'],
    opacity=0.8,
    marginal=None
)

kde = gaussian_kde(df['sell_price'].dropna())
x_range = np.linspace(df['sell_price'].min(), df['sell_price'].max(), 1000)
kde_values = kde(x_range) * len(df['sell_price']) * (x_range[1] - x_range[0])

fig.add_trace(
    go.Scatter(
        x=x_range,
        y=kde_values,
        mode='lines',
        line=dict(color='darkblue', width=2),
        name='KDE'
    )
)

fig.update_layout(
    title={
        'text': 'Distribution of Selling Price',
        'font': {'size': 16}
    },
    xaxis_title='Selling Price',
    yaxis_title='Frequency',
    xaxis=dict(
        title_font={'size': 14},
        tickfont={'size': 12},
        showgrid=False
    ),
    yaxis=dict(
        title_font={'size': 14},
        tickfont={'size': 12},
        showgrid=True,
        gridcolor='lightgray',
        griddash='dot',
        gridwidth=1
    ),
    bargap=0.05,
    plot_bgcolor='white',
    hovermode='x unified'
)


fig.update_traces(
    marker=dict(line=dict(color='black', width=1)),
    selector=dict(type='histogram')
)

fig.show()

We will notice that most of the prices are in the range of 10000 and 15000

##### Are most cars on the market automatic?

In [None]:
transmission_counts = df['transmission'].value_counts().reset_index()
transmission_counts.columns = ['transmission', 'count']

fig = px.pie(
    transmission_counts,
    names='transmission',
    values='count',
    title='Distribution of Transmission Types',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    hole=0.3
)

fig.update_traces(
    textinfo='percent+label',
    textfont_size=14,
    marker=dict(line=dict(color='#000000', width=1))
)

fig.update_layout(
    title={
        'text': "Are Most Cars Automatic?",
        'font': {'size': 20, 'family': 'Arial', 'color': 'darkblue'},
        'x': 0.5
    },
    legend={
        'font': {'size': 12}
    },
    annotations=[dict(
        text=f"Total Cars: {len(df):,}",
        x=0.5,
        y=0.5,
        font_size=18,
        showarrow=False
    )]
)

fig.show()

96.9% Automatic and 3.1% Manual

In [None]:
state_abbreviations = {
    'CALIFORNIA': 'CA',
    'NEW JERSEY': 'NJ',
    'GEORGIA': 'GA',
    'VIRGINIA': 'VA',
    'INDIANA': 'IN',
    'ILLINOIS': 'IL',
    'MINNESOTA': 'MN',
    'MICHIGAN': 'MI',
    'OHIO': 'OH',
    'TEXAS': 'TX',
    'ARIZONA': 'AZ',
    'COLORADO': 'CO',
    'MISSOURI': 'MO',
    'PENNSYLVANIA': 'PA',
    'NEBRASKA': 'NE',
    'NEVADA': 'NV',
    'MASSACHUSETTS': 'MA',
    'UTAH': 'UT',
    'PUERTO RICO': 'PR',
    'NORTH CAROLINA': 'NC',
    'FLORIDA': 'FL',
    'SOUTH CAROLINA': 'SC',
    'NEW YORK': 'NY',
    'WISCONSIN': 'WI',
    'MARYLAND': 'MD',
    'TENNESSEE': 'TN',
    'WASHINGTON': 'WA',
    'LOUISIANA': 'LA',
    'OREGON': 'OR',
    'HAWAII': 'HI',
    'OKLAHOMA': 'OK',
    'MISSISSIPPI': 'MS',
    'NEW MEXICO': 'NM',
    'ALABAMA': 'AL'
}

df['state_abbr'] = df['state'].str.upper().replace(state_abbreviations)

In [None]:
state_counts = df['state_abbr'].value_counts().reset_index()
state_counts.columns = ['state', 'count']

In [None]:


fig = px.choropleth(
    state_counts,
    locations='state',
    locationmode='USA-states',
    color='count',
    scope="usa",
    color_continuous_scale="Blues",
    title="Distribution of Car Sales Across US States 🚗",
    labels={'count': 'Number of Cars Sold'}
)

fig.update_layout(
    title={
        'text': "Car Sales Distribution by State",
        'font': {'size': 20, 'color': 'navy'},
        'x': 0.5
    },
    geo=dict(
        landcolor='lightgray',
        lakecolor='white',
        bgcolor='white'
    ),
    margin={"r": 0, "t": 40, "l": 0, "b": 0}
)

fig.update_traces(
    hovertemplate="<b>%{location}</b><br>Sales: %{z:,}"
)

fig.show()

Most sales are made in Florida and then California.

### Bivariante Analysis

#### Some Visualizations to understand the data in a better way

In [None]:
newdata = df[['sell_price', 'odometer']].copy()
newdata['odometer_group'] = (newdata['odometer'] // 5000) * 5000
avg_price_by_group = newdata.groupby('odometer_group')['sell_price'].mean().reset_index()
avg_price_by_mileage = df.groupby('odometer')['sell_price'].mean().reset_index()


fig = make_subplots(rows=1, cols=2, subplot_titles=(
    "Average Selling Price vs Odometer (Rounded to Nearest 5000 Miles)",
    "Average Selling Price by Odometer"
))


fig.add_trace(
    go.Scatter(
        x=avg_price_by_group['odometer_group'],
        y=avg_price_by_group['sell_price'],
        mode='lines+markers',
        marker=dict(color='green', size=8),
        line=dict(color='green', width=2),
        name='Grouped by 5000 Miles'
    ),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(
        x=avg_price_by_mileage['odometer'],
        y=avg_price_by_mileage['sell_price'],
        mode='markers',
        marker=dict(color='blue', size=4, opacity=0.5),
        name='Raw Data'
    ),
    row=1, col=2
)

fig.update_layout(
    title_text="Relationship Between Odometer and Selling Price",
    title_font=dict(size=24, color='navy'),
    showlegend=False,
    template="plotly_white",
    width=1200,
    height=500
)

fig.update_xaxes(
    title_text="Odometer (Miles)",
    title_font=dict(size=14),
    tickfont=dict(size=12),
    row=1, col=1
)
fig.update_yaxes(
    title_text="Average Selling Price ($)",
    title_font=dict(size=14),
    tickfont=dict(size=12),
    row=1, col=1
)

fig.update_xaxes(
    title_text="Odometer (Miles)",
    title_font=dict(size=14),
    tickfont=dict(size=12),
    row=1, col=2
)
fig.update_yaxes(
    title_text="Average Selling Price ($)",
    title_font=dict(size=14),
    tickfont=dict(size=12),
    row=1, col=2
)

fig.show()

The more distances a car travels, the less its value is in the market

In [None]:

df_sorted = df.sort_values('mmr')

fig = px.line(
    df_sorted,
    x='mmr',
    y='sell_price',
    title='Selling Price vs MMR',
    labels={'mmr': 'MMR', 'sell_price': 'Selling Price ($)'}
)

fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        title_font=dict(size=14)
    ),
    yaxis=dict(
        showgrid=True,
        gridcolor='lightgray',
        title_font=dict(size=14)
    ),
    hovermode='x unified'
)
fig.update_traces(
    hovertemplate="<b>MMR:</b> %{x}<br><b>Selling Price:</b> %{y}$"
)

fig.show()

The selling price and the MMR are approximately equal

In [None]:
avg_price_by_condition = df.groupby('condition')['sell_price'].mean().reset_index()
df['condition_group'] = pd.cut(df['condition'],
                               bins=[0, 1, 2, 3, 4, 5],
                               labels=['Very Poor', 'Poor', 'Average', 'Good', 'Excellent'],
                               include_lowest=True)


fig = make_subplots(
    rows=2,
    cols=1,
    subplot_titles=(
        "Average Selling Price by Condition",
        "Box Plot of Selling Price by Condition Group"
    ),
    vertical_spacing=0.15
)


fig.add_trace(
    go.Scatter(
        x=avg_price_by_condition['condition'],
        y=avg_price_by_condition['sell_price'],
        mode='lines+markers',
        marker=dict(color='royalblue', size=10),
        line=dict(color='royalblue', width=3),
        name='Average Price'
    ),
    row=1, col=1
)


for i, group in enumerate(['Very Poor', 'Poor', 'Average', 'Good', 'Excellent']):
    fig.add_trace(
        go.Box(
            x=df[df['condition_group'] == group]['condition_group'],
            y=df[df['condition_group'] == group]['sell_price'],
            name=group,
            marker_color=px.colors.qualitative.Set2[i],
            boxpoints='outliers'
        ),
        row=2, col=1
    )


fig.update_layout(
    title_text="Car Price Analysis by Condition",
    title_font=dict(size=24, color='navy', family='Arial'),
    showlegend=False,
    template="plotly_white",
    height=800,
    margin=dict(t=100)
)


fig.update_xaxes(
    title_text="Condition",
    title_font=dict(size=16),
    tickfont=dict(size=14),
    row=1, col=1
)
fig.update_yaxes(
    title_text="Average Selling Price ($)",
    title_font=dict(size=16),
    tickfont=dict(size=14),
    gridcolor='lightgray',
    gridwidth=1,
    row=1, col=1
)


fig.update_xaxes(
    title_text="Condition Group",
    title_font=dict(size=16),
    tickfont=dict(size=14),
    row=2, col=1
)
fig.update_yaxes(
    title_text="Selling Price ($)",
    title_font=dict(size=16),
    tickfont=dict(size=14),
    gridcolor='lightgray',
    gridwidth=1,
    row=2, col=1
)

fig.show()


y_vals = fig.layout.yaxis2.tickvals
if y_vals is not None:
    fig.update_layout(
        shapes=[
            dict(
                type="line",
                yref="y2",
                xref="paper",
                x0=0,
                y0=y_val,
                x1=1,
                y1=y_val,
                line=dict(color="lightgray", width=1, dash="dot")
            ) for y_val in y_vals
        ]
    )

fig.show()

The average selling price of cars significantly increases with better condition ratings. Notably, cars rated as "Excellent" command the highest prices, while those rated as "Very Poor" show a wide price range, indicating potential losses for sellers.

In [None]:
avg_odometer = df.groupby('year')['odometer'].mean().reset_index()


fig = px.line(
    avg_odometer,
    x='year',
    y='odometer',
    markers=True,
    title='Year vs. Odometer',
    labels={'year': 'Year', 'odometer': 'Odometer (Miles)'}
)


fig.update_layout(
    title={
        'text': "Year vs. Odometer",
        'font': {'size': 18, 'color': 'black', 'family': 'Arial', 'weight': 'bold'}
    },
    xaxis=dict(
        title='Year',
        title_font={'size': 14},
        tickfont={'size': 12},
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot'
    ),
    yaxis=dict(
        title='Odometer (Miles)',
        title_font={'size': 14},
        tickfont={'size': 12},
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot'
    ),
    plot_bgcolor='white',
    width=1200,
    height=600
)


fig.update_traces(
    line=dict(color='green', width=2),
    marker=dict(
        color='green',
        size=8,
        line=dict(color='black', width=1)
    )
)


fig.update_traces(
    hovertemplate="<b>Year:</b> %{x}<br><b>Odometer:</b> %{y:,} miles"
)

fig.show()

Mostly the newer the car, the less the distance traveled than the old one

In [None]:
avg_condition = df.groupby('year')['condition'].mean().reset_index()


fig = px.line(
    avg_condition,
    x='year',
    y='condition',
    markers=True,
    title='Average Condition of Sold Cars Over the Years'
)


fig.update_layout(
    title={
        'text': "Average Condition of Sold Cars Over the Years",
        'font': {'size': 16, 'color': 'black', 'family': 'Arial', 'weight': 'bold'}
    },
    xaxis=dict(
        title='Year',
        title_font={'size': 14},
        tickfont={'size': 12},
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot'
    ),
    yaxis=dict(
        title='Condition of Sold Car',
        title_font={'size': 14},
        tickfont={'size': 12},
        range=[1, 5],
        showgrid=True,
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot'
    ),
    plot_bgcolor='white',
    width=1000,
    height=600
)


fig.update_traces(
    line=dict(
        color='royalblue',
        width=2,
        dash='dash'
    ),
    marker=dict(
        color='royalblue',
        size=8,
        line=dict(color='black', width=1)
    )
)


fig.update_traces(
    hovertemplate="<b>Year:</b> %{x}<br><b>Condition:</b> %{y:.2f}"
)

fig.show()

The older cars are in worse condition than the new one because the situation worsens with the passage of time

##### Analysis of Car Sales and Total Revenue Over Years

In [None]:
p = df.groupby('year').agg({'sell_price': ['count', 'sum']})

p.columns = ['sales_count', 'total_revenue']



fig = go.Figure()

fig.update_layout(yaxis=dict(title='Car Sold'), yaxis2=dict(title='Total Revenue', overlaying='y', side='right'), title='Distribution of Car Year and Total Revenue', template='plotly_white')

trace_1 = go.Bar(x=p.index, y=p['sales_count'])

trace_2 = go.Scatter(x=p.index, y=p['total_revenue'], yaxis='y2', mode='lines+markers', line=dict(color='green'))

fig.add_traces([trace_1, trace_2])



fig.update_traces(selector=dict(type='bar'), name='Sales Count')

fig.update_traces(selector=dict(type='scatter'), name='Total Revenue')

fig.show()

##### What is the average car prices in the market?

In [None]:
def categorize(price):
    if price < 10000:
        return 'Economical'
    elif 10000 <= price < 20000:
        return 'Medium'
    else:
        return 'High'

df['category'] = df['sell_price'].apply(categorize)
df_avg = df.groupby(['make', 'category'], as_index=False)['sell_price'].mean()

fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=("High", "Medium", "Economical"),
    vertical_spacing=0.08
)

categories = ['High', 'Medium', 'Economical']
colors = ['#a1c9f4', '#8de5a1', '#ffb482']
for i, category in enumerate(categories):
    filtered = df_avg[df_avg['category'] == category] \
        .sort_values('sell_price', ascending=False)

    fig.add_trace(
        go.Bar(
            x=filtered['make'],
            y=filtered['sell_price'],
            marker_color=colors[i],
            text=filtered['sell_price'].round(0).astype(int),
            texttemplate='%{text:,}',
            textposition='outside',
            name=category
        ),
        row=i+1, col=1
    )

fig.update_layout(
    title_text="Average Selling Price by Make and Category",
    title_font=dict(size=28, color='navy', family='Arial'),
    showlegend=False,
    height=1200,
    plot_bgcolor='white',
    margin=dict(t=100)
)

for i in range(3):
    fig.update_xaxes(
        tickangle=45,
        tickfont=dict(size=14),
        title_text="Make",
        row=i+1, col=1
    )
    fig.update_yaxes(
        title_text="Average Price ($)",
        tickprefix="$",
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot',
        title_font=dict(size=16),
        row=i+1, col=1
    )

fig.show()

In [None]:

fig = px.box(
    df,
    x='category',
    y='sell_price',
    color='category',
    color_discrete_sequence=px.colors.qualitative.Pastel,
    category_orders={'category': ['Economical', 'Medium', 'High']})

fig.update_layout(
    title={
        'text': 'Box Plot of Selling Prices by Category',
        'font': {'size': 18, 'family': 'Arial'}
    },
    xaxis_title='Category',
    yaxis_title='Selling Price ($)',
    xaxis=dict(
        tickfont={'size': 12},
        title_font={'size': 14}
    ),
    yaxis=dict(
        tickfont={'size': 12},
        title_font={'size': 14},
        gridcolor='lightgray',
        gridwidth=1,
        griddash='dot',
        showgrid=True
    ),
    plot_bgcolor='white',
    boxmode='group',
    width=1000,
    height=600
)

fig.update_layout(showlegend=False)

fig.update_yaxes(
    gridwidth=1,
    gridcolor='lightgray',
    griddash='dot',
    minor_griddash="dot"
)

fig.show()

##### Category Breakdown:

High Category: Features luxury brands like BMW and Mercedes, appealing to affluent consumers seeking prestige.

Medium Category: Targets middle-class buyers with reliable and efficient options.

Economical Category: Attracts budget-conscious consumers, often including compact and older models

##### Which states are the most sales-making

In [None]:

state_sales = df.groupby('state')['sell_price'].sum().sort_values(ascending=False)
top_5_states = state_sales.nlargest(5)
other_sales = state_sales.iloc[5:].sum()

sales_data = pd.DataFrame({
    'state': list(top_5_states.index) + ['Other'],
    'sales': list(top_5_states.values) + [other_sales]
})

colors = ['#66c2a5', '#fc8d62', '#8da0cb', '#e78ac3', '#a6d854', '#ffd92f']

fig = px.pie(
    sales_data,
    names='state',
    values='sales',
    color_discrete_sequence=colors,
    title='Percentage of Sales by State (Top 5 + Other)',
    hole=0.3,
    labels={'sales': 'Total Sales'}
)

fig.update_traces(
    textinfo='percent+label',
    marker=dict(line=dict(color='black', width=1)),
    pull=[0.05] + [0.02]*5  )

fig.update_layout(
    title={
        'text': "Percentage of Sales by State (Top 5 + Other)",
        'font': {'size': 18, 'family': 'Arial', 'color': 'black'},
        'x': 0.5
    },
    uniformtext_minsize=14,
    uniformtext_mode='hide',
    showlegend=False,
    width=800,
    height=800
)

fig.show()

The highest total sales of cars are in Florida, California, Pennsylvania, Texas, Georgia, in conjunction with the number of sales

##### Which brands have the most models?

In [None]:

brand_car = df.groupby('make', as_index=False)['model'].count().sort_values('model', ascending=False)
top_10 = brand_car.head(4)
other_total = brand_car.iloc[4:]['model'].sum()
other_data = pd.DataFrame({'make': ['Other'], 'model': [other_total]})
combin_data = pd.concat([top_10, other_data])


viridis_palette = sns.color_palette('viridis', len(combin_data)).as_hex()


fig = px.pie(
    combin_data,
    names='make',
    values='model',
    title='Brand By Models Count',
    color_discrete_sequence=viridis_palette,
    hole=0.3
)


fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    texttemplate='%{label}<br>%{percent:.2%}',
    rotation=270,
    marker=dict(line=dict(color='white', width=1))
)


fig.update_layout(
    title={
        'text': "Brand By Models Count",
        'font': {'size': 20, 'family': 'Arial', 'color': 'black'},
        'x': 0.5
    },
    paper_bgcolor='lavender',
    uniformtext_minsize=12,
    uniformtext_mode='hide',
    showlegend=False,
    width=800,
    height=600
)

fig.show()

Ford is the car that has a larger number of models, then Chevrolet

##### Who are the most profitable sellers?

In [None]:
top_sellers = df.groupby('seller')['sell_price'].sum().nlargest(5).reset_index()

top_sellers

In [None]:
plt.figure(figsize=(12, 8))

sns.barplot(data=top_sellers, x='sell_price', y='seller', palette='viridis')

plt.title('Top 5 Sellers by total selling price', fontsize=16)

plt.xlabel('Total Sales ($)', fontsize=14)

plt.ylabel('Seller', fontsize=14)

plt.show()

Insight on New Selling Prices by Sellers

##### 1.Price Analysis:


The data shows that Nissan Infiniti LT tops the list with a new selling price of $456,580,011. This could indicate that this brand's cars are either highly popular or feature luxury models, reflecting a significant market value.

##### 2.Company Comparison:

Ford Motor Credit Company LLC ranks second with a price of $306,871,800, followed by The Hertz Corporation at $232,114,601. This highlights the influence of well-established brands and their impact on the car sales market.

##### What is most profitable Body?

In [None]:

avg_body_prices = df.groupby('body')['sell_price'].mean().reset_index()
avg_body_prices = avg_body_prices.sort_values(by='sell_price', ascending=False)

fig = px.bar(
    avg_body_prices,
    x='body',
    y='sell_price',
    title='Average Selling Price by Body Type',
    labels={'body': 'Body Type', 'sell_price': 'Average Selling Price ($)'},
    color='sell_price',
    color_continuous_scale='RdBu',
    text_auto=True
)


fig.update_layout(
    title={
        'text': "Average Selling Price by Body Type",
        'font': {'size': 18, 'family': 'Arial', 'color': 'black'},
        'x': 0.5
    },
    xaxis=dict(
        title='Body Type',
        title_font={'size': 14},
        tickfont={'size': 12},
        tickangle=-90,
        showgrid=False
    ),
    yaxis=dict(
        title='Average Selling Price ($)',
        title_font={'size': 14},
        tickfont={'size': 12},
        showgrid=True,
        gridcolor='lightgray',
        griddash='dot'
    ),
    plot_bgcolor='white',
    width=1200,
    height=600
)

fig.update_traces(
    marker_line_color='black',
    marker_line_width=0.5,
    texttemplate='%{y:$,.0f}',
    textposition='outside'
)

fig.show()

### Multivariant Analysis

##### Top 3 sales models in the top 6 brands sales

In [None]:

total_sales = df.groupby(['make', 'model']).agg({'sell_price': 'sum'}).reset_index()
top_makes = total_sales.groupby('make')['sell_price'].sum().nlargest(6).index
top_models = total_sales[total_sales['make'].isin(top_makes)]
top_models = top_models.loc[top_models.groupby('make')['sell_price'].nlargest(3).reset_index(level=0, drop=True).index]

other_models = total_sales[total_sales['make'].isin(top_makes) & ~total_sales['model'].isin(top_models['model'])]
other_sales = other_models.groupby('make').agg({'sell_price': 'sum'}).reset_index()
other_sales['model'] = 'Other'

combined_top_models = pd.concat([top_models, other_sales], ignore_index=True)

fig = make_subplots(
    rows=3,
    cols=2,
    specs=[[{'type':'pie'}, {'type':'pie'}],
           [{'type':'pie'}, {'type':'pie'}],
           [{'type':'pie'}, {'type':'pie'}]],
    subplot_titles=combined_top_models['make'].unique()
)

cmap = cm.get_cmap('Set3')
colors = [f'rgb({int(r*255)},{int(g*255)},{int(b*255)})'
          for r, g, b in cmap.colors]

for i, make in enumerate(combined_top_models['make'].unique(), 1):
    data = combined_top_models[combined_top_models['make'] == make]
    labels = data['model']
    values = data['sell_price']

    pull = [0.1 if label == 'Other' else 0 for label in labels]

    fig.add_trace(
        go.Pie(
            labels=labels,
            values=values,
            name=make,
            marker_colors=colors[:len(labels)],
            pull=pull,
            textinfo='percent+label',
            texttemplate='%{label}<br>%{percent:.1%}',
            rotation=90
        ),
        row=(i+1)//2 if i%2 else i//2,
        col=2 if i%2 else 1
    )

fig.update_layout(
    title_text="Top Selling Models by Make",
    title_font_size=24,
    height=1200,
    width=1400,
    showlegend=False,
    annotations=[
        dict(
            text=f"Total Sales: ${df['sell_price'].sum():,.0f}",
            x=0.5,
            y=1.05,
            showarrow=False,
            font_size=18
        )
    ]
)

fig.update_traces(
    hoverinfo='label+percent+value',
    textfont_size=14,
    marker=dict(line=dict(color='white', width=1))
)

fig.show()

##### Make vs. Transmission vs. Selling Price by Condition

In [None]:

subset_makes = df['make'].value_counts().index[:4]
df_subset = df[df['make'].isin(subset_makes)]
aggregated = df_subset.groupby(['make', 'transmission', 'condition_group'], as_index=False)['sell_price'].mean()

condition_groups = df['condition_group'].unique()
colors = px.colors.qualitative.Set2[:len(condition_groups)]


fig = make_subplots(
    rows=2,
    cols=2,
    subplot_titles=subset_makes,
    vertical_spacing=0.15,
    horizontal_spacing=0.1,
    specs=[[{'type': 'bar'}, {'type': 'bar'}],
           [{'type': 'bar'}, {'type': 'bar'}]]
)


for i, make in enumerate(subset_makes):
    row = (i // 2) + 1
    col = (i % 2) + 1

    make_data = aggregated[aggregated['make'] == make]

    for j, condition in enumerate(condition_groups):
        condition_data = make_data[make_data['condition_group'] == condition]

        fig.add_trace(
            go.Bar(
                x=condition_data['transmission'],
                y=condition_data['sell_price'],
                name=condition,
                marker_color=colors[j],
                showlegend=(i == 0),
                text=condition_data['sell_price'].round(0),
                texttemplate='%{text:$,.0f}',
                textposition='outside'
            ),
            row=row,
            col=col
        )


fig.update_layout(
    title_text='Make vs. Transmission vs. Selling Price by Condition',
    title_x=0.5,
    title_y=0.95,
    height=800,
    width=1000,
    legend_title='Condition Group',
    barmode='group',
    uniformtext_minsize=10,
    margin=dict(t=100)
)

fig.update_xaxes(title_text="Transmission", row=1, col=1)
fig.update_xaxes(title_text="Transmission", row=1, col=2)
fig.update_xaxes(title_text="Transmission", row=2, col=1)
fig.update_xaxes(title_text="Transmission", row=2, col=2)

fig.update_yaxes(title_text="Average Selling Price ($)", row=1, col=1)
fig.update_yaxes(title_text="Average Selling Price ($)", row=2, col=1)


fig.update_layout(
    plot_bgcolor='white',
    xaxis=dict(showgrid=False),
    yaxis=dict(gridcolor='lightgray'),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig.show()

In [None]:

top_makes = df.groupby('make')['sell_price'].sum().nlargest(10).index
filtered_df = df[df['make'].isin(top_makes)]
aggregated_data = filtered_df.groupby(['make', 'transmission'])['sell_price'].sum().reset_index()


fig = px.bar(aggregated_data,
             x='sell_price',
             y='make',
             color='transmission',
             orientation='h',
             title='Top 10 Makes vs. Transmission vs. Total Selling Price',
             labels={'sell_price': 'Total Selling Price', 'make': 'Car Make'},
             color_discrete_sequence=px.colors.qualitative.Pastel,
             text='sell_price',
             height=600)


fig.update_layout(
    uniformtext_minsize=8,
    uniformtext_mode='hide',
    xaxis_tickprefix='$',
    xaxis_tickformat=',.0f',
    yaxis_categoryorder='total ascending',
    legend_title_text='Transmission Type',
    hovermode='y unified',
    plot_bgcolor='rgba(0,0,0,0)',
    margin=dict(l=100, r=20, t=80, b=20)
)


fig.update_traces(
    texttemplate='%{text:$,.0f}',
    textposition='inside',
    textfont_size=12,
    marker_line_color='rgba(0,0,0,0.2)',
    marker_line_width=1
)


fig.update_xaxes(title_text='Total Selling Price',
                tickprefix='$',
                showgrid=True,
                gridcolor='lightgray')
fig.update_yaxes(title_text='Car Make',
                ticksuffix='  ',
                showgrid=False)

fig.show()

In [None]:
top_10_expensive_cars_full_details = df[['make','model', 'year', 'condition', 'sell_price']].sort_values(by='sell_price', ascending=False).head(10)





# Display the detailed top 10 most expensive cars table

display(top_10_expensive_cars_full_details)

##### Top 10 Car Soldout count


In [None]:
df['full_model'] = df['make'] + ' ' + df['model'] + ' ' + df['sale_year'].astype(str)
model_n = df.groupby(['full_model', 'condition']).size().reset_index(name='count')

top_models = model_n.groupby('full_model')['count'].sum().reset_index() \
                   .sort_values('count', ascending=False).head(10)

model_n_filtered = model_n[model_n['full_model'].isin(top_models['full_model'])]

fig = px.bar(
    model_n_filtered,
    x='full_model',
    y='count',
    color='condition',
    barmode='group',
    title='Top 10 Car Soldout count',
    labels={'count': 'Number Of Cars', 'full_model': 'Model Name'},
    height=600,
    width=1200
)


fig.update_layout(
    xaxis=dict(
        title='Model Name',
        title_font=dict(size=15),
        tickfont=dict(size=14),
        tickangle=-90,
        categoryorder='total descending'
    ),
    yaxis=dict(
        title='Number Of Cars',
        title_font=dict(size=15),
        tickfont=dict(size=14)
    ),
    legend_title_text='Condition',
    legend=dict(
        font=dict(size=12),
        bgcolor='rgba(255,255,255,0.8)'
    ),
    title_font=dict(size=26),
    plot_bgcolor='white',
    bargap=0.2
)


fig.update_traces(
    hovertemplate="<b>%{x}</b><br>Count: %{y}"
)

fig.show()

##### 3D Plot: Year vs Condition vs Selling Pric

In [None]:
fig = go.Figure(data=[
    go.Scatter3d(
        x=df['year'],
        y=df['condition'],
        z=df['sell_price'],
        mode='markers',
        marker=dict(
            size=5,
            color=df['sell_price'],
            colorscale='Viridis',
            opacity=0.7,
            colorbar=dict(title='Selling Price')
        )
    )
])

fig.update_layout(
    title='3D Plot: Year vs Condition vs Selling Price',
    scene=dict(
        xaxis=dict(title='Year', title_font=dict(size=14)),
        yaxis=dict(title='Condition (1-5)', title_font=dict(size=14)),
        zaxis=dict(title='Selling Price', title_font=dict(size=14))
    ),
    width=800,
    height=600,
    margin=dict(l=0, r=0, b=0, t=40)
)

fig.show()

##### 3D Plot: Year vs. Odometer vs. Selling Price

In [None]:
ig = go.Figure(data=[
    go.Scatter3d(
        x=df['year'],
        y=df['odometer'],
        z=df['sell_price'],
        mode='markers',
        marker=dict(
            size=5,
            color=df['sell_price'],
            colorscale='Viridis',
            opacity=0.8,
            colorbar=dict(title='Selling Price')
        )
    )
])


fig.update_layout(
    title='Year vs. Odometer vs. Selling Price',
    scene=dict(
        xaxis=dict(title='Year'),
        yaxis=dict(title='Odometer'),
        zaxis=dict(title='Selling Price')
    ),
    width=1000,
    height=700
)

fig.show()

In [None]:
plt.figure(figsize=(10, 6))

correlation_matrix = df[['condition', 'odometer', 'mmr', 'sell_price']].corr()

plt.show()

In [None]:
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)

plt.title('Correlation Heatmap of Key Variables')

plt.show()