In [13]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

## Data Analysis

In [14]:
df_data = pd.read_csv('..//data//BMW_Car_Sales_Classification.csv')
df_data.head()

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low


In [15]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50000 non-null  object 
 1   Year                  50000 non-null  int64  
 2   Region                50000 non-null  object 
 3   Color                 50000 non-null  object 
 4   Fuel_Type             50000 non-null  object 
 5   Transmission          50000 non-null  object 
 6   Engine_Size_L         50000 non-null  float64
 7   Mileage_KM            50000 non-null  int64  
 8   Price_USD             50000 non-null  int64  
 9   Sales_Volume          50000 non-null  int64  
 10  Sales_Classification  50000 non-null  object 
dtypes: float64(1), int64(4), object(6)
memory usage: 4.2+ MB


In [16]:
if df_data.duplicated().any():
    print('Founded duplicated data')
    df_data = df_data.drop_duplicates()
else:
    print('Duplicated data not found')

Duplicated data not found


## Variable Analysis

In [17]:
df_yearly = df_data.groupby('Year').sum().reset_index()
max_id = df_yearly['Sales_Volume'].idxmax()
fig_1 = px.line(
    df_yearly,
    x='Year',
    y='Sales_Volume',
    title=' Total Sales by Manufacturing Year',
    markers=True
)
fig_1.update_layout(
    xaxis_title='Year of Manufacture',
    yaxis_title='Total Sales Volume',
    title_x=0.5,
    hovermode='x unified',
    font=dict(family="Arial", size=14)
)

fig_1.update_yaxes(tickformat=",")
fig_1.show()

print(f'Best-selling car manufacturing year: {df_yearly.loc[max_id]['Year']} '
      f'with {df_yearly.loc[max_id]['Sales_Volume']:,} units.')

Best-selling car manufacturing year: 2022 with 17,920,946 units.


In [18]:
df_yearly_region = df_data.groupby(['Year', 'Region']).sum().reset_index()
max_id = df_yearly_region['Sales_Volume'].idxmax()
fig_2 = px.line(
    df_yearly_region,
    x='Year',
    y='Sales_Volume',
    color='Region',
    title=' Total Sales by Manufacturing Year',
    markers=True
)
fig_2.update_layout(
    xaxis_title='Year of Manufacture',
    yaxis_title='Total Sales Volume',
    title_x=0.5,
    hovermode='x unified',
    font=dict(family="Arial", size=14)
)

fig_2.update_yaxes(tickformat=",")
fig_2.show()

print(f'Best-selling car Region: {df_yearly_region.loc[max_id]['Region']} '
      f'with {df_yearly_region.loc[max_id]['Sales_Volume']:,} units '
      f'of cars manufactured in {df_yearly_region.loc[max_id]['Year']}.')

Best-selling car Region: North America with 3,117,538 units of cars manufactured in 2019.


In [19]:
df_model_sell = df_data.groupby(['Model', 'Region']).sum().reset_index()

model_order = df_model_sell.groupby('Model')['Sales_Volume'].sum().sort_values(ascending=False).index

fig_3 = px.histogram(
    df_model_sell,
    x='Model',
    y='Sales_Volume',
    color='Region',
    category_orders={'Model': model_order},
    title='Sales Volume by Car Model and Region',
    text_auto=True 
)

fig_3.update_layout(
    xaxis_title='Car Model',
    yaxis_title='Total Sales Volume',
    barmode='stack',
    title_x=0.5,
    template='plotly_white',
    font=dict(family="Arial", size=14),
    legend_title_text='Region',
    xaxis_tickangle=-45
)

fig_3.update_yaxes(tickformat=",")

fig_3.show()
print(f'Top selling model: {model_order[0]} '
      f'with {df_model_sell.query(f'Model == "{model_order[0]}"')['Sales_Volume'].sum():,} units')

Top selling model: 7 Series with 23,786,466 units


In [65]:
df_fuel_sales = df_data.groupby(['Fuel_Type', 'Sales_Classification']).sum().reset_index()

fig_4 = px.bar(
    df_fuel_sales,
    x='Fuel_Type',
    y='Sales_Volume',
    color='Sales_Classification',
    text_auto=True,
    title='Sales Volume by Fuel Type and Classification'
)

fig_4.update_layout(
    xaxis_title='Fuel Type',
    yaxis_title='Total Sales Volume',
    barmode='group',
    title_x=0.5,
    template='plotly_white',
    font=dict(family="Arial", size=14),
    legend_title_text='Sales Classification',
    xaxis_tickangle=-45
)

fig_4.update_yaxes(tickformat=",")

fig_4.show()


In [21]:
fig_5 = px.violin(
    df_data,
    x='Transmission',
    y='Price_USD',
    color='Sales_Classification',
    box=True,
    title='Price Distribution by Transmission Type and Sales Classification'
)

fig_5.update_layout(
    xaxis_title='Transmission Type',
    yaxis_title='Price (USD)',
    title_x=0.5,
    template='plotly_white',
    font=dict(family="Arial", size=14),
    legend_title_text='Sales Classification'
)

fig_5.update_yaxes(
    tickprefix="$",
    tickformat=","
)

fig_5.show()


In [64]:
df_eng_sales = df_data.groupby(['Engine_Size_L', 'Sales_Classification']).sum().reset_index()

fig_6 = px.histogram(
    df_eng_sales,
    x='Engine_Size_L',
    y='Sales_Volume',
    color='Sales_Classification',
    barmode='group',
    category_orders={'Engine_Size_L': eng_ord},
    text_auto=True,
    title='🔩 Sales Volume by Engine Size and Sales Classification'
)

fig_6.update_layout(
    xaxis_title='Engine Size (Liters)',
    yaxis_title='Total Sales Volume',
    title_x=0.5,
    template='plotly_white',
    font=dict(family="Arial", size=14),
    legend_title_text='Sales Classification'
)

fig_6.update_yaxes(tickformat=",")

fig_6.show()


In [None]:
corr_matrix = df_data.corr(numeric_only=True)

fig_7 = px.imshow(
    corr_matrix,
    text_auto=True,
    aspect="auto",
    color_continuous_scale="RdBu_r",
    title="Correlation Matrix of Numerical Features"
)

fig_7.update_xaxes(side="top")

fig_7.update_layout(
    title_x=0.5,
    template='plotly_white',
    font=dict(family="Arial", size=14),
    coloraxis_colorbar=dict(title="Correlation")
)

fig_7.show()
