In [8]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [9]:
df = pd.read_csv(r'/kaggle/input/retail-sales-dataset/retail_sales_dataset.csv')
df.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


In [10]:
df.rename(columns={'Price per Unit':'Unit Price'},inplace=True)
df.rename(columns={'Total Amount':'Revenue'},inplace=True)
df.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Unit Price,Revenue
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100


*Which product category has the highest average price per unit?*

In [11]:
high_avg_per = df.groupby('Product Category').agg(avg_price=('Unit Price','mean')).reset_index()
high_avg_per

fig = px.bar(
    high_avg_per,
    x='Product Category',
    y='avg_price',
    color='avg_price',
    color_continuous_scale='Burg',
    title='Average Unit Price by Product Category'
)

fig.update_layout(
    xaxis_title='Product Category',
    yaxis_title='Average Unit Price',
    xaxis_tickangle=-45
)

fig.show()

high_avg_per


Unnamed: 0,Product Category,avg_price
0,Beauty,184.055375
1,Clothing,174.287749
2,Electronics,181.900585


*How does the quantity sold vary across different product categories?*

In [12]:
quantity_sold_per_product_category = df.groupby('Product Category')['Quantity'].sum().reset_index()
quantity_sold_per_product_category

fig = px.bar(
    quantity_sold_per_product_category,
    x='Product Category',
    y='Quantity',
    color='Quantity',
    color_continuous_scale='Jet',
    title = 'Product Categories based on Quantity Sold'
)

fig.update_layout(
    xaxis_title='Product Category',
    yaxis_title='Quantity Sold',
    xaxis_tickangle=-45
)

fig.show()
quantity_sold_per_product_category

Unnamed: 0,Product Category,Quantity
0,Beauty,771
1,Clothing,894
2,Electronics,849


*Are there any product categories with unusually high or low total amounts?*

In [13]:
#To find outliers to calculate Z-score = (Total + Mean) / SD
category_tot = df.groupby('Product Category')['Revenue'].sum().reset_index()
category_tot

#Mean
mean_tot = category_tot['Revenue'].mean()

#Standard Deviation
std_tot = category_tot['Revenue'].std()

#Z-Score
category_tot['z_score'] = (category_tot['Revenue'] - mean_tot) / std_tot

#Check Unusuals
category_tot['Unusuals'] = category_tot['z_score'].apply(
    lambda x: 'High' if x > 2 else ('Low' if x<-2 else 'Normal')
)

category_tot.drop(columns=['z_score'], errors='ignore')


Unnamed: 0,Product Category,Revenue,Unusuals
0,Beauty,143515,Normal
1,Clothing,155580,Normal
2,Electronics,156905,Normal


In [15]:
df.tail()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Unit Price,Revenue
995,996,2023-05-16,CUST996,Male,62,Clothing,1,50,50
996,997,2023-11-17,CUST997,Male,52,Beauty,3,30,90
997,998,2023-10-29,CUST998,Female,23,Beauty,4,25,100
998,999,2023-12-05,CUST999,Female,36,Electronics,3,50,150
999,1000,2023-04-12,CUST1000,Male,47,Electronics,4,30,120
