In [2]:
# Step 1: Business Questions (CRITICAL)

# Before coding, define questions:

# How are sales trending over time?
# Which product categories generate the most revenue?
# Do gender or age groups affect spending?
# Who are the high-value customers?
# What drives total revenue the most?

In [4]:
# Step 2: Load & Inspect Data

import pandas as pd
dataset = pd.read_csv('Datasets/retail_sales_dataset.csv')

In [6]:
dataset.head(4)

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500


In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Transaction ID    1000 non-null   int64 
 1   Date              1000 non-null   object
 2   Customer ID       1000 non-null   object
 3   Gender            1000 non-null   object
 4   Age               1000 non-null   int64 
 5   Product Category  1000 non-null   object
 6   Quantity          1000 non-null   int64 
 7   Price per Unit    1000 non-null   int64 
 8   Total Amount      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB


In [10]:
# Step 3: converting date to datetime

dataset['Date'] = pd.to_datetime(dataset['Date'])

In [14]:
# Step 4: Feature Engineering (PRO MOVE)
# Date Features

dataset['year'] = dataset['Date'].dt.year
dataset['month'] = dataset['Date'].dt.month
dataset['month_name'] = dataset['Date'].dt.month_name()

In [16]:
dataset.head()

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount,year,month,month_name
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150,2023,11,November
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000,2023,2,February
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30,2023,1,January
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500,2023,5,May
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100,2023,5,May


In [18]:
# Age Group
dataset['age_group'] = pd.cut(
    dataset['Age'],
    bins=[0, 18, 35, 60, 100],
    labels=['Teen', 'Young Adult', 'Adult', 'Senior']
)

In [20]:
dataset

Unnamed: 0,Transaction ID,Date,Customer ID,Gender,Age,Product Category,Quantity,Price per Unit,Total Amount,year,month,month_name,age_group
0,1,2023-11-24,CUST001,Male,34,Beauty,3,50,150,2023,11,November,Young Adult
1,2,2023-02-27,CUST002,Female,26,Clothing,2,500,1000,2023,2,February,Young Adult
2,3,2023-01-13,CUST003,Male,50,Electronics,1,30,30,2023,1,January,Adult
3,4,2023-05-21,CUST004,Male,37,Clothing,1,500,500,2023,5,May,Adult
4,5,2023-05-06,CUST005,Male,30,Beauty,2,50,100,2023,5,May,Young Adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2023-05-16,CUST996,Male,62,Clothing,1,50,50,2023,5,May,Senior
996,997,2023-11-17,CUST997,Male,52,Beauty,3,30,90,2023,11,November,Adult
997,998,2023-10-29,CUST998,Female,23,Beauty,4,25,100,2023,10,October,Young Adult
998,999,2023-12-05,CUST999,Female,36,Electronics,3,50,150,2023,12,December,Adult


In [22]:
# Step 5: Core KPIs (THIS IS WHAT COMPANIES WANT)
# Total Revenue

total_revenue = dataset['Total Amount'].sum()
total_revenue

456000

In [24]:
# Average Transaction Value

avg_transaction_value = dataset['Total Amount'].mean()
avg_transaction_value

456.0

In [26]:
# Total Transactions

total_transactions = dataset['Transaction ID'].nunique()
total_transactions

1000

In [28]:
# Step 6: Sales Trend Over Time

monthly_sales = (
    dataset.groupby(['year', 'month'])['Total Amount']
      .sum()
      .reset_index()
)

monthly_sales

Unnamed: 0,year,month,Total Amount
0,2023,1,35450
1,2023,2,44060
2,2023,3,28990
3,2023,4,33870
4,2023,5,53150
5,2023,6,36715
6,2023,7,35465
7,2023,8,36960
8,2023,9,23620
9,2023,10,46580


In [30]:
# Step 7: Product Category Performance

category_sales = (
    dataset.groupby('Product Category')['Total Amount']
      .sum()
      .sort_values(ascending=False)
)

category_sales

Product Category
Electronics    156905
Clothing       155580
Beauty         143515
Name: Total Amount, dtype: int64

In [34]:
# Step 8: Customer Segmentation Insights
# Revenue by Gender

gender_sales = (
    dataset.groupby('Gender')['Total Amount']
    .sum()
    .reset_index()
)
gender_sales


Unnamed: 0,Gender,Total Amount
0,Female,232840
1,Male,223160


In [42]:
# Revenue by Age Group
age_group_sales = (
    dataset.groupby('age_group')['Total Amount']
    .sum()
    .reset_index()
)
age_group_sales


  dataset.groupby('age_group')['Total Amount']


Unnamed: 0,age_group,Total Amount
0,Teen,11215
1,Young Adult,171815
2,Adult,239745
3,Senior,33225


In [44]:
# Step 9: High-Value Customers

customer_value = (
    dataset.groupby('Customer ID')['Total Amount']
      .sum()
      .sort_values(ascending=False)
)

top_customers = customer_value.head(5)
top_customers

Customer ID
CUST487    2000
CUST476    2000
CUST773    2000
CUST503    2000
CUST093    2000
Name: Total Amount, dtype: int64

In [46]:
# Step 10: What Actually Drives Revenue?

dataset[['Quantity', 'Price per Unit', 'Total Amount']].corr()

Unnamed: 0,Quantity,Price per Unit,Total Amount
Quantity,1.0,0.017501,0.373707
Price per Unit,0.017501,1.0,0.851925
Total Amount,0.373707,0.851925,1.0
