In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio

In [None]:
# Set Plotly default template and Seaborn style for a deep blue theme
pio.templates.default = "plotly_white"
sns.set_theme(style="whitegrid", palette="deep")
custom_palette = ["#041E42", "#1B365D", "#4B6FA5", "#7DA0CA", "#A7C7E7"]

In [None]:
# Load Data
df = pd.read_csv('/content/Sales_Data.csv', encoding='unicode_escape')

In [None]:
#Check Number of rows and Columns
df.shape

(11257, 13)

In [None]:
#To see the imported data
df.head(5)

Unnamed: 0,User_ID,Cust_name,Product_ID,Age,Age Group,Gender,State,Zone,Zipcode,Profession,Product_Category,Orders,Amount
0,1002903.0,Anvi,P00125942,27.0,26-35,Female,Maharashtra,West,,Healthcare,Sports,4.0,20500.0
1,1000732.0,Shanta,P00110942,34.0,26-35,Female,Andhra Pradesh,South,,Govt,Sports,2.0,25360.0
2,1001990.0,Sheetal,P00118542,16.0,0-17,Female,Uttar Pradesh,Central,,Automobile,Health,4.0,29350.0
3,1001425.0,Virendra,P00237842,16.0,0-17,M,Karnataka,South,,Construction,Clothing,6.0,23500.0
4,1000588.0,Vishal,P00057942,28.0,26-35,M,Gujarat,West,,Food Processing,Electronics,4.0,23870.0


In [None]:
#Field details and Data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11257 entries, 0 to 11256
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   User_ID           11245 non-null  float64
 1   Cust_name         11245 non-null  object 
 2   Product_ID        11245 non-null  object 
 3   Age               11245 non-null  float64
 4   Age Group         11245 non-null  object 
 5   Gender            11245 non-null  object 
 6   State             11245 non-null  object 
 7   Zone              11245 non-null  object 
 8   Zipcode           0 non-null      float64
 9   Profession        11245 non-null  object 
 10  Product_Category  11245 non-null  object 
 11  Orders            11245 non-null  float64
 12  Amount            11245 non-null  float64
dtypes: float64(5), object(8)
memory usage: 1.1+ MB


Data Cleaning

In [None]:
#Deleting blank column
df.drop(['Zipcode'], axis=1, inplace=True)

In [None]:
#List of Columns Available
df.columns

Index(['User_ID', 'Cust_name', 'Product_ID', 'Age', 'Age Group', 'Gender',
       'State', 'Zone', 'Profession', 'Product_Category', 'Orders', 'Amount'],
      dtype='object')

In [None]:
#check for null values
pd.isnull(df).sum()

Unnamed: 0,0
User_ID,12
Cust_name,12
Product_ID,12
Age,12
Age Group,12
Gender,12
State,12
Zone,12
Profession,12
Product_Category,12


In [None]:
# drop null values
df.dropna(how='all', inplace=True)

In [None]:
df.shape

(11245, 12)

In [None]:
#replace value of Gender Column
df['Gender'] = df['Gender'].replace('M', 'Male')

In [None]:
#View only Male Gender data
df[df['Gender'] == 'Male']

Unnamed: 0,User_ID,Cust_name,Product_ID,Age,Age Group,Gender,State,Zone,Profession,Product_Category,Orders,Amount
3,1001425.0,Virendra,P00237842,16.0,0-17,Male,Karnataka,South,Construction,Clothing,6.0,23500.0
4,1000588.0,Vishal,P00057942,28.0,26-35,Male,Gujarat,West,Food Processing,Electronics,4.0,23870.0
5,1000588.0,Suuraj,P00057942,28.0,26-35,Male,Himachal Pradesh,Northern,Food Processing,Electronics,3.0,23860.0
8,1003224.0,Kushal,P00205642,35.0,26-35,Male,Uttar Pradesh,Central,Govt,Beauty,4.0,23809.0
11,1003829.0,Harsh,P00200842,34.0,26-35,Male,Delhi,Central,Banking,Health,2.0,23770.0
...,...,...,...,...,...,...,...,...,...,...,...,...
11249,1005446.0,Sheetal,P00297742,53.0,51-55,Male,Gujarat,West,Healthcare,Health,3.0,382.0
11250,1005446.0,Sheetal,P00297742,53.0,51-55,Male,Madhya Pradesh,Central,Healthcare,Health,2.0,382.0
11252,1000695.0,Manning,P00296942,19.0,18-25,Male,Maharashtra,West,Chemical,Health,1.0,370.0
11253,1004089.0,Reichenbach,P00171342,33.0,26-35,Male,Haryana,Northern,Healthcare,Health,5.0,367.0


In [None]:
# describe() method returns description of the data in the DataFrame (i.e. count, mean, std, etc)
df.describe()

Unnamed: 0,User_ID,Age,Orders,Amount
count,11245.0,11245.0,11245.0,11245.0
mean,1003004.0,35.415651,3.500311,9461.934237
std,1716.207,12.756369,1.713706,5234.426634
min,1000001.0,12.0,1.0,188.0
25%,1001492.0,27.0,2.0,5443.0
50%,1003065.0,33.0,4.0,8109.0
75%,1004429.0,43.0,5.0,12683.0
max,1006040.0,92.0,6.0,29350.0


Total Transactions count by Gender Wise show in Bar Chart

In [None]:
#Total Transactions Count by Gender wise
gender_counts = df['Gender'].value_counts()
print(gender_counts)

Gender
Female    7837
Male      3408
Name: count, dtype: int64


In [None]:
gender_counts = df['Gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']
# Interactive Bar
fig = px.bar(
    gender_counts, x='Gender', y='Count',
    color='Gender', color_discrete_sequence=custom_palette,
    text='Count', title='Gender Distribution'
)
fig.update_traces(textfont_size=16)
fig.update_layout(font=dict(size=16), title_font=dict(size=24))
fig.show()

*Conclusion : Females do more transactions than males.*

Gender wise distribution in Pie Chart

In [None]:
gender_counts = df['Gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']
# Interactive Pie
fig = px.pie(
    gender_counts, values='Count', names='Gender',
    color='Gender', color_discrete_sequence=custom_palette,
    title='Gender Distribution (Pie Chart)',
    hole=0.4
)
fig.update_traces(textinfo='percent+label', textfont_size=16)
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*Conclusion : Females do 38.4% more transactions than males.*

Gender wise Total Sales Amount show in Bar chart

In [None]:
df.groupby('Gender', as_index=False)['Amount'].sum()

Unnamed: 0,Gender,Amount
0,Female,74461610.49
1,Male,31937840.0


In [None]:
gen_sales = df.groupby('Gender', as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)
fig = px.bar(
    gen_sales, x='Gender', y='Amount',
    color='Gender', color_discrete_sequence=custom_palette,
    text='Amount', title='Gender-wise Total Sales Amount'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*Conclusion : Females do more transactions(by amount) than males.*

Age Group wise Transactions Count in Bar Chart

In [None]:
age_counts = df['Age Group'].value_counts().sort_values(ascending=False)
sns_order = age_counts.index

fig = px.histogram(
    df, x='Age Group', color='Age Group', category_orders={'Age Group': list(sns_order)},
    color_discrete_sequence=custom_palette, title='Transactions by Age Group'
)
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

Age Group and Gender Distribution

In [None]:
fig = px.histogram(
    df, x='Age Group', color='Gender', barmode='group',
    category_orders={'Age Group': list(sns_order)},
    color_discrete_sequence=custom_palette,
    title='Age Group and Gender-wise Transactions'
)
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*Conclusion : From above graphs we can see that most of the buyers are of age group between 26-35 yrs females*

Age Group-wise Total Sales Amount

In [None]:
sales_age = df.groupby('Age Group', as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)
fig = px.bar(
    sales_age, x='Age Group', y='Amount',
    color='Age Group', color_discrete_sequence=custom_palette,
    text='Amount', title='Age Group-wise Total Sales Amount'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

Top 5 States by Orders

In [None]:
order_state = df.groupby(['State'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(5)
print(order_state)

             State  Orders
14   Uttar Pradesh  6669.0
10     Maharashtra  5328.0
7        Karnataka  4506.0
2            Delhi  3893.0
9   Madhya Pradesh  3217.0


In [None]:
order_state = df.groupby('State', as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(5)
fig = px.bar(
    order_state, x='State', y='Orders',
    color='State', color_discrete_sequence=custom_palette,
    text='Orders', title='Top 5 States by Orders'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

Top 5 States by Sales Amount

In [None]:
sales_state = df.groupby('State', as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(5)
fig = px.bar(
    sales_state, x='State', y='Amount',
    color='State', color_discrete_sequence=custom_palette,
    text='Amount', title='Top 5 States by Sales Amount'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*From above graphs we can see that most of the orders & total sales/amount are from Uttar Pradesh, Maharashtra and Karnataka respectively*


Product Category Analysis

In [None]:
cat_counts = df['Product_Category'].value_counts().reset_index()
cat_counts.columns = ['Product_Category', 'Count']
fig = px.bar(
    cat_counts, x='Product_Category', y='Count',
    color='Product_Category', color_discrete_sequence=custom_palette,
    text='Count', title='Product Category-wise Transactions'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*Conclusion : Product category Beauty has most transactions.*

Amount by Product Category (Top 5)

In [None]:
sales_cat = df.groupby('Product_Category', as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(5)
fig = px.bar(
    sales_cat, x='Product_Category', y='Amount',
    color='Product_Category', color_discrete_sequence=custom_palette,
    text='Amount', title='Top 5 Product Categories by Sales Amount'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

Product Category and Gender-wise Sales

In [None]:
sales_pro = df.groupby(['Product_Category', 'Gender'], as_index=False)['Amount'].sum()
fig = px.bar(
    sales_pro, x='Product_Category', y='Amount', color='Gender',
    barmode='group', color_discrete_sequence=custom_palette,
    text='Amount', title='Product Category and Gender-wise Sales'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*From above graphs we can see that most of the buyers are Female

Profession Analysis

In [None]:
prof_counts = df['Profession'].value_counts().reset_index()
prof_counts.columns = ['Profession', 'Count']
fig = px.bar(
    prof_counts, x='Profession', y='Count',
    color='Profession', color_discrete_sequence=custom_palette,
    text='Count', title='Profession-wise Transactions'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*Conclusion : IT professionals perform the most transactions*

Top 3 Professions by Sales

In [None]:
sales_prof = df.groupby('Profession', as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(3)
fig = px.bar(
    sales_prof, x='Profession', y='Amount',
    color='Profession', color_discrete_sequence=custom_palette,
    text='Amount', title='Top 3 Professions by Sales Amount'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

*From above graphs we can see that most of the buyers are working in IT, Healthcare and Aviation sector*

Top 10 Products by Orders

In [None]:
top_products = df.groupby('Product_ID', as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)
fig = px.bar(
    top_products, x='Product_ID', y='Orders',
    color='Product_ID', color_discrete_sequence=custom_palette,
    text='Orders', title='Top 10 Products by Orders'
)
fig.update_traces(texttemplate='%{text:.0f}', textposition='outside')
fig.update_layout(font=dict(size=16), title_font=dict(size=20))
fig.show()

## Conclusion:

### *Female age group 26-35 yrs from Uttar Pradesh,  Maharastra and Karnataka working in IT, Healthcare and Aviation are more likely to buy products from Beauty, Sports and Electronics*