# Data Cleaning

In [None]:
# import python libraries

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt # visualizing data
%matplotlib inline
import seaborn as sns

In [None]:
# import csv file
df = pd.read_csv('Diwali Sales Data.csv', encoding= 'unicode_escape')

#### Rows And Columns

In [None]:
df.shape #Rows and Columns 

## Dataset

In [None]:
df.head() #Top 5 entrys of Dataset

### Information About Dataset

In [None]:
df.info()

In [None]:
#drop unrelated/blank columns
df.drop(['Status', 'unnamed1'], axis=1, inplace=True)

In [None]:
print(df.isnull())          #Null value

### Null Values Sum

In [None]:
#check for null values
pd.isnull(df).sum()

In [None]:
# drop null values
df.dropna(inplace=True)

### DataType Changing

In [None]:
# change data type
df['Amount'] = df['Amount'].astype('int')

In [None]:
df['Amount'].dtypes

In [None]:
df.columns

In [None]:
#rename column
df.rename(columns= {'Marital_Status':'Shaadi'})

In [None]:
# describe() method returns description of the data in the DataFrame (i.e. count, mean, std, etc)
df.describe()

In [None]:
# use describe() for specific columns
df[['Age', 'Orders', 'Amount']].describe()

# Exploratory Data Analysis

### Gender

In [None]:
# plotting a bar chart for Gender and it's count

ax = sns.countplot(x = 'Gender',data = df)

for bars in ax.containers:
    ax.bar_label(bars)

In [None]:
# plotting a bar chart for gender vs total amount

sales_gen = df.groupby(['Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

sns.barplot(x = 'Gender',y= 'Amount' ,data = sales_gen)

*From above graphs we can see that most of the buyers are females and even the purchasing power of females are greater than men*

### Age

In [None]:
ax = sns.countplot(data = df, x = 'Age Group', hue = 'Gender')

sns.set(rc={'figure.figsize':(9,5)})
for bars in ax.containers:
    ax.bar_label(bars)

In [None]:
# Total Amount vs Age Group
sales_age = df.groupby(['Age Group'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

sns.barplot(x = 'Age Group',y= 'Amount' ,data = sales_age)

*From above graphs we can see that most of the buyers are of age group between 26-35 yrs female*

### State

In [None]:
# Group data by state and calculate total orders
sales_state = df.groupby(['State'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)

# Set figure size and style
sns.set(rc={'figure.figsize': (16, 5)})
sns.set_style("whitegrid")

# Custom color palette
custom_colors = sns.color_palette('coolwarm', len(sales_state))

# Barplot with correct syntax (pass 'hue' explicitly to avoid warning)
sns.barplot(data=sales_state, x='State', y='Orders', hue='State', palette=custom_colors, legend=False)

# Add Titles and Labels
plt.title('Total Number of Orders by State', fontsize=16, fontweight='bold', color='darkblue')
plt.xlabel('State', fontsize=14, fontweight='bold')
plt.ylabel('Orders', fontsize=14, fontweight='bold')

plt.show()

*From above graphs we can see that most of the orders & total sales/amount are from Uttar Pradesh, Maharashtra and Karnataka respectively*


### Marital Status

In [None]:
ax = sns.countplot(data = df, x = 'Marital_Status')

sns.set(rc={'figure.figsize':(5,4)})
for bars in ax.containers:
    ax.bar_label(bars)

In [None]:
sales_state = df.groupby(['Marital_Status', 'Gender'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

sns.set(rc={'figure.figsize':(6,5)})
sns.barplot(data = sales_state, x = 'Marital_Status',y= 'Amount', hue='Gender')

*From above graphs we can see that most of the buyers are married (women) and they have high purchasing power*

### Occupation

In [None]:
sns.set(rc={'figure.figsize':(20,5)})
ax = sns.countplot(data = df, x = 'Occupation')

for bars in ax.containers:
    ax.bar_label(bars)

In [None]:
# Group by Occupation and sum the Amount
sales_state = df.groupby(['Occupation'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False)

# Set figure size
sns.set(rc={'figure.figsize': (20, 5)})

# Get the correct number of unique categories for Occupation
num_categories = df['Occupation'].nunique()

# Create a dynamic color palette
custom_colors = sns.color_palette('coolwarm', num_categories)

# Create the bar plot with updated palette
sns.barplot(data=sales_state, x='Occupation', y='Amount', hue='Occupation', palette=custom_colors, legend=False)

From above graphs we can see that most of the buyers are working in IT, Healthcare and Aviation sector

### Prroduct Category

In [None]:
sns.set(rc={'figure.figsize':(25,7)})
ax = sns.countplot(data = df, x = 'Product_Category')

for bars in ax.containers:
    ax.bar_label(bars)


In [None]:
sales_state = df.groupby(['Product_Category'], as_index=False)['Amount'].sum().sort_values(by='Amount', ascending=False).head(10)

sns.set(rc={'figure.figsize':(20,5)})
sns.barplot(data = sales_state, x = 'Product_Category',y= 'Amount',hue='Product_Category', palette=custom_colors, legend=False)

*From above graphs we can see that most of the sold products are from Food, Clothing and Electronics category*

In [None]:
sales_state = df.groupby(['Product_ID'], as_index=False)['Orders'].sum().sort_values(by='Orders', ascending=False).head(10)

sns.set(rc={'figure.figsize':(20,5)})
sns.barplot(data = sales_state, x = 'Product_ID',y= 'Orders',hue='Product_ID', palette=custom_colors, legend=False)

In [None]:
# top 10 most sold products (same thing as above)

fig1, ax1 = plt.subplots(figsize=(12,7))
df.groupby('Product_ID')['Orders'].sum().nlargest(10).sort_values(ascending=False).plot(kind='bar')

### Conclusion:

## Married women age group 26-35 yrs from UP, Maharastra and Karnataka working in IT, Healthcare and Aviation are more likely to buy products from Food, Clothing and Electronics category