In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime

# Importing Data

In [2]:
data = pd.read_csv('../input/bigbasket-entire-product-list-28k-datapoints/BigBasket Products.csv',index_col=0)

In [3]:
data.head(10)

# Basic Analysis

In [4]:
data.info()

In [5]:
data.columns

In [6]:
data.shape

In [7]:
data.nunique()

In [8]:
list(data.category.unique())

In [9]:
data.sub_category.unique()

In [10]:
data.isna().sum()

In [11]:
data.brand.value_counts()

In [12]:
data.duplicated().sum()

In [13]:
data.drop_duplicates(inplace=True)
data.duplicated().sum()

In [14]:
count_prod = data.groupby(['category','sub_category']).product.count().to_frame('Total_Prods')
count_prod

In [15]:
count_type = data.groupby(['category','sub_category']).type.count().to_frame('Total_Type')
count_type

In [16]:
price_data = data[['product','brand','type','sale_price']]
price_data.head()

# Products with highest price

In [17]:
highest = price_data.sort_values('sale_price',ascending=False)[:10]
highest

# Products with lowest price

In [18]:
lowest = price_data.sort_values('sale_price')[:10]
lowest

# Top Rated Products

In [19]:
top_rate = data[data.rating>=4.5]
top_rate = top_rate.sort_values('rating',ascending=False)
top_rate[['product','brand','rating']]

# Lowest Rated Products

In [20]:
low_rate = data[data.rating<=2]
low_rate = low_rate.sort_values('rating')
low_rate[['product','brand','rating']]

# Products with most discount

In [21]:
discount_data = data.copy()
discount_data.head()

In [22]:
discount_data['Discount_Percentage']=round(((discount_data['market_price']-discount_data['sale_price'])/discount_data['market_price'])*100,2)
discount_data.head()

In [23]:
top_discount = discount_data[['product','brand','Discount_Percentage']].sort_values('Discount_Percentage',ascending=False)[:10]
top_discount

# Products with discount over 50%

In [24]:
above50 = discount_data[discount_data.Discount_Percentage>=50.0]
above50['product'].count()

# Products with discount below 5%

In [25]:
below5 = discount_data[discount_data.Discount_Percentage<=5.0]
below5['product'].count()

# Visualizations:

## No. of Products Per Brand

In [26]:
brand_prod = data.brand.value_counts()
brand_prod

In [27]:
data_df = data.groupby('brand').size().reset_index().sort_values(0,ascending=False)
data_df.rename(columns={0:'count'},inplace=True)
data_df.head()

In [28]:
plt.figure(figsize=(16,12))
kl = sns.barplot(data=data_df[:15],x='count',y='brand',palette='rocket')
kl.bar_label(kl.containers[0])
plt.xlabel("Count of Products")
plt.ylabel('Brand Name')
plt.title('No. of Products Per Brand')
plt.show()

## No. of Products Per Category

In [29]:
category_data = data.groupby('category').size().reset_index().sort_values(0,ascending=False)
category_data.rename(columns={0:'count'},inplace=True)
category_data.head()

In [30]:
plt.figure(figsize=(18,9))
cd = sns.barplot(data=category_data,x='count',y='category',palette='Paired')
cd.bar_label(cd.containers[0])
plt.xlabel('No. of Products')
plt.ylabel('Categories')
plt.title("Number of Products Per Category")
plt.show()

In [31]:
cat = list(data.category.unique())
cat

In [32]:
cat_subcat=count_prod.sort_values(['category','Total_Prods'],ascending=False).reset_index()
cat_subcat

## All Categories with sub-category product count

In [33]:
for i in cat:
    labels_subcat = cat_subcat[cat_subcat['category']==i].sub_category.tolist()
    values_subcat = cat_subcat[cat_subcat['category']==i].Total_Prods.tolist()
    plt.figure(figsize=(16,8))
    hui=px.pie(names=labels_subcat,values=values_subcat,hole=0.4,title=i)
    hui.show()

## No. of Products Per Rating

In [34]:
plt.figure(figsize=(15,6))
axi=plt.hist(data.rating,bins=np.arange(1,6,0.5),edgecolor="black",align='left',linewidth=2)
plt.xlabel("Rating")
plt.ylabel("No of products")
plt.title("No of products per rating")
plt.show()

## Sub-category box plot of Beauty & Hygiene with sale_price

In [35]:
BH_df = data[data['category']=='Beauty & Hygiene']
beauty_df = BH_df[['category','sub_category','sale_price']].sort_values('sale_price')
beauty_df.sample()

In [36]:
plt.figure(figsize=(20,15))
sns.boxplot(data=beauty_df,y='sub_category',x='sale_price')
plt.xlabel("Sales Price")
plt.ylabel("Category")
plt.title("Sub-category Distribution of Beauty & Hygiene Products")
plt.show()

## Average Discount Percentage VS Category

In [37]:
plt.figure(figsize=(24,12))
sns.lineplot(data=discount_data,x='category',y='Discount_Percentage')
plt.xticks(rotation=90)
plt.title("Avg Discount Percentage VS Category")
plt.show()