In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Ignore replce() FutureWarning.
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


In [None]:
datapath = "/kaggle/input/sales-of-a-supermarket/supermarket_sales.csv"
supermarket = pd.read_csv(datapath)
supermarket.head()

# 01. Data Cleaning

### Check null and duplicates

In [None]:
print(f'{supermarket.duplicated().sum()} duplicates found')
print(f'{supermarket.isna().sum().sum()} nulls found')

### To look inside each Column and type for missing information

In [None]:
supermarket.info()

# 02. Sales Performance

## Branches

### How are the branches performing in term of total sale?

In [None]:
city = supermarket.groupby('City')['Total'].sum()
print(city)

# Visualization
colors = ['#3B979B','#ACB292','#C49F7D']
ax = city.plot(kind = 'pie', autopct='%1.1f%%', colors = colors, startangle=180)
ax.axis('equal') 

# Layouts
plt.title('Sales of Branches')

plt.show()

# 03. Analysing product line

### Which product line contributes the most revenue?

In [None]:
product_line = supermarket.groupby('Product line')['Total'].sum().sort_values().reset_index()
product_line.columns = ['Product line', 'Total sale']
print(product_line)

# visualization
fig = px.bar(product_line, x='Total sale', y='Product line', color = 'Total sale', orientation='h')

# layouts
fig.update_layout(
    title = 'Sales of Product Lines',
    width=700, 
    height=500
)
fig.show()

### Peak months and Peak hours

In [None]:
#Parsing date
supermarket['Date'] = pd.to_datetime(supermarket['Date'])

# Add a year column
supermarket['year'] = supermarket['Date'].dt.year
supermarket['year'].unique()

# Add a month column
supermarket['month'] = supermarket['Date'].dt.month
supermarket['month'].unique()

In [None]:
import plotly.graph_objects as go
trend = supermarket.groupby('month')['Total'].sum().reset_index()

# viz
fig = go.Figure()

# bar
fig.add_trace(go.Bar(
    x=trend['month'],
    y=trend['Total'],
    name='Total Sales',
    marker_color='skyblue',
    width=0.3 
))

# line
fig.add_trace(go.Scatter(
    x=trend['month'],
    y=trend['Total'],
    name='Trend Line',
    mode='lines+markers',  # 设置为线和标记点
    line=dict(color='blue', width=1)  # 设置线的颜色和宽度
))

# layouts
fig.update_layout(
    title='Sales by Month',
    width=700,
    height=500,
    xaxis=dict(
        tickvals=[1, 2, 3],
        ticktext=['January', 'February', 'March'],
        title='Month'
    ),
    yaxis=dict(title='Total Sales')
)

In [None]:
# Parsinig time
supermarket['Time'] = pd.to_datetime(supermarket['Time'], format='%H:%M:%S').dt.time

# Extract hours
supermarket['Hour'] = pd.to_datetime(supermarket['Time'], format='%H:%M:%S').dt.hour

# Sales each hour
hourly_sales = supermarket.groupby('Hour')['Total'].sum()

# Transaction counts each hour
hourly_transactions = supermarket.groupby('Hour').size()

# Creating subplots
fig, ax = plt.subplots(2, 1, figsize=(8, 6))

# Sales each hour
hourly_sales.plot(kind='bar', ax=ax[0], color='skyblue')
ax[0].set_title('Hourly Sales Total')
ax[0].set_xlabel('Hour of the Day')
ax[0].set_ylabel('Total Sales')

# Transaction counts each hour
hourly_transactions.plot(kind='bar', ax=ax[1], color='orange')
ax[1].set_title('Hourly Transactions')
ax[1].set_xlabel('Hour of the Day')
ax[1].set_ylabel('Number of Transactions')

plt.tight_layout()
plt.show()

# 04. Customer Behavior

## Type of customers and their choice of products







### gender

In [None]:
gender_sale = supermarket.groupby('Gender')['Total'].sum()
print(gender_sale)

# viz
ax = gender_sale.plot(kind = 'pie', autopct='%1.1f%%', colors = colors, startangle= 90)
ax.axis('equal') 
plt.title('Total Sales by Gender')

In [None]:
gender = supermarket.groupby(['Product line', 'Gender']).size().reset_index()
gender.columns = ['Product line', 'Gender', 'Count']

# grouped bar chart
fig = px.bar(gender, x = 'Product line', y = 'Count', color = 'Gender', barmode = 'group')

# layouts
fig.update_layout(
    height = 500,
    width =900,
    title = 'Product line by Gender'
)
fig.show()

### Customer Type (Membership or Non-member)

In [None]:
Ctype_sale = supermarket.groupby('Customer type')['Total'].sum()
print(Ctype_sale)

# viz
ax = Ctype_sale.plot(kind = 'pie', autopct='%1.1f%%', colors = colors, startangle=90)
ax.axis('equal') 
plt.title('Total Sales by Customer type')

In [None]:
gender = supermarket.groupby(['Product line', 'Customer type']).size().reset_index()
gender.columns = ['Product line', 'Customer type', 'Count']

# grouped bar chart
fig = px.bar(gender, x = 'Product line', y = 'Count', color = 'Customer type', barmode = 'group')

# layouts
fig.update_layout(
    height = 500,
    width =900,
    title = 'Product line by Customer type'
)
fig.show()