# Commercial Store Data

<h2>Import Libraries</h2>

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import plotly.express as px


<h2> Load the Data</h2>

In [25]:
storeData=pd.read_csv("data.csv", low_memory=False)

In [26]:
#to display  max values
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
#create dataframe 
df = pd.DataFrame(storeData)

## Inspect data

In [27]:
# show data
df.head(100)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,12/1/2010 8:26,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,12/1/2010 8:26,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,12/1/2010 8:28,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,12/1/2010 8:28,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,12/1/2010 8:34,1.69,13047.0,United Kingdom


In [28]:
# summary of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27865 entries, 0 to 27864
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   InvoiceNo    27865 non-null  object 
 1   StockCode    27865 non-null  object 
 2   Description  27752 non-null  object 
 3   Quantity     27865 non-null  int64  
 4   InvoiceDate  27865 non-null  object 
 5   UnitPrice    27865 non-null  float64
 6   CustomerID   17928 non-null  float64
 7   Country      27865 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 1.7+ MB


In [29]:
""" 
    InvoiceNo:invoice no: of customer
    StockCode:unique identifier assigned to each product 
    Description:product name
    Quantity:quantity of product 
    InvoiceDate:date of purchase
    UnitPrice:price per unit item
    CustomerID:unique ID of customer
    Country:country of customer
    TotalValue:total amount spend"""
#column information
df.columns

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')

In [30]:
"""Count: The number of non-null entries.
Mean: The average value.
Standard Deviation (std): The spread of the data.
Min: The minimum value.
25%: The 25th percentile (first quartile).
50%: The 50th percentile (median).
75%: The 75th percentile (third quartile).
Max: The maximum value."""
# Descriptive statistics
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,27865.0,27865.0,17928.0
mean,7.307088,6.770915,15582.891789
std,67.768182,171.499338,1755.083386
min,-9360.0,0.0,12347.0
25%,1.0,1.45,14205.0
50%,2.0,2.51,15628.0
75%,6.0,4.25,17218.0
max,2880.0,13541.33,18269.0


## Handle Missing Values

In [31]:
# Check for missing values
print(df.isnull().sum())


InvoiceNo         0
StockCode         0
Description     113
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID     9937
Country           0
dtype: int64


In [32]:
#  Remove rows with missing values
df = df.dropna()


## Remove Duplicates

In [33]:
# Check for duplicates
print(df.duplicated().sum())

407


In [34]:
# Remove duplicates
df = df.drop_duplicates()

## Convert Data Types

In [35]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype('int'
df['InvoiceNo'] = df['InvoiceNo'].astype('int'))

SyntaxError: invalid syntax. Perhaps you forgot a comma? (646013866.py, line 2)

##  Outliers

In [None]:
Q1 = df['Quantity'].quantile(0.25)
Q3 = df['Quantity'].quantile(0.75)
IQR = Q3 - Q1

# outlier boundaries
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Identify outliers
outliers = df[(df['Quantity'] < lower_bound) | (df['Quantity'] > upper_bound)]
outliers

In [None]:
#removing
dfCleaned = df[(df['Quantity'] >= lower_bound) & (df['Quantity'] <= upper_bound)]

## Store Cleaned Data

In [None]:
dfCleaned.to_csv('cleanedStoreData.csv', index=False)

## cleaned data

In [None]:
cleanedStoreData=pd.read_csv("data.csv", low_memory=False)
df1 = pd.DataFrame(cleanedStoreData)

In [None]:
# Add new column 'Total Value' which is Quantity * Unit Price
df1['TotalValue'] = df1['Quantity'] * df1['UnitPrice']
# data


# "Store data"

In [None]:
df1.head(100)

In [None]:
# Total rows and columns
df1.shape

In [None]:
#to find customer who has spend maximum money in single  item purchase
df1.loc[df1["TotalValue"].argmax()][["CustomerID","Description","TotalValue"]]

# Product List

In [None]:
#total number of uniqe products 
print(len(df1.Description.unique()))
#product List
print("\n".join(map(str, df1.Description.unique())))

# Countries List

In [None]:
#total number of uniqe countries 
print(len(df1.Country.unique()))
#Country List
print("\n".join(df1.Country.unique()))

### Total money spend by customers(country wise)

In [None]:
salesByCountry = df1.groupby('Country')['TotalValue'].sum().sort_values(ascending=False)

# to dataframe
salesByCountrydf = salesByCountry.reset_index()
salesByCountrydf.columns = ['Country', 'TotalExpense']

# Print the DataFrame
salesByCountrydf

### total quantity of products sold overall

In [None]:
totalProductsSold = df1.groupby('Description')['Quantity'].sum().sort_values(ascending=False).head(100)
# show data
totalProductsSold


###  total amount spend  by customer over all

In [None]:
CustomerTotalSpending = df1.groupby('CustomerID')['TotalValue'].sum().sort_values(ascending=False).head(100)
# show data
CustomerTotalSpending

## Customers from each country

In [None]:
customerCounts = df.groupby('Country')['CustomerID'].nunique().sort_values(ascending=False)
customerCounts

### daily sales amount

In [None]:
df1['InvoiceDate'] = pd.to_datetime(df1['InvoiceDate'])
dailySales = df1.groupby(df1['InvoiceDate'].dt.to_period('D'))['TotalValue'].sum()
dailySalesdf = dailySales.reset_index()
dailySalesdf.columns = ['Date', 'TotalExpense']
# Extracting the day from the date for ploting graph
dailySalesdf['Day'] = dailySalesdf['Date'].dt.day
dailySalesdf


# Visualizations

## Graph to show Total Expense by Country in Log Scale

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(salesByCountrydf['Country'], salesByCountrydf['TotalExpense'], color='skyblue')
plt.yscale('log')
plt.xlabel('Country')
plt.ylabel('Total Expense (log scale)')
plt.title('Total Expense by Country (Log Scale)')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Graph to show daily  Expense

In [None]:
plt.figure(figsize=(14, 6))
plt.bar(dailySalesdf['Day'], dailySalesdf['TotalExpense'], color='skyblue')
plt.xlabel('Day')
plt.ylabel('Total Expense')
plt.title('Daily Expense')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## Graph to show customer total spending

In [None]:
plt.figure(figsize=(17,8))
CustomerTotalSpending.plot(kind='bar')
plt.title('Customer Total Spending')
plt.xlabel('CustomerID')
plt.ylabel('Total Spending')
plt.tight_layout()
plt.show()

## Graph to show customer total quantity of product sold

In [None]:
plt.figure(figsize=(17,8))
totalProductsSold.plot(kind='bar')
plt.title('Total quantity of product sold')
plt.xlabel('Quantity')
plt.ylabel('Description')
plt.tight_layout()
plt.show()

In [None]:
# pairwise relationships between quantity and unit price with respect to country
sns.pairplot(df, vars=['Quantity', 'UnitPrice'], hue='Country')

In [None]:
customerCounts = customerCounts.reset_index()
fig = px.bar(customerCounts, x='Country', y='CustomerID', title='Number of Customers by Country')
fig.show()

In [None]:
#  distribution of a quantity  variable
sns.histplot(df['Quantity'], bins=30, kde=True, color='blue')

In [None]:
# relationships between quantity and unitprice variables
sns.scatterplot(x='Quantity', y='UnitPrice', data=df)