## Shopper Spectrum: Customer Segmentation and Product Recommendations in E-Commerce

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd 

In [2]:
df = pd.read_csv("online_retail.csv")

### Data Cleaning

In [3]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2022-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2022-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2022-12-01 08:26:00,3.39,17850.0,United Kingdom


In [4]:
df['CustomerID'].isnull().sum()


np.int64(135080)

In [5]:
df.shape

(541909, 8)

In [6]:
print("Before cleaning:", df.shape)

df = df.dropna(subset=['CustomerID'])
print("After removing missing CustomerID:", df.shape)

df = df[~df['InvoiceNo'].astype(str).str.startswith('C')]
print("After removing cancelled invoices:", df.shape)

df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
print("After removing invalid quantity/price:", df.shape)


Before cleaning: (541909, 8)
After removing missing CustomerID: (406829, 8)
After removing cancelled invoices: (397924, 8)
After removing invalid quantity/price: (397884, 8)


In [7]:
(df['Quantity'] <= 0).sum()

np.int64(0)

In [8]:
(df['UnitPrice'] == 0).sum()

np.int64(0)

In [9]:
# Missing CustomerID
df['CustomerID'].isnull().sum()

# Cancelled invoices
df['InvoiceNo'].astype(str).str.startswith('C').sum()

np.int64(0)

In [13]:

df1 = df.copy()
df1.to_csv("online_retail_cleaned.csv", index=False)


In [14]:
(df1['UnitPrice'] == 0).sum()

np.int64(0)

In [15]:
type(df1)


pandas.core.frame.DataFrame

### EDA

In [16]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']


In [17]:
country_txn = df.groupby('Country')['InvoiceNo'].nunique().sort_values(ascending=False)
country_txn.head(10)


Country
United Kingdom    16646
Germany             457
France              389
EIRE                260
Belgium              98
Netherlands          94
Spain                90
Portugal             57
Australia            57
Switzerland          51
Name: InvoiceNo, dtype: int64

In [18]:
import matplotlib.pyplot as plt

country_txn.head(10).plot(kind='bar', figsize=(8,4))
plt.title("Top 10 Countries by Number of Transactions")
plt.ylabel("Number of Transactions")
plt.xlabel("Country")
plt.show()


ModuleNotFoundError: No module named 'matplotlib'