1. Import libraries and set up

In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Nice plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Display all columns and reasonable row limits
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [36]:
# 2. Load the raw data
raw_path = "../data_raw/Online_Retail.xlsx"

df = pd.read_excel(raw_path)

print(f"Dataset shape: {df.shape}")
df.head(10)

Dataset shape: (541909, 8)


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [37]:
# 3. Basic info and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [38]:
# 4. Check for missing values
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [39]:
# 5. Look at key columns
# InvoiceNo — check for cancellations
print("Sample InvoiceNo:")
print(df['InvoiceNo'].astype(str).head(20))

# Quantity — negative values = returns
print("\nQuantity distribution:")
print(df['Quantity'].describe())

# UnitPrice — should be positive
print("\nUnitPrice distribution:")
print(df['UnitPrice'].describe())

# Country
print("\nCountries:")
print(df['Country'].value_counts().head(10))

Sample InvoiceNo:
0     536365
1     536365
2     536365
3     536365
4     536365
5     536365
6     536365
7     536366
8     536366
9     536367
10    536367
11    536367
12    536367
13    536367
14    536367
15    536367
16    536367
17    536367
18    536367
19    536367
Name: InvoiceNo, dtype: object

Quantity distribution:
count    541909.000000
mean          9.552250
std         218.081158
min      -80995.000000
25%           1.000000
50%           3.000000
75%          10.000000
max       80995.000000
Name: Quantity, dtype: float64

UnitPrice distribution:
count    541909.000000
mean          4.611114
std          96.759853
min      -11062.060000
25%           1.250000
50%           2.080000
75%           4.130000
max       38970.000000
Name: UnitPrice, dtype: float64

Countries:
Country
United Kingdom    495478
Germany             9495
France              8557
EIRE                8196
Spain               2533
Netherlands         2371
Belgium             2069
Switzerland     

In [40]:
# 6. Identify obvious data issues
# Cancelled invoices (start with 'C')
cancelled = df['InvoiceNo'].astype(str).str.startswith('C')
print(f"Cancelled invoices: {cancelled.sum()}")

# Negative quantity (returns)
returns = df['Quantity'] < 0
print(f"Negative quantity rows: {returns.sum()}")

# Zero or negative unit price
bad_price = df['UnitPrice'] <= 0
print(f"UnitPrice <= 0: {bad_price.sum()}")

# Missing CustomerID
missing_customer = df['CustomerID'].isnull()
print(f"Missing CustomerID: {missing_customer.sum()}")

Cancelled invoices: 9288
Negative quantity rows: 10624
UnitPrice <= 0: 2517
Missing CustomerID: 135080


In [41]:
# === 7. Full Cleaning and Proper Data Types ===

# Start fresh from raw data for reproducibility
df_clean = df.copy()

# 1. Remove rows with missing CustomerID (required for customer-level analysis)
df_clean = df_clean.dropna(subset=['CustomerID'])

# 2. Convert to proper data types
df_clean['CustomerID'] = df_clean['CustomerID'].astype('int64')          # Unique ID → integer
df_clean['InvoiceNo'] = df_clean['InvoiceNo'].astype('string')          # Alphanumeric code
df_clean['StockCode'] = df_clean['StockCode'].astype('string')          # Product code
df_clean['Description'] = df_clean['Description'].astype('string')      # Text description
df_clean['Country'] = df_clean['Country'].astype('category')            # Finite categories → memory efficient

# Quantity is already int64 — keep it (handles negative returns correctly)
# UnitPrice is float64 — keep as-is

# 3. Convert InvoiceDate to datetime (if not already)
df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])

# 4. Create TotalPrice feature
df_clean['TotalPrice'] = df_clean['Quantity'] * df_clean['UnitPrice']

# 5. Filter out invalid rows
df_clean = df_clean[df_clean['UnitPrice'] > 0]                           # Remove free/postage/errors
df_clean = df_clean[df_clean['Quantity'] != 0]                           # Optional: remove zero quantity

# 6. Remove extreme outliers (common practice — justify in README/interview)
# These are very high-value single transactions that can skew RFM
df_clean = df_clean[df_clean['TotalPrice'] < 5000]
df_clean = df_clean[df_clean['Quantity'].abs() < 10000]                 # abs() to catch large returns

# 7. Final check
print(f"Final cleaned shape: {df_clean.shape}")
print("\nData types after optimization:")
print(df_clean.dtypes)
print("\nFirst few rows:")
df_clean.head()

Final cleaned shape: (406780, 9)

Data types after optimization:
InvoiceNo      string[python]
StockCode      string[python]
Description    string[python]
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID              int64
Country              category
TotalPrice            float64
dtype: object

First few rows:


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,TotalPrice
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [42]:
# 8. Save the cleaned dataset
clean_path = "../data_clean/online_retail_clean.csv"
df_clean.to_csv(clean_path, index=False)

print(f"Cleaned data saved to {clean_path}")

Cleaned data saved to ../data_clean/online_retail_clean.csv


In [43]:
# 9. Quick summary stats on cleaned data
print("Summary after cleaning:")
print(df_clean.describe(include='all'))

Summary after cleaning:
       InvoiceNo StockCode                         Description       Quantity  \
count     406780    406780                              406780  406780.000000   
unique     22180      3683                                3895            NaN   
top       576339    85123A  WHITE HANGING HEART T-LIGHT HOLDER            NaN   
freq         542      2077                                2070            NaN   
mean         NaN       NaN                                 NaN      12.009693   
min          NaN       NaN                                 NaN   -9360.000000   
25%          NaN       NaN                                 NaN       2.000000   
50%          NaN       NaN                                 NaN       5.000000   
75%          NaN       NaN                                 NaN      12.000000   
max          NaN       NaN                                 NaN    4800.000000   
std          NaN       NaN                                 NaN      45.636975   

   

In [44]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 406780 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406780 non-null  string        
 1   StockCode    406780 non-null  string        
 2   Description  406780 non-null  string        
 3   Quantity     406780 non-null  int64         
 4   InvoiceDate  406780 non-null  datetime64[ns]
 5   UnitPrice    406780 non-null  float64       
 6   CustomerID   406780 non-null  int64         
 7   Country      406780 non-null  category      
 8   TotalPrice   406780 non-null  float64       
dtypes: category(1), datetime64[ns](1), float64(2), int64(2), string(3)
memory usage: 28.3 MB
