## Import Libraries

In [1]:

import pandas as pd
import os
import numpy as np


## Load Raw Data from Excel Sheets

In [2]:
data_folder = '../data'
excel_file_name = 'online_retail_dataset.xlsx' # The actual Excel file name
excel_file_path = os.path.join(data_folder, excel_file_name)
print(f"Loading data from: {excel_file_path}")

# Load data from individual sheets
df_2009_2010 = pd.read_excel(excel_file_path, sheet_name='Year 2009-2010')
df_2010_2011 = pd.read_excel(excel_file_path, sheet_name='Year 2010-2011')

# Combine both datasets into a single DataFrame
df_raw = pd.concat([df_2009_2010, df_2010_2011], ignore_index=True)

print(f"Raw data loaded and combined. Initial shape: {df_raw.shape}")
print("First 5 rows of raw combined data:")
display(df_raw.head())

Loading data from: ../data\online_retail_dataset.xlsx
Raw data loaded and combined. Initial shape: (1067371, 8)
First 5 rows of raw combined data:


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom


## Initial Data Inspection

In [3]:
# Initial Data Inspection
print("Raw Data Info:")
df_raw.info()

print("\nMissing values in raw data:")
print(df_raw.isnull().sum())

Raw Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1067371 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   Invoice      1067371 non-null  object        
 1   StockCode    1067371 non-null  object        
 2   Description  1062989 non-null  object        
 3   Quantity     1067371 non-null  int64         
 4   InvoiceDate  1067371 non-null  datetime64[ns]
 5   Price        1067371 non-null  float64       
 6   Customer ID  824364 non-null   float64       
 7   Country      1067371 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 65.1+ MB

Missing values in raw data:
Invoice             0
StockCode           0
Description      4382
Quantity            0
InvoiceDate         0
Price               0
Customer ID    243007
Country             0
dtype: int64


## Handle Missing CustomerID and Convert CustomerID Type

In [5]:
# Handle Missing CustomerID and Convert CustomerID Type
initial_rows = df_raw.shape[0]
df_cleaned = df_raw.copy() # Work on a copy to avoid modifying the original raw DataFrame

# Remove rows where CustomerID is missing (crucial for customer-level analysis)
df_cleaned.dropna(subset=['Customer ID'], inplace=True)
print(f"Removed {initial_rows - df_cleaned.shape[0]} rows with missing Customer ID.")

# Convert 'CustomerID' to string to avoid numerical issues and ensure consistent type
df_cleaned['Customer ID'] = df_cleaned['Customer ID'].astype(str)
print("Converted 'Customer ID' to string type.")

print("\nShape after handling missing Customer ID:", df_cleaned.shape)
print("Missing values after Customer ID handling:")
print(df_cleaned.isnull().sum())

Removed 243007 rows with missing Customer ID.
Converted 'Customer ID' to string type.

Shape after handling missing Customer ID: (824364, 8)
Missing values after Customer ID handling:
Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


## df_cleaned

In [6]:
df_cleaned

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.10,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067366,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
1067367,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
1067368,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France
1067369,581587,22138,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680.0,France


## shape and size of the df_cleaned dataset

In [7]:
df_cleaned.shape, df_cleaned.size

((824364, 8), 6594912)

## Is there is any null values ?

In [8]:
df_cleaned.isnull().sum()

Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64

In [9]:
df_cleaned.isnull().sum().sum()

np.int64(0)

## Is there is any duplicate values ?

In [11]:
df_cleaned[df_cleaned.duplicated]

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
371,489517,21912,VINTAGE SNAKES & LADDERS,1,2009-12-01 11:34:00,3.75,16329.0,United Kingdom
383,489517,22130,PARTY CONE CHRISTMAS DECORATION,6,2009-12-01 11:34:00,0.85,16329.0,United Kingdom
384,489517,22319,HAIRCLIPS FORTIES FABRIC ASSORTED,12,2009-12-01 11:34:00,0.65,16329.0,United Kingdom
385,489517,21913,VINTAGE SEASIDE JIGSAW PUZZLES,1,2009-12-01 11:34:00,3.75,16329.0,United Kingdom
386,489517,21821,GLITTER STAR GARLAND WITH BELLS,1,2009-12-01 11:34:00,3.75,16329.0,United Kingdom
...,...,...,...,...,...,...,...,...
1067136,581538,22068,BLACK PIRATE TREASURE CHEST,1,2011-12-09 11:34:00,0.39,14446.0,United Kingdom
1067150,581538,23318,BOX OF 6 MINI VINTAGE CRACKERS,1,2011-12-09 11:34:00,2.49,14446.0,United Kingdom
1067153,581538,22992,REVOLVER WOODEN RULER,1,2011-12-09 11:34:00,1.95,14446.0,United Kingdom
1067160,581538,22694,WICKER STAR,1,2011-12-09 11:34:00,2.10,14446.0,United Kingdom


In [12]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 824364 entries, 0 to 1067370
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      824364 non-null  object        
 1   StockCode    824364 non-null  object        
 2   Description  824364 non-null  object        
 3   Quantity     824364 non-null  int64         
 4   InvoiceDate  824364 non-null  datetime64[ns]
 5   Price        824364 non-null  float64       
 6   Customer ID  824364 non-null  object        
 7   Country      824364 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(1), object(5)
memory usage: 56.6+ MB


## categorical , numerical and datatime variables in the dataset . 

In [13]:
categorical_features = df_cleaned.select_dtypes(include=['object']).columns.tolist()
numericaL_features = df_cleaned.select_dtypes(include=['int64','float64']).columns.tolist()
date_time_columns = df_cleaned.select_dtypes(include=['datetime64[ns]']).columns.tolist()
print(categorical_features)
print(len(categorical_features))
print(date_time_columns)
print(len(date_time_columns))
print(numericaL_features)
print(len(numericaL_features))
print(df_cleaned.columns)
print(len(df_cleaned.columns))

['Invoice', 'StockCode', 'Description', 'Customer ID', 'Country']
5
['InvoiceDate']
1
['Quantity', 'Price']
2
Index(['Invoice', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'Price', 'Customer ID', 'Country'],
      dtype='object')
8


## Convert InvoiceDate to Datetime and Handle Invalid Dates

In [17]:
# Convert InvoiceDate to Datetime and Handle Invalid Dates
initial_rows = df_cleaned.shape[0]

# Convert 'InvoiceDate' to datetime objects. 'coerce' will turn unparseable dates into NaT (Not a Time)
df_cleaned['InvoiceDate'] = pd.to_datetime(df_cleaned['InvoiceDate'], errors='coerce')
print("Converted 'InvoiceDate' to datetime, coercing errors to NaT.")

# Remove rows where InvoiceDate conversion failed (i.e., became NaT)
df_cleaned.dropna(subset=['InvoiceDate'], inplace=True)
print(f"Removed {initial_rows - df_cleaned.shape[0]} rows with invalid InvoiceDate.")

print("\nShape after handling InvoiceDate:", df_cleaned.shape)
print("Missing values after InvoiceDate handling:")
print(df_cleaned.isnull().sum())

Converted 'InvoiceDate' to datetime, coercing errors to NaT.
Removed 0 rows with invalid InvoiceDate.

Shape after handling InvoiceDate: (824364, 8)
Missing values after InvoiceDate handling:
Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


## Convert Quantity/UnitPrice to Numeric and Handle Missing Values After Coercion

In [20]:
# Convert Quantity/UnitPrice to Numeric and Handle Missing Values After Coercion
initial_rows = df_cleaned.shape[0]

# Convert 'Quantity' and 'UnitPrice' to numeric, coercing errors to NaN
df_cleaned['Quantity'] = pd.to_numeric(df_cleaned['Quantity'], errors='coerce')
df_cleaned['Price'] = pd.to_numeric(df_cleaned['Price'], errors='coerce')
print("Ensured 'Quantity' and 'Price' are numeric, coercing errors to NaN.")

# Remove rows where Quantity or UnitPrice became NaN after coercion
df_cleaned.dropna(subset=['Quantity', 'Price'], inplace=True)
print(f"Removed {initial_rows - df_cleaned.shape[0]} rows where Quantity or UnitPrice became NaN.")

print("\nShape after handling numeric conversions:", df_cleaned.shape)
print("Missing values after numeric handling:")
print(df_cleaned.isnull().sum())

Ensured 'Quantity' and 'Price' are numeric, coercing errors to NaN.
Removed 0 rows where Quantity or UnitPrice became NaN.

Shape after handling numeric conversions: (824364, 8)
Missing values after numeric handling:
Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Country        0
dtype: int64


## Filter Out Negative/Zero Quantity and UnitPrice

In [None]:
# Filter Out Negative/Zero Quantity and UnitPrice
# Filter out rows with non-positive Quantity (likely returns or errors, not purchases)
initia_rows = df_cleaned.shape[0]
df_cleaned = df_cleaned[df_cleaned['Quantity'] >= 1]
print(f"Filtered out {initial_rows - df_cleaned.shape[0]} rows with non-positive 'Quantity'.")

# Filter out rows with non-positive UnitPrice (invalid sales)
initial_rows = df_cleaned.shape[0]
df_cleaned = df_cleaned[df_cleaned['Price'] > 0]
print(f"Filtered out {initial_rows - df_cleaned.shape[0]} rows with non-positive 'Price'.")

print("\nShape after filtering Quantity and Price:", df_cleaned.shape)

Filtered out 18744 rows with non-positive 'Quantity'.
Filtered out 71 rows with non-positive 'Price'.

Shape after filtering Quantity and Price: (805549, 8)


In [22]:
# Remove Duplicate Rows
initial_rows = df_cleaned.shape[0]
df_cleaned.drop_duplicates(inplace=True)
print(f"Removed {initial_rows - df_cleaned.shape[0]} duplicate rows.")

print("\nShape after removing duplicates:", df_cleaned.shape)

Removed 26124 duplicate rows.

Shape after removing duplicates: (779425, 8)


## Calculate TotalPrice

In [None]:
# Calculate TotalPrice
df_cleaned['TotalPrice'] = df_cleaned['Quantity'] * df_cleaned['Price']
print("Calculated 'TotalPrice' column (Quantity * Price).")

print("\nFirst 5 rows with new 'TotalPrice' column:")
display(df_cleaned.head())

Calculated 'TotalPrice' column (Quantity * Price).

First 5 rows with new 'TotalPrice' column:


Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country,TotalPrice
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,2009-12-01 07:45:00,6.95,13085.0,United Kingdom,83.4
1,489434,79323P,PINK CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
2,489434,79323W,WHITE CHERRY LIGHTS,12,2009-12-01 07:45:00,6.75,13085.0,United Kingdom,81.0
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,2009-12-01 07:45:00,2.1,13085.0,United Kingdom,100.8
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,2009-12-01 07:45:00,1.25,13085.0,United Kingdom,30.0


## Final Cleaned Data Inspection

In [24]:
# Final Cleaned Data Inspection
print("\n--- Final Cleaned Data Overview ---")
print(f"Final shape of the cleaned data: {df_cleaned.shape}")
print("Final Data Info:")
df_cleaned.info()
print("\nMissing values in final cleaned data:")
print(df_cleaned.isnull().sum())

# Display a summary of numerical columns to check for outliers/distribution
print("\nDescriptive statistics of numerical columns in cleaned data:")
display(df_cleaned.describe())


--- Final Cleaned Data Overview ---
Final shape of the cleaned data: (779425, 9)
Final Data Info:
<class 'pandas.core.frame.DataFrame'>
Index: 779425 entries, 0 to 1067370
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      779425 non-null  object        
 1   StockCode    779425 non-null  object        
 2   Description  779425 non-null  object        
 3   Quantity     779425 non-null  int64         
 4   InvoiceDate  779425 non-null  datetime64[ns]
 5   Price        779425 non-null  float64       
 6   Customer ID  779425 non-null  object        
 7   Country      779425 non-null  object        
 8   TotalPrice   779425 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 59.5+ MB

Missing values in final cleaned data:
Invoice        0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
Price          0
Customer ID    0
Count

Unnamed: 0,Quantity,InvoiceDate,Price,TotalPrice
count,779425.0,779425,779425.0,779425.0
mean,13.48937,2011-01-03 01:44:42.593475584,3.218488,22.291823
min,1.0,2009-12-01 07:45:00,0.001,0.001
25%,2.0,2010-07-02 14:39:00,1.25,4.95
50%,6.0,2010-12-02 14:09:00,1.95,12.48
75%,12.0,2011-08-01 13:44:00,3.75,19.8
max,80995.0,2011-12-09 12:50:00,10953.5,168469.6
std,145.855814,,29.67614,227.427075


## Save Cleaned Data

In [25]:
# Save Cleaned Data

# Define the path to save the cleaned data
output_data_folder = '../data' # Relative path from notebooks to data folder
output_file_name = 'online_retail_dataset_preprocessing.xlsx'
output_file_path = os.path.join(output_data_folder, output_file_name)

print(f"Attempting to save cleaned data to: {output_file_path}")

# Save the cleaned DataFrame to an Excel file
# index=False prevents pandas from writing the DataFrame index as a column in the Excel file
df_cleaned.to_excel(output_file_path, index=False)

print(f"Cleaned data successfully saved to '{output_file_path}'.")

Attempting to save cleaned data to: ../data\online_retail_dataset_preprocessing.xlsx
Cleaned data successfully saved to '../data\online_retail_dataset_preprocessing.xlsx'.
