Step 1: Import Required Libraries

In [1]:
import pandas as pd  # For data handling
import numpy as np  # For numerical operations
from sklearn.preprocessing import StandardScaler, LabelEncoder  # For scaling & encoding
from sklearn.impute import SimpleImputer  # For missing value handling

Step 2: Load the Dataset

In [2]:
# Define the path to the cleaned dataset
file_path = "../data/cleaned_data.csv"

# Load the dataset
df = pd.read_csv(file_path)

# Display first few rows
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,01-12-2010 8.26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,01-12-2010 8.26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,01-12-2010 8.26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,01-12-2010 8.26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,01-12-2010 8.26,3.39,17850.0,United Kingdom


Step 3: Handle Missing Values

In [3]:
# Check for missing values
print("\nMissing Values Before Handling:\n")
print(df.isnull().sum())


Missing Values Before Handling:

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64


In [4]:
# Use mean imputation for numerical features
num_imputer = SimpleImputer(strategy="mean")

# Use mode imputation for categorical features
cat_imputer = SimpleImputer(strategy="most_frequent")

# Separate numerical and categorical features
num_cols = df.select_dtypes(include=[np.number]).columns
cat_cols = df.select_dtypes(include=[object]).columns

# Apply imputers
df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Verify missing values are handled
print("\nMissing Values After Handling:\n")
print(df.isnull().sum())


Missing Values After Handling:

InvoiceNo      0
StockCode      0
Description    0
Quantity       0
InvoiceDate    0
UnitPrice      0
CustomerID     0
Country        0
dtype: int64


Step 4: Encode Categorical Variables

In [5]:
# Apply label encoding to categorical features
label_encoder = LabelEncoder()
for col in cat_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Display the dataset after encoding
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,0,3536,3918,6.0,708,2.55,17850.0,36
1,0,2794,3926,6.0,708,3.39,17850.0,36
2,0,3044,913,8.0,708,2.75,17850.0,36
3,0,2985,1910,6.0,708,3.39,17850.0,36
4,0,2984,2911,6.0,708,3.39,17850.0,36


Step 5: Feature Scaling

In [6]:
# Standardize numerical features for better clustering
scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# Display scaled data
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,0,3536,3918,-0.016289,708,-0.021301,1.725758,36
1,0,2794,3926,-0.016289,708,-0.01262,1.725758,36
2,0,3044,913,-0.007118,708,-0.019234,1.725758,36
3,0,2985,1910,-0.016289,708,-0.01262,1.725758,36
4,0,2984,2911,-0.016289,708,-0.01262,1.725758,36


Step 6: Save the Processed Dataset

In [7]:
# Save the processed dataset for clustering
df.to_csv("../data/processed_data.csv", index=False)

print("\nProcessed dataset saved successfully as 'processed_data.csv'")


Processed dataset saved successfully as 'processed_data.csv'
