In [1]:
# retail_data_cleaning.ipynb

import pandas as pd
import numpy as np
import os

# Load the dataset
file_path = 'retail_data.csv'  # Adjust path as needed
df = pd.read_csv(file_path)

# Display initial shape and sample data
print("Initial shape:", df.shape)
df.head()

# ----------------------------
# Clean Missing Data
# ----------------------------
# Drop columns with too many missing values (threshold can be adjusted)
df.dropna(thresh=len(df) * 0.5, axis=1, inplace=True)

# Fill numerical columns with median
for col in df.select_dtypes(include=[np.number]).columns:
    df[col].fillna(df[col].median(), inplace=True)

# Fill categorical columns with mode
for col in df.select_dtypes(include=["object"]).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Drop rows missing critical fields (if any)
if 'Product_ID' in df.columns and 'Total_Sales' in df.columns:
    df.dropna(subset=['Product_ID', 'Total_Sales'], inplace=True)

# ----------------------------
# Remove Duplicates
# ----------------------------
df.drop_duplicates(inplace=True)

# ----------------------------
# Metadata Changes: Rename or Convert Types
# ----------------------------
# Example renaming
rename_dict = {
    'Store Code': 'Store_ID',
    'Sales Value': 'Total_Sales',
    'Transaction Date': 'Date'
}
df.rename(columns=rename_dict, inplace=True)

# Convert date column to datetime if applicable
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# ----------------------------
# Apply Transformations
# ----------------------------
# Add text length column if there's a description field
if 'Product_Description' in df.columns:
    df['Description_Length'] = df['Product_Description'].apply(lambda x: len(str(x)))

# Create buckets for Total_Sales
if 'Total_Sales' in df.columns:
    df['Sales_Bucket'] = pd.qcut(df['Total_Sales'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# ----------------------------
# Save cleaned dataset for AutoML
# ----------------------------
outfile = 'retail_data_cleaned.csv'
df.to_csv(outfile, index=False)
print(f"Cleaned dataset saved as: {outfile}")


Initial shape: (1000000, 78)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)


Cleaned dataset saved as: retail_data_cleaned.csv


In [3]:
print(df.columns.tolist())

['customer_id', 'age', 'gender', 'income_bracket', 'loyalty_program', 'membership_years', 'churned', 'marital_status', 'number_of_children', 'education_level', 'occupation', 'transaction_id', 'transaction_date', 'product_id', 'product_category', 'quantity', 'unit_price', 'discount_applied', 'payment_method', 'store_location', 'transaction_hour', 'day_of_week', 'week_of_year', 'month_of_year', 'avg_purchase_value', 'purchase_frequency', 'last_purchase_date', 'avg_discount_used', 'preferred_store', 'online_purchases', 'in_store_purchases', 'avg_items_per_transaction', 'avg_transaction_value', 'total_returned_items', 'total_returned_value', 'total_sales', 'total_transactions', 'total_items_purchased', 'total_discounts_received', 'avg_spent_per_category', 'max_single_purchase_value', 'min_single_purchase_value', 'product_name', 'product_brand', 'product_rating', 'product_review_count', 'product_stock', 'product_return_rate', 'product_size', 'product_weight', 'product_color', 'product_mater