In [None]:
# for all the data quality issues we identified in the data-quality-issues.ipynb file, we will write a script to clean the data
import os
import pandas as pd

# Ensure the cleansed directory exists
os.makedirs("../data/cleansed", exist_ok=True)

# Define file paths
raw_folder = "../data/raw"
cleansed_folder = "../data/cleansed"

# Load datasets
products_df = pd.read_csv(os.path.join(raw_folder, "products.csv"))
transactions_df = pd.read_csv(os.path.join(raw_folder, "transaction.csv"))
users_df = pd.read_csv(os.path.join(raw_folder, "user.csv"))

In [4]:
# --- Cleaning Process ---

# 1. Handle Missing Values
products_df.fillna({"CATEGORY_1": "Unknown", "CATEGORY_2": "Unknown", "CATEGORY_3": "Unknown", 
                    "CATEGORY_4": "Unknown", "MANUFACTURER": "Unknown", "BRAND": "Unknown"}, inplace=True)
transactions_df["BARCODE"].fillna(-1, inplace=True)  # Placeholder for missing barcodes
users_df.fillna({"LANGUAGE": "unknown", "GENDER": "unknown", "STATE": "unknown"}, inplace=True)

# 2. Convert Data Types
transactions_df["BARCODE"] = transactions_df["BARCODE"].astype("Int64")  # Convert barcode to integer
products_df["BARCODE"] = products_df["BARCODE"].astype("Int64") 

# Convert date columns to datetime format
date_columns = ["PURCHASE_DATE", "SCAN_DATE", "CREATED_DATE", "BIRTH_DATE"]
for col in date_columns:
    if col in transactions_df:
        transactions_df[col] = pd.to_datetime(transactions_df[col], errors="coerce")
    if col in users_df:
        users_df[col] = pd.to_datetime(users_df[col], errors="coerce")

# 3. Remove Duplicates
products_df.drop_duplicates(inplace=True)
transactions_df.drop_duplicates(inplace=True)
users_df.drop_duplicates(inplace=True)

# --- Save cleaned datasets ---
products_df.to_csv(os.path.join(cleansed_folder, "products_cleaned.csv"), index=False)
transactions_df.to_csv(os.path.join(cleansed_folder, "transactions_cleaned.csv"), index=False)
users_df.to_csv(os.path.join(cleansed_folder, "users_cleaned.csv"), index=False)

print("✅ Data cleaning complete! Cleaned files saved in 'data/cleansed' folder.")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  transactions_df["BARCODE"].fillna(-1, inplace=True)  # Placeholder for missing barcodes


✅ Data cleaning complete! Cleaned files saved in 'data/cleansed' folder.


In [5]:
# summary checks after cleaning to verify that the updates were successfully applied. 
# Reload the cleaned data
products_cleaned_df = pd.read_csv(os.path.join(cleansed_folder, "products_cleaned.csv"))
transactions_cleaned_df = pd.read_csv(os.path.join(cleansed_folder, "transactions_cleaned.csv"))
users_cleaned_df = pd.read_csv(os.path.join(cleansed_folder, "users_cleaned.csv"))

# Function to generate a summary report
def data_summary(df, name):
    print(f"\nSummary Report for {name}")
    print("=" * 40)
    
    # Missing values
    print("\nMissing Values:\n", df.isnull().sum())
    
    # Data types
    print("\nData Types:\n", df.dtypes)
    
    # Summary statistics
    print("\nSummary Statistics:\n", df.describe(include="all"))
    
    # Duplicates
    print("\nDuplicate Rows:", df.duplicated().sum())

# Run summaries
data_summary(products_cleaned_df, "Products (Cleaned)")
data_summary(transactions_cleaned_df, "Transactions (Cleaned)")
data_summary(users_cleaned_df, "Users (Cleaned)")



Summary Report for Products (Cleaned)

Missing Values:
 CATEGORY_1         0
CATEGORY_2         0
CATEGORY_3         0
CATEGORY_4         0
MANUFACTURER       0
BRAND              0
BARCODE         3968
dtype: int64

Data Types:
 CATEGORY_1       object
CATEGORY_2       object
CATEGORY_3       object
CATEGORY_4       object
MANUFACTURER     object
BRAND            object
BARCODE         float64
dtype: object

Summary Statistics:
                CATEGORY_1 CATEGORY_2 CATEGORY_3 CATEGORY_4 MANUFACTURER  \
count              845337     845337     845337     845337       845337   
unique                 28        122        345        128         4355   
top     Health & Wellness      Candy    Unknown    Unknown      Unknown   
freq               512686     120898      60563     777884       226464   
mean                  NaN        NaN        NaN        NaN          NaN   
std                   NaN        NaN        NaN        NaN          NaN   
min                   NaN        NaN    