In [1]:
import os

In [2]:
from minio import Minio
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_extract, col, when, length 
from pyspark.sql.types import NumericType, IntegerType, LongType, FloatType, DoubleType, DecimalType
from pyspark.ml.feature import Imputer
import pyspark.sql.functions as F
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [3]:
JAR_PATH_1 = os.path.abspath("./jars/hadoop-aws-3.4.0.jar")
JAR_PATH_2 = os.path.abspath("./jars/aws-sdk-s3-2.29.52.jar")

JARS_LIST = f"{JAR_PATH_1},{JAR_PATH_2}"

In [4]:
spark = (
    SparkSession.builder.appName("Cleaning")
    .master("local[*]")
    .config(
        "spark.jars.packages",
        "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262",
    )
    .config("spark.driver.host", "127.0.0.1")
    .config("spark.driver.bindAddress", "0.0.0.0")
    .config("spark.jars.repositories", "https://repo1.maven.org/maven2/")
    .config("spark.hadoop.fs.s3a.endpoint", os.getenv("MINIO_ENDPOINT"))
    .config("spark.hadoop.fs.s3a.access.key", os.getenv("MINIO_ACCESS_KEY"))
    .config("spark.hadoop.fs.s3a.secret.key", os.getenv("MINIO_SECRET_KEY"))
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
    )
    .getOrCreate()
)

KeyboardInterrupt: 

In [5]:
minio_client = Minio(
    os.getenv("MINIO_ENDPOINT"),
    access_key=os.getenv("MINIO_ACCESS_KEY"),
    secret_key=os.getenv("MINIO_SECRET_KEY"),
    secure=False,
)

bucket_name = "pulse-bucket-1"

In [6]:
# Ensure Spark context is active and JARs are loaded
spark.sparkContext.setLogLevel("WARN")

objects = minio_client.list_objects(bucket_name, prefix="mapped_", recursive=True)
dataframes = {}
for obj in objects:
    df = spark.read.option("header", "true").option("inferSchema", "true").csv(
        f"s3a://{bucket_name}/{obj.object_name}"
    )
    object_name = obj.object_name.replace("mapped_", "").replace(".csv", "")
    dataframes[object_name] = df
    print(f"Loaded {object_name} with {df.count()} rows")

25/11/20 22:35:05 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


Loaded addresses with 1552 rows
Loaded categories with 207 rows
Loaded customer_sessions with 5175 rows
Loaded customers with 1035 rows
Loaded inventory with 1552 rows
Loaded marketing_campaigns with 596 rows
Loaded order_items with 5175 rows
Loaded orders with 2587 rows
Loaded payments with 3622 rows
Loaded products with 1035 rows
Loaded reviews with 3105 rows
Loaded shopping_cart with 3105 rows
Loaded suppliers with 1035 rows
Loaded wishlist with 2070 rows


In [7]:
for table in dataframes.keys():
    df = dataframes[table]

    for column in df.columns:
        if column.endswith("_id") and not column.startswith("session_id"):
            df = df.withColumn(
                column,
                when(
                    regexp_extract(col(column), r"(\d+)", 1) == "",
                    None,
                ).otherwise(regexp_extract(col(column), r"(\d+)", 1)),
            )
            df = df.withColumn(column, col(column))

    dataframes[table] = df
for table, dataframe in dataframes.items():
    print(f"Table: {table}")
    dataframe.show(3)

Table: addresses
+----------+----------+-------------------+-----------+--------------+
|address_id|      city|     state_province|postal_code|       country|
+----------+----------+-------------------+-----------+--------------+
|      5555|Las Palmas|          Maryland*|     J8E7L5|       Bermuda|
|      5938|  Gillfort|Nordrhein-Westfalen|      34985|         Congo|
|      5060|       NaN|                NaN|    00000  |United Kingdom|
+----------+----------+-------------------+-----------+--------------+
only showing top 3 rows

Table: categories
+-----------+----------+---------------+
|category_id|  category|   sub_category|
+-----------+----------+---------------+
|        695|  Colthing|      Wholesale|
|        659|Mirrorless|Limited Edition|
|        571|     Suits|      Wholesale|
+-----------+----------+---------------+
only showing top 3 rows

Table: customer_sessions
+----------+-----------+--------------------+--------------------+-----------+---------------+------------

25/11/20 22:35:14 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


cast schema for correct datatypes

In [8]:
from pyspark.sql.types import *
from pyspark.sql.functions import col

def cast_dataframes(dataframes):
    # 1. Addresses
    if "addresses" in dataframes:
        dataframes["addresses"] = dataframes["addresses"].select(
            col("address_id").cast(StringType()),
            col("city").cast(StringType()),
            col("state_province").cast(StringType()),
            col("postal_code").cast(StringType()),
            col("country").cast(StringType())
        )
        print("Cast addresses DataFrame")

    # 2. Customers
    if "customers" in dataframes:
        dataframes["customers"] = dataframes["customers"].select(
            col("customer_id").cast(StringType()),
            col("gender").cast(StringType()),
            col("date_of_birth").cast(DateType()),
            col("account_status").cast(StringType()),
            col("address_id").cast(StringType()),
            col("city").cast(StringType()),
            col("state_province").cast(StringType()),
            col("postal_code").cast(StringType()),
            col("country").cast(StringType()),
            col("account_created_at").cast(TimestampType()),
            col("last_login_date").cast(DateType()),
            col("is_active").cast(BooleanType())
        )
        print("Cast customers DataFrame")

    # 3. Suppliers
    if "suppliers" in dataframes:
        dataframes["suppliers"] = dataframes["suppliers"].select(
            col("supplier_id").cast(StringType()),
            col("supplier_rating").cast(FloatType()),
            col("supplier_status").cast(StringType()),
            col("is_preferred").cast(BooleanType()),
            col("is_verified").cast(BooleanType()),
            col("contract_start_date").cast(DateType()),
            col("contract_end_date").cast(DateType()),
            col("city").cast(StringType()),
            col("state").cast(StringType()),
            col("zip_code").cast(StringType()),
            col("country").cast(StringType())
        )
        print("Cast suppliers DataFrame")

    # 4. Categories
    if "categories" in dataframes:
        dataframes["categories"] = dataframes["categories"].select(
            col("category_id").cast(StringType()),
            col("category").cast(StringType()),
            col("sub_category").cast(StringType())
        )
        print("Cast categories DataFrame")

    # 5. Products
    if "products" in dataframes:
        dataframes["products"] = dataframes["products"].select(
            col("product_id").cast(StringType()),
            col("product_name").cast(StringType()),
            col("sku").cast(StringType()),
            col("category_id").cast(StringType()),
            col("category").cast(StringType()),
            col("sub_category").cast(StringType()),
            col("brand").cast(StringType()),
            col("supplier_id").cast(StringType()),
            col("cost_price").cast(FloatType()),
            col("sell_price").cast(FloatType()),
            col("launch_date").cast(DateType()),
            col("weight").cast(FloatType()),
            col("dimensions").cast(StringType()),
            col("color").cast(StringType()),
            col("size").cast(StringType()),
            col("material").cast(StringType())
        )
        print("Cast products DataFrame")

    # 6. Inventory
    if "inventory" in dataframes:
        dataframes["inventory"] = dataframes["inventory"].select(
            col("inventory_id").cast(StringType()),
            col("product_id").cast(StringType()),
            col("supplier_id").cast(StringType()),
            col("stock_quantity").cast(IntegerType()),
            col("reserved_quantity").cast(IntegerType()),
            col("minimum_stock_level").cast(IntegerType()),
            col("last_restocked_date").cast(DateType()),
            col("storage_cost").cast(FloatType())
        )
        print("Cast inventory DataFrame")

    # 7. Wishlist
    if "wishlist" in dataframes:
        dataframes["wishlist"] = dataframes["wishlist"].select(
            col("wishlist_id").cast(StringType()),
            col("customer_id").cast(StringType()),
            col("product_id").cast(StringType()),
            col("added_date").cast(DateType()),
            col("purchased_date").cast(DateType()),
            col("removed_date").cast(DateType())
        )
        print("Cast wishlist DataFrame")

    # 8. Shopping Cart
    if "shopping_cart" in dataframes:
        dataframes["shopping_cart"] = dataframes["shopping_cart"].select(
            col("cart_id").cast(StringType()),
            col("customer_id").cast(StringType()),
            col("session_id").cast(StringType()),
            col("product_id").cast(StringType()),
            col("quantity").cast(IntegerType()),
            col("unit_price").cast(FloatType()),
            col("added_date").cast(DateType()),
            col("cart_status").cast(StringType())
        )
        print("Cast shopping_cart DataFrame")

    # 9. Orders
    if "orders" in dataframes:
        dataframes["orders"] = dataframes["orders"].select(
            col("order_id").cast(StringType()),
            col("customer_id").cast(StringType()),
            col("order_status").cast(StringType()),
            col("subtotal").cast(FloatType()),
            col("tax_amount").cast(FloatType()),
            col("shipping_cost").cast(FloatType()),
            col("total_discount").cast(FloatType()),
            col("total_amount").cast(FloatType()),
            col("currency").cast(StringType()),
            col("order_placed_at").cast(TimestampType()),
            col("order_shipped_at").cast(DateType()),
            col("order_delivered_at").cast(DateType())
        )
        print("Cast orders DataFrame")

    # 10. Order Items
    if "order_items" in dataframes:
        dataframes["order_items"] = dataframes["order_items"].select(
            col("order_item_id").cast(StringType()),
            col("order_id").cast(StringType()),
            col("product_id").cast(StringType()),
            col("quantity").cast(IntegerType()),
            col("discount_amount").cast(FloatType()),
            col("product_cost").cast(FloatType())
        )
        print("Cast order_items DataFrame")

    # 11. Payments
    if "payments" in dataframes:
        dataframes["payments"] = dataframes["payments"].select(
            col("payment_id").cast(StringType()),
            col("order_id").cast(StringType()),
            col("payment_method").cast(StringType()),
            col("payment_provider").cast(StringType()),
            col("payment_status").cast(StringType()),
            col("transaction_id").cast(StringType()),
            col("processing_fee").cast(FloatType()),
            col("refund_amount").cast(FloatType()),
            col("refund_date").cast(DateType()),
            col("payment_date").cast(DateType())
        )
        print("Cast payments DataFrame")

    # 12. Reviews
    if "reviews" in dataframes:
        dataframes["reviews"] = dataframes["reviews"].select(
            col("review_id").cast(StringType()),
            col("product_id").cast(StringType()),
            col("customer_id").cast(StringType()),
            col("rating").cast(IntegerType()),
            col("review_title").cast(StringType()),
            col("review_desc").cast(StringType()),
            col("review_date").cast(TimestampType())
        )
        print("Cast reviews DataFrame")

    # 13. Marketing Campaigns
    if "marketing_campaigns" in dataframes:
        dataframes["marketing_campaigns"] = dataframes["marketing_campaigns"].select(
            col("campaign_id").cast(StringType()),
            col("campaign_name").cast(StringType()),
            col("campaign_type").cast(StringType()),
            col("start_date").cast(DateType()),
            col("end_date").cast(DateType()),
            col("budget").cast(FloatType()),
            col("spent_amount").cast(FloatType()),
            col("impressions").cast(IntegerType()),
            col("clicks").cast(IntegerType()),
            col("conversions").cast(IntegerType()),
            col("target_audience").cast(StringType()),
            col("campaign_status").cast(StringType())
        )
        print("Cast marketing_campaigns DataFrame")

    # 14. Customer Sessions
    if "customer_sessions" in dataframes:
        dataframes["customer_sessions"] = dataframes["customer_sessions"].select(
            col("session_id").cast(StringType()),
            col("customer_id").cast(StringType()),
            col("session_start").cast(TimestampType()),
            col("session_end").cast(TimestampType()),
            col("device_type").cast(StringType()),
            col("referrer_source").cast(StringType()),
            col("pages_viewed").cast(IntegerType()),
            col("products_viewed").cast(IntegerType()),
            col("conversion_flag").cast(BooleanType()),
            col("cart_abandonment_flag").cast(BooleanType())
        )
        print("Cast customer_sessions DataFrame")

    print("\n‚úÖ All DataFrames cast successfully!")
    return dataframes

dataframes = cast_dataframes(dataframes)

Cast addresses DataFrame
Cast customers DataFrame
Cast suppliers DataFrame
Cast categories DataFrame
Cast products DataFrame
Cast inventory DataFrame
Cast wishlist DataFrame
Cast shopping_cart DataFrame
Cast orders DataFrame
Cast order_items DataFrame
Cast payments DataFrame
Cast reviews DataFrame
Cast marketing_campaigns DataFrame
Cast customer_sessions DataFrame

‚úÖ All DataFrames cast successfully!


In [9]:
dataframes["customers"].columns

['customer_id',
 'gender',
 'date_of_birth',
 'account_status',
 'address_id',
 'city',
 'state_province',
 'postal_code',
 'country',
 'account_created_at',
 'last_login_date',
 'is_active']

Merging  addresses table with Customers
and categories table with products if exists

In [10]:
def merge():
      if not "addresses" in dataframes:
            print("Addresses DataFrame is missing.")

      if "addresses" in dataframes and "customers" in dataframes:
            dataframes["addresses"].createOrReplaceTempView("addresses")
            dataframes["customers"].createOrReplaceTempView("customers")

            customers = spark.sql("""
            SELECT c.customer_id, c.gender, c.date_of_birth, c.account_status,
                  a.city, a.state_province, a.postal_code, a.country,
                  c.account_created_at,c.last_login_date,c.is_active
            FROM customers c
            LEFT JOIN addresses a
                  ON c.address_id = a.address_id
            """)
            dataframes["customers"] = customers
            print("Merged addresses into customers.")
            dataframes.pop("addresses", None)

      if not "categories" in dataframes:
            print("Categories DataFrame is missing.")
            return
      
      if "categories" in dataframes and "products" in dataframes:
            dataframes["categories"].createOrReplaceTempView("categories")
            dataframes["products"].createOrReplaceTempView("products")

            products = spark.sql("""
            SELECT p.product_id, p.product_name, p.sku, cat.category, cat.sub_category,
                  p.brand, p.supplier_id, p.cost_price, p.sell_price, p.launch_date,
                  p.weight, p.dimensions, p.color, p.size, p.material
            FROM products p
            LEFT JOIN categories cat
                  ON p.category_id = cat.category_id
            """)
            dataframes["products"] = products
            print("Merged categories into products.")
            dataframes.pop("categories", None)
merge()

Merged addresses into customers.
Merged categories into products.


# Cleaning null values and Duplicate values 

In [11]:
def check_dups():
    for name, df in dataframes.items():
        dup_rows = df.groupBy(*df.columns).count().filter("count > 1")
        row_count = dup_rows.count()
        print(f"The number of duplicate rows in {name} is: {row_count}")
check_dups()

The number of duplicate rows in customer_sessions is: 65
The number of duplicate rows in customers is: 28
The number of duplicate rows in inventory is: 33
The number of duplicate rows in marketing_campaigns is: 11
The number of duplicate rows in order_items is: 104
The number of duplicate rows in orders is: 37
The number of duplicate rows in payments is: 48
The number of duplicate rows in products is: 18
The number of duplicate rows in reviews is: 40
The number of duplicate rows in shopping_cart is: 47
The number of duplicate rows in suppliers is: 13
The number of duplicate rows in wishlist is: 42


In [12]:
def drop_dups():
    for table in dataframes.keys():
        dataframes[table] = dataframes[table].dropDuplicates()
drop_dups()


In [13]:
check_dups()

The number of duplicate rows in customer_sessions is: 0
The number of duplicate rows in customers is: 0
The number of duplicate rows in inventory is: 0
The number of duplicate rows in marketing_campaigns is: 0
The number of duplicate rows in order_items is: 0
The number of duplicate rows in orders is: 0
The number of duplicate rows in payments is: 0
The number of duplicate rows in products is: 0
The number of duplicate rows in reviews is: 0
The number of duplicate rows in shopping_cart is: 0
The number of duplicate rows in suppliers is: 0
The number of duplicate rows in wishlist is: 0


In [14]:
def drop_null_rows(table, col_name):
    if table in dataframes:
        df = dataframes[table]
        if col_name in df.columns:
            before = df.count()
            cleaned = df.filter(F.col(col_name).isNotNull())
            dataframes[table] = cleaned
            after = cleaned.count()
            print(f"Removed {before - after} rows from '{table}' where '{col_name}' is NULL")
        else:
            print(f"Column '{col_name}' not found in '{table}'")    
    else:
        print(f"Table '{table}' not found in dataframes")

Dropping all rows from all tables where primary key and foreign keys are null 

In [15]:
all_ids = ["session_id","customer_id", "address_id", "product_id", "supplier_id", "order_id", "order_item_id", "payment_id", "campaign_id","cart_id", "review_id", "wishlist_id"]
for table in dataframes.keys():
    for col in dataframes[table].columns:
        if col in all_ids:
            drop_null_rows(table, col)

Removed 0 rows from 'customer_sessions' where 'session_id' is NULL
Removed 1898 rows from 'customer_sessions' where 'customer_id' is NULL
Removed 10 rows from 'customers' where 'customer_id' is NULL
Removed 42 rows from 'inventory' where 'product_id' is NULL
Removed 70 rows from 'inventory' where 'supplier_id' is NULL
Removed 19 rows from 'marketing_campaigns' where 'campaign_id' is NULL
Removed 42 rows from 'order_items' where 'order_item_id' is NULL
Removed 127 rows from 'order_items' where 'order_id' is NULL
Removed 104 rows from 'order_items' where 'product_id' is NULL
Removed 41 rows from 'orders' where 'order_id' is NULL
Removed 35 rows from 'orders' where 'customer_id' is NULL
Removed 33 rows from 'payments' where 'payment_id' is NULL
Removed 81 rows from 'payments' where 'order_id' is NULL
Removed 15 rows from 'products' where 'product_id' is NULL
Removed 41 rows from 'products' where 'supplier_id' is NULL
Removed 29 rows from 'reviews' where 'review_id' is NULL
Removed 71 rows

In [16]:
def check_nulls():
    for df in dataframes.values():
        null_counts = df.select([F.sum(F.col(c).isNull().cast("int")).alias(c) for c in df.columns])
        null_counts.show()
check_nulls()

+----------+-----------+-------------+-----------+-----------+---------------+------------+---------------+---------------+---------------------+
|session_id|customer_id|session_start|session_end|device_type|referrer_source|pages_viewed|products_viewed|conversion_flag|cart_abandonment_flag|
+----------+-----------+-------------+-----------+-----------+---------------+------------+---------------+---------------+---------------------+
|         0|          0|          280|        135|          0|              0|         188|            103|            134|                   27|
+----------+-----------+-------------+-----------+-----------+---------------+------------+---------------+---------------+---------------------+

+-----------+------+-------------+--------------+----+--------------+-----------+-------+------------------+---------------+---------+
|customer_id|gender|date_of_birth|account_status|city|state_province|postal_code|country|account_created_at|last_login_date|is_active|

Filling null values fo non numeric columns 

In [17]:
def fill_null():
    if "customers" in dataframes.keys():
        dataframes["customers"] = dataframes["customers"].fillna({
            "gender": "Unknown",
            "account_status": "Unknown",
            "city": "Unknown",
            "state_province": "Unknown",
            "postal_code": "00000",
            "country": "Unknown",
            "date_of_birth": "1900-01-01",
            "account_created_at": "1900-01-01",
            "last_login_date": "1900-01-01",
            "is_active": "false"
        })
    else:
        print("Customers DataFrame is missing.")

    if "suppliers" in dataframes.keys():
        dataframes["suppliers"] = dataframes["suppliers"].fillna({
            "supplier_rating": 0.0,
            "supplier_status": "Unknown",
            "is_preferred": "false",
            "is_verified": "false",
            "contract_start_date": "1900-01-01",
            "contract_end_date": "1900-01-01",
            "city": "Unknown",
            "state": "Unknown",
            "zip_code": "00000",
            "country": "Unknown",
        })
    else:
        print("Suppliers DataFrame is missing.")

    if "products" in dataframes.keys():
        dataframes["products"] = dataframes["products"].fillna({
            "product_name": "Unknown",
            "sku": "Unknown",
            "category": "Unknown",
            "sub_category": "Unknown",
            "brand": "Unknown",
            "launch_date": "1900-01-01",
            "weight": "0.0",
            "dimensions": "Unknown",
            "color": "Unknown",
            "size": "Unknown",
            "material": "Unknown"
        })
    else:
        print("Products DataFrame is missing.")

    if "wishlist" in dataframes.keys():
        dataframes["wishlist"] = dataframes["wishlist"].fillna({
            "added_date": "1900-01-01",
            "purchased_date": "1900-01-01",
            "removed_date": "1900-01-01"
        })
    else:
        print("Wishlist DataFrame is missing.")

    if "shopping_cart" in dataframes.keys():
        dataframes["shopping_cart"] = dataframes["shopping_cart"].fillna({
            "added_date": "1900-01-01",
            "cart_status": "Unknown"
        })
    else:
        print("Shopping Cart DataFrame is missing.")    
        
    if "inventory" in dataframes.keys():
        dataframes["inventory"] = dataframes["inventory"].fillna({
            "last_restocked_date": "1900-01-01"
        })
    else:
        print("Inventory DataFrame is missing.")

    if "customer_sessions" in dataframes.keys():
        dataframes["customer_sessions"] = dataframes["customer_sessions"].fillna({
            "session_start": "1900-01-01",
            "session_end": "1900-01-01",
            "device_type": "Unknown",
            "referrer_source": "Unknown",
            "pages_viewed": 0,
            "products_viewed": 0,
            "conversion_flag": "false",
            "cart_abandonment_flag": "false"
        })
    else:
        print("Customer Sessions DataFrame is missing.")

    if "reviews" in dataframes.keys():
        dataframes["reviews"] = dataframes["reviews"].fillna({
            "review_date": "1900-01-01",
            "review_title": "Unknown",
            "review_desc": "Unknown"
        })
    else:
        print("Reviews DataFrame is missing.")

    if "orders" in dataframes.keys():
        dataframes["orders"] = dataframes["orders"].fillna({
            "order_status": "Unknown",
            "order_placed_at": "1900-01-01",
            "order_shipped_at": "1900-01-01",
            "order_delivered_at": "1900-01-01",
            "currency": "Unknown",
        })
    else:
        print("Orders DataFrame is missing.")

    if "payments" in dataframes.keys():
        dataframes["payments"] = dataframes["payments"].fillna({
            "payment_method": "Unknown",
            "payment_status": "Unknown",
            "payment_date": "1900-01-01",
            "transaction_id": "Unknown ",
            "payment_provider": "Unknown",
            "refund_date": "1900-01-01"
        })
    else:
        print("Payments DataFrame is missing.")

    if "marketing_campaigns" in dataframes.keys():
        dataframes["marketing_campaigns"] = dataframes["marketing_campaigns"].fillna({
            "start_date": "1900-01-01",
            "end_date": "1900-01-01",
            "campaign_type": "Unknown",
            "campaign_status": "Unknown",
            "campaign_name": "Unknown",
            "target_audience": "Unknown",
            "impressions": 0,
            "clicks": 0,
            "conversions": 0
        })
    else:
        print("Marketing Campaigns DataFrame is missing.")

fill_null()

Imputing missing numeric values

In [18]:


def impute_missing_values(table, numeric_cols):

    total_rows = dataframes[table].count()
    print(f"Total rows: {total_rows}")

    non_null_counts = dataframes[table].select([F.count(F.col(c)).alias(c) for c in numeric_cols]).collect()[0]

    all_null_cols = []
    valid_numeric_cols = []

    for col_name in numeric_cols:
        non_null_count = non_null_counts[col_name]
        if non_null_count == 0:
            all_null_cols.append(col_name)
            print(f"üö´ {col_name}: ALL NULL - will fill with 0")
        else:
            valid_numeric_cols.append(col_name)
            null_count = total_rows - non_null_count
            print(f"‚úÖ {col_name}: {non_null_count} non-null, {null_count} null - will impute")

    print(f"\nAll-NULL columns: {all_null_cols}")
    print(f"Valid columns for imputation: {valid_numeric_cols}")


    if all_null_cols:
        fill_dict = {col: 0 for col in all_null_cols}
        dataframes[table] = dataframes[table].fillna(fill_dict)
        print(f"‚úÖ Filled all-NULL columns with 0: {all_null_cols}")


    if valid_numeric_cols:
        imputer = Imputer(inputCols=valid_numeric_cols, outputCols=valid_numeric_cols).setStrategy("median")
        dataframes[table] = imputer.fit(dataframes[table]).transform(dataframes[table])
        print(f"‚úÖ Successfully imputed columns with median: {valid_numeric_cols}")
    else:
        print("‚ö†Ô∏è No valid columns found for imputation")

    print("\n" + "="*50)
    print("üîç Final check for NULL values in inventory:")
    print("="*50)


In [19]:
def impute_all():
    all_ids = ["session_id","customer_id", "address_id", "product_id", "supplier_id", "order_id", "order_item_id", "payment_id", "campaign_id","cart_id", "review_id", "wishlist_id"]
    for table in dataframes.keys():
        numeric_cols = [field.name for field in dataframes[table].schema.fields 
                        if isinstance(field.dataType, (IntegerType, LongType, FloatType, DoubleType, DecimalType))] 
        numeric_cols = [col for col in numeric_cols if col not in all_ids]
        if numeric_cols:
            print(f"\nImputing missing values for table: {table}")
            impute_missing_values(table, numeric_cols)
        else:
            print(f"\nNo numeric columns found in table: {table}, skipping imputation.")

impute_all()


Imputing missing values for table: customer_sessions
Total rows: 3158
‚úÖ pages_viewed: 3158 non-null, 0 null - will impute
‚úÖ products_viewed: 3158 non-null, 0 null - will impute

All-NULL columns: []
Valid columns for imputation: ['pages_viewed', 'products_viewed']
‚úÖ Successfully imputed columns with median: ['pages_viewed', 'products_viewed']

üîç Final check for NULL values in inventory:

No numeric columns found in table: customers, skipping imputation.

Imputing missing values for table: inventory
Total rows: 1385
‚úÖ stock_quantity: 1358 non-null, 27 null - will impute
‚úÖ reserved_quantity: 1357 non-null, 28 null - will impute
‚úÖ minimum_stock_level: 1358 non-null, 27 null - will impute
‚úÖ storage_cost: 1367 non-null, 18 null - will impute

All-NULL columns: []
Valid columns for imputation: ['stock_quantity', 'reserved_quantity', 'minimum_stock_level', 'storage_cost']
‚úÖ Successfully imputed columns with median: ['stock_quantity', 'reserved_quantity', 'minimum_stock_lev

In [20]:
check_nulls()

+----------+-----------+-------------+-----------+-----------+---------------+------------+---------------+---------------+---------------------+
|session_id|customer_id|session_start|session_end|device_type|referrer_source|pages_viewed|products_viewed|conversion_flag|cart_abandonment_flag|
+----------+-----------+-------------+-----------+-----------+---------------+------------+---------------+---------------+---------------------+
|         0|          0|            0|          0|          0|              0|           0|              0|              0|                    0|
+----------+-----------+-------------+-----------+-----------+---------------+------------+---------------+---------------+---------------------+

+-----------+------+-------------+--------------+----+--------------+-----------+-------+------------------+---------------+---------+
|customer_id|gender|date_of_birth|account_status|city|state_province|postal_code|country|account_created_at|last_login_date|is_active|

Standardizing Numerical columns by removing outliers 

In [21]:
def remove_outliers(table_name, columns):
    if table_name not in dataframes:
        print(f"Table {table_name} not found")
        return
    
    df = dataframes[table_name]
    result_df = df
    
    for column in columns:
        if column not in df.columns:
            print(f"Column {column} not found in {table_name}")
            continue
            
        print(f"\nProcessing outliers for {column} in {table_name}...")
        quantiles = result_df.approxQuantile(column, [0.01, 0.99], 0.0)
        low_cutoff, high_cutoff = quantiles[0], quantiles[1]
        
        print(f"  {column} - Low cutoff: {low_cutoff}, High cutoff: {high_cutoff}")
        if low_cutoff < 0:
            low_cutoff = 0
            print(f"  Adjusted Low cutoff for {column} to 0 since it was negative.")
            
        before_count = result_df.count()
        result_df = result_df.filter(
            (F.col(column) >= low_cutoff) & (F.col(column) <= high_cutoff)
        )
        after_count = result_df.count()
        
        removed = before_count - after_count
        print(f"  Removed {removed} outlier rows based on {column}")
        
        
       
    dataframes[table_name] = result_df
    print(f"\n‚úÖ Completed outlier removal for {table_name}")
   


In [22]:
def remove_all_outliers():
    all_ids = ["session_id","customer_id", "address_id", "product_id", "supplier_id", "order_id", "order_item_id", "payment_id", "campaign_id","cart_id", "review_id", "wishlist_id"]
    for table in dataframes.keys():
        numeric_cols = [field.name for field in dataframes[table].schema.fields
                        if isinstance(field.dataType, (IntegerType, LongType, FloatType, DoubleType, DecimalType))
                       ]
        numeric_cols = [column_name for column_name in numeric_cols if column_name not in all_ids]
        if numeric_cols:
            print(f"\nRemoving outliers for table: {table}")  
            remove_outliers(table, numeric_cols)
        else:
            print(f"\nNo numeric columns found in table: {table}, skipping outlier removal.")

remove_all_outliers()


Removing outliers for table: customer_sessions

Processing outliers for pages_viewed in customer_sessions...
  pages_viewed - Low cutoff: -4.0, High cutoff: 30.0
  Adjusted Low cutoff for pages_viewed to 0 since it was negative.
  Removed 71 outlier rows based on pages_viewed

Processing outliers for products_viewed in customer_sessions...
  products_viewed - Low cutoff: 0.0, High cutoff: 20.0
  Removed 28 outlier rows based on products_viewed

‚úÖ Completed outlier removal for customer_sessions

No numeric columns found in table: customers, skipping outlier removal.

Removing outliers for table: inventory

Processing outliers for stock_quantity in inventory...
  stock_quantity - Low cutoff: 0.0, High cutoff: 952.0
  Removed 13 outlier rows based on stock_quantity

Processing outliers for reserved_quantity in inventory...
  reserved_quantity - Low cutoff: 0.0, High cutoff: 462.0
  Removed 13 outlier rows based on reserved_quantity

Processing outliers for minimum_stock_level in invent

In [23]:
def validate_dates_and_timestamps():
    
    from pyspark.sql.functions import current_date, current_timestamp, when, col
    from pyspark.sql.types import DateType, TimestampType
    
    print("üïí Validating dates and timestamps...")
    
    for table_name, df in dataframes.items():
        print(f"\nüìÖ Processing {table_name}...")
        
        
        date_timestamp_cols = []
        for field in df.schema.fields:
            if isinstance(field.dataType, (DateType, TimestampType)):
                date_timestamp_cols.append((field.name, field.dataType))
        
        if not date_timestamp_cols:
            print(f"  ‚úÖ No date/timestamp columns found in {table_name}")
            continue
            
        result_df = df
        
        for col_name, col_type in date_timestamp_cols:
            print(f"  üîç Checking {col_name} ({col_type})...")
            
            if isinstance(col_type, DateType):
                
                future_count = result_df.filter(col(col_name) > current_date()).count()
                
                if future_count > 0:
                    print(f"    ‚ö†Ô∏è Found {future_count} future dates in {col_name}")
                    result_df = result_df.withColumn(
                        col_name,
                        when(col(col_name) > current_date(), current_date())
                        .otherwise(col(col_name))
                    )
                    print(f"    ‚úÖ Updated {future_count} future dates to current date")
                else:
                    print(f"    ‚úÖ No future dates found in {col_name}")
                    
            elif isinstance(col_type, TimestampType):
                
                future_count = result_df.filter(col(col_name) > current_timestamp()).count()
                
                if future_count > 0:
                    print(f"    ‚ö†Ô∏è Found {future_count} future timestamps in {col_name}")
                    result_df = result_df.withColumn(
                        col_name,
                        when(col(col_name) > current_timestamp(), current_timestamp())
                        .otherwise(col(col_name))
                    )
                    print(f"    ‚úÖ Updated {future_count} future timestamps to current timestamp")
                else:
                    print(f"    ‚úÖ No future timestamps found in {col_name}")
        
        dataframes[table_name] = result_df
    
    print("\nüéâ Date and timestamp validation completed!")


validate_dates_and_timestamps()

üïí Validating dates and timestamps...

üìÖ Processing customer_sessions...
  üîç Checking session_start (TimestampType())...
    ‚ö†Ô∏è Found 27 future timestamps in session_start
    ‚úÖ Updated 27 future timestamps to current timestamp
  üîç Checking session_end (TimestampType())...
    ‚ö†Ô∏è Found 25 future timestamps in session_end
    ‚úÖ Updated 25 future timestamps to current timestamp

üìÖ Processing customers...
  üîç Checking date_of_birth (DateType())...
    ‚úÖ No future dates found in date_of_birth
  üîç Checking account_created_at (TimestampType())...
    ‚úÖ No future timestamps found in account_created_at
  üîç Checking last_login_date (DateType())...
    ‚ö†Ô∏è Found 8 future dates in last_login_date
    ‚úÖ Updated 8 future dates to current date

üìÖ Processing inventory...
  üîç Checking last_restocked_date (DateType())...
    ‚úÖ No future dates found in last_restocked_date

üìÖ Processing marketing_campaigns...
  üîç Checking start_date (DateType())..

In [24]:
dataframes["suppliers"].show(5)

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+---------------+-----------------+--------+-----------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|           city|            state|zip_code|    country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+---------------+-----------------+--------+-----------+
|      10669|           3.38|      suspended|        true|       true|         2025-06-02|       2025-11-20|           ÁáïÂ∏Ç|        Guip√∫zcoa|406-2179|Bi√©lorussie|
|      10757|           4.04|       active  |       false|       true|         2022-12-20|       2025-11-20|Barre-sur-Alves|Baden-W√ºrttemberg|446-7296|     Belize|
|      10733|           0.13|         active|        true|       true|         2025-03-05|       2025-11-20|         ÂêàËÇ•Â∏Ç|        La Coru√±a|   55417|      Macao|
|      

## Handling Gibberish Values

In [25]:
from pyspark.sql.functions import col, length, trim, when


def detect_gibberish_patterns():
    """
    Cleans specific columns with known gibberish or invalid data patterns 
    (like postal codes, dimensions, or status columns) across all tables.
    Invalid values are replaced with lit(NULL) (represented by F.lit(None)).
    """

    print("\n" + "=" * 60)
    print("üîç DETECTING AND CLEANING GIBBERISH PATTERNS (Specific Columns)")
    print("=" * 60)

    # 1. Clean postal/zip code columns - Relaxed for international formats (Issue 2)
    # Replaced 'Unknown' with lit(NULL) (Issue 1)
    postal_columns = {"customers": "postal_code", "suppliers": "zip_code"}

    for table, col_name in postal_columns.items():
        if table in dataframes:
            df = dataframes[table]
            if col_name in df.columns:
                df = df.withColumn(
                    col_name,
                    when(
                        # Check for characters outside of letters, numbers, space, and hyphen
                        (col(col_name).rlike(r"[^a-zA-Z0-9 -]")) 
                        # Check for excessive length (permissive for international codes)
                        | (length(trim(col(col_name))) > 15) 
                        # Check for repeating patterns (e.g., AAAA, 1111)
                        | (col(col_name).rlike(r"(.)\1{3,}")), 
                        F.lit(None),  # Replace with lit(NULL)
                    ).otherwise(trim(col(col_name))),
                )
                dataframes[table] = df
                print(f"‚úÖ Cleaned {col_name} in {table} (using flexible international check)")

    # 2. Clean dimensions column in products - Updated for 2D/3D and 'x'/'*' separators (Issue 3)
    # Replaced 'Unknown' with lit(NULL) (Issue 1)
    if "products" in dataframes:
        df = dataframes["products"]
        if "dimensions" in df.columns:
            # Regex accepts:
            # - Numeric/decimal values ([\d\.])
            # - Separators 'x', 'X', or '*' ([xX*])
            # - Optional second segment for 3D/3-part dimensions ((?:[xX*][\d\.]+)?\s*$)
            dimension_pattern = r"^\s*[\d\.]+[xX*][\d\.]+(?:[xX*][\d\.]+)?\s*$"
            
            df = df.withColumn(
                "dimensions",
                when(
                    # Only keep values that match the expected dimension format
                    col("dimensions").rlike(dimension_pattern),
                    trim(col("dimensions"))
                ).otherwise(F.lit(None)), # Replace with lit(NULL)
            )
            dataframes["products"] = df
            print("‚úÖ Cleaned dimensions in products (Updated to accept 2D/3D and '*/x')")

    # 3. Clean state/province columns - Generic check for gibberish/non-alpha characters
    for table, col_name in {"customers": "state_province", "suppliers": "state"}.items():
        if table in dataframes:
            df = dataframes[table]
            if col_name in df.columns:
                df = df.withColumn(
                    col_name,
                    when(
                        col(col_name).rlike(r".*[*@#$%^&].*"), 
                        F.lit(None) # Replace with lit(NULL)
                    ).otherwise(col(col_name)),
                )
                dataframes[table] = df
                print(f"‚úÖ Cleaned {col_name} in {table} (special chars check)")

    # 4. Clean city columns - Generic check for gibberish/numbers in city names
    for table in dataframes.keys():
        df = dataframes[table]
        if "city" in df.columns:
            df = df.withColumn(
                "city",
                when(
                    col("city").rlike(r".*[*@#$%^&0-9].*"), # Special chars or numbers
                    F.lit(None), # Replace with lit(NULL)
                ).otherwise(col("city")),
            )
            dataframes[table] = df
            print(f"‚úÖ Cleaned city in {table} (special chars/numbers check)")

    # 5. Clean country columns - Generic check for gibberish/numbers in country names
    for table in dataframes.keys():
        df = dataframes[table]
        if "country" in df.columns:
            df = df.withColumn(
                "country",
                when(
                    col("country").rlike(r".*[*@#$%^&0-9].*"), # Special chars or numbers
                    F.lit(None), # Replace with lit(NULL)
                ).otherwise(col("country")),
            )
            dataframes[table] = df
            print(f"‚úÖ Cleaned country in {table} (special chars/numbers check)")

    # 6. Validate status-type columns (e.g., order_status, account_status)
    status_columns = {
        "customers": ["account_status"],
        "orders": ["order_status", "delivery_status"],
        "payments": ["payment_status"],
        "suppliers": ["supplier_status"],
        "shopping_cart": ["cart_status"],
        "marketing_campaigns": ["campaign_status"],
    }
    
    valid_statuses = [
        "Active", "Inactive", "Pending", 
        "Shipped", "Delivered", "Cancelled", 
        "Completed", "Failed", "Success", 
        "Open", "Closed"
    ]

    for table, cols in status_columns.items():
        if table in dataframes:
            df = dataframes[table]
            for col_name in cols:
                if col_name in df.columns:
                    df = df.withColumn(
                        col_name,
                        when(
                            (col(col_name).isNull()) | (col(col_name).isin(valid_statuses)),
                            col(col_name)
                        ).otherwise(F.lit(None)) # Replace invalid status with lit(NULL)
                    )
                    dataframes[table] = df
                    print(f"‚úÖ Cleaned {col_name} in {table} (status check)")

    # 7. Validate gender column
    if "customers" in dataframes and "gender" in dataframes["customers"].columns:
        df = dataframes["customers"]
        valid_genders = ["Male", "Female", "Other", "Prefer Not to Say", "X"]
        df = df.withColumn(
            "gender",
            when(
                (col("gender").isNull()) | (col("gender").isin(valid_genders)),
                col("gender")
            ).otherwise(F.lit(None)) # Replace invalid gender with lit(NULL)
        )
        dataframes["customers"] = df
        print("‚úÖ Cleaned gender in customers (gender check)")

    print("=" * 60)
    print("‚úÖ PATTERN DETECTION COMPLETED")
    print("=" * 60)
detect_gibberish_patterns()


üîç DETECTING AND CLEANING GIBBERISH PATTERNS (Specific Columns)
‚úÖ Cleaned postal_code in customers (using flexible international check)
‚úÖ Cleaned zip_code in suppliers (using flexible international check)
‚úÖ Cleaned dimensions in products (Updated to accept 2D/3D and '*/x')
‚úÖ Cleaned state_province in customers (special chars check)
‚úÖ Cleaned state in suppliers (special chars check)
‚úÖ Cleaned city in customers (special chars/numbers check)
‚úÖ Cleaned city in suppliers (special chars/numbers check)
‚úÖ Cleaned country in customers (special chars/numbers check)
‚úÖ Cleaned country in suppliers (special chars/numbers check)
‚úÖ Cleaned account_status in customers (status check)
‚úÖ Cleaned order_status in orders (status check)
‚úÖ Cleaned payment_status in payments (status check)
‚úÖ Cleaned supplier_status in suppliers (status check)
‚úÖ Cleaned cart_status in shopping_cart (status check)
‚úÖ Cleaned campaign_status in marketing_campaigns (status check)
‚úÖ Cleaned gender 

In [26]:
dataframes["suppliers"].show()

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|             city|            state|zip_code|             country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|      10669|           3.38|           NULL|        true|       true|         2025-06-02|       2025-11-20|             ÁáïÂ∏Ç|        Guip√∫zcoa|406-2179|         Bi√©lorussie|
|      10757|           4.04|           NULL|       false|       true|         2022-12-20|       2025-11-20|  Barre-sur-Alves|Baden-W√ºrttemberg|446-7296|              Belize|
|      10733|           0.13|           NULL|        true|       true|         2025-03-05|       2025-11-20|           Â

In [27]:
from pyspark.sql.functions import udf
from pyspark.sql.types import BooleanType, StringType
import re


def clean_text_columns(dataframes):
    """
    Clean gibberish from text columns across all tables using linguistic analysis.

    Args:
        dataframes (dict): Dictionary of table names to DataFrames

    Returns:
        dict: Updated dictionary with cleaned text
    """

    def is_gibberish_text(text):
        """Detect gibberish strings based on character ratios and patterns."""
        if not text or len(str(text)) < 3:
            return False

        text = str(text).lower()

        # Skip if text contains only alphanumeric (likely an ID/code)
        if text.replace("_", "").replace("-", "").isalnum() and any(
            c.isdigit() for c in text
        ):
            return False

        vowels = len(re.findall(r"[aeiou]", text))
        vowel_ratio = vowels / len(text)

        # English text typically has 30-40% vowels
        if vowel_ratio < 0.15 or vowel_ratio > 0.7:
            return True

        # Check for excessive consonant clusters (5+ in a row)
        if re.search(r"[bcdfghjklmnpqrstvwxyz]{5,}", text):
            return True

        # Check for repeating patterns (same char 4+ times)
        if re.search(r"(.)\1{3,}", text):
            return True

        return False

    is_gibberish_udf = udf(is_gibberish_text, BooleanType())

    print("\n" + "=" * 60)
    print("üìù CLEANING TEXT COLUMNS FOR GIBBERISH")
    print("=" * 60)

    # Patterns to identify columns that should NOT be cleaned
    skip_patterns = [
        "id",
        "key",
        "sku",
        "code",
        "zip",
        "postal",
        "dimension",
        "transaction",
    ]

    for table_name, df in dataframes.items():
        print(f"\nüîç Checking {table_name}...")

        # Get string columns, excluding IDs and codes
        string_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StringType)
            and not any(pattern in field.name.lower() for pattern in skip_patterns)
        ]

        for col_name in string_cols:
            gibberish_count = df.filter(is_gibberish_udf(col(col_name))).count()

            if gibberish_count > 0:
                df = df.withColumn(
                    col_name,
                    when(is_gibberish_udf(col(col_name)), F.lit(None)).otherwise(
                        col(col_name)
                    ),
                )
                print(f"  ‚úÖ Fixed {gibberish_count} gibberish values in {col_name}")

        dataframes[table_name] = df

    print("\n" + "=" * 60)
    print("‚úÖ TEXT CLEANING COMPLETED")
    print("=" * 60)

    return dataframes
    """
    Clean gibberish from text columns across all tables using linguistic analysis.

    Uses a UDF to detect gibberish based on:
    - Vowel-to-consonant ratios
    - Excessive consonant clusters
    - Character repetition patterns

    Args:
        dataframes (dict): Dictionary of table names to DataFrames

    Returns:
        dict: Updated dictionary with cleaned text
    """

    def is_gibberish_text(text):
        """
        UDF to detect gibberish strings based on character ratios and patterns.
        """
        if not text or text in ["Unknown", "NULL", None]:
            return False

        text = str(text).lower()

        # Skip very short text
        if len(text) < 3:
            return False

        vowels = len(re.findall(r"[aeiou]", text))

        if len(text) > 3:
            vowel_ratio = vowels / len(text)
            # English text typically has 30-40% vowels
            if vowel_ratio < 0.15 or vowel_ratio > 0.7:
                return True

            # Check for excessive consonant clusters (4+ in a row)
            if re.search(r"[bcdfghjklmnpqrstvwxyz]{4,}", text):
                return True

            # Check for repeating patterns (same char 4+ times)
            if re.search(r"(.)\1{3,}", text):
                return True

        return False

    # Register UDF locally within function
    is_gibberish_udf = udf(is_gibberish_text, BooleanType())
    print("\n" + "=" * 60)
    print("üìù CLEANING TEXT COLUMNS FOR GIBBERISH")
    print("=" * 60)

    for table_name, df in dataframes.items():
        print(f"\nüîç Checking {table_name}...")

        # Get all string columns except special ones
        string_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StringType)
            and field.name
            not in ["sku", "zip_code", "postal_code", "dimensions", "transaction_id"]
        ]

        for col_name in string_cols:
            gibberish_count = df.filter(is_gibberish_udf(col(col_name))).count()

            if gibberish_count > 0:
                df = df.withColumn(
                    col_name,
                    when(is_gibberish_udf(col(col_name)), F.lit(None)).otherwise(
                        col(col_name)
                    ),
                )
                print(f"  ‚úÖ Fixed {gibberish_count} gibberish values in {col_name}")

        dataframes[table_name] = df

    print("\n" + "=" * 60)
    print("‚úÖ TEXT CLEANING COMPLETED")
    print("=" * 60)

    return dataframes

In [28]:
dataframes["suppliers"].show()

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|             city|            state|zip_code|             country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|      10669|           3.38|           NULL|        true|       true|         2025-06-02|       2025-11-20|             ÁáïÂ∏Ç|        Guip√∫zcoa|406-2179|         Bi√©lorussie|
|      10757|           4.04|           NULL|       false|       true|         2022-12-20|       2025-11-20|  Barre-sur-Alves|Baden-W√ºrttemberg|446-7296|              Belize|
|      10733|           0.13|           NULL|        true|       true|         2025-03-05|       2025-11-20|           Â

In [29]:
from pyspark.sql.functions import col, regexp_replace, when, trim
import re

def clean_numeric_strings():
    """Clean string columns with numeric formatting issues - fully general"""
    
    print("\n" + "="*60)
    print("üî¢ CLEANING NUMERIC STRING COLUMNS")
    print("="*60)
    
    # 1. Clean all ID columns - remove leading zeros
    for table_name, df in dataframes.items():
        id_columns = [col_name for col_name in df.columns if col_name.endswith("_id")]
        
        for col_name in id_columns:
            # Remove leading zeros
            df = df.withColumn(
                col_name,
                regexp_replace(col(col_name), "^0+(?=\\d)", "")
            )
        
        if id_columns:
            dataframes[table_name] = df
            print(f"‚úÖ Cleaned {len(id_columns)} ID columns in {table_name}")
    
    # 2. Clean all integer columns that might have text values
    from pyspark.sql.types import IntegerType, LongType, ShortType
    
    for table_name, df in dataframes.items():
        integer_cols = [field.name for field in df.schema.fields 
                       if isinstance(field.dataType, (IntegerType, LongType, ShortType))]
        
        for col_name in integer_cols:
            # Skip ID columns (already handled)
            if col_name.endswith("_id"):
                continue
            
            # Check if there are non-numeric values when cast to string
            non_numeric_count = df.filter(
                ~col(col_name).cast("string").rlike("^-?[0-9]+$")
            ).count()
            
            if non_numeric_count > 0:
                # Determine default value based on column name
                default_value = 1 if "quantity" in col_name.lower() else 0
                
                df = df.withColumn(
                    col_name,
                    when(
                        col(col_name).cast("string").rlike("^-?[0-9]+$"), 
                        col(col_name)
                    ).otherwise(default_value)
                )
                print(f"  ‚úÖ Cleaned {non_numeric_count} non-numeric values in {table_name}.{col_name}")
        
        dataframes[table_name] = df
    
    # 3. Clean all decimal/float columns
    from pyspark.sql.types import FloatType, DoubleType, DecimalType
    
    for table_name, df in dataframes.items():
        decimal_cols = [field.name for field in df.schema.fields 
                       if isinstance(field.dataType, (FloatType, DoubleType, DecimalType))]
        
        for col_name in decimal_cols:
            # Check for non-numeric text values
            non_numeric_count = df.filter(
                ~col(col_name).cast("string").rlike("^-?[0-9]+(\\.[0-9]+)?$")
            ).count()
            
            if non_numeric_count > 0:
                df = df.withColumn(
                    col_name,
                    when(
                        col(col_name).cast("string").rlike("^-?[0-9]+(\\.[0-9]+)?$"), 
                        col(col_name)
                    ).otherwise(0.0)
                )
                print(f"  ‚úÖ Cleaned {non_numeric_count} non-numeric values in {table_name}.{col_name}")
        
        dataframes[table_name] = df
    
    # 4. Handle columns with numeric status codes (convert to text)
    for table_name, df in dataframes.items():
        status_columns = [col_name for col_name in df.columns if col_name.endswith("_status")]
        
        for status_col in status_columns:
            # Check if status column has numeric values like "0", "1"
            numeric_status_count = df.filter(
                col(status_col).rlike("^[0-9]$")
            ).count()
            
            if numeric_status_count > 0:
                # Convert common numeric codes to text
                df = df.withColumn(
                    status_col,
                    when(col(status_col) == "1", "Active")
                    .when(col(status_col) == "0", "Inactive")
                    .when(col(status_col) == "2", "Pending")
                    .otherwise(col(status_col))
                )
                print(f"  ‚úÖ Converted {numeric_status_count} numeric status codes in {table_name}.{status_col}")
        
        dataframes[table_name] = df
    
    # 5. Clean transaction/reference IDs - remove special characters
    for table_name, df in dataframes.items():
        transaction_cols = [col_name for col_name in df.columns 
                          if "transaction" in col_name.lower() or col_name == "sku"]
        
        for col_name in transaction_cols:
            df = df.withColumn(
                col_name,
                regexp_replace(col(col_name), "[^a-zA-Z0-9-]", "")
            )
        
        if transaction_cols:
            dataframes[table_name] = df
            print(f"‚úÖ Cleaned transaction IDs in {table_name}")
    
    # 6. Trim all numeric string columns
    for table_name, df in dataframes.items():
        string_cols = [field.name for field in df.schema.fields 
                      if isinstance(field.dataType, StringType)]
        
        for col_name in string_cols:
            # Check if column looks numeric
            sample_value = df.select(col_name).filter(col(col_name).isNotNull()).first()
            if sample_value and sample_value[0]:
                if re.match(r'^[\d\s.,-]+$', str(sample_value[0])):
                    # Trim whitespace from numeric strings
                    df = df.withColumn(col_name, trim(col(col_name)))
        
        dataframes[table_name] = df
    
    print("="*60)
    print("‚úÖ NUMERIC STRING CLEANUP COMPLETED")
    print("="*60)

clean_numeric_strings()


üî¢ CLEANING NUMERIC STRING COLUMNS
‚úÖ Cleaned 2 ID columns in customer_sessions
‚úÖ Cleaned 1 ID columns in customers
‚úÖ Cleaned 3 ID columns in inventory
‚úÖ Cleaned 1 ID columns in marketing_campaigns
‚úÖ Cleaned 3 ID columns in order_items
‚úÖ Cleaned 2 ID columns in orders
‚úÖ Cleaned 3 ID columns in payments
‚úÖ Cleaned 2 ID columns in products
‚úÖ Cleaned 3 ID columns in reviews
‚úÖ Cleaned 4 ID columns in shopping_cart
‚úÖ Cleaned 1 ID columns in suppliers
‚úÖ Cleaned 3 ID columns in wishlist
‚úÖ Cleaned transaction IDs in payments
‚úÖ Cleaned transaction IDs in products
‚úÖ NUMERIC STRING CLEANUP COMPLETED


In [30]:
dataframes["suppliers"].show()

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|             city|            state|zip_code|             country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|      10669|           3.38|           NULL|        true|       true|         2025-06-02|       2025-11-20|             ÁáïÂ∏Ç|        Guip√∫zcoa|406-2179|         Bi√©lorussie|
|      10757|           4.04|           NULL|       false|       true|         2022-12-20|       2025-11-20|  Barre-sur-Alves|Baden-W√ºrttemberg|446-7296|              Belize|
|      10733|           0.13|           NULL|        true|       true|         2025-03-05|       2025-11-20|           Â

In [31]:
def clean_whitespace_issues():
    """Remove excessive whitespace and formatting issues - fully general"""

    print("\n" + "=" * 60)
    print("üßπ CLEANING WHITESPACE AND FORMATTING")
    print("=" * 60)

    from pyspark.sql.types import StringType

    for table_name, df in dataframes.items():
        # Get all string columns
        string_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StringType)
        ]

        if not string_cols:
            continue

        for col_name in string_cols:
            # 1. Trim leading/trailing whitespace
            df = df.withColumn(col_name, trim(col(col_name)))

            # 2. Replace multiple spaces with single space
            df = df.withColumn(col_name, regexp_replace(col(col_name), "\\s+", " "))

            # 3. Remove trailing special characters
            df = df.withColumn(col_name, regexp_replace(col(col_name), "[*]+$", ""))

            # 4. Remove leading special characters
            df = df.withColumn(col_name, regexp_replace(col(col_name), "^[*]+", ""))

            # 5. Clean up excessive quotes
            df = df.withColumn(col_name, regexp_replace(col(col_name), '"{2,}', '"'))

            # 6. Remove trailing quotes
            df = df.withColumn(col_name, regexp_replace(col(col_name), '"+$', ""))

        dataframes[table_name] = df
        print(f"‚úÖ Cleaned {len(string_cols)} string columns in {table_name}")

    print("=" * 60)
    print("‚úÖ WHITESPACE CLEANUP COMPLETED")
    print("=" * 60)


clean_whitespace_issues()


üßπ CLEANING WHITESPACE AND FORMATTING
‚úÖ Cleaned 4 string columns in customer_sessions
‚úÖ Cleaned 7 string columns in customers
‚úÖ Cleaned 3 string columns in inventory
‚úÖ Cleaned 5 string columns in marketing_campaigns
‚úÖ Cleaned 3 string columns in order_items
‚úÖ Cleaned 4 string columns in orders
‚úÖ Cleaned 6 string columns in payments
‚úÖ Cleaned 11 string columns in products
‚úÖ Cleaned 5 string columns in reviews
‚úÖ Cleaned 5 string columns in shopping_cart
‚úÖ Cleaned 6 string columns in suppliers
‚úÖ Cleaned 3 string columns in wishlist
‚úÖ WHITESPACE CLEANUP COMPLETED


In [32]:
dataframes["suppliers"].show()

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|             city|            state|zip_code|             country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+-----------------+-----------------+--------+--------------------+
|      10669|           3.38|           NULL|        true|       true|         2025-06-02|       2025-11-20|             ÁáïÂ∏Ç|        Guip√∫zcoa|406-2179|         Bi√©lorussie|
|      10757|           4.04|           NULL|       false|       true|         2022-12-20|       2025-11-20|  Barre-sur-Alves|Baden-W√ºrttemberg|446-7296|              Belize|
|      10733|           0.13|           NULL|        true|       true|         2025-03-05|       2025-11-20|           Â

In [33]:
def clean_mixed_scripts():
    """Remove non-ASCII characters from all text columns - fully general"""

    print("\n" + "=" * 60)
    print("üåê CLEANING MIXED SCRIPTS AND NON-ASCII CHARACTERS")
    print("=" * 60)

    from pyspark.sql.types import StringType

    for table_name, df in dataframes.items():
        # Get all string columns
        string_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StringType)
        ]

        # Skip ID columns and codes
        text_cols = [
            col_name
            for col_name in string_cols
            if not col_name.endswith("_id") and "code" not in col_name.lower()
        ]

        if not text_cols:
            continue

        print(f"\n  üîß Processing {table_name}...")

        for col_name in text_cols:
            # Count non-ASCII before cleaning
            non_ascii_count = df.filter(col(col_name).rlike(".*[^\x00-\x7f].*")).count()

            if non_ascii_count > 0:
                # Determine replacement value based on column type
                replacement_value = "Unknown"
                if "name" in col_name.lower():
                    replacement_value = "Unknown"
                elif "title" in col_name.lower():
                    replacement_value = "No Title"
                elif "desc" in col_name.lower():
                    replacement_value = "No description"
                elif "city" in col_name.lower():
                    replacement_value = "Unknown"
                elif "state" in col_name.lower() or "province" in col_name.lower():
                    replacement_value = "Unknown"
                elif "country" in col_name.lower():
                    replacement_value = "Unknown"

                # Replace non-ASCII characters
                df = df.withColumn(
                    col_name,
                    when(
                        col(col_name).rlike(".*[^\x00-\x7f].*"), replacement_value
                    ).otherwise(col(col_name)),
                )

                print(
                    f"    ‚úÖ Cleaned {non_ascii_count} rows with non-ASCII in {col_name}"
                )

        dataframes[table_name] = df

    print("\n" + "=" * 60)
    print("‚úÖ MIXED SCRIPTS CLEANUP COMPLETED")
    print("=" * 60)


clean_mixed_scripts()


üåê CLEANING MIXED SCRIPTS AND NON-ASCII CHARACTERS

  üîß Processing customer_sessions...

  üîß Processing customers...
    ‚úÖ Cleaned 80 rows with non-ASCII in city
    ‚úÖ Cleaned 77 rows with non-ASCII in state_province
    ‚úÖ Cleaned 93 rows with non-ASCII in country

  üîß Processing marketing_campaigns...

  üîß Processing orders...

  üîß Processing payments...

  üîß Processing products...
    ‚úÖ Cleaned 24 rows with non-ASCII in product_name
    ‚úÖ Cleaned 37 rows with non-ASCII in category
    ‚úÖ Cleaned 10 rows with non-ASCII in sub_category
    ‚úÖ Cleaned 2 rows with non-ASCII in brand

  üîß Processing reviews...
    ‚úÖ Cleaned 16 rows with non-ASCII in review_title
    ‚úÖ Cleaned 10 rows with non-ASCII in review_desc

  üîß Processing shopping_cart...

  üîß Processing suppliers...
    ‚úÖ Cleaned 329 rows with non-ASCII in city
    ‚úÖ Cleaned 110 rows with non-ASCII in state
    ‚úÖ Cleaned 351 rows with non-ASCII in country

‚úÖ MIXED SCRIPTS CLEAN

In [34]:
dataframes["suppliers"].show()

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+----------------+----------+--------+--------------------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|            city|     state|zip_code|             country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+----------------+----------+--------+--------------------+
|      10669|           3.38|           NULL|        true|       true|         2025-06-02|       2025-11-20|         Unknown|   Unknown|406-2179|             Unknown|
|      10757|           4.04|           NULL|       false|       true|         2022-12-20|       2025-11-20| Barre-sur-Alves|   Unknown|446-7296|              Belize|
|      10733|           0.13|           NULL|        true|       true|         2025-03-05|       2025-11-20|         Unknown|   Unknown|   55417|               Macao

In [35]:
def validate_all_cleaned_data():
    """Final validation of all cleaned data"""

    print("\n" + "=" * 60)
    print("üîç FINAL DATA VALIDATION")
    print("=" * 60)

    issues_found = False

    # 1. Check for gibberish patterns in all text columns
    for table_name, df in dataframes.items():
        from pyspark.sql.types import StringType

        string_cols = [
            field.name
            for field in df.schema.fields
            if isinstance(field.dataType, StringType)
        ]

        for col_name in string_cols:
            # Check for excessive special characters
            special_char_count = df.filter(
                col(col_name).rlike(".*[*@#$%^&]{2,}.*")
            ).count()

            if special_char_count > 0:
                print(
                    f"‚ö†Ô∏è {table_name}.{col_name}: {special_char_count} rows with multiple special characters"
                )
                issues_found = True

            # Check for excessive whitespace
            whitespace_count = df.filter(col(col_name).rlike(".*\\s{3,}.*")).count()

            if whitespace_count > 0:
                print(
                    f"‚ö†Ô∏è {table_name}.{col_name}: {whitespace_count} rows with excessive whitespace"
                )
                issues_found = True

            # Check for non-ASCII in important columns
            if col_name in [
                "city",
                "country",
                "state",
                "state_province",
                "product_name",
                "brand",
            ]:
                non_ascii_count = df.filter(
                    col(col_name).rlike(".*[^\x00-\x7f].*")
                ).count()

                if non_ascii_count > 0:
                    print(
                        f"‚ö†Ô∏è {table_name}.{col_name}: {non_ascii_count} rows with non-ASCII characters"
                    )
                    issues_found = True

    if not issues_found:
        print("‚úÖ All data passed validation checks!")
    else:
        print("\n‚ö†Ô∏è Some issues found - review the warnings above")

    print("=" * 60)


# Run validation
validate_all_cleaned_data()


üîç FINAL DATA VALIDATION
‚ö†Ô∏è marketing_campaigns.target_audience: 5 rows with multiple special characters


                                                                                




In [36]:
dataframes["suppliers"].show()

+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+----------------+----------+--------+--------------------+
|supplier_id|supplier_rating|supplier_status|is_preferred|is_verified|contract_start_date|contract_end_date|            city|     state|zip_code|             country|
+-----------+---------------+---------------+------------+-----------+-------------------+-----------------+----------------+----------+--------+--------------------+
|      10669|           3.38|           NULL|        true|       true|         2025-06-02|       2025-11-20|         Unknown|   Unknown|406-2179|             Unknown|
|      10757|           4.04|           NULL|       false|       true|         2022-12-20|       2025-11-20| Barre-sur-Alves|   Unknown|446-7296|              Belize|
|      10733|           0.13|           NULL|        true|       true|         2025-03-05|       2025-11-20|         Unknown|   Unknown|   55417|               Macao

In [37]:
dataframes["products"].show(1000)

+----------+--------------------+-------------+--------------------+--------------------+------------+-----------+----------+----------+-----------+------+----------+-----------+-----------------+---------------+
|product_id|        product_name|          sku|            category|        sub_category|       brand|supplier_id|cost_price|sell_price|launch_date|weight|dimensions|      color|             size|       material|
+----------+--------------------+-------------+--------------------+--------------------+------------+-----------+----------+----------+-----------+------+----------+-----------+-----------------+---------------+
|      1388|Samsung Casual Shoes|SA-SP0388-NAV|      Children Books|            Standard|     Saucony|      10646|    288.73|     428.1| 2022-12-01| 0.596|  35x19x10|        Red|               43|          Nylon|
|      1112|Sony Running Shoe...|SO-HM0112-TUR|            Cookware|            Featured|       Canon|      10727|    421.55|    773.67| 2022-06-09|

In [38]:
dataframes["customers"].show(1000)

+-----------+------+-------------+--------------+--------------------+--------------------+-----------+--------------------+-------------------+---------------+---------+
|customer_id|gender|date_of_birth|account_status|                city|      state_province|postal_code|             country| account_created_at|last_login_date|is_active|
+-----------+------+-------------+--------------+--------------------+--------------------+-----------+--------------------+-------------------+---------------+---------+
|       1726|  NULL|   2003-12-27|        Active|           Cookmouth|     Rheinland-Pfalz|     B8X8E7|             Unknown|2023-09-13 00:05:57|     2024-12-28|     true|
|       1413|  Male|   1989-10-01|          NULL|        North Andrew|            Albacete|      44687|              Egipto|2023-03-07 00:56:37|     2024-08-09|     true|
|       1003| Other|   1991-06-20|        Active|             Fischer|             Arizona|      41316|               Korea|2025-11-07 13:01:19| 

In [39]:
from io import BytesIO

for table, df in dataframes.items():
    pdf = df.toPandas()
    csv_buffer = BytesIO()
    pdf.to_csv(csv_buffer, index=False)
    csv_buffer.seek(0)
    file_name = "cleaned_" + table + ".csv"
    minio_client.put_object(
        bucket_name,
        file_name,
        csv_buffer,
        length=len(csv_buffer.getvalue()),
        content_type="text/csv",
    )
    print(f"‚úÖ Saved {file_name} ({len(pdf)} rows)")
    csv_buffer.close()

‚úÖ Saved cleaned_customer_sessions.csv (3059 rows)
‚úÖ Saved cleaned_customers.csv (1018 rows)
‚úÖ Saved cleaned_inventory.csv (1321 rows)
‚úÖ Saved cleaned_marketing_campaigns.csv (473 rows)
‚úÖ Saved cleaned_order_items.csv (4453 rows)
‚úÖ Saved cleaned_orders.csv (2253 rows)
‚úÖ Saved cleaned_payments.csv (3305 rows)
‚úÖ Saved cleaned_products.csv (936 rows)
‚úÖ Saved cleaned_reviews.csv (2738 rows)
‚úÖ Saved cleaned_shopping_cart.csv (1780 rows)
‚úÖ Saved cleaned_suppliers.csv (975 rows)
‚úÖ Saved cleaned_wishlist.csv (1894 rows)


In [40]:
dataframes["inventory"].show(10)

+------------+----------+-----------+--------------+-----------------+-------------------+-------------------+------------+
|inventory_id|product_id|supplier_id|stock_quantity|reserved_quantity|minimum_stock_level|last_restocked_date|storage_cost|
+------------+----------+-----------+--------------+-----------------+-------------------+-------------------+------------+
|       10847|      1984|      10147|            82|                8|                 26|         2025-10-08|        2.64|
|       10421|      1818|      10399|            61|                6|                 68|         2025-10-24|        4.71|
|       10772|      1482|      10222|             0|                0|                 43|         2025-10-24|        4.32|
|       11377|      1986|      10012|           617|              185|                 30|         2025-11-07|         4.9|
|       10238|      2387|      10100|             0|                0|                  0|         2025-03-13|        3.85|
|       