In [33]:
number_of_customers = 1000
starting_customer_index = 1000
number_of_addresses = 1500
starting_address_index = 5000
number_of_products = 1000
starting_product_index = 1000
number_of_orders = 2500
starting_order_index = 100
number_of_order_items = 5000
starting_order_item_index = 100
number_of_reviews = 3000
starting_review_index = 100
number_of_categories = 200
starting_category_index = 500
number_of_wishlists = 2000
starting_wishlist_index = 10
number_of_payments = 3500
starting_payment_index = 200
number_of_campaigns = 500
starting_campaign_index = 1
number_of_sessions = 5000
starting_session_index = 100
number_of_suppliers = 1000
starting_supplier_index = 10000
number_of_inventories = 1500
starting_inventory_index = 10000
number_of_carts = 3000
starting_cart_index = 1000

### Customer Table Generator

In [34]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, timezone
import random
import string
import re

fake = Faker(["en_US", "en_GB", "en_CA", "en_AU"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

GENDER_MAP = {
    "M": "Male", "m": "Male", "male": "Male", "man": "Male",
    "F": "Female", "f": "Female", "female": "Female", "woman": "Female",
    "O": "Other", "o": "Other", "non-binary": "Other", "nb": "Other",
    "1": "Male", "2": "Female",
}
GENDER_CANONICAL = ["Male", "Female", "Other", "Prefer not to say"]
STATUS_CANONICAL = ["Active", "Inactive", "Blocked", "Pending", "Closed"]
DUMMY_EMAIL_DOMAINS = ["example.com", "test.com", "mailinator.com", "tempmail.com"]

def generate_user_id(index):
    """Generate numeric integer user_id (> 0)."""
    return index

def to_iso8601(dt):
    """Convert to ISO-8601 with UTC timezone."""
    if dt is None:
        return None
    if isinstance(dt, str):
        try:
            parsed = pd.to_datetime(dt, errors="coerce")
            if pd.isna(parsed):
                return None
            return parsed.strftime("%Y-%m-%dT%H:%M:%SZ")
        except:
            return None
    if isinstance(dt, (datetime, pd.Timestamp)):
        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    return None

def to_iso8601_date(dt):
    """Convert to YYYY-MM-DD format."""
    if dt is None:
        return None
    if isinstance(dt, str):
        try:
            parsed = pd.to_datetime(dt, errors="coerce")
            if pd.isna(parsed):
                return None
            return parsed.strftime("%Y-%m-%d")
        except:
            return None
    if isinstance(dt, (datetime, pd.Timestamp)):
        return dt.strftime("%Y-%m-%d")
    return None

def normalize_gender(gender):
    """Map to canonical gender value."""
    if not gender or (isinstance(gender, str) and gender.lower() in ["n/a", "unknown", "null", ""]):
        return "Prefer not to say"
    if isinstance(gender, str):
        if len(gender) > 30:
            return "Prefer not to say"
        mapped = GENDER_MAP.get(gender.lower()) or GENDER_MAP.get(gender)
        return mapped if mapped else "Prefer not to say"
    return "Prefer not to say"

def normalize_status(status):
    """Map to canonical status with title case."""
    if not status or (isinstance(status, str) and status.lower() in ["n/a", "null", ""]):
        return "Active"
    if isinstance(status, str):
        status = status.strip().title()
        status = re.sub(r'\s+', ' ', status)
        return status if status in STATUS_CANONICAL else "Active"
    return "Active"

def is_valid_dob(dob):
    """Check if DOB is valid Gregorian date with age 13-115."""
    if not dob:
        return False
    try:
        parsed = pd.to_datetime(dob, errors="coerce")
        if pd.isna(parsed):
            return False
        if parsed > datetime.now():
            return False
        age = (datetime.now() - parsed).days / 365.2425
        return 13 <= age <= 115
    except:
        return False

def is_valid_dob_at_signup(dob, created_date):
    """Check if user was 13+ at account creation."""
    if not dob or not created_date:
        return True
    try:
        dob_parsed = pd.to_datetime(dob)
        created_parsed = pd.to_datetime(created_date) if isinstance(created_date, str) else created_date
        min_dob = created_parsed - timedelta(days=13*365.25)
        return dob_parsed <= min_dob
    except:
        return True

def status_to_is_active(status):
    """Determine is_active boolean from status."""
    return status == "Active"

def normalize_country_code(country):
    """Normalize to ISO 3166-1 alpha-2 only."""
    iso_map = {"USA": "US", "UK": "GB", "Canada": "CA", "Australia": "AU"}
    if not country:
        return "US"
    if isinstance(country, str):
        country = country.strip().upper()
        if country in ["UNKNOWN", "N/A", "EUROPE", "APAC"]:
            return "US"
        mapped = iso_map.get(country)
        if mapped:
            return mapped
        if len(country) == 2 and country.isalpha():
            return country
    return "US"

def is_valid_postal_code(postal, country):
    """Validate postal code format for country and reject repeated digits."""
    if not postal or isinstance(postal, float):
        return True
    postal_str = str(postal).strip().upper()
    
    # Reject repeated digits: 00000, 11111, 22222, etc.
    if len(postal_str) >= 5 and len(set(postal_str.replace(" ", "-"))) == 1:
        return False
    
    # Reject common placeholders
    if postal_str in ["00000", "99999", "N/A", "NULL"]:
        return False
    
    return True

def postal_code_for_country(country):
    """Generate valid country-specific postal code."""
    if country == "US":
        code = f"{random.randint(10000, 99999)}"
        return code if code not in ["00000", "99999"] else f"{random.randint(10001, 99998)}"
    elif country == "CA":
        return f"{random.choice(string.ascii_uppercase)}{random.randint(0,9)}{random.choice(string.ascii_uppercase)} {random.randint(0,9)}{random.choice(string.ascii_uppercase)}{random.randint(0,9)}"
    elif country == "GB":
        return f"{random.choice(string.ascii_uppercase)}{random.randint(1,9)} {random.randint(0,9)}{random.choice(string.ascii_uppercase)}{random.choice(string.ascii_uppercase)}"
    elif country == "AU":
        code = f"{random.randint(1000, 9999)}"
        return code if code not in ["0000", "9999"] else f"{random.randint(1001, 9998)}"
    return f"{random.randint(10000, 99999)}"

def normalize_city(city):
    """Validate and normalize city (2-60 chars, title case, allowed chars only)."""
    if not city or (isinstance(city, str) and city.lower().strip() in ["n/a", "null", "unknown", "", "   "]):
        return None
    if isinstance(city, str):
        city = city.strip().title()
        if len(city) < 2 or len(city) > 60:
            return None
        if not re.match(r"^[a-zA-Z\s\-'\.]+$", city):
            return None
        return city
    return None

def normalize_email(email):
    """Lowercase, trim, validate domain, check for dummy domains."""
    if not email or (isinstance(email, str) and email.lower().strip() in ["n/a", "na", "not provided", "", "null"]):
        return None
    if isinstance(email, str):
        email = email.strip().lower()
        if not re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", email):
            return None
        if any(dummy in email for dummy in DUMMY_EMAIL_DOMAINS):
            return None
        if ".." in email or email.startswith(".") or email.endswith("."):
            return None
        if " " in email:
            return None
        return email
    return None

def normalize_phone(phone):
    """Normalize to E.164 format when possible."""
    if not phone or (isinstance(phone, str) and phone.upper() in ["N/A", "UNKNOWN", "0000000000"]):
        return None
    if isinstance(phone, str):
        phone = phone.strip()
        digits = re.sub(r'\D', '', phone)
        if len(digits) < 7 or len(digits) > 15:
            return None
        if digits in ["1234567890", "0000000000"] or len(set(digits)) == 1:
            return None
        if not digits.startswith("1") and len(digits) == 10:
            digits = "1" + digits
        if len(digits) == 11 and digits[0] == "1":
            return f"+{digits}"
        if len(digits) >= 7:
            return f"+{digits}" if not digits.startswith("+") else digits
    return None


def generate_messy_customer_data(num_rows=1000):
    """Generate customer data with all validation rules integrated."""
    data = []
    used_emails = set()
    used_user_ids = set()

    for i in range(num_rows):
        record = {}
        idx = starting_customer_index + i

        # 1. user_id: Pure integer, must be > 0, unique (NO DUPLICATES)
        record["user_id"] = generate_user_id(idx)
        used_user_ids.add(record["user_id"])

        # 2. gender: Canonical values only
        if i % 20 == 0:
            gender = None
        elif i % 25 == 0:
            gender = random.choice(["M", "F", "O", "male", "female", "1", "2"])
        else:
            gender = random.choice(["Male", "Female", "Other", "Prefer not to say"])
        record["gender"] = normalize_gender(gender)

        # 3. date_of_birth: YYYY-MM-DD format, age 13-115, no future dates
        if i % 15 == 0:
            dob = None
        elif i % 20 == 0:
            dob_date = fake.date_of_birth(minimum_age=18, maximum_age=80)
            formats = ["%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y"]
            dob = dob_date.strftime(random.choice(formats))
        elif i % 30 == 0:
            # Invalid dates
            dob = random.choice(["0000-00-00", "1900-01-01", "9999-99-99"])
        elif i % 40 == 0:
            # Future dates
            dob = fake.date_between(start_date="today", end_date="+10y")
        elif i % 80 == 0:
            # Too old
            dob = fake.date_between(start_date="-150y", end_date="-120y")
        else:
            dob = fake.date_of_birth(minimum_age=18, maximum_age=80)

        # Validate and convert to YYYY-MM-DD
        if dob:
            dob_converted = to_iso8601_date(dob)
            if dob_converted and is_valid_dob(dob_converted):
                record["date_of_birth"] = dob_converted
            else:
                record["date_of_birth"] = to_iso8601_date(fake.date_of_birth(minimum_age=18, maximum_age=80))
        else:
            record["date_of_birth"] = None

        # 4. account_status: Canonical values with logical constraints
        if i % 30 == 0:
            status = None
        elif i % 40 == 0:
            status = random.choice(["active", "inactive", "blocked", "pending"])
        else:
            status = random.choice(["Active", "Inactive", "Blocked", "Closed"])
        record["account_status"] = normalize_status(status)

        # 10. account_created_at: ISO-8601 with timezone, <= now()
        if i % 25 == 0:
            reg_date = None
        else:
            reg_date = fake.date_time_between(start_date="-5y", end_date="now")
        record["account_created_at"] = to_iso8601(reg_date)

        # Validate DOB lifecycle: user must be 13+ at signup
        if record["date_of_birth"] and record["account_created_at"]:
            if not is_valid_dob_at_signup(record["date_of_birth"], record["account_created_at"]):
                record["date_of_birth"] = to_iso8601_date(
                    pd.to_datetime(record["account_created_at"]) - timedelta(days=20*365.25)
                )

        # 5. address_id: Format ADDR_[1-9][0-9]*, NULL rules based on status
        if record["account_status"] in ["Active", "Inactive", "Blocked", "Closed"]:
            # For these statuses, address should usually be present
            if i % 40 == 0 and random.random() > 0.7:
                address_id = None
            else:
                address_id = f"ADDR_{starting_address_index + i}"
        else:
            # Pending can have NULL address
            if i % 25 == 0:
                address_id = None
            else:
                address_id = f"ADDR_{starting_address_index + i}"
        
        # Validate address_id format
        if address_id and isinstance(address_id, str):
            if re.match(r"^ADDR_[1-9][0-9]*$", address_id):
                record["address_id"] = address_id
            else:
                record["address_id"] = None
        else:
            record["address_id"] = None

        # 6. city: 2-60 chars, title case, no digits/special chars (except -, ', .)
        if i % 50 == 0:
            city = None
        elif i % 30 == 0:
            city = random.choice(["N/A", "NULL", "Unknown"])
        else:
            city = fake.city()
        record["city"] = normalize_city(city)

        # 9. country: ISO 3166-1 Alpha-2 uppercase only
        if i % 30 == 0:
            country = None
        elif i % 40 == 0:
            country = random.choice(["Unknown", "EUROPE", "APAC"])
        else:
            country = fake.country_code()
        record["country"] = normalize_country_code(country)

        # 7. state_province: Valid subdivision, require for US/CA/AU if address exists
        if i % 20 == 0:
            state = None
        elif i % 30 == 0:
            state = random.choice(["N/A", "XX", "Unknown"])
        else:
            state = random.choice([fake.state(), fake.state_abbr()])
        
        # Enforce non-null for country-required states
        if record["country"] in ["US", "CA", "AU"] and record["address_id"]:
            if state and state.lower() in ["n/a", "xx", "unknown", ""]:
                state = fake.state_abbr()
        
        record["state_province"] = state.strip() if (state and isinstance(state, str) and state.lower() not in ["n/a", "xx", "unknown"]) else None

        # 8. postal_code: Country-specific format, trim, no all-zeros/repeated digits
        if i % 25 == 0:
            postal = None
        elif i % 35 == 0:
            postal = random.choice(["00000", "99999", "N/A"])
        else:
            postal = postal_code_for_country(record["country"])
        
        # Validate postal code
        if postal and is_valid_postal_code(postal, record["country"]):
            record["postal_code"] = postal if str(postal).lower() not in ["n/a", "null"] else None
        else:
            record["postal_code"] = None

        # 11. last_login_date: ISO-8601, >= account_created_at, <= now()
        if i % 15 == 0:
            last_login = None
        elif i % 25 == 0:
            last_login = fake.date_time_between(start_date="-1y", end_date="now")
        elif i % 45 == 0:
            # Violation: before account creation
            last_login = fake.date_time_between(start_date="-10y", end_date="-6y")
        else:
            try:
                if reg_date:
                    last_login = fake.date_time_between(start_date=reg_date, end_date="now")
                else:
                    last_login = fake.date_time_between(start_date="-1y", end_date="now")
            except:
                last_login = fake.date_time_between(start_date="-1y", end_date="now")
        
        record["last_login_date"] = to_iso8601(last_login)

        # 12. is_active: Boolean only, consistent with status
        record["is_active"] = status_to_is_active(record["account_status"]) if record["account_status"] else None

        # 13. email_address: Lowercase, trim, validate, unique, enforce at least one contact
        if i % 15 == 0:
            email = None
        elif i % 25 == 0:
            email = random.choice(["N/A", "not provided", ""])
        else:
            email = fake.email()
        
        normalized_email = normalize_email(email)
        if normalized_email and normalized_email not in used_emails:
            record["email_address"] = normalized_email
            used_emails.add(normalized_email)
        else:
            record["email_address"] = None

        # 14. phone_number: E.164 format or normalized
        if i % 20 == 0:
            phone = None
        elif i % 30 == 0:
            phone = random.choice(["N/A", "0000000000"])
        else:
            phone = fake.phone_number()
        record["phone_number"] = normalize_phone(phone)

        # Cross-field: Enforce at least one contact method (email OR phone)
        if record["email_address"] is None and record["phone_number"] is None:
            if random.random() > 0.5:
                record["email_address"] = normalize_email(fake.email())
            else:
                record["phone_number"] = normalize_phone(fake.phone_number())

        # 15. age: Computed from DOB, 13-115, consistent with DOB
        if record["date_of_birth"]:
            try:
                dob_parsed = pd.to_datetime(record["date_of_birth"])
                age = int((datetime.now() - dob_parsed).days / 365.2425)
                if 13 <= age <= 115:
                    record["age"] = age
                else:
                    record["age"] = None
            except:
                record["age"] = None
        else:
            record["age"] = None

        # 16. total_purchases: Integer >= 0
        total_purchases = None
        if random.random() > 0.4:
            if i % 30 == 0:
                total_purchases = None
            elif i % 40 == 0:
                total_purchases = random.choice([-10, -1])
            else:
                total_purchases = random.randint(0, 100)
            
            if isinstance(total_purchases, int) and total_purchases >= 0:
                record["total_purchases"] = total_purchases
            else:
                record["total_purchases"] = None
        else:
            record["total_purchases"] = None

        # 14. lifetime_value: >= 0, cross-field with total_purchases
        ltv = None
        if random.random() > 0.3:
            if i % 20 == 0:
                ltv = None
            elif i % 30 == 0:
                ltv = random.choice(["-1", "0", "N/A"])
            elif i % 40 == 0:
                ltv = random.choice([-99999999, 99999999])
            else:
                ltv = round(random.uniform(0.01, 50000), 2)
            
            # Cross-field constraint: If total_purchases > 0, LTV must be > 0
            if record.get("total_purchases") and record["total_purchases"] > 0:
                if isinstance(ltv, (int, float)) and ltv <= 0:
                    ltv = round(random.uniform(10, 50000), 2)
            
            # If total_purchases == 0, LTV should be 0
            if record.get("total_purchases") == 0:
                if random.random() > 0.3:
                    ltv = 0
            
            if isinstance(ltv, (int, float)) and ltv >= 0:
                record["lifetime_value"] = round(ltv, 2)
            else:
                record["lifetime_value"] = None
        else:
            record["lifetime_value"] = None

        # 15. last_purchase_date: >= account_created_at, cross-field with total_purchases
        if random.random() > 0.2:
            if i % 25 == 0:
                last_purchase = None
            elif i % 35 == 0:
                last_purchase = "1900-01-01"
            else:
                last_purchase = fake.date_between(start_date="-1y", end_date="today")
            
            # Cross-field: if total_purchases > 0, last_purchase must be non-null
            if record.get("total_purchases") and record["total_purchases"] > 0:
                if last_purchase is None or last_purchase == "1900-01-01":
                    last_purchase = fake.date_between(start_date="-1y", end_date="today")
            # if total_purchases == 0, last_purchase should be null
            elif record.get("total_purchases") == 0:
                if random.random() > 0.3:
                    last_purchase = None
            
            record["last_purchase_date"] = to_iso8601_date(last_purchase)
        else:
            record["last_purchase_date"] = None

        # 19. loyalty_points: Integer >= 0, cross-field with total_purchases
        if random.random() > 0.5:
            if i % 35 == 0:
                points = None
            elif i % 45 == 0:
                points = random.choice([-9999, -1])
            else:
                points = random.randint(0, 5000)
            
            # If total_purchases == 0, points should be low (signup bonus max)
            if record.get("total_purchases") == 0:
                if random.random() > 0.3:
                    points = random.randint(0, 500)
            
            if isinstance(points, int) and points >= 0:
                record["loyalty_points"] = points
            else:
                record["loyalty_points"] = None
        else:
            record["loyalty_points"] = None

        # Cross-field realism: If is_active = TRUE, should have recent activity
        if record["is_active"] and record["account_status"] == "Active":
            if record["last_login_date"] is None and record["last_purchase_date"] is None:
                # Should have at least one recent activity
                if random.random() > 0.5:
                    record["last_login_date"] = to_iso8601(fake.date_time_between(start_date="-2y", end_date="now"))
                else:
                    record["last_purchase_date"] = to_iso8601_date(fake.date_between(start_date="-2y", end_date="today"))

        data.append(record)

    df = pd.DataFrame(data)

    # Add EMPTY rows and NULL-value rows (NOT duplicate user_ids)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series([random.choice(null_values) for _ in range(len(df.columns))], index=df.columns)
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    """Add additional data quality issues."""
    string_cols = df.select_dtypes(include=["object"]).columns
    
    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (str(x).upper() if pd.notna(x) and random.random() > 0.5 else 
                      str(x).lower() if pd.notna(x) else x)
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    for col in string_cols[:2]:
        mask = np.random.random(len(df)) < 0.01
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (str(x).replace("a", "Ã¡").replace("e", "Ã©") if pd.notna(x) and random.random() > 0.5 else x)
        )

    return df

df = generate_messy_customer_data(number_of_customers)
df = add_more_messiness(df)

output_file = "customers.xlsx"
df.to_excel(output_file, index=False)

  parsed = pd.to_datetime(dt, errors="coerce")
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Address Table Generator

In [35]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker(["en_US", "en_GB", "en_CA", "en_AU"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
ADDRESS_TYPE_CANONICAL = ["Shipping", "Billing", "Home", "Office", "Warehouse"]
COUNTRY_CODES = [
    "US",
    "GB",
    "DE",
    "FR",
    "CA",
    "AU",
    "JP",
    "CN",
    "IN",
    "BR",
    "MX",
    "IT",
    "ES",
    "NL",
]

# US states for validation
US_STATES = [
    "Alabama",
    "Alaska",
    "Arizona",
    "Arkansas",
    "California",
    "Colorado",
    "Connecticut",
    "Delaware",
    "Florida",
    "Georgia",
    "Hawaii",
    "Idaho",
    "Illinois",
    "Indiana",
    "Iowa",
    "Kansas",
    "Kentucky",
    "Louisiana",
    "Maine",
    "Maryland",
    "Massachusetts",
    "Michigan",
    "Minnesota",
    "Mississippi",
    "Missouri",
    "Montana",
    "Nebraska",
    "Nevada",
    "New Hampshire",
    "New Jersey",
    "New Mexico",
    "New York",
    "North Carolina",
    "North Dakota",
    "Ohio",
    "Oklahoma",
    "Oregon",
    "Pennsylvania",
    "Rhode Island",
    "South Carolina",
    "South Dakota",
    "Tennessee",
    "Texas",
    "Utah",
    "Vermont",
    "Virginia",
    "Washington",
    "West Virginia",
    "Wisconsin",
    "Wyoming",
]

US_STATE_ABBR = [
    "AL",
    "AK",
    "AZ",
    "AR",
    "CA",
    "CO",
    "CT",
    "DE",
    "FL",
    "GA",
    "HI",
    "ID",
    "IL",
    "IN",
    "IA",
    "KS",
    "KY",
    "LA",
    "ME",
    "MD",
    "MA",
    "MI",
    "MN",
    "MS",
    "MO",
    "MT",
    "NE",
    "NV",
    "NH",
    "NJ",
    "NM",
    "NY",
    "NC",
    "ND",
    "OH",
    "OK",
    "OR",
    "PA",
    "RI",
    "SC",
    "SD",
    "TN",
    "TX",
    "UT",
    "VT",
    "VA",
    "WA",
    "WV",
    "WI",
    "WY",
]

# Generate customer and supplier IDs
customer_ids = [
    f"CUST_{i + starting_customer_index}" for i in range(number_of_customers)
]
supplier_ids = [
    f"SUP_{i + starting_supplier_index}" for i in range(number_of_suppliers)
]


def generate_messy_address_data(num_rows=2000):
    data = []
    used_address_ids = []
    default_addresses = {}  # Track defaults per (owner_id, address_type)

    for i in range(num_rows):
        record = {}

        # address_id: Primary key, format ^ADDR_[1-9][0-9]*$, unique
        if i % 53 == 0 and used_address_ids:
            addr_id = random.choice(used_address_ids)  # Duplicate violation
        elif i % 97 == 0:
            addr_id = None  # NULL violation
        elif i % 61 == 0:
            # Format violations
            formats = [
                f"ADDRESS-{i + starting_address_index}",
                f"ADDR-{i + starting_address_index}",
                f"ADDR_ {i + starting_address_index}",  # Embedded space
                str(i + starting_address_index),
            ]
            addr_id = random.choice(formats)
        elif i % 71 == 0:
            # Case inconsistencies
            addr_id = random.choice(
                [
                    f"addr_{i + starting_address_index}",
                    f"Addr_{i + starting_address_index}",
                ]
            )
        elif i % 81 == 0:
            # Placeholder values
            addr_id = random.choice(["ADDR_0", "ADDR_999999", "ADDR_TEST"])
        else:
            # Valid: canonical format
            addr_id = f"ADDR_{i + starting_address_index}"

        used_address_ids.append(addr_id)
        record["address_id"] = addr_id

        # Decide owner type (80% customer, 20% supplier)
        is_supplier_owned = random.random() < 0.20

        # user_id/owner_id: Mandatory, FK to customers or suppliers
        if i % 79 == 0:
            owner_id = None  # NULL violation (mandatory)
        elif i % 59 == 0:
            # FK violation
            if is_supplier_owned:
                owner_id = f"SUP_{99999}"
            else:
                owner_id = f"CUST_{99999}"
        elif i % 41 == 0:
            # Invalid format
            owner_id = random.choice(["INVALID", "NULL", "N/A", "", "GUEST"])
        else:
            # Valid FK
            if is_supplier_owned:
                owner_id = random.choice(supplier_ids)
            else:
                owner_id = random.choice(customer_ids)
        record["owner_id"] = owner_id

        # address_line1: Mandatory, 5-100 chars, must contain street number + name
        if i % 73 == 0:
            street1 = None  # NULL violation
        elif i % 31 == 0:
            # Placeholder values
            street1 = random.choice(["N/A", "NA", "TBD", "UNKNOWN", "-", "X"])
        elif i % 43 == 0:
            # Too short (< 5 chars)
            street1 = random.choice(["123", "St", "Ave", ""])
        elif i % 55 == 0:
            # All-numeric or all-symbol (invalid)
            street1 = random.choice(["12345", "######", "@@@@@"])
        elif i % 65 == 0:
            # No street number
            street1 = random.choice(["Maple Street", "Oak Avenue", "Main Road"])
        elif i % 75 == 0:
            # Too long (> 100 chars)
            street1 = (
                fake.street_address()
                + " "
                + fake.secondary_address()
                + " Building "
                + fake.building_number() * 5
            )
        elif i % 85 == 0:
            # Special characters
            street1 = fake.street_address() + random.choice(["#", "@", "!", "™", "©"])
        else:
            # Valid: contains street number + name
            street1 = fake.street_address()
        record["address_line1"] = street1

        # address_line2: Optional, 0-100 chars, must not equal address_line1
        if random.random() > 0.6:
            if i % 37 == 0:
                street2 = None
            elif i % 47 == 0:
                # Placeholder values (should be NULL instead)
                street2 = random.choice(["N/A", "NA", "-", ""])
            elif i % 67 == 0:
                # Equals address_line1 violation
                street2 = street1
            elif i % 77 == 0:
                # Too long
                street2 = "Suite " + "A" * 150
            else:
                # Valid: unit/suite/apartment info
                street2 = fake.secondary_address()
            record["address_line2"] = street2

        # city: Mandatory, 2-60 chars, no digits-only
        if i % 63 == 0:
            city = None  # NULL violation
        elif i % 27 == 0:
            # Placeholder values
            city = random.choice(["N/A", "NULL", "Unknown", "City", ""])
        elif i % 37 == 0:
            # Case variations
            city = fake.city().upper() if random.random() > 0.5 else fake.city().lower()
        elif i % 47 == 0:
            # Special characters
            city = fake.city() + random.choice(["!", "?", "#"])
        elif i % 57 == 0:
            # Digits only (invalid)
            city = str(random.randint(10000, 99999))
        elif i % 69 == 0:
            # Too short (< 2 chars)
            city = random.choice(["A", "X", ""])
        elif i % 83 == 0:
            # Unicode characters
            city = random.choice(["北京", "São Paulo", "Montréal", "München", "Москва"])
        elif i % 93 == 0:
            # City doesn't match state/country (geographic mismatch)
            city = random.choice(["Paris", "London", "Tokyo", "Sydney"])
        else:
            # Valid city
            city = fake.city()
        record["city"] = city

        # state_province: Mandatory for US/CA/AU, valid subdivisions
        if i % 23 == 0:
            state = None
        elif i % 33 == 0:
            # Mix of formats (full name vs abbreviation)
            state = (
                random.choice(US_STATES)
                if random.random() > 0.5
                else random.choice(US_STATE_ABBR)
            )
        elif i % 44 == 0:
            # Invalid values
            state = random.choice(["XX", "N/A", "Unknown", "99", ""])
        elif i % 54 == 0:
            # Wrong country states (geographic mismatch)
            state = random.choice(
                ["Ontario", "Quebec", "Bavaria", "Tokyo", "New South Wales"]
            )
        elif i % 64 == 0:
            # Typos
            state = random.choice(["Californai", "Texus", "Florda", "New Yrok"])
        elif i % 74 == 0:
            # Case variations
            state = random.choice(
                [fake.state().upper(), random.choice(US_STATE_ABBR).lower()]
            )
        else:
            # Valid state
            state = fake.state()
        record["state_province"] = state

        # postal_code: Country-specific format, mandatory
        if i % 26 == 0:
            postal = None
        elif i % 32 == 0:
            # Placeholder values
            postal = random.choice(
                ["00000", "99999", "XXXXX", "N/A", "NULL", "", "11111"]
            )
        elif i % 42 == 0:
            # Wrong format for country
            postal = random.choice(["SW1A 1AA", "M5H 2N2", "75001", "100-0001"])
        elif i % 52 == 0:
            # Extreme values
            postal = random.choice(["00000-0000", "99999-9999", "123", "1234567890"])
        elif i % 62 == 0:
            # Special characters
            postal = fake.postcode() + random.choice(["!", "#", "@"])
        elif i % 72 == 0:
            # Incomplete postal codes
            postal_base = fake.postcode()
            postal = postal_base[:3] if len(postal_base) > 3 else postal_base
        else:
            # Valid US ZIP
            postal = fake.postcode()
        record["postal_code"] = postal

        # country: Mandatory, ISO 3166-1 alpha-2 or official names
        if i % 68 == 0:
            country = None  # NULL violation
        elif i % 28 == 0:
            # ISO alpha-2 codes (valid)
            country = random.choice(COUNTRY_CODES)
        elif i % 38 == 0:
            # Mix of formats (should be normalized)
            country = random.choice(
                [
                    "United States",
                    "USA",
                    "US",
                    "U.S.A.",
                    "United States of America",
                    "UK",
                    "GB",
                ]
            )
        elif i % 48 == 0:
            # Invalid values
            country = random.choice(["N/A", "NULL", "Unknown", "1", "", "EU", "Europe"])
        elif i % 58 == 0:
            # Typos
            country = random.choice(["Untied States", "Canadia", "Australa", "Germny"])
        elif i % 78 == 0:
            # Case variations
            country = random.choice([fake.country().upper(), fake.country().lower()])
        elif i % 88 == 0:
            # UK vs GB (should be GB for strict ISO)
            country = "UK"
        else:
            # Valid: full country name
            country = fake.country()
        record["country"] = country

        # address_type: Canonical set
        if i % 24 == 0:
            addr_type = None
        elif i % 34 == 0:
            # Case variations
            addr_type = random.choice(
                ["billing", "BILLING", "shipping", "SHIPPING", "home", "HOME"]
            )
        elif i % 46 == 0:
            # Synonyms (should be mapped)
            addr_type = random.choice(["Work", "Primary", "Secondary", "Main"])
        elif i % 56 == 0:
            # Invalid values
            addr_type = random.choice(["1", "2", "B", "S", "H"])
        elif i % 66 == 0:
            # Owner compatibility violation
            if is_supplier_owned:
                addr_type = "Home"  # Supplier shouldn't have Home
            else:
                addr_type = "Warehouse"  # Customer shouldn't have Warehouse
        else:
            # Valid: based on owner type
            if is_supplier_owned:
                addr_type = random.choice(["Warehouse", "Office"])
            else:
                addr_type = random.choice(["Shipping", "Billing", "Home", "Office"])
        record["address_type"] = addr_type

        # is_default: Boolean, uniqueness per (owner_id, address_type)
        if i % 29 == 0:
            is_default = None  # NULL violation (should default to FALSE)
        elif i % 39 == 0:
            # Various boolean representations
            is_default = random.choice(
                ["Y", "N", "Yes", "No", "1", "0", "true", "false"]
            )
        elif i % 49 == 0:
            # Invalid values
            is_default = random.choice(["Pending", "Unknown", "N/A", ""])
        else:
            # Valid: enforce uniqueness constraint
            key = (owner_id, addr_type)
            if key in default_addresses:
                is_default = False  # Already have a default
            else:
                is_default = random.choices([True, False], weights=[30, 70], k=1)[0]
                if is_default:
                    default_addresses[key] = True
        record["is_default"] = is_default

        # phone_number: E.164 format, 7-15 digits
        if random.random() > 0.4:
            if i % 21 == 0:
                phone = None
            elif i % 35 == 0:
                # Invalid formats
                invalid_phones = [
                    "0000000000",
                    "9999999999",
                    "123",
                    "N/A",
                    "",
                    "+10000000000",
                    "1111111111",
                    "1234567890",
                ]
                phone = random.choice(invalid_phones)
            elif i % 45 == 0:
                # Too short (< 7 digits)
                phone = str(random.randint(100, 999999))
            elif i % 55 == 0:
                # Too long (> 15 digits)
                phone = str(random.randint(10**16, 10**18))
            else:
                # Valid: various formats
                phone = fake.phone_number()
            record["phone_number"] = phone

        # coordinates: lat -90 to 90, lon -180 to 180
        if random.random() > 0.3:
            if i % 31 == 0:
                lat, lon = None, None
            elif i % 43 == 0:
                # Invalid string values
                lat = random.choice(["N/A", "NULL", "999", "-999"])
                lon = random.choice(["N/A", "NULL", "999", "-999"])
            elif i % 53 == 0:
                # Out of range
                lat = random.choice([91, -91, 180, -180, 999])
                lon = random.choice([181, -181, 360, -360, 999])
            elif i % 63 == 0:
                # Placeholder (0,0)
                lat, lon = 0, 0
            else:
                # Valid coordinates
                lat = float(fake.latitude())
                lon = float(fake.longitude())
            record["latitude"] = lat
            record["longitude"] = lon

        # resident_name: Optional, minimum 2 tokens (first + last)
        if random.random() > 0.5:
            if i % 25 == 0:
                name = None
            elif i % 40 == 0:
                # Single token (violation)
                name = fake.first_name()
            elif i % 50 == 0:
                # Email/phone embedded (violation)
                name = f"{fake.name()} ({fake.email()})"
            elif i % 60 == 0:
                # Placeholder
                name = random.choice(["N/A", "Unknown", "Resident", ""])
            elif i % 70 == 0:
                # For supplier warehouse, department name
                if is_supplier_owned:
                    name = random.choice(
                        ["Receiving Department", "Warehouse Manager", "Shipping Dept"]
                    )
                else:
                    name = fake.name()
            else:
                # Valid: full name
                name = fake.name()
            record["resident_name"] = name

        # created_at: <= now()
        if random.random() > 0.5:
            if i % 32 == 0:
                created = None
            elif i % 42 == 0:
                # String format
                created = fake.date_time_between(
                    start_date="-2y", end_date="now"
                ).strftime("%Y-%m-%d %H:%M:%S")
            elif i % 62 == 0:
                # Future date violation
                created = fake.date_time_between(start_date="+1d", end_date="+1y")
            else:
                # Valid
                created = fake.date_time_between(start_date="-2y", end_date="now")
            record["created_at"] = created

        # updated_at: >= created_at, <= now()
        if random.random() > 0.6:
            if i % 36 == 0:
                updated = None
            elif i % 52 == 0:
                # Updated before created violation
                if isinstance(record.get("created_at"), datetime):
                    updated = record["created_at"] - timedelta(
                        days=random.randint(1, 30)
                    )
                else:
                    updated = fake.date_time_between(start_date="-3y", end_date="-2y")
            elif i % 72 == 0:
                # Future date violation
                updated = fake.date_time_between(start_date="+1d", end_date="+1y")
            else:
                # Valid: after created_at
                if isinstance(record.get("created_at"), datetime):
                    updated = fake.date_time_between(
                        start_date=record["created_at"], end_date="now"
                    )
                else:
                    updated = fake.date_time_between(start_date="-1y", end_date="now")
            record["updated_at"] = updated

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_address_data(number_of_addresses)
df = add_more_messiness(df)

output_file = "addresses.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Products Table Generator

In [36]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
AVAILABILITY_STATUS_CANONICAL = [
    "Active",
    "Discontinued",
    "Out of Stock",
    "Pre-order",
    "Archive",
]
COLORS_CANONICAL = [
    "Black",
    "White",
    "Blue",
    "Red",
    "Gray",
    "Silver",
    "Gold",
    "Green",
    "Navy",
    "Pink",
    "Purple",
    "Orange",
    "Yellow",
    "Brown",
    "Beige",
    "Space Gray",
]
CLOTHING_SIZES = ["XS", "S", "M", "L", "XL", "XXL", "XXXL"]
SHOE_SIZES_US = [
    "6",
    "6.5",
    "7",
    "7.5",
    "8",
    "8.5",
    "9",
    "9.5",
    "10",
    "10.5",
    "11",
    "11.5",
    "12",
    "13",
]
ELECTRONIC_SIZES = [
    '13"',
    '14"',
    '15.6"',
    '17"',
    '21"',
    '24"',
    '27"',
    '32"',
    "128GB",
    "256GB",
    "512GB",
    "1TB",
]
MATERIALS_FOOTWEAR = [
    "Leather",
    "Synthetic",
    "Canvas",
    "Mesh",
    "Rubber",
    "Suede",
    "Nylon",
]
MATERIALS_CLOTHING = [
    "Cotton",
    "Polyester",
    "Wool",
    "Silk",
    "Linen",
    "Denim",
    "Nylon",
    "Spandex",
]
MATERIALS_ELECTRONICS = [
    "Aluminum",
    "Plastic",
    "Glass",
    "Carbon Fiber",
    "Steel",
    "Magnesium Alloy",
]

BRANDS = [
    "Nike",
    "Adidas",
    "Apple",
    "Samsung",
    "Sony",
    "Dell",
    "HP",
    "Lenovo",
    "Microsoft",
    "Google",
    "Amazon",
    "LG",
    "Panasonic",
    "Canon",
    "Nikon",
    "Bose",
    "JBL",
    "Reebok",
    "Puma",
    "Under Armour",
]

PRODUCT_TYPES = [
    "Laptop",
    "Smartphone",
    "Tablet",
    "Headphones",
    "Smartwatch",
    "Camera",
    "Running Shoes",
    "Basketball Shoes",
    "Training Shoes",
    "T-Shirt",
    "Jacket",
    "Backpack",
    "Monitor",
    "Keyboard",
]

CATEGORY_NAMES = [
    "Electronics",
    "Footwear",
    "Apparel",
    "Accessories",
    "Sports",
    "Home & Office",
    "Audio",
    "Gaming",
    "Photography",
    "Fitness",
    "Outdoor",
    "Travel",
    "Technology",
    "Fashion",
]

SUB_CATEGORIES = {
    "Electronics": ["Computers", "Mobile", "Tablets", "Wearables", "Accessories"],
    "Footwear": ["Running", "Basketball", "Casual", "Formal", "Outdoor"],
    "Apparel": ["Shirts", "Pants", "Jackets", "Sportswear", "Formal"],
    "Accessories": ["Bags", "Belts", "Watches", "Jewelry", "Hats"],
    "Sports": ["Equipment", "Clothing", "Shoes", "Accessories", "Nutrition"],
    "Audio": ["Headphones", "Speakers", "Earbuds", "Microphones", "Amplifiers"],
    "Gaming": ["Consoles", "Controllers", "Headsets", "Keyboards", "Mice"],
}


def generate_messy_product_data(num_rows=1000):
    data = []
    used_product_ids = []
    used_skus = []

    # Generate category and supplier pools
    categories = [
        f"CAT_{i + starting_category_index}" for i in range(number_of_categories)
    ]
    suppliers = [
        f"SUPP_{i + starting_supplier_index}" for i in range(number_of_suppliers)
    ]

    for i in range(num_rows):
        record = {}

        # prod_id: Primary key, positive integer, unique
        if i % 53 == 0 and used_product_ids:
            prod_id = random.choice(used_product_ids)  # Duplicate violation
        else:
            prod_id = starting_product_index + i
            used_product_ids.append(prod_id)
        record["prod_id"] = prod_id if i % 97 != 0 else None  # 1% null

        # Determine product category for related attributes
        product_category = random.choice(
            ["electronics", "footwear", "clothing", "general"]
        )
        is_digital = (
            product_category == "electronics" and random.random() < 0.1
        )  # 10% of electronics are digital

        # product_name: Mandatory, 5-150 chars, descriptive
        if i % 79 == 0:
            name = None
        elif i % 31 == 0:
            name = random.choice(
                ["N/A", "NULL", "Product", "Test", ""]
            )  # Placeholder violation
        elif i % 41 == 0:
            # Very long name violation
            brand = random.choice(BRANDS)
            product = random.choice(PRODUCT_TYPES)
            name = f"{brand} {product} Premium Edition with Extra Features and Extended Warranty Limited Time Offer Special Bundle Pack"
        elif i % 51 == 0:
            # Short name violation (< 5 chars)
            name = random.choice(["ABC", "XYZ", "Pro", "Air"])
        elif i % 61 == 0:
            # Name with special characters
            brand = random.choice(BRANDS)
            product = random.choice(PRODUCT_TYPES)
            name = f"{brand} {product}™ #{i}"
        else:
            brand = random.choice(BRANDS)
            product = random.choice(PRODUCT_TYPES)
            model = (
                random.choice(["Pro", "Air", "Ultra", "Max", "Plus", "Elite"])
                if random.random() > 0.3
                else ""
            )
            name = f"{brand} {product} {model}".strip()
        record["product_name"] = name

        # stock_code (SKU): Mandatory, unique, canonical pattern ^[A-Z0-9]{2,5}-[A-Z0-9]{2,10}-[0-9]{3,6}$
        if i % 27 == 0:
            sku = None
        elif i % 37 == 0 and used_skus:
            sku = random.choice(used_skus)  # Duplicate violation
        elif i % 47 == 0:
            sku = random.choice(["N/A", "NULL", "SKU", "", "000000"])  # Invalid
        elif i % 57 == 0:
            sku = f"SKU-{i}!@#"  # Special chars violation
        elif i % 67 == 0:
            # Wrong format - lowercase
            brand_code = (
                name[:2]
                if name and name not in ["N/A", "NULL", "Product", "Test", ""]
                else "XX"
            ).upper()
            sku = f"{brand_code.lower()}-el{str(i).zfill(4)}-{random.choice(COLORS_CANONICAL)[:3].lower()}"
        else:
            # Valid SKU format: ^[A-Z0-9]{2,5}-[A-Z0-9]{2,10}-[0-9]{3,6}$
            brand_code = (
                name[:2]
                if name and name not in ["N/A", "NULL", "Product", "Test", ""]
                else "XX"
            ).upper()
            category_code = random.choice(["EL", "CL", "SP", "AC", "HM"])
            sku = f"{brand_code}-{category_code}{str(i).zfill(4)}-{str(random.randint(100, 999999)).zfill(6)}"
            used_skus.append(sku)
        record["stock_code"] = sku

        # category_ref: Mandatory, format ^CAT_[1-9][0-9]*$, FK
        if i % 29 == 0:
            category_id = None
        elif i % 39 == 0:
            category_id = f"CAT_{9999}"  # FK violation
        elif i % 49 == 0:
            category_id = random.choice(
                ["INVALID", "N/A", "", "CAT_0", "CAT_-1"]
            )  # Invalid format
        else:
            category_id = random.choice(categories)
        record["category_ref"] = category_id

        # cat_name: Must match category_ref
        if i % 26 == 0:
            cat_name = None
        elif i % 36 == 0:
            cat_name = random.choice(CATEGORY_NAMES)  # May not match category_id
        elif i % 46 == 0:
            cat_name = random.choice(
                ["N/A", "Unknown", "General", "Other", ""]
            )  # Invalid
        elif i % 56 == 0:
            cat_name = random.choice(["Electronix", "Footware", "Aparrel"])  # Typos
        else:
            # Valid: match with product category
            if product_category == "footwear":
                cat_name = "Footwear"
            elif product_category == "clothing":
                cat_name = "Apparel"
            elif product_category == "electronics":
                cat_name = "Electronics"
            else:
                cat_name = random.choice(CATEGORY_NAMES)
        record["cat_name"] = cat_name

        # sub_cat: Must be valid for cat_name
        if i % 23 == 0:
            sub_cat = None
        elif i % 33 == 0:
            sub_cat = random.choice(["N/A", "NULL", "None", "General", ""])  # Invalid
        elif i % 43 == 0:
            # Inconsistent with main category
            sub_cat = random.choice(["Computers", "Running", "Shirts"])
        elif i % 63 == 0:
            sub_cat = random.choice(["Runing", "Casul", "Moble"])  # Typos
        else:
            # Valid: appropriate subcategory based on category
            if cat_name in SUB_CATEGORIES:
                sub_cat = random.choice(SUB_CATEGORIES[cat_name])
            elif product_category == "footwear":
                sub_cat = random.choice(["Running", "Basketball", "Casual"])
            elif product_category == "clothing":
                sub_cat = random.choice(["Shirts", "Pants", "Jackets"])
            elif product_category == "electronics":
                sub_cat = random.choice(["Computers", "Mobile", "Tablets"])
            else:
                sub_cat = random.choice(["General", "Miscellaneous"])
        record["sub_cat"] = sub_cat

        # brand: Mandatory, standardized
        if i % 24 == 0:
            brand_val = None
        elif i % 34 == 0:
            brand_val = random.choice(
                ["nike", "NIKE", "Nike Inc.", "Nike®"]
            )  # Inconsistent
        elif i % 44 == 0:
            brand_val = random.choice(["Addidas", "Appl", "Samung"])  # Typos
        elif i % 54 == 0:
            brand_val = random.choice(["N/A", "Unknown", "Generic", ""])  # Invalid
        else:
            brand_val = random.choice(BRANDS)
        record["brand"] = brand_val

        # supp_id: Mandatory, format ^SUPP_[1-9][0-9]*$, FK
        if i % 21 == 0:
            supplier = None
        elif i % 32 == 0:
            supplier = f"SUPP_{999}"  # FK violation
        elif i % 42 == 0:
            supplier = random.choice(["INVALID", "N/A", "NULL", "SUPP_0"])  # Invalid
        else:
            supplier = random.choice(suppliers)
        record["supp_id"] = supplier

        # unit_cost: Decimal >= 0, typically <= retail_price
        if i % 22 == 0:
            cost = None
        elif i % 35 == 0:
            cost = random.choice(["N/A", "NULL", "Free", ""])  # String violation
        elif i % 45 == 0:
            cost = round(random.uniform(-100, -1), 2)  # Negative violation
        elif i % 55 == 0:
            cost = random.choice([0, 999999.99, 0.001, -9999])  # Extreme
        else:
            # Valid cost based on product category
            if product_category == "electronics":
                cost = round(random.uniform(50, 800), 2)
            elif product_category == "footwear":
                cost = round(random.uniform(20, 150), 2)
            elif product_category == "clothing":
                cost = round(random.uniform(10, 80), 2)
            else:
                cost = round(random.uniform(10, 300), 2)
        record["unit_cost"] = cost

        # retail_price: Decimal >= 0, typically >= unit_cost, markup 20-300%
        if i % 25 == 0:
            price = None
        elif i % 38 == 0:
            price = random.choice(
                ["N/A", "Contact for price", "TBD"]
            )  # String violation
        elif i % 48 == 0:
            # Price < cost violation
            if isinstance(cost, (int, float)) and cost > 0:
                price = round(cost * 0.5, 2)
            else:
                price = round(random.uniform(1, 10), 2)
        elif i % 58 == 0:
            price = random.choice([0, 999999.99, 0.01, -100])  # Extreme/negative
        elif i % 68 == 0:
            # Unrealistic markup (> 300%)
            if isinstance(cost, (int, float)) and cost > 0:
                price = round(cost * 5, 2)
            else:
                price = round(random.uniform(5000, 10000), 2)
        else:
            # Valid: retail_price >= unit_cost, markup 20-300%
            if isinstance(cost, (int, float)) and cost > 0:
                markup = random.uniform(1.2, 3.0)
                price = round(cost * markup, 2)
            else:
                price = round(random.uniform(20, 1000), 2)
        record["retail_price"] = price

        # release_date: YYYY-MM-DD, <= today unless Pre-order
        if i % 28 == 0:
            launch = None
        elif i % 40 == 0:
            # String format variations
            launch_date = fake.date_between(start_date="-5y", end_date="today")
            formats = ["%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y", "%Y%m%d"]
            launch = launch_date.strftime(random.choice(formats))
        elif i % 50 == 0:
            launch = random.choice(["0000-00-00", "9999-99-99", "Invalid"])  # Invalid
        elif i % 60 == 0:
            launch = fake.date_between(
                start_date="today", end_date="+1y"
            )  # Future (for pre-order)
        elif i % 70 == 0:
            launch = fake.date_between(
                start_date="-50y", end_date="-30y"
            )  # Very old violation
        else:
            launch = fake.date_between(start_date="-5y", end_date="today")
        record["release_date"] = launch

        # digital_product: Boolean
        if i % 30 == 0:
            digital_val = None
        elif i % 52 == 0:
            digital_val = random.choice(
                ["Y", "N", "Yes", "No", "1", "0"]
            )  # String boolean
        elif i % 62 == 0:
            digital_val = random.choice(["Maybe", "Unknown", "Physical"])  # Invalid
        else:
            digital_val = is_digital
        record["digital_product"] = digital_val

        # weight: Decimal > 0 for physical, 0/NULL for digital
        if i % 20 == 0:
            weight = None
        elif i % 33 == 0:
            weight = random.choice(
                ["N/A", "Unknown", "Light", "Heavy"]
            )  # String violation
        elif i % 43 == 0:
            weight = round(random.uniform(-10, -0.001), 3)  # Negative violation
        elif i % 53 == 0:
            weight = random.choice([0, 99999.999, 0.0001])  # Extreme
        elif i % 73 == 0:
            # Wrong weight for category
            if product_category == "electronics":
                weight = round(random.uniform(50, 100), 3)  # Too heavy for electronics
            else:
                weight = round(random.uniform(0.001, 0.01), 3)  # Too light
        elif is_digital:
            weight = 0  # Digital products have 0 weight
        else:
            # Valid weight based on category
            if product_category == "electronics":
                weight = round(random.uniform(0.1, 5), 3)
            elif product_category == "footwear":
                weight = round(random.uniform(0.2, 1.5), 3)
            elif product_category == "clothing":
                weight = round(random.uniform(0.1, 2), 3)
            else:
                weight = round(random.uniform(0.1, 10), 3)
        record["weight"] = weight

        # dimensions: LxWxH format, NULL for digital
        if i % 25 == 0:
            dimensions = None
        elif i % 37 == 0:
            dimensions = random.choice(
                ["Large", "Small", "N/A", "Compact"]
            )  # Invalid format
        elif i % 47 == 0:
            # Wrong separators
            l, w, h = (
                random.randint(5, 100),
                random.randint(5, 100),
                random.randint(5, 100),
            )
            dimensions = random.choice([f"{l}-{w}-{h}", f"{l}/{w}/{h}", f"{l},{w},{h}"])
        elif i % 57 == 0:
            dimensions = random.choice(["0x0x0", "-10x-10x-10"])  # Invalid values
        elif is_digital:
            dimensions = None  # Digital products have no dimensions
        else:
            # Valid dimensions based on category
            if product_category == "electronics":
                l, w, h = (
                    random.randint(10, 40),
                    random.randint(10, 30),
                    random.randint(1, 10),
                )
            elif product_category == "footwear":
                l, w, h = (
                    random.randint(25, 35),
                    random.randint(15, 20),
                    random.randint(10, 15),
                )
            else:
                l, w, h = (
                    random.randint(5, 50),
                    random.randint(5, 50),
                    random.randint(5, 50),
                )
            dimensions = f"{l}x{w}x{h}"
        record["dimensions"] = dimensions

        # color: Standardized lookup values
        if i % 19 == 0:
            color = None
        elif i % 31 == 0:
            color = random.choice(["N/A", "Unknown", "Various", "", "null"])  # Invalid
        elif i % 41 == 0:
            color_choice = random.choice(COLORS_CANONICAL)
            color = random.choice(
                [color_choice.upper(), color_choice.lower()]
            )  # Case issues
        elif i % 51 == 0:
            color = f"{random.choice(COLORS_CANONICAL)}/{random.choice(COLORS_CANONICAL)}"  # Multiple
        elif i % 71 == 0:
            color = random.choice(["Balck", "Whtie", "Grey", "Blu"])  # Typos
        else:
            if product_category == "electronics":
                color = random.choice(
                    ["Black", "Silver", "White", "Space Gray", "Gold"]
                )
            else:
                color = random.choice(COLORS_CANONICAL)
        record["color"] = color

        # size: Category-appropriate format
        if i % 26 == 0:
            size = None
        elif i % 36 == 0:
            size = random.choice(
                ["N/A", "One Size Fits All", "Standard", ""]
            )  # Invalid
        elif i % 46 == 0:
            # Wrong format for category
            if product_category == "footwear":
                size = random.choice(CLOTHING_SIZES)  # Wrong
            elif product_category == "clothing":
                size = random.choice(SHOE_SIZES_US)  # Wrong
            else:
                size = str(random.randint(1, 100))
        elif i % 66 == 0:
            size = random.choice(["Smal", "Mediun", "Larg"])  # Typos
        else:
            # Valid size for category
            if product_category == "footwear":
                size = random.choice(SHOE_SIZES_US)
            elif product_category == "clothing":
                size = random.choice(CLOTHING_SIZES)
            elif product_category == "electronics":
                size = random.choice(ELECTRONIC_SIZES)
            else:
                size = random.choice(["Small", "Medium", "Large"])
        record["size"] = size

        # material: Category-appropriate, standardized
        if i % 21 == 0:
            material = None
        elif i % 34 == 0:
            material = random.choice(["N/A", "Unknown", "Various", "Mixed"])  # Invalid
        elif i % 44 == 0:
            material = random.choice(["Lether", "Cotten", "Pollyester"])  # Typos
        elif i % 64 == 0:
            # Wrong material for category
            if product_category == "footwear":
                material = random.choice(MATERIALS_ELECTRONICS)
            elif product_category == "electronics":
                material = random.choice(MATERIALS_CLOTHING)
            else:
                material = random.choice(MATERIALS_FOOTWEAR)
        else:
            # Valid material for category
            if product_category == "footwear":
                material = random.choice(MATERIALS_FOOTWEAR)
            elif product_category == "clothing":
                material = random.choice(MATERIALS_CLOTHING)
            elif product_category == "electronics":
                material = random.choice(MATERIALS_ELECTRONICS)
            else:
                material = random.choice(MATERIALS_FOOTWEAR + MATERIALS_CLOTHING)
        record["material"] = material

        # availability_status: Canonical set, consistent with inventory/release_date
        if i % 27 == 0:
            status = None
        elif i % 38 == 0:
            status = random.choice(["active", "ACTIVE", "1", "A"])  # Case issues
        elif i % 48 == 0:
            status = random.choice(["Available", "Unavailable", "Sold Out"])  # Invalid
        elif i % 58 == 0:
            status = random.choice(["Activ", "Discontined", "Out of Stok"])  # Typos
        else:
            status = random.choices(
                AVAILABILITY_STATUS_CANONICAL, weights=[50, 10, 15, 10, 15], k=1
            )[0]
        record["availability_status"] = status

        # inventory_qty: Integer >= 0, consistent with availability_status
        if i % 30 == 0:
            stock = None
        elif i % 40 == 0:
            stock = random.choice(
                ["In Stock", "Out of Stock", "Limited"]
            )  # String violation
        elif i % 50 == 0:
            stock = random.randint(-100, -1)  # Negative violation
        elif i % 60 == 0:
            stock = random.choice([999999, 0.5, -9999])  # Extreme
        elif i % 75 == 0:
            # Inconsistent: Out of Stock but has inventory
            if status == "Out of Stock":
                stock = random.randint(50, 200)  # Violation
            else:
                stock = 0
        else:
            # Valid: consistent with availability_status
            if status == "Out of Stock":
                stock = 0
            elif status == "Pre-order":
                stock = 0  # Pre-order typically has 0 inventory
            elif status == "Active":
                stock = random.randint(10, 500)
            else:
                stock = random.randint(0, 100)
        record["inventory_qty"] = stock

        # avg_customer_rating: 0.0-5.0, consistent with review_count
        if i % 22 == 0:
            rating = None
        elif i % 32 == 0:
            rating = random.choice(["N/A", "No ratings", ""])  # String violation
        elif i % 42 == 0:
            rating = random.choice([-1, 6, 10, 999])  # Out of range
        elif i % 52 == 0:
            rating = round(random.uniform(0, 5), 5)  # Too many decimals
        else:
            rating = round(random.uniform(1, 5), 2)
        record["avg_customer_rating"] = rating

        # review_count: Integer >= 0, consistent with rating
        if i % 23 == 0:
            reviews = None
        elif i % 35 == 0:
            reviews = random.choice(["N/A", "None", ""])  # String violation
        elif i % 45 == 0:
            reviews = random.randint(-100, -1)  # Negative violation
        elif i % 65 == 0:
            # Inconsistent: rating exists but 0 reviews
            if isinstance(rating, (int, float)) and 1 <= rating <= 5:
                reviews = 0  # Violation
            else:
                reviews = random.randint(1, 100)
        else:
            # Valid: consistent with rating
            if rating is None or (isinstance(rating, (int, float)) and rating == 0):
                reviews = 0
            elif isinstance(rating, (int, float)) and rating >= 4:
                reviews = random.randint(10, 1000)
            else:
                reviews = random.randint(1, 100)
        record["review_count"] = reviews

        # discount_pct: 0-100, financial consistency
        if i % 33 == 0:
            discount = None
        elif i % 43 == 0:
            discount = random.choice(["Sale", "Clearance", "N/A"])  # String violation
        elif i % 53 == 0:
            discount = random.choice([-50, 150, 999])  # Out of range
        elif i % 69 == 0:
            # Discount makes price negative violation
            discount = 110
        elif i % 78 == 0:
            # Discount on pre-order violation
            if status == "Pre-order":
                discount = random.randint(30, 50)  # Unusual for pre-order
            else:
                discount = 0
        else:
            # Valid: 0-100, typically 0-50 for active products
            if status == "Pre-order":
                discount = 0  # Pre-order typically no discount
            elif status == "Discontinued":
                discount = random.randint(20, 70)  # Clearance
            else:
                discount = random.choices(
                    [0, 5, 10, 15, 20, 25, 30], weights=[40, 15, 15, 10, 10, 5, 5], k=1
                )[0]
        record["discount_pct"] = discount

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_product_data(number_of_products)
df = add_more_messiness(df)

output_file = "products.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Categories Table Generator

In [37]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, timezone
import random
import string
import re

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# System launch date for lower bound validation
SYSTEM_LAUNCH_DATE = datetime(2010, 1, 1, tzinfo=timezone.utc)
RESERVED_WORDS = {"admin", "cart", "checkout", "api", "search", "login", "account", "profile"}

# Controlled vocabulary for sub_cat
VALID_SUBCATEGORIES = {
    "Premium", "Standard", "Budget", "Professional", "Consumer", "Industrial",
    "Retail", "Wholesale", "Limited Edition", "Regular", "Special Offer",
    "Clearance", "New Arrival", "Best Seller", "Featured", "Sale", "On Discount"
}

def generate_cat_id(index):
    """Generate unique integer cat_id (> 0)."""
    return index

def to_iso8601(dt):
    """Convert to ISO-8601 format with UTC timezone."""
    if dt is None:
        return None
    if isinstance(dt, str):
        try:
            parsed = pd.to_datetime(dt, errors="coerce")
            if pd.isna(parsed):
                return None
            return parsed.strftime("%Y-%m-%dT%H:%M:%SZ")
        except:
            return None
    if isinstance(dt, (datetime, pd.Timestamp)):
        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    return None

def is_valid_cat_name(name):
    """Validate category name: 3-60 chars, no placeholders, meaningful."""
    if not name or (isinstance(name, str) and name.lower().strip() in ["n/a", "null", "test", "category", "misc", "uncategorized", "other", "general", ""]):
        return False
    if isinstance(name, str):
        name = name.strip()
        if len(name) < 3 or len(name) > 60:
            return False
        # Check for HTML tags or control characters
        if "<" in name or ">" in name or chr(0) in name:
            return False
        # Reject repeated punctuation
        if "!!!!" in name or "???" in name or "---" in name:
            return False
        return True
    return False

def normalize_cat_name(name):
    """Normalize category name: title case, trim spaces."""
    if not name:
        return None
    if isinstance(name, str):
        name = name.strip()
        if not is_valid_cat_name(name):
            return None
        # Title case (preserve brand casing like iPhone would need manual override)
        return name.title()
    return None

def normalize_subcategory(subcat):
    """Normalize sub_cat to controlled vocabulary."""
    if not subcat or (isinstance(subcat, str) and subcat.lower().strip() in ["n/a", "null", "none", "general", "", "all", "unknown"]):
        return None
    if isinstance(subcat, str):
        subcat = subcat.strip().title()
        # Map to valid vocabulary
        for valid in VALID_SUBCATEGORIES:
            if valid.lower() == subcat.lower():
                return valid
        # If not in vocabulary, reject
        return None
    return None

def is_valid_url_slug(slug):
    """Validate URL slug: lowercase a-z0-9 with single hyphens, 2-80 chars, no reserved words."""
    if not slug:
        return False
    if isinstance(slug, str):
        slug = slug.strip().lower()
        if len(slug) < 2 or len(slug) > 80:
            return False
        # Check format: ^[a-z0-9]+(?:-[a-z0-9]+)*$
        if not re.match(r"^[a-z0-9]+(?:-[a-z0-9]+)*$", slug):
            return False
        # Check for reserved words
        if slug in RESERVED_WORDS or slug.split('-')[0] in RESERVED_WORDS:
            return False
        # Check for file-like endings
        if slug.endswith(('.php', '.html', '.asp', '.jsp')):
            return False
        return True
    return False

def generate_url_slug(cat_name):
    """Generate URL slug from category name."""
    if not cat_name:
        return None
    slug = cat_name.lower().replace(" ", "-").replace("&", "and")
    slug = re.sub(r'[^a-z0-9-]', '', slug)
    slug = re.sub(r'-+', '-', slug)
    slug = slug.strip('-')
    if is_valid_url_slug(slug):
        return slug
    return None

def normalize_bool(value):
    """Normalize to boolean: Y/N, yes/no, true/false, 1/0, T/F."""
    if value is None:
        return None
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        value_lower = value.lower().strip()
        if value_lower in ['y', 'yes', 'true', 't', '1']:
            return True
        elif value_lower in ['n', 'no', 'false', 'f', '0']:
            return False
    if isinstance(value, (int, float)):
        return bool(value)
    return None

def is_valid_hierarchy_level(level):
    """Validate hierarchy_level: 0-3 typically."""
    if level is None:
        return True
    try:
        level_int = int(level)
        return 0 <= level_int <= 10  # Allow deeper trees but flag extreme values
    except:
        return False

def breadcrumb_from_hierarchy(cat_name, parent_name=None, level=None):
    """Generate breadcrumb path from hierarchy info."""
    if not cat_name:
        return None
    if level == 0 or parent_name is None:
        return cat_name
    if parent_name:
        return f"{parent_name} > {cat_name}"
    return cat_name


def generate_messy_category_data(num_rows=200):
    """Generate category data with all 12 validation rules integrated."""
    data = []
    used_cat_ids = set()
    used_url_slugs = set()
    used_names_by_parent = {}  # Track names per parent for uniqueness rule
    category_hierarchy = {}  # Track hierarchy for validation

    for i in range(num_rows):
        record = {}
        idx = starting_category_index + i

        # 1. cat_id: Pure integer, must be > 0, unique (NO DUPLICATES)
        cat_id = generate_cat_id(idx)
        record["cat_id"] = cat_id
        used_cat_ids.add(cat_id)

        # 9. hierarchy_level: Integer >= 0, determines parent/child rules
        if i % 20 == 0:
            level = None
        elif i % 30 == 0:
            level = random.choice(["Root", "Child", "Leaf", "-1", "99"])
        else:
            # Generate level: ~60% root, ~30% level 1, ~10% level 2+
            rand = random.random()
            if rand < 0.6:
                level = 0
            elif rand < 0.9:
                level = 1
            else:
                level = random.choice([2, 3])

        # Validate and convert level
        if isinstance(level, str):
            try:
                level = int(level)
            except:
                level = None
        
        if level is not None and not is_valid_hierarchy_level(level):
            level = random.choice([0, 1, 2])
        
        record["hierarchy_level"] = level

        # 12. parent_cat_id: Must be NULL iff level = 0, referential integrity
        parent_cat_id = None
        if level and level > 0:
            # Child nodes must have parent
            if len([c for c in category_hierarchy.values() if c['level'] == level - 1]) > 0:
                possible_parents = [c['id'] for c in category_hierarchy.values() if c['level'] == level - 1]
                parent_cat_id = random.choice(possible_parents)
            elif used_cat_ids and len(used_cat_ids) > 1:
                parent_cat_id = random.choice(list(used_cat_ids)[:-1])
        elif level == 0:
            # Root nodes must NOT have parent
            parent_cat_id = None
        else:
            # No level info - randomly decide
            if random.random() > 0.7 and used_cat_ids and len(used_cat_ids) > 1:
                parent_cat_id = random.choice(list(used_cat_ids)[:-1])

        # Validation: no self-reference
        if parent_cat_id == cat_id:
            parent_cat_id = None

        # Reject orphan placeholders like 9999, 0, -1
        if parent_cat_id in [0, -1, 9999, "0", "-1", "9999"]:
            parent_cat_id = None

        record["parent_cat_id"] = parent_cat_id

        # 2. cat_name: NOT NULL, 3-60 chars, unique within parent, no placeholders
        if i % 100 == 0:
            name = None  # Violation
        elif i % 40 == 0:
            name = random.choice(["N/A", "NULL", "Category", "Test", "Misc", "Uncategorized"])
        elif i % 35 == 0:
            name = "Test" * 20  # Exceeds 60 chars
        elif i % 50 == 0:
            name = random.choice(["electronics", "ELECTRONICS", "ElEcTrOnIcS"])
        else:
            name = random.choice([
                "Electronics", "Clothing", "Sports & Outdoors", "Home & Garden",
                "Books & Media", "Computers", "Mobile Devices", "Audio",
                "Cameras", "Gaming", "Accessories", "Men's Clothing",
                "Women's Clothing", "Kids' Clothing", "Shoes", "Fitness",
                "Outdoor Recreation", "Team Sports", "Water Sports",
                "Winter Sports", "Furniture", "Kitchen", "Bedroom",
                "Bathroom", "Garden", "Fiction", "Non-Fiction"
            ])

        # Normalize name
        name = normalize_cat_name(name)
        
        # Enforce NOT NULL
        if name is None:
            if i % 100 == 0:
                name = f"Category {i}"  # Generate valid default
            else:
                name = random.choice(["Electronics", "Clothing", "Sports"])

        # Check uniqueness within parent
        parent_key = str(parent_cat_id) if parent_cat_id else "root"
        if parent_key not in used_names_by_parent:
            used_names_by_parent[parent_key] = set()

        # If already used, regenerate
        if name in used_names_by_parent[parent_key]:
            if i % 3 == 0:  # Keep some duplicates for testing
                pass
            else:
                name = f"{name} {random.randint(1, 999)}"

        used_names_by_parent[parent_key].add(name)
        record["cat_name"] = name

        # 3. sub_cat: Optional but if present must be from controlled vocabulary
        if i % 20 == 0:
            subcat = None
        elif i % 30 == 0:
            subcat = random.choice(["N/A", "NULL", "None", "General", "", "All", "Unknown"])
        elif i % 40 == 0:
            subcat = "Invalid Category Type"
        else:
            subcat = random.choice(list(VALID_SUBCATEGORIES))

        # Normalize to controlled vocabulary
        subcat = normalize_subcategory(subcat)
        
        # Hierarchy logic: Root should have NULL subcat
        if level == 0 and subcat is not None and random.random() > 0.7:
            subcat = None

        record["sub_cat"] = subcat

        # 5. url_slug: Required for public, unique, format validation, reserved words check
        if i % 35 == 0:
            slug = None
        elif i % 45 == 0:
            slug = random.choice(["category name", "N/A", "slug with spaces", "UPPERCASE", "slug_with_underscore"])
        elif i % 55 == 0:
            slug = "admin"  # Reserved word
        elif i % 65 == 0:
            slug = "category.php"  # File-like ending
        else:
            slug = generate_url_slug(name) if name else None

        # Validate format
        if slug and not is_valid_url_slug(slug):
            slug = None

        # Enforce uniqueness
        if slug and slug in used_url_slugs:
            if random.random() > 0.8:
                pass  # Keep some duplicates for testing
            else:
                slug = f"{slug}-{random.randint(1, 999)}"

        if slug:
            used_url_slugs.add(slug)

        record["url_slug"] = slug

        # 6. active_flag: Boolean only, hierarchy dependency
        if i % 25 == 0:
            active = None
        elif i % 35 == 0:
            active = random.choice(["Y", "N", "Yes", "No", "1", "0", "Maybe", "Unknown"])
        else:
            active = random.choice([True, True, True, True, False])  # 80% active

        # Normalize to boolean
        active = normalize_bool(active)

        # Hierarchy dependency: If parent is inactive, child should be inactive
        if parent_cat_id and parent_cat_id in category_hierarchy:
            if not category_hierarchy[parent_cat_id]['active']:
                if random.random() > 0.3:  # 70% enforce, 30% allow violation
                    active = False

        record["active_flag"] = active

        # 8. category_desc: Optional, 20-500 chars, no placeholders
        if random.random() > 0.4:
            if i % 35 == 0:
                desc = None
            elif i % 45 == 0:
                desc = random.choice(["N/A", "No description", "", "Lorem ipsum"])
            elif i % 55 == 0:
                desc = "Test"  # Too short (< 20 chars)
            else:
                desc = fake.sentence(nb_words=random.randint(4, 20))

            # Validate: if present, should be 20-500 chars and meaningful
            if isinstance(desc, str):
                desc = desc.strip()
                if len(desc) < 20 and desc.lower() not in ["n/a", "no description", "test", "", "none"]:
                    # Pad with more text
                    desc = desc + ". " + fake.sentence(nb_words=5)
                if len(desc) > 500:
                    desc = desc[:500]

            record["category_desc"] = desc
        else:
            record["category_desc"] = None

        # 4. display_sequence: Integer >= 0, sibling uniqueness
        if i % 30 == 0:
            sequence = None
        elif i % 40 == 0:
            sequence = random.choice(["First", "Last", "-1", "99999"])
        else:
            sequence = random.randint(0, 1000)

        # Validate: >= 0, integer
        if sequence is not None:
            try:
                sequence = int(sequence)
                if sequence < 0:
                    sequence = random.randint(0, 100)
            except:
                sequence = None

        record["display_sequence"] = sequence

        # 10. product_count: Integer >= 0, active realism
        if random.random() > 0.5:
            if i % 40 == 0:
                count = None
            elif i % 50 == 0:
                count = random.choice(["Many", "-1", "99999"])
            else:
                count = random.randint(0, 500)

            # Validate: >= 0, integer
            if count is not None:
                try:
                    count = int(count)
                    if count < 0:
                        count = random.randint(0, 100)
                except:
                    count = None

            # Active realism: If active, product_count should usually be > 0
            if active and count == 0:
                if random.random() > 0.7:  # 70% enforce
                    count = random.randint(1, 100)

            record["product_count"] = count
        else:
            record["product_count"] = None

        # 11. breadcrumb_path: Required for non-root, correct structure
        if level == 0:
            breadcrumb = name  # Root just has its name
        else:
            parent_name = None
            if parent_cat_id in category_hierarchy:
                parent_name = category_hierarchy[parent_cat_id].get('name')
            
            if parent_name:
                breadcrumb = f"{parent_name} > {name}"
            else:
                breadcrumb = name

        record["breadcrumb_path"] = breadcrumb

        # 7. date_created: ISO-8601, <= now(), >= system launch date
        if i % 30 == 0:
            created = None
        elif i % 40 == 0:
            created = fake.date_time_between(start_date="+1y", end_date="+2y")  # Future date
        elif i % 50 == 0:
            created = fake.date_time_between(start_date="-100y", end_date="-50y")  # Too old
        else:
            created = fake.date_time_between(start_date=SYSTEM_LAUNCH_DATE, end_date="now")

        # Convert to ISO-8601 and validate
        created_iso = to_iso8601(created)
        if created_iso:
            created_dt = pd.to_datetime(created_iso)
            if created_dt > datetime.now(timezone.utc):
                created_iso = to_iso8601(datetime.now(timezone.utc))
            if created_dt < SYSTEM_LAUNCH_DATE:
                created_iso = to_iso8601(SYSTEM_LAUNCH_DATE)

        record["date_created"] = created_iso

        # Store category info for hierarchy validation
        category_hierarchy[cat_id] = {
            'id': cat_id,
            'name': name,
            'level': level,
            'active': active,
            'parent': parent_cat_id
        }

        data.append(record)

    df = pd.DataFrame(data)

    # Add empty rows and NULL-value rows (NOT duplicates of cat_id)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series([random.choice(null_values) for _ in range(len(df.columns))], index=df.columns)
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    """Add additional data quality issues."""
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (str(x).upper() if pd.notna(x) and random.random() > 0.5 else 
                      str(x).lower() if pd.notna(x) else x)
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


if __name__ == "__main__":
    df = generate_messy_category_data(number_of_categories)
    df = add_more_messiness(df)

    output_file = "categories.xlsx"
    df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Wishlist Table Generator

In [38]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
PRIORITY_CANONICAL = ["Low", "Medium", "High"]
LIST_NAME_CANONICAL = ["Wishlist", "Gift Ideas", "Future Purchases", "Dream Items"]
ADD_SOURCE_CANONICAL = [
    "Web",
    "Mobile App",
    "Android",
    "iOS",
    "Social Media",
    "QR Code",
    "Email Link",
]

# Generate product prices for consistency
product_prices = {
    f"PROD_{i + starting_product_index}": round(random.uniform(10, 500), 2)
    for i in range(number_of_products)
}


def generate_messy_wishlist_data(
    num_rows=2000, customer_id_format="CUST", product_id_format="PROD"
):
    data = []
    used_wishlist_ids = []

    customer_ids = [
        f"CUST_{i + starting_customer_index}" for i in range(number_of_customers)
    ]
    product_ids = [
        f"PROD_{i + starting_product_index}" for i in range(number_of_products)
    ]

    customer_product_pairs = {}

    for i in range(num_rows):
        record = {}

        # wish_id: Primary key, positive integer, unique
        if i % 53 == 0 and used_wishlist_ids:
            wish_id = random.choice(used_wishlist_ids)  # Duplicate violation
        else:
            wish_id = starting_wishlist_index + i
            used_wishlist_ids.append(wish_id)
        record["wish_id"] = wish_id if i % 97 != 0 else None  # 1% null

        # user_id: Mandatory, FK to customers
        if i % 79 == 0:
            cust_id = None
        elif i % 61 == 0:
            cust_id = f"CUST_{99999}"  # FK violation
        elif i % 43 == 0:
            cust_id = random.choice(["INVALID", "NULL", "N/A", "", "UNKNOWN"])
        else:
            cust_id = random.choice(customer_ids)
        record["user_id"] = cust_id

        # item_id: Mandatory, FK to products
        if i % 73 == 0:
            prod_id = None
        elif i % 57 == 0:
            prod_id = f"PROD_{9999}"  # FK violation
        elif i % 47 == 0:
            prod_id = random.choice(["INVALID", "N/A", "NULL", "", "DISCONTINUED"])
        else:
            # Check for duplicate customer-product pairs
            if i % 31 == 0 and cust_id in customer_product_pairs:
                if customer_product_pairs[cust_id]:
                    prod_id = random.choice(customer_product_pairs[cust_id])
                else:
                    prod_id = random.choice(product_ids)
            else:
                prod_id = random.choice(product_ids)

        if cust_id and prod_id:
            if cust_id not in customer_product_pairs:
                customer_product_pairs[cust_id] = []
            customer_product_pairs[cust_id].append(prod_id)
        record["item_id"] = prod_id

        # Get product price for this item
        base_price = product_prices.get(prod_id, round(random.uniform(10, 500), 2))

        # date_added: <= now(), earliest event timestamp
        if i % 27 == 0:
            added = None
        elif i % 37 == 0:
            added_date = fake.date_time_between(start_date="-2y", end_date="now")
            formats = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M", "%d-%m-%Y"]
            added = added_date.strftime(random.choice(formats))
        elif i % 47 == 0:
            added = int(
                fake.date_time_between(start_date="-2y", end_date="now").timestamp()
            )
        elif i % 57 == 0:
            added = fake.date_time_between(
                start_date="+1m", end_date="+1y"
            )  # Future violation
        elif i % 67 == 0:
            added = fake.date_time_between(
                start_date="-10y", end_date="-5y"
            )  # Very old
        elif i % 77 == 0:
            added = random.choice(
                ["1970-01-01", "1900-01-01", "0000-00-00"]
            )  # Placeholder violation
        else:
            added = fake.date_time_between(start_date="-2y", end_date="now")
        record["date_added"] = added

        # Determine outcome: purchased, removed, or active
        outcome = random.choices(
            ["active", "purchased", "removed"], weights=[75, 15, 10], k=1
        )[0]

        # purchase_date: Optional, >= date_added, mutual exclusion with removal_date
        purchased = None
        if outcome == "purchased":
            if i % 42 == 0:
                # Purchased before added violation
                if isinstance(added, datetime):
                    purchased = added - timedelta(days=random.randint(1, 30))
                else:
                    purchased = fake.date_time_between(start_date="-3y", end_date="-2y")
            elif i % 52 == 0:
                purchased = fake.date_time_between(
                    start_date="+1m", end_date="+6m"
                )  # Future
            elif i % 62 == 0:
                purchased = random.choice(
                    ["0000-00-00", "N/A", "PURCHASED", ""]
                )  # Invalid format
            elif i % 72 == 0:
                # Purchased within 2 seconds of add (instrumentation artifact)
                if isinstance(added, datetime):
                    purchased = added + timedelta(seconds=random.randint(1, 2))
                else:
                    purchased = fake.date_time_between(start_date="-1y", end_date="now")
            else:
                # Valid: purchase_date >= date_added
                if isinstance(added, datetime):
                    purchased = fake.date_time_between(start_date=added, end_date="now")
                else:
                    purchased = fake.date_time_between(start_date="-1y", end_date="now")
        record["purchase_date"] = purchased

        # removal_date: Optional, >= date_added, mutual exclusion with purchase_date
        removed = None
        if outcome == "removed":
            if i % 44 == 0:
                # Removed before added violation
                if isinstance(added, datetime):
                    removed = added - timedelta(days=random.randint(1, 30))
                else:
                    removed = fake.date_time_between(start_date="-3y", end_date="-2y")
            elif i % 54 == 0:
                removed = fake.date_time_between(
                    start_date="+1m", end_date="+6m"
                )  # Future
            elif i % 74 == 0:
                # Removed within 1 second of add (bot/noise)
                if isinstance(added, datetime):
                    removed = added + timedelta(seconds=random.uniform(0.1, 1))
                else:
                    removed = fake.date_time_between(start_date="-1y", end_date="now")
            else:
                # Valid: removal_date >= date_added
                if isinstance(added, datetime):
                    removed = fake.date_time_between(start_date=added, end_date="now")
                else:
                    removed = fake.date_time_between(start_date="-1y", end_date="now")
        elif i % 81 == 0:
            # Both purchased and removed violation
            removed = fake.date_time_between(start_date="-6m", end_date="now")
            purchased = fake.date_time_between(start_date="-1y", end_date="-6m")
            record["purchase_date"] = purchased
        record["removal_date"] = removed

        # price_at_addition: Decimal >= 0, historical price
        if i % 29 == 0:
            price_added = None
        elif i % 39 == 0:
            price_added = random.choice(["N/A", "Unknown", "Free"])
        elif i % 49 == 0:
            price_added = round(random.uniform(-100, -1), 2)  # Negative violation
        elif i % 59 == 0:
            price_added = random.choice([0, 999999.99, 0.001])  # Extreme
        else:
            # Valid: based on product price with small variation
            price_added = round(base_price * random.uniform(0.9, 1.1), 2)
        record["price_at_addition"] = price_added

        # current_price: Decimal >= 0, should match products.retail_price
        if i % 33 == 0:
            current_price = None
        elif i % 43 == 0:
            current_price = random.choice(["Out of Stock", "Discontinued", "TBD"])
        elif i % 63 == 0:
            # Price swing > 300% violation
            if isinstance(price_added, (int, float)) and price_added > 0:
                current_price = round(price_added * random.uniform(4, 6), 2)
            else:
                current_price = round(random.uniform(1000, 2000), 2)
        elif i % 73 == 0:
            # Massive drop < 25% violation
            if isinstance(price_added, (int, float)) and price_added > 0:
                current_price = round(price_added * 0.1, 2)
            else:
                current_price = round(random.uniform(1, 10), 2)
        elif i % 83 == 0:
            current_price = round(random.uniform(-50, -1), 2)  # Negative
        else:
            # Valid: close to product retail price
            current_price = round(base_price * random.uniform(0.8, 1.2), 2)
        record["current_price"] = current_price

        # priority: Canonical set [Low, Medium, High], default Medium
        if random.random() > 0.5:
            if i % 22 == 0:
                priority = None  # Will default to Medium
            elif i % 32 == 0:
                priority = random.choice(
                    [
                        "high",
                        "HIGH",
                        "H",
                        "1",
                        "medium",
                        "MEDIUM",
                        "M",
                        "2",
                        "low",
                        "LOW",
                        "L",
                        "3",
                    ]
                )
            elif i % 42 == 0:
                priority = random.choice(
                    ["Urgent", "Important", "Normal", "Critical", "ASAP", ""]
                )
            elif i % 62 == 0:
                priority = random.choice(["Hihg", "Mediun", "Loww"])  # Typos
            else:
                priority = random.choice(PRIORITY_CANONICAL)
            record["priority"] = priority

        # user_notes: Optional, 0-255 chars
        if random.random() > 0.6:
            if i % 40 == 0:
                notes = None
            elif i % 50 == 0:
                notes = random.choice(["", "N/A", "None"])
            elif i % 60 == 0:
                # Very long notes violation (> 255 chars)
                notes = fake.text(max_nb_chars=500)
            elif i % 70 == 0:
                notes = random.choice(
                    ["test", "asdf", "lorem ipsum"]
                )  # Placeholder violation
            elif i % 80 == 0:
                notes = "<script>alert('xss')</script>"  # Script injection
            else:
                notes = random.choice(
                    [
                        "Birthday gift idea",
                        "Wait for sale",
                        "Check reviews first",
                        "Alternative to consider",
                        "Must have!",
                        "Compare with other options",
                        "Gift for mom",
                        fake.sentence(nb_words=6),
                    ]
                )
            record["user_notes"] = notes

        # list_name: Optional, canonical set
        if random.random() > 0.7:
            if i % 40 == 0:
                list_name = None  # Will default to "Wishlist"
            elif i % 50 == 0:
                list_name = random.choice(["", "Default", "Main"])
            elif i % 60 == 0:
                list_name = random.choice(
                    ["wish list", "WISHLIST", "WishList"]
                )  # Case variations
            elif i % 70 == 0:
                # Too short (< 3 chars)
                list_name = random.choice(["AB", "X", ""])
            else:
                list_name = random.choice(LIST_NAME_CANONICAL)
            record["list_name"] = list_name

        # price_alert_enabled: Boolean, default FALSE
        if i % 34 == 0:
            alert_enabled = None
        elif i % 44 == 0:
            alert_enabled = random.choice(["Y", "N", "Active", "Inactive"])
        else:
            alert_enabled = random.choice([True, False])
        record["price_alert_enabled"] = alert_enabled

        # notification_sent: Boolean, depends on price_alert_enabled
        if random.random() > 0.4:
            if i % 35 == 0:
                notified = None
            elif i % 45 == 0:
                notified = random.choice(
                    ["Y", "N", "Yes", "No", "1", "0", "true", "false"]
                )
            elif i % 55 == 0:
                notified = random.choice(["Pending", "Sent", "Failed"])
            elif i % 75 == 0:
                # Notification sent but alert not enabled violation
                if alert_enabled == False:
                    notified = True
                else:
                    notified = False
            else:
                # Valid: notification_sent => price_alert_enabled
                if alert_enabled == True:
                    notified = random.choice([True, False])
                else:
                    notified = False
            record["notification_sent"] = notified

        # add_source: Canonical set
        if random.random() > 0.6:
            if i % 35 == 0:
                source = None
            elif i % 45 == 0:
                source = random.choice(["", "N/A", "Unknown"])
            elif i % 55 == 0:
                source = random.choice(["android", "ios", "mobile"])  # Lowercase
            elif i % 65 == 0:
                # Both Mobile App and Android/iOS violation
                source = random.choice(["Mobile App, Android", "Mobile App/iOS"])
            else:
                source = random.choice(ADD_SOURCE_CANONICAL)
            record["add_source"] = source

        # desired_quantity: Integer >= 1, flag > 20
        if random.random() > 0.7:
            if i % 45 == 0:
                qty = None
            elif i % 55 == 0:
                qty = random.choice(["Multiple", "A few", "N/A"])
            elif i % 65 == 0:
                qty = random.choice([-1, 0, -10])  # <= 0 violation
            elif i % 75 == 0:
                qty = random.choice([999, 0.5, 10000])  # Extreme
            elif i % 85 == 0:
                qty = random.randint(25, 100)  # > 20 (reseller flag)
            else:
                # Valid: 1-5 typical
                qty = random.randint(1, 5)
            record["desired_quantity"] = qty

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_wishlist_data(number_of_wishlists)
df = add_more_messiness(df)

output_file = "wishlists.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Shopping Cart Table Generator

In [39]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import uuid

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
STATUS_CANONICAL = ["Active", "Abandoned", "Converted (Ordered)", "Deleted"]
DEVICE_TYPE_CANONICAL = [
    "Desktop Web",
    "Mobile Web",
    "iOS App",
    "Android App",
    "Tablet Web",
]
PROMO_CODES = [
    "SAVE10",
    "WELCOME20",
    "FREESHIP",
    "SUMMER2024",
    "VIP15",
    "FLASH50",
    "CLEARANCE",
    "LOYALTY",
]

# Generate product prices for consistency
product_prices = {
    f"PROD_{i + starting_product_index}": round(random.uniform(5, 500), 2)
    for i in range(number_of_products)
}


def generate_messy_shopping_cart_data(
    num_rows=3000, customer_id_format="CUST", product_id_format="PROD"
):
    data = []
    used_cart_ids = []
    used_session_ids = []

    customer_ids = [
        f"CUST_{i + starting_customer_index}" for i in range(number_of_customers)
    ]
    product_ids = [
        f"PROD_{i + starting_product_index}" for i in range(number_of_products)
    ]

    cart_sessions = {}

    for i in range(num_rows):
        record = {}

        # cart_item_id: Primary key, positive integer, unique
        if i % 53 == 0 and used_cart_ids:
            cart_id = random.choice(used_cart_ids)  # Duplicate violation
        else:
            cart_id = starting_cart_index + i
            used_cart_ids.append(cart_id)
        record["cart_item_id"] = cart_id if i % 97 != 0 else None  # 1% null

        # Decide guest vs registered
        is_guest = random.random() < 0.35  # 35% guest carts

        # customer_ref: Can be NULL for guest carts, FK to customers
        if is_guest:
            cust_id = None
        elif i % 59 == 0:
            cust_id = f"CUST_{99999}"  # FK violation
        elif i % 41 == 0:
            cust_id = random.choice(["INVALID", "NULL", "N/A", "", "GUEST"])
        elif i % 79 == 0:
            # Both customer_ref and session_identifier NULL violation
            cust_id = None
            is_guest = False  # Will force session to be NULL too
        else:
            cust_id = random.choice(customer_ids)
        record["customer_ref"] = cust_id

        # session_identifier: UUID format, required if customer_ref is NULL
        if is_guest and cust_id is None:
            # Guest cart must have session ID
            if i % 71 == 0:
                session_id = random.choice(
                    ["INVALID", "NULL", "", "000000"]
                )  # Invalid format
            elif i % 51 == 0 and used_session_ids:
                session_id = random.choice(used_session_ids[-10:])  # Reuse session
            else:
                session_id = str(uuid.uuid4())  # Valid UUID format
                used_session_ids.append(session_id)
        elif not is_guest and cust_id:
            # Registered user might still have session ID
            if random.random() < 0.7:
                if i % 61 == 0:
                    session_id = random.choice(
                        ["USER_SESSION", "N/A"]
                    )  # Invalid format
                else:
                    session_id = str(uuid.uuid4())
                    used_session_ids.append(session_id)
            else:
                session_id = None
        elif i % 79 == 0:
            session_id = None  # Both NULL violation
        else:
            session_id = None
        record["session_identifier"] = session_id

        # product_ref: Mandatory, FK to products
        if i % 73 == 0:
            prod_id = None
        elif i % 57 == 0:
            prod_id = f"PROD_{9999}"  # FK violation
        elif i % 47 == 0:
            prod_id = random.choice(["INVALID", "N/A", "NULL", "", "OUT_OF_STOCK"])
        else:
            # Group products by session
            if session_id and session_id in cart_sessions:
                if random.random() < 0.3 and cart_sessions[session_id]:
                    prod_id = random.choice(cart_sessions[session_id])
                else:
                    prod_id = random.choice(product_ids)
                    cart_sessions[session_id].append(prod_id)
            else:
                prod_id = random.choice(product_ids)
                if session_id:
                    cart_sessions[session_id] = [prod_id]
        record["product_ref"] = prod_id

        # Get product price
        base_price = product_prices.get(prod_id, round(random.uniform(5, 500), 2))

        # item_quantity: Integer >= 1, flag > 50 as suspicious
        if i % 78 == 0:
            quantity = None
        elif i % 31 == 0:
            quantity = random.choice(["One", "Two", "Many", "N/A"])  # String violation
        elif i % 42 == 0:
            quantity = random.choice([0, -1, -10])  # <= 0 violation
        elif i % 52 == 0:
            quantity = random.choice([999, 10000, -999])  # Extreme
        elif i % 62 == 0:
            quantity = random.choice([1.5, 2.3, 3.7])  # Decimal violation
        elif i % 72 == 0:
            quantity = random.randint(60, 200)  # > 50 suspicious
        else:
            # Valid: typical quantities
            quantity = random.choices(
                [1, 2, 3, 4, 5, 10, 20], weights=[40, 25, 15, 10, 5, 3, 2], k=1
            )[0]
        record["item_quantity"] = quantity

        # price_per_unit: Decimal >= 0, typically > 0
        if i % 83 == 0:
            price = None
        elif i % 33 == 0:
            price = random.choice(
                ["Free", "N/A", "Contact for price", ""]
            )  # String violation
        elif i % 43 == 0:
            price = round(random.uniform(-100, -1), 2)  # Negative violation
        elif i % 55 == 0:
            price = 0  # Zero price (needs promo indicator)
        elif i % 65 == 0:
            price = random.choice([999999.99, 0.001, -9999])  # Extreme
        elif i % 75 == 0:
            price = round(random.uniform(10, 500), 5)  # Too many decimals
        else:
            # Valid: matches product price
            price = base_price
        record["price_per_unit"] = price

        # date_added_to_cart: <= now(), no future dates
        if i % 26 == 0:
            added = None
        elif i % 36 == 0:
            added_date = fake.date_time_between(start_date="-30d", end_date="now")
            formats = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M", "%d-%m-%Y"]
            added = added_date.strftime(random.choice(formats))
        elif i % 46 == 0:
            added = int(
                fake.date_time_between(start_date="-30d", end_date="now").timestamp()
            )
        elif i % 56 == 0:
            added = fake.date_time_between(
                start_date="+1d", end_date="+7d"
            )  # Future violation
        elif i % 66 == 0:
            added = fake.date_time_between(start_date="-1y", end_date="-6m")  # Very old
        elif i % 76 == 0:
            added = random.choice(
                ["1970-01-01", "1900-01-01", "0000-00-00"]
            )  # Placeholder
        else:
            # Valid: recent carts (within 30 days)
            added = fake.date_time_between(start_date="-30d", end_date="now")
        record["date_added_to_cart"] = added

        # status: Canonical set
        if i % 22 == 0:
            status = None
        elif i % 32 == 0:
            status = random.choice(
                ["active", "ACTIVE", "A", "1", "abandoned", "ABANDONED"]
            )
        elif i % 44 == 0:
            status = random.choice(
                ["Pending", "In Progress", "Expired", "Ordered"]
            )  # Non-canonical
        elif i % 54 == 0:
            status = random.choice(["Activ", "Abandond", "Convertd"])  # Typos
        else:
            # Valid: realistic distribution based on age
            if isinstance(added, datetime):
                days_old = (datetime.now() - added).days
                if days_old > 7:
                    status = random.choices(
                        ["Abandoned", "Converted (Ordered)", "Deleted"],
                        weights=[60, 35, 5],
                        k=1,
                    )[0]
                else:
                    status = random.choices(
                        STATUS_CANONICAL, weights=[30, 45, 20, 5], k=1
                    )[0]
            else:
                status = random.choice(STATUS_CANONICAL)
        record["status"] = status

        # discount_amount: >= 0, <= price_per_unit * item_quantity
        if random.random() > 0.6:
            if i % 35 == 0:
                discount = None
            elif i % 45 == 0:
                discount = random.choice(["10%", "SALE", "N/A"])  # String violation
            elif i % 58 == 0:
                discount = round(random.uniform(-50, -1), 2)  # Negative violation
            elif i % 68 == 0:
                # Discount > gross amount violation
                if (
                    isinstance(price, (int, float))
                    and price > 0
                    and isinstance(quantity, (int, float))
                    and quantity > 0
                ):
                    discount = price * quantity * 1.5
                else:
                    discount = 1000
            elif i % 88 == 0:
                # Discount > 80% (suspicious)
                if (
                    isinstance(price, (int, float))
                    and price > 0
                    and isinstance(quantity, (int, float))
                    and quantity > 0
                ):
                    discount = round(price * quantity * 0.9, 2)
                else:
                    discount = 500
            else:
                # Valid: 0-30% discount
                if (
                    isinstance(price, (int, float))
                    and price > 0
                    and isinstance(quantity, (int, float))
                    and quantity > 0
                ):
                    discount = round(price * quantity * random.uniform(0, 0.3), 2)
                else:
                    discount = 0
            record["discount_amount"] = discount

        # promo_code: Uppercase, format ^[A-Z0-9_-]{3,30}$
        if random.random() > 0.7:
            if i % 40 == 0:
                coupon = None
            elif i % 50 == 0:
                coupon = random.choice(["", "INVALID", "EXPIRED"])
            elif i % 60 == 0:
                coupon = fake.text(max_nb_chars=100)  # Too long violation
            elif i % 70 == 0:
                coupon = random.choice(
                    ["save10", "Welcome20", "free ship"]
                )  # Wrong case/format
            elif i % 80 == 0:
                # promo_code but no discount (violation)
                record["discount_amount"] = 0
                coupon = random.choice(PROMO_CODES)
            else:
                coupon = random.choice(PROMO_CODES)
            record["promo_code"] = coupon

        # tax_amount: Decimal >= 0, rate typically 0-25%
        if random.random() > 0.5:
            if i % 39 == 0:
                tax = None
            elif i % 49 == 0:
                tax = random.choice(["Included", "Exempt", "N/A"])  # String violation
            elif i % 64 == 0:
                tax = round(random.uniform(-10, -1), 2)  # Negative violation
            elif i % 74 == 0:
                tax = random.choice([999, 0.001])  # Extreme
            elif i % 84 == 0:
                # Tax rate > 25% violation
                if (
                    isinstance(price, (int, float))
                    and price > 0
                    and isinstance(quantity, (int, float))
                    and quantity > 0
                ):
                    tax = round(price * quantity * 0.35, 2)
                else:
                    tax = 100
            else:
                # Valid: 5-15% tax
                if (
                    isinstance(price, (int, float))
                    and price > 0
                    and isinstance(quantity, (int, float))
                    and quantity > 0
                ):
                    discount_val = record.get("discount_amount", 0)
                    if not isinstance(discount_val, (int, float)):
                        discount_val = 0
                    taxable = max(0, price * quantity - discount_val)
                    tax = round(taxable * random.uniform(0.05, 0.15), 2)
                else:
                    tax = 0
            record["tax_amount"] = tax

        # last_updated: >= date_added_to_cart, <= now()
        if random.random() > 0.7:
            if i % 45 == 0:
                updated = None
            elif i % 58 == 0:
                # Updated before added violation
                if isinstance(added, datetime):
                    updated = added - timedelta(hours=random.randint(1, 24))
                else:
                    updated = fake.date_time_between(start_date="-35d", end_date="-31d")
            elif i % 69 == 0:
                # Updated in future violation
                updated = fake.date_time_between(start_date="+1d", end_date="+7d")
            else:
                # Valid: after date_added
                if isinstance(added, datetime):
                    updated = fake.date_time_between(start_date=added, end_date="now")
                else:
                    updated = fake.date_time_between(start_date="-29d", end_date="now")
            record["last_updated"] = updated

        # device_type: Canonical set
        if random.random() > 0.4:
            if i % 50 == 0:
                device = None
            elif i % 63 == 0:
                device = random.choice(["", "N/A", "Unknown"])
            elif i % 73 == 0:
                device = random.choice(
                    ["ios", "Android", "Mobile", "Chrome", "Safari"]
                )  # Non-canonical
            else:
                device = random.choice(DEVICE_TYPE_CANONICAL)
            record["device_type"] = device

        # ip_address: Valid IPv4/IPv6, reject 0.0.0.0, 127.0.0.1
        if random.random() > 0.6:
            if i % 46 == 0:
                ip = None
            elif i % 56 == 0:
                ip = random.choice(
                    [
                        "N/A",
                        "0.0.0.0",
                        "999.999.999.999",
                        "localhost",
                        "127.0.0.1",
                        "::1",
                    ]
                )
            else:
                ip = fake.ipv4()
            record["ip_address"] = ip

        # saved_for_later: Boolean, if TRUE and customer_ref is NULL, flag as inconsistent
        if random.random() > 0.8:
            if i % 57 == 0:
                saved = None
            elif i % 67 == 0:
                saved = random.choice(
                    ["Y", "N", "Yes", "No", "1", "0"]
                )  # String boolean
            elif i % 77 == 0:
                # saved_for_later=TRUE but status=Converted violation
                if status in ["Converted (Ordered)", "Converted"]:
                    saved = True
                else:
                    saved = False
            elif i % 87 == 0:
                # saved_for_later=TRUE but customer_ref is NULL violation
                if cust_id is None:
                    saved = True
                else:
                    saved = False
            else:
                saved = random.choice([True, False])
            record["saved_for_later"] = saved

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_shopping_cart_data(number_of_carts)
df = add_more_messiness(df)

output_file = "shopping_carts.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame

### Orders Table Generator

In [40]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker(["en_US", "en_GB", "fr_FR", "de_DE"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
STATUS_CANONICAL = [
    "Pending",
    "Processing",
    "Shipped",
    "Delivered",
    "Cancelled",
    "Returned",
]
CURRENCY_CANONICAL = ["USD", "EUR", "GBP", "CAD", "AUD"]
DEVICE_CANONICAL = ["Desktop", "Mobile", "Tablet"]
SHIPPING_METHOD_CANONICAL = [
    "Standard",
    "Priority",
    "Express",
    "2-Day",
    "White Glove",
    "Cash on Delivery",
]
PAYMENT_METHOD_CANONICAL = [
    "Credit Card",
    "PayPal",
    "Apple Pay",
    "Bitcoin",
    "Gift Card",
    "Cash on Delivery",
]
MARKETING_CHANNEL_CANONICAL = [
    "Organic Search",
    "Paid Search",
    "Social Media",
    "Direct",
    "Email",
    "Referral",
    "Display Ads",
]


def generate_messy_orders_data(num_rows=2500, customer_id_format="CUST"):
    data = []
    used_order_ids = []

    customer_ids = [
        f"CUST_{i + starting_customer_index}" for i in range(number_of_customers)
    ]

    for i in range(num_rows):
        record = {}

        # order_ref: Primary key, positive integer, unique
        if i % 53 == 0 and used_order_ids:
            order_id = random.choice(used_order_ids)  # Duplicate violation
        else:
            order_id = starting_order_index + i
            used_order_ids.append(order_id)
        record["order_ref"] = order_id if i % 97 != 0 else None  # 1% null

        # customer_ref: Mandatory, format ^CUST_[1-9][0-9]*$
        if i % 61 == 0:
            cust_id = f"CUST_{99999}"  # FK violation
        elif i % 43 == 0:
            cust_id = random.choice(["INVALID", "NULL", "N/A", "", "GUEST"])
        else:
            cust_id = random.choice(customer_ids)
        record["customer_ref"] = cust_id

        # order_status: Canonical set with date dependencies
        if i % 27 == 0:
            status = None
        elif i % 37 == 0:
            status = random.choice(
                ["pending", "PENDING", "P", "1", "shipped", "SHIPPED"]
            )
        elif i % 47 == 0:
            status = random.choice(["In Transit", "Complete", "Failed", "On Hold"])
        elif i % 57 == 0:
            status = random.choice(
                ["Pendng", "Proccessing", "Shiped", "Deliverd", "Cancled"]
            )
        else:
            status = random.choices(
                STATUS_CANONICAL, weights=[10, 15, 20, 40, 10, 5], k=1
            )[0]
        record["order_status"] = status

        # purchase_date: <= now()
        if i % 89 == 0:
            order_date = None
        elif i % 31 == 0:
            order_date_dt = fake.date_time_between(start_date="-2y", end_date="now")
            formats = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M", "%d-%m-%Y", "%Y%m%d"]
            order_date = order_date_dt.strftime(random.choice(formats))
        elif i % 41 == 0:
            order_date = int(
                fake.date_time_between(start_date="-2y", end_date="now").timestamp()
            )
        elif i % 51 == 0:
            order_date = fake.date_time_between(
                start_date="+1d", end_date="+30d"
            )  # Future violation
        elif i % 67 == 0:
            order_date = fake.date_time_between(
                start_date="-10y", end_date="-5y"
            )  # Very old
        else:
            order_date = fake.date_time_between(start_date="-2y", end_date="now")
        record["purchase_date"] = order_date

        # shipment_date: Required if Shipped/Delivered/Returned, >= purchase_date
        shipped_date = None
        requires_shipment = status in ["Shipped", "Delivered", "Returned"]

        if requires_shipment:
            if i % 42 == 0:  # Violation: shipped before ordered
                if isinstance(order_date, datetime):
                    shipped_date = order_date - timedelta(days=random.randint(1, 5))
                else:
                    shipped_date = fake.date_time_between(
                        start_date="-3y", end_date="-2y"
                    )
            elif i % 52 == 0:
                if isinstance(order_date, datetime):
                    shipped_date = (
                        order_date + timedelta(days=random.randint(1, 3))
                    ).strftime("%Y-%m-%d")
                else:
                    shipped_date = fake.date_between(start_date="-1y", end_date="today")
            elif i % 62 == 0:
                shipped_date = fake.date_time_between(
                    start_date="+1d", end_date="+30d"
                )  # Future
            else:
                # Valid: shipment_date >= purchase_date
                if isinstance(order_date, datetime):
                    max_ship = min(order_date + timedelta(days=7), datetime.now())
                    shipped_date = fake.date_time_between(
                        start_date=order_date, end_date=max_ship
                    )
                else:
                    shipped_date = fake.date_time_between(
                        start_date="-1y", end_date="now"
                    )
        elif i % 71 == 0:  # Violation: not shipped but has date
            shipped_date = fake.date_time_between(start_date="-1y", end_date="now")
        record["shipment_date"] = shipped_date

        # delivery_date: Required if Delivered/Returned, >= shipment_date
        delivered_date = None
        requires_delivery = status in ["Delivered", "Returned"]

        if requires_delivery:
            if i % 46 == 0:  # Violation: delivered before shipped
                if isinstance(shipped_date, datetime):
                    delivered_date = shipped_date - timedelta(days=random.randint(1, 3))
                elif isinstance(order_date, datetime):
                    delivered_date = order_date - timedelta(days=random.randint(1, 5))
                else:
                    delivered_date = fake.date_time_between(
                        start_date="-3y", end_date="-2y"
                    )
            elif i % 56 == 0:
                delivered_date = shipped_date  # Same day
            elif i % 66 == 0:
                delivered_date = fake.date_time_between(
                    start_date="+1d", end_date="+30d"
                )  # Future
            else:
                # Valid: delivery_date >= shipment_date
                if isinstance(shipped_date, datetime):
                    max_deliver = min(shipped_date + timedelta(days=10), datetime.now())
                    delivered_date = fake.date_time_between(
                        start_date=shipped_date, end_date=max_deliver
                    )
                elif isinstance(order_date, datetime):
                    max_deliver = min(order_date + timedelta(days=14), datetime.now())
                    delivered_date = fake.date_time_between(
                        start_date=order_date + timedelta(days=3), end_date=max_deliver
                    )
                else:
                    delivered_date = fake.date_time_between(
                        start_date="-6m", end_date="now"
                    )
        elif i % 76 == 0:  # Violation: not delivered but has date
            delivered_date = fake.date_time_between(start_date="-1y", end_date="now")
        record["delivery_date"] = delivered_date

        # order_subtotal: Decimal >= 0
        if i % 23 == 0:
            subtotal = None
        elif i % 33 == 0:
            subtotal = random.choice(["N/A", "FREE", "TBD", ""])
        elif i % 44 == 0:
            subtotal = round(random.uniform(-500, -10), 2)  # Negative violation
        elif i % 54 == 0:
            subtotal = 0  # Zero subtotal
        elif i % 64 == 0:
            subtotal = random.choice([999999.99, 0.001, -9999])
        else:
            subtotal = round(random.uniform(10, 2000), 2)
        record["order_subtotal"] = subtotal

        # tax_total: Decimal >= 0, typically 0-25% of subtotal
        if i % 26 == 0:
            tax = None
        elif i % 36 == 0:
            tax = random.choice(["Included", "Exempt", "N/A"])
        elif i % 48 == 0:
            tax = round(random.uniform(-50, -1), 2)  # Negative violation
        elif i % 58 == 0:  # Tax > subtotal violation
            if isinstance(subtotal, (int, float)) and subtotal > 0:
                tax = subtotal * 1.5
            else:
                tax = 999
        else:
            # Valid: tax 5-15% of subtotal
            if isinstance(subtotal, (int, float)) and subtotal > 0:
                tax = round(subtotal * random.uniform(0.05, 0.15), 2)
            else:
                tax = 0
        record["tax_total"] = tax

        # shipping_fee: Decimal >= 0, depends on shipping_method
        if i % 29 == 0:
            shipping = None
        elif i % 39 == 0:
            shipping = random.choice(["Free", "FREE SHIPPING", "N/A"])
        elif i % 49 == 0:
            shipping = round(random.uniform(-20, -1), 2)  # Negative violation
        elif i % 59 == 0:
            shipping = random.choice([999, 0.001, -99])
        else:
            # Valid: free for large orders, otherwise 5-50
            if isinstance(subtotal, (int, float)) and subtotal > 100:
                shipping = (
                    0 if random.random() < 0.3 else round(random.uniform(5, 25), 2)
                )
            else:
                shipping = round(random.uniform(5, 50), 2)
        record["shipping_fee"] = shipping

        # discount_total: Decimal >= 0, <= subtotal + shipping + tax
        if i % 34 == 0:
            discount = None
        elif i % 45 == 0:
            discount = random.choice(["10%", "SALE", "N/A"])
        elif i % 55 == 0:
            discount = round(random.uniform(-100, -10), 2)  # Negative violation
        elif i % 65 == 0:  # Discount > subtotal violation
            if isinstance(subtotal, (int, float)) and subtotal > 0:
                discount = subtotal * 1.2
            else:
                discount = 1000
        else:
            # Valid: discount <= subtotal
            if isinstance(subtotal, (int, float)) and subtotal > 0:
                discount = (
                    round(subtotal * random.uniform(0, 0.3), 2)
                    if random.random() < 0.4
                    else 0
                )
            else:
                discount = 0
        record["discount_total"] = discount

        # grand_total: = subtotal + tax + shipping - discount, >= 0
        if i % 28 == 0:
            total = None
        elif i % 38 == 0:
            total = random.choice(["PAID", "PENDING", "N/A"])
        elif i % 46 == 0:  # Wrong calculation violation
            if all(
                isinstance(x, (int, float)) for x in [subtotal, tax, shipping, discount]
            ):
                correct_total = subtotal + tax + shipping - discount
                total = correct_total * random.uniform(0.5, 1.5)
            else:
                total = random.uniform(10, 1000)
        elif i % 56 == 0:
            total = round(random.uniform(-500, -1), 2)  # Negative violation
        elif i % 66 == 0:
            total = 0
        else:
            # Valid: grand_total = subtotal + tax + shipping - discount
            if all(
                isinstance(x, (int, float)) for x in [subtotal, tax, shipping, discount]
            ):
                total = round(subtotal + tax + shipping - discount, 2)
                if total < 0:
                    total = 0  # Domain >= 0
            elif isinstance(subtotal, (int, float)):
                total = round(subtotal * 1.1, 2)
            else:
                total = round(random.uniform(10, 2000), 2)
        record["grand_total"] = total

        # currency_code: ISO 4217 3-letter uppercase
        if i % 32 == 0:
            currency = None
        elif i % 42 == 0:
            currency = random.choice(["US", "EURO", "Dollar", "$", "€", "N/A"])
        elif i % 52 == 0:
            currency = random.choice(["JPY", "CNY", "INR", "BTC", "DOGE"])
        else:
            currency = random.choices(
                CURRENCY_CANONICAL, weights=[60, 20, 10, 5, 5], k=1
            )[0]
        record["currency_code"] = currency

        # shipping_method: Canonical set
        if random.random() > 0.4:
            if i % 40 == 0:
                ship_method = None
            elif i % 50 == 0:
                ship_method = random.choice(["", "N/A", "TBD"])
            else:
                ship_method = random.choice(SHIPPING_METHOD_CANONICAL)
                # Cross-field: Free Shipping should have shipping_fee = 0
                if (
                    ship_method == "Standard"
                    and isinstance(shipping, (int, float))
                    and shipping == 0
                ):
                    ship_method = "Free Shipping"  # Map correctly
            record["shipping_method"] = ship_method

        # device_category: Canonical set
        if random.random() > 0.4:
            if i % 30 == 0:
                device = None
            elif i % 40 == 0:
                device = random.choice(["mobile", "MOBILE", "desk", "DESKTOP"])
            elif i % 50 == 0:
                device = random.choice(["Unknown", "Bot", "API", ""])
            else:
                device = random.choices(DEVICE_CANONICAL, weights=[40, 50, 10], k=1)[0]
            record["device_category"] = device

        # payment_method: Canonical set
        if random.random() > 0.3:
            if i % 35 == 0:
                payment = None
            elif i % 45 == 0:
                payment = random.choice(["", "N/A", "Unknown"])
            else:
                payment = random.choice(PAYMENT_METHOD_CANONICAL)
            record["payment_method"] = payment

        # marketing_channel: Canonical set
        if random.random() > 0.3:
            if i % 25 == 0:
                channel = None
            elif i % 35 == 0:
                channel = random.choice(
                    ["google", "GOOGLE", "fb", "FB", "email", "EMAIL"]
                )
            elif i % 45 == 0:
                channel = random.choice(["Unknown", "N/A", "?", ""])
            else:
                channel = random.choice(MARKETING_CHANNEL_CANONICAL)
            record["marketing_channel"] = channel

        # coupon_code: If present, discount_total > 0
        if random.random() > 0.6:
            if i % 50 == 0:
                coupon = None
            elif i % 60 == 0:
                coupon = random.choice(["", "INVALID", "EXPIRED"])
            else:
                coupon = random.choice(
                    [
                        "SAVE10",
                        "WELCOME20",
                        "FREESHIP",
                        "SUMMER2024",
                        "VIP15",
                        "FLASH50",
                        "BLACKFRIDAY",
                        "LOYALTY",
                    ]
                )
                # Cross-field: if coupon present, discount should be > 0 (unless violation)
            record["coupon_code"] = coupon

        # item_count: Integer > 0
        if random.random() > 0.5:
            if i % 45 == 0:
                items = None
            elif i % 55 == 0:
                items = random.choice(["Multiple", "Few", "Many"])
            elif i % 65 == 0:
                items = random.choice([0, -1, -5])  # <= 0 violation
            elif i % 75 == 0:
                items = random.choice([999, 0.5, 10000])
            else:
                items = random.choices(
                    [1, 2, 3, 4, 5, 10, 20], weights=[30, 25, 20, 10, 10, 3, 2], k=1
                )[0]
            record["item_count"] = items

        # customer_ip: Valid IPv4/IPv6
        if random.random() > 0.7:
            if i % 55 == 0:
                ip = None
            elif i % 65 == 0:
                ip = random.choice(["N/A", "0.0.0.0", "999.999.999.999"])
            else:
                ip = fake.ipv4()
            record["customer_ip"] = ip

        # refund_amount: >= 0, <= grand_total, only if Cancelled/Returned
        is_refundable = status in ["Returned", "Cancelled"]
        if is_refundable:
            if i % 60 == 0:
                refund = None
            elif i % 70 == 0:
                refund = random.choice(["Full", "Partial", "Pending"])
            elif i % 80 == 0:  # Refund > total violation
                if isinstance(total, (int, float)) and total > 0:
                    refund = total * 1.5
                else:
                    refund = 1000
            else:
                # Valid: refund <= grand_total
                if isinstance(total, (int, float)) and total > 0:
                    refund = round(total * random.uniform(0.5, 1.0), 2)
                else:
                    refund = round(random.uniform(10, 500), 2)
            record["refund_amount"] = refund

        # record_created: <= now(), close to purchase_date
        if random.random() > 0.3:
            if i % 30 == 0:
                created = None
            elif i % 40 == 0:
                created = fake.date_time_between(
                    start_date="-2y", end_date="now"
                ).strftime("%Y-%m-%d %H:%M:%S")
            elif i % 50 == 0:  # Violation: created after order
                if isinstance(order_date, datetime):
                    created = order_date + timedelta(days=random.randint(1, 30))
                else:
                    created = fake.date_time_between(start_date="+1d", end_date="+30d")
            else:
                # Valid: close to purchase_date
                if isinstance(order_date, datetime):
                    created = order_date + timedelta(minutes=random.randint(0, 60))
                else:
                    created = fake.date_time_between(start_date="-2y", end_date="now")
            record["record_created"] = created

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_orders_data(number_of_orders)
df = add_more_messiness(df)

output_file = "orders.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Orders Items Table Generator

In [41]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
RETURN_STATUS_CANONICAL = [
    "None",
    "Return Requested",
    "Return Pending",
    "Returned",
    "Return Rejected",
    "Exchanged",
]
VALID_WAREHOUSES = [
    "WH-EAST-01",
    "WH-WEST-01",
    "WH-CENTRAL-01",
    "WH-NORTH-01",
    "WH-SOUTH-01",
    "DC-01",
    "DC-02",
    "STORE-001",
    "STORE-002",
    "DROPSHIP",
]


def generate_messy_order_items_data(
    num_rows=5000, order_id_format="ORD", product_id_format="PROD"
):
    data = []
    used_order_item_ids = []

    order_ids = [f"ORD_{i + starting_order_index}" for i in range(number_of_orders)]
    product_ids = [
        f"PROD_{i + starting_product_index}" for i in range(number_of_products)
    ]

    order_items_map = {}

    # Generate base prices/costs for products (consistency within same product)
    product_base_prices = {pid: round(random.uniform(5, 500), 2) for pid in product_ids}
    product_base_costs = {
        pid: round(product_base_prices[pid] * random.uniform(0.4, 0.7), 2)
        for pid in product_ids
    }
    product_unit_weights = {
        pid: round(random.uniform(0.1, 10), 2) for pid in product_ids
    }

    for i in range(num_rows):
        record = {}

        # line_item_id: Primary key, positive integer, unique
        if i % 53 == 0 and used_order_item_ids:
            item_id = random.choice(used_order_item_ids)  # Duplicate violation
        else:
            item_id = starting_order_item_index + i
            used_order_item_ids.append(item_id)
        record["line_item_id"] = item_id if i % 97 != 0 else None  # 1% null

        # order_ref: Mandatory, format ^ORD_[0-9]+$
        if i % 83 == 0:
            order_id = None
        elif i % 61 == 0:
            order_id = f"ORD_{999999}"  # FK violation
        elif i % 43 == 0:
            order_id = random.choice(["INVALID", "NULL", "N/A", "", "MISSING"])
        else:
            # Group items by order (realistic basket behavior)
            if random.random() < 0.6 and order_items_map:
                recent_orders = [
                    o for o, items in order_items_map.items() if len(items) < 10
                ]
                if recent_orders:
                    order_id = random.choice(recent_orders[-20:])
                else:
                    order_id = random.choice(order_ids)
            else:
                order_id = random.choice(order_ids)
        record["order_ref"] = order_id

        # product_ref: Mandatory, format ^PROD_[0-9]+$
        if i % 79 == 0:
            prod_id = None
        elif i % 57 == 0:
            prod_id = f"PROD_{9999}"  # FK violation
        elif i % 47 == 0:
            prod_id = random.choice(["INVALID", "N/A", "NULL", "", "DISCONTINUED"])
        else:
            if order_id in order_items_map and random.random() < 0.1:
                if order_items_map[order_id]:
                    prod_id = random.choice(order_items_map[order_id])
                else:
                    prod_id = random.choice(product_ids)
            else:
                popular_products = product_ids[:20]
                prod_id = (
                    random.choice(popular_products)
                    if random.random() < 0.3
                    else random.choice(product_ids)
                )

        if order_id and prod_id:
            if order_id not in order_items_map:
                order_items_map[order_id] = []
            order_items_map[order_id].append(prod_id)
        record["product_ref"] = prod_id

        # Get base values for this product
        base_price = product_base_prices.get(prod_id, round(random.uniform(5, 500), 2))
        base_cost = product_base_costs.get(
            prod_id, round(base_price * random.uniform(0.4, 0.7), 2)
        )
        unit_weight = product_unit_weights.get(
            prod_id, round(random.uniform(0.1, 10), 2)
        )

        # qty_ordered: Integer, strictly > 0
        if i % 89 == 0:
            quantity = None
        elif i % 31 == 0:
            quantity = random.choice(["One", "Two", "Many", "N/A", "A few"])
        elif i % 41 == 0:
            quantity = random.choice([0, -1, -10, -100])  # <= 0 violation
        elif i % 51 == 0:
            quantity = random.choice([999, 10000, 0.5, -999])  # Extreme/decimal
        elif i % 67 == 0:
            quantity = random.choice([1.5, 2.3, 3.7, 10.25])  # Decimal violation
        else:
            quantity = random.choices(
                [1, 2, 3, 4, 5, 10, 20, 50, 100],
                weights=[50, 20, 10, 5, 5, 5, 3, 1, 1],
                k=1,
            )[0]
        record["qty_ordered"] = quantity

        # is_gift: Boolean - determine early as it affects pricing
        is_gift = False
        if random.random() > 0.9:
            if i % 71 == 0:
                gift_val = None
            elif i % 81 == 0:
                gift_val = random.choice(["Y", "N", "Yes", "No", "1", "0"])
            else:
                gift_val = random.choice([True, False])
                is_gift = gift_val == True
            record["is_gift"] = gift_val

        # unit_cost: Decimal >= 0
        if i % 29 == 0:
            product_cost = None
        elif i % 37 == 0:
            product_cost = random.choice(["N/A", "Unknown", "TBD"])
        elif i % 49 == 0:
            product_cost = round(random.uniform(-50, -1), 2)  # Negative violation
        elif i % 59 == 0:
            product_cost = 0  # Zero cost
        elif i % 73 == 0:
            product_cost = random.choice([9999.99, 0.001, -999])  # Extreme
        else:
            product_cost = round(base_cost * random.uniform(0.95, 1.05), 2)
        record["unit_cost"] = product_cost

        # unit_selling_price: Decimal >= 0, typically >= unit_cost
        if i % 91 == 0:
            unit_price = None
        elif i % 33 == 0:
            unit_price = random.choice(["Free", "N/A", "Contact for price", "", "TBD"])
        elif i % 44 == 0:
            unit_price = round(random.uniform(-100, -1), 2)  # Negative violation
        elif i % 55 == 0:
            unit_price = 0  # Zero price
        elif i % 66 == 0:
            unit_price = random.choice([999999.99, 0.001, -9999])  # Extreme
        elif i % 77 == 0:
            # Price < cost (negative margin) - intentional violation
            if isinstance(product_cost, (int, float)) and product_cost > 0:
                unit_price = round(product_cost * random.uniform(0.5, 0.9), 2)
            else:
                unit_price = round(base_price * 0.3, 2)
        elif is_gift:
            unit_price = 0  # Gift items have zero price
        else:
            unit_price = round(base_price * random.uniform(0.95, 1.05), 2)
        record["unit_selling_price"] = unit_price

        # item_discount: Decimal >= 0, must be <= unit_selling_price * qty_ordered
        if i % 34 == 0:
            discount = None
        elif i % 46 == 0:
            discount = random.choice(["10%", "SALE", "N/A", "Free shipping"])
        elif i % 56 == 0:
            discount = round(random.uniform(-50, -1), 2)  # Negative violation
        elif i % 69 == 0:
            # Discount > gross amount violation
            if (
                all(isinstance(x, (int, float)) for x in [quantity, unit_price])
                and quantity > 0
                and unit_price > 0
            ):
                discount = round((unit_price * quantity) * random.uniform(1.1, 1.5), 2)
            else:
                discount = random.choice([9999, 0.001, -999])
        elif (
            is_gift
            and isinstance(quantity, (int, float))
            and isinstance(unit_price, (int, float))
        ):
            discount = round(unit_price * quantity, 2)  # 100% discount for gifts
        else:
            # Valid discount: <= gross amount
            if (
                all(isinstance(x, (int, float)) for x in [quantity, unit_price])
                and quantity > 0
                and unit_price > 0
            ):
                gross = unit_price * quantity
                if quantity >= 10:
                    discount = round(gross * 0.1, 2)  # 10% bulk discount
                elif random.random() < 0.3:
                    discount = round(gross * random.uniform(0.05, 0.25), 2)
                else:
                    discount = 0
            else:
                discount = (
                    0 if random.random() > 0.2 else round(random.uniform(0, 50), 2)
                )
        record["item_discount"] = discount

        # line_total: Must equal (unit_selling_price * qty_ordered) - item_discount
        if i % 101 == 0:
            total_price = None
        elif i % 36 == 0:
            total_price = random.choice(["PAID", "PENDING", "N/A", "Calculate"])
        elif i % 48 == 0:
            # Wrong calculation - intentional violation
            if all(
                isinstance(x, (int, float)) for x in [quantity, unit_price, discount]
            ):
                correct_total = (quantity * unit_price) - discount
                total_price = round(correct_total * random.uniform(0.5, 1.5), 2)
            else:
                total_price = random.uniform(10, 1000)
        elif i % 58 == 0:
            total_price = round(random.uniform(-500, -1), 2)  # Negative violation
        elif i % 68 == 0:
            total_price = 0  # Zero but has quantity/price
        else:
            # Correct calculation: (unit_selling_price * qty_ordered) - item_discount
            if all(isinstance(x, (int, float)) for x in [quantity, unit_price]):
                discount_val = discount if isinstance(discount, (int, float)) else 0
                total_price = round((quantity * unit_price) - discount_val, 2)
                if total_price < 0:
                    total_price = 0  # Domain >= 0
            else:
                total_price = round(random.uniform(10, 1000), 2)
        record["line_total"] = total_price

        # profit_margin: Must equal line_total - (unit_cost * qty_ordered)
        if random.random() > 0.6:
            if i % 85 == 0:
                margin = f"LOSS: ${random.randint(10, 500)}"  # String violation
            elif all(
                isinstance(x, (int, float))
                for x in [total_price, product_cost, quantity]
            ):
                margin = round(total_price - (product_cost * quantity), 2)
            else:
                margin = None
            record["profit_margin"] = margin

        # tax_amount: Decimal >= 0, based on line_total
        if random.random() > 0.5:
            if i % 42 == 0:
                tax = None
            elif i % 52 == 0:
                tax = random.choice(["Included", "Exempt", "N/A"])
            elif i % 62 == 0:
                tax = round(random.uniform(-10, -1), 2)  # Negative violation
            else:
                # Valid: tax based on line_total (5-15% typical)
                if isinstance(total_price, (int, float)) and total_price > 0:
                    tax = round(total_price * random.uniform(0.05, 0.15), 2)
                else:
                    tax = 0  # Zero tax if line_total is 0
            record["tax_amount"] = tax

        # product_sku: Format ^SKU-[A-Z0-9]+$, must belong to product_ref
        if random.random() > 0.6:
            if i % 45 == 0:
                sku = None
            elif i % 54 == 0:
                sku = random.choice(["N/A", "NULL", "SKU"])  # Invalid
            else:
                # Valid: SKU belongs to product_ref
                if prod_id and prod_id.startswith("PROD_"):
                    prod_num = prod_id.split("_")[1]
                    sku = (
                        f"SKU-{prod_num}-{random.choice(['BLK', 'WHT', 'RED', 'BLU'])}"
                    )
                else:
                    sku = f"SKU-{random.randint(1000, 9999)}-BLK"
            record["product_sku"] = sku

        # total_weight_kg: unit_weight * qty_ordered
        if random.random() > 0.7:
            if i % 63 == 0:
                weight = None
            elif i % 72 == 0:
                weight = random.choice(["Light", "Heavy", "N/A"])
            elif i % 82 == 0:
                weight = round(random.uniform(-5, -0.1), 2)  # Negative violation
            else:
                # Valid: weight = unit_weight * qty_ordered
                if isinstance(quantity, (int, float)) and quantity > 0:
                    weight = round(unit_weight * quantity, 2)
                else:
                    weight = round(random.uniform(0.1, 50), 2)
            record["total_weight_kg"] = weight

        # fulfillment_location: Mandatory for physical, valid format
        if random.random() > 0.7:
            if i % 50 == 0:
                warehouse = None
            elif i % 60 == 0:
                warehouse = random.choice(["", "N/A", "TBD"])  # Invalid
            else:
                warehouse = random.choice(VALID_WAREHOUSES)
            record["fulfillment_location"] = warehouse

        # return_status: Canonical set
        if random.random() > 0.85:
            if i % 55 == 0:
                return_status = None
            elif i % 65 == 0:
                return_status = random.choice(["", "N/A", "Refunded"])  # Invalid
            else:
                return_status = random.choice(RETURN_STATUS_CANONICAL)
            record["return_status"] = return_status

        # created_timestamp: ISO 8601, <= now()
        if random.random() > 0.3:
            if i % 32 == 0:
                created = None
            elif i % 38 == 0:
                created = fake.date_time_between(
                    start_date="-2y", end_date="now"
                ).strftime("%Y-%m-%d %H:%M:%S")
            elif i % 64 == 0:
                created = fake.date_time_between(
                    start_date="+1d", end_date="+30d"
                )  # Future violation
            else:
                created = fake.date_time_between(start_date="-2y", end_date="now")
            record["created_timestamp"] = created

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_order_items_data(number_of_order_items)
df = add_more_messiness(df)

output_file = "order_items.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Payments Table Generator

In [42]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import uuid

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
PAYMENT_TYPE_CANONICAL = [
    "Credit Card",
    "Debit Card",
    "PayPal",
    "Apple Pay",
    "Google Pay",
    "Bank Transfer",
    "Gift Card",
    "Cash on Delivery",
    "Buy Now Pay Later",
]
GATEWAY_PROVIDER_CANONICAL = [
    "Stripe",
    "Square",
    "PayPal",
    "Authorize.net",
    "Adyen",
    "Braintree",
    "Plaid",
    "COD",
]
PAYMENT_STATUS_CANONICAL = [
    "Pending",
    "Completed",
    "Failed",
    "Refunded",
    "Partially Refunded",
    "Cancelled",
]
CARD_BRAND_CANONICAL = ["Visa", "MasterCard", "American Express", "Discover", "JCB"]
CURRENCY_CANONICAL = ["USD", "EUR", "GBP", "CAD", "AUD"]

# Payment type to provider mapping
PAYMENT_PROVIDER_MAP = {
    "Credit Card": ["Stripe", "Square", "Authorize.net", "Braintree", "Adyen"],
    "Debit Card": ["Stripe", "Square", "Authorize.net", "Adyen"],
    "PayPal": ["PayPal"],
    "Apple Pay": ["Stripe", "Square", "Adyen", "Braintree"],
    "Google Pay": ["Stripe", "Square", "Adyen", "Braintree"],
    "Bank Transfer": ["Plaid"],
    "Gift Card": ["Stripe", "Square"],  # Internal handling
    "Cash on Delivery": ["COD"],
    "Buy Now Pay Later": ["Stripe", "Adyen"],  # Klarna/Afterpay through these
}


def generate_gateway_transaction_id(provider, payment_status):
    """Generate provider-specific transaction ID format"""
    if payment_status in ["Failed", "Cancelled"] and random.random() < 0.3:
        return None  # Some failed transactions don't get IDs

    if provider == "Stripe":
        return f"pi_{uuid.uuid4().hex[:24]}"
    elif provider == "PayPal":
        return f"PP-{uuid.uuid4().hex[:20].upper()}"
    elif provider == "Square":
        return f"sq_{uuid.uuid4().hex[:22]}"
    elif provider == "Adyen":
        return f"ADY-{uuid.uuid4().hex[:16].upper()}"
    elif provider == "Braintree":
        return f"bt_{uuid.uuid4().hex[:20]}"
    elif provider == "Authorize.net":
        return f"AUTH-{random.randint(100000000, 999999999)}"
    elif provider == "Plaid":
        return f"plaid_{uuid.uuid4().hex[:18]}"
    elif provider == "COD":
        return None  # COD typically has no gateway transaction ID
    else:
        return f"TXN-{uuid.uuid4().hex[:16].upper()}"


def generate_messy_payments_data(num_rows=3500, order_id_format="ORD"):
    data = []
    used_payment_ids = []
    used_transaction_ids = []

    order_ids = [f"ORD_{i + starting_order_index}" for i in range(number_of_orders)]

    # Generate order amounts for consistency
    order_amounts = {oid: round(random.uniform(10, 2000), 2) for oid in order_ids}

    for i in range(num_rows):
        record = {}

        # payment_ref: Primary key, positive integer, unique
        if i % 53 == 0 and used_payment_ids:
            payment_id = random.choice(used_payment_ids)  # Duplicate violation
        else:
            payment_id = starting_payment_index + i
            used_payment_ids.append(payment_id)
        record["payment_ref"] = payment_id if i % 97 != 0 else None  # 1% null

        # order_ref: Mandatory, FK to orders
        if i % 89 == 0:
            order_id = None
        elif i % 61 == 0:
            order_id = f"ORD_{999999}"  # FK violation
        elif i % 43 == 0:
            order_id = random.choice(["INVALID", "NULL", "N/A", "", "MISSING"])
        else:
            order_id = random.choice(order_ids)
        record["order_ref"] = order_id

        # Get expected amount for this order
        expected_amount = order_amounts.get(
            order_id, round(random.uniform(10, 2000), 2)
        )

        # payment_type: Canonical set
        if i % 27 == 0:
            method = None
        elif i % 37 == 0:
            method = random.choice(["CC", "credit card", "CREDIT_CARD", "Card", "Visa"])
        elif i % 47 == 0:
            method = random.choice(["Unknown", "N/A", "Cash", "Check", ""])
        elif i % 57 == 0:
            method = random.choice(
                ["Credt Card", "PayPall", "Banck Transfer", "Appel Pay"]
            )
        else:
            method = random.choice(PAYMENT_TYPE_CANONICAL)
        record["payment_type"] = method

        # gateway_provider: Must match payment_type
        if i % 29 == 0:
            provider = None
        elif i % 39 == 0:
            # Mismatched provider violation
            if method == "PayPal":
                provider = "Stripe"
            elif method == "Credit Card":
                provider = "PayPal"
            else:
                provider = "Unknown"
        elif i % 49 == 0:
            provider = random.choice(["N/A", "NULL", "Internal", ""])
        elif i % 59 == 0:
            provider = random.choice(["Strpe", "Sqaure", "PayPl", "Klarrna"])
        else:
            # Valid: provider matches payment_type
            if method in PAYMENT_PROVIDER_MAP:
                provider = random.choice(PAYMENT_PROVIDER_MAP[method])
            else:
                provider = random.choice(GATEWAY_PROVIDER_CANONICAL[:5])
        record["gateway_provider"] = provider

        # payment_status: Canonical set
        if i % 23 == 0:
            status = None
        elif i % 33 == 0:
            status = random.choice(
                ["completed", "COMPLETED", "Complete", "1", "Success"]
            )
        elif i % 44 == 0:
            status = random.choice(
                ["In Progress", "Processing", "Approved", "Declined"]
            )
        elif i % 54 == 0:
            status = random.choice(["Complted", "Pendng", "Faild", "Refnded"])
        else:
            status = random.choices(
                PAYMENT_STATUS_CANONICAL, weights=[10, 60, 10, 10, 5, 5], k=1
            )[0]
        record["payment_status"] = status

        # transaction_date: <= now()
        if i % 26 == 0:
            payment_date = None
        elif i % 36 == 0:
            payment_date_dt = fake.date_time_between(start_date="-1y", end_date="now")
            formats = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y %H:%M", "%d-%m-%Y"]
            payment_date = payment_date_dt.strftime(random.choice(formats))
        elif i % 46 == 0:
            payment_date = int(
                fake.date_time_between(start_date="-1y", end_date="now").timestamp()
            )
        elif i % 56 == 0:
            payment_date = fake.date_time_between(
                start_date="+1d", end_date="+30d"
            )  # Future violation
        else:
            payment_date = fake.date_time_between(start_date="-1y", end_date="now")
        record["transaction_date"] = payment_date

        # gateway_transaction_id: Required for Completed, provider-specific format
        if i % 31 == 0:
            trans_id = None
        elif i % 41 == 0:
            trans_id = random.choice(["N/A", "NULL", "PENDING", ""])
        elif i % 51 == 0 and used_transaction_ids:
            trans_id = random.choice(used_transaction_ids)  # Duplicate violation
        elif i % 67 == 0:
            # Wrong format for provider
            trans_id = f"WRONG-{random.randint(1000, 9999)}"
        else:
            # Valid: provider-specific format
            trans_id = generate_gateway_transaction_id(provider, status)
            if trans_id:
                used_transaction_ids.append(trans_id)
        record["gateway_transaction_id"] = trans_id

        # payment_amount: > 0 for Completed
        if i % 93 == 0:
            amount = None
        elif i % 34 == 0:
            amount = random.choice(["Free", "N/A", "Pending", ""])
        elif i % 45 == 0:
            amount = round(random.uniform(-500, -10), 2)  # Negative violation
        elif i % 55 == 0:
            amount = 0  # Zero violation
        elif i % 65 == 0:
            amount = round(expected_amount * random.uniform(0.5, 1.5), 2)  # Mismatch
        elif i % 75 == 0:
            amount = random.choice([999999.99, 0.01, -9999])  # Extreme
        else:
            # Valid: matches order amount
            if status in ["Completed", "Refunded", "Partially Refunded"]:
                amount = expected_amount
            elif status == "Failed":
                amount = expected_amount  # Failed attempts record attempted amount
            else:
                amount = expected_amount
        record["payment_amount"] = amount

        # transaction_fee: >= 0, < payment_amount, typically 1-5%
        if i % 28 == 0:
            fee = None
        elif i % 38 == 0:
            fee = random.choice(["Included", "N/A", "Waived"])
        elif i % 48 == 0:
            fee = round(random.uniform(-10, -1), 2)  # Negative violation
        elif i % 58 == 0:
            fee = random.choice([999, 0.001, -99])  # Extreme
        elif i % 68 == 0:
            # Fee > 10% violation
            if isinstance(amount, (int, float)) and amount > 0:
                fee = round(amount * 0.15, 2)
            else:
                fee = 50
        else:
            # Valid: fee based on payment type
            if isinstance(amount, (int, float)) and amount > 0:
                if method in ["Cash on Delivery"]:
                    fee = 0  # COD has no transaction fee
                elif method in ["Gift Card"]:
                    fee = 0  # Gift card typically no fee
                elif method in ["Credit Card", "Debit Card"]:
                    fee = round(amount * 0.029 + 0.30, 2)  # 2.9% + $0.30
                elif method == "PayPal":
                    fee = round(amount * 0.0349 + 0.49, 2)  # 3.49% + $0.49
                elif method == "Bank Transfer":
                    fee = round(amount * 0.008, 2)  # 0.8%
                else:
                    fee = round(amount * 0.025, 2)  # 2.5%
            else:
                fee = round(random.uniform(0.30, 50), 2)
        record["transaction_fee"] = fee

        # refund_total: >= 0, <= payment_amount, consistent with status
        if i % 86 == 0:
            # Not refunded but has refund amount violation
            if status not in ["Refunded", "Partially Refunded"]:
                refund_amount = round(random.uniform(10, 500), 2)
            else:
                refund_amount = 0
        elif i % 70 == 0:
            refund_amount = random.choice(["Full", "Partial", "Pending"])
        elif i % 80 == 0:
            # Refund > payment violation
            if isinstance(amount, (int, float)) and amount > 0:
                refund_amount = amount * 1.5
            else:
                refund_amount = 1000
        elif i % 90 == 0:
            refund_amount = round(random.uniform(-100, -10), 2)  # Negative violation
        else:
            # Valid: consistent with status
            if status == "Refunded":
                refund_amount = amount if isinstance(amount, (int, float)) else 0
            elif status == "Partially Refunded":
                if isinstance(amount, (int, float)) and amount > 0:
                    refund_amount = round(amount * random.uniform(0.1, 0.9), 2)
                else:
                    refund_amount = round(random.uniform(10, 200), 2)
            else:
                refund_amount = 0  # No refund for other statuses
        record["refund_total"] = refund_amount

        # refund_processed_date: Required if Refunded/Partially Refunded, >= transaction_date
        refund_date = None
        has_refund = isinstance(refund_amount, (int, float)) and refund_amount > 0

        if has_refund:
            if i % 52 == 0:
                # Refund before payment violation
                if isinstance(payment_date, datetime):
                    refund_date = payment_date - timedelta(days=random.randint(1, 10))
                else:
                    refund_date = fake.date_time_between(
                        start_date="-2y", end_date="-1y"
                    )
            elif i % 62 == 0:
                refund_date = fake.date_between(start_date="-6m", end_date="today")
            elif i % 72 == 0:
                refund_date = fake.date_time_between(
                    start_date="+1d", end_date="+30d"
                )  # Future
            else:
                # Valid: refund_date >= transaction_date
                if isinstance(payment_date, datetime):
                    max_refund = min(payment_date + timedelta(days=90), datetime.now())
                    refund_date = fake.date_time_between(
                        start_date=payment_date, end_date=max_refund
                    )
                else:
                    refund_date = fake.date_time_between(
                        start_date="-6m", end_date="now"
                    )
        elif status in ["Refunded", "Partially Refunded"] and i % 82 == 0:
            refund_date = None  # Missing refund date violation
        record["refund_processed_date"] = refund_date

        # currency_code: ISO 4217
        if i % 32 == 0:
            currency = None
        elif i % 42 == 0:
            currency = random.choice(["US", "EURO", "Dollar", "$"])
        else:
            currency = random.choices(
                CURRENCY_CANONICAL, weights=[70, 15, 5, 5, 5], k=1
            )[0]
        record["currency_code"] = currency

        # card_last_four: Only for card payments, exactly 4 digits
        is_card_payment = method in ["Credit Card", "Debit Card", "CC", "credit card"]
        if is_card_payment:
            if i % 50 == 0:
                card_last4 = None
            elif i % 60 == 0:
                card_last4 = random.choice(["XXXX", "N/A", "****", "0000"])  # Invalid
            elif i % 70 == 0:
                card_last4 = random.choice(["123", "12345", "1"])  # Wrong length
            else:
                card_last4 = str(random.randint(1000, 9999))
            record["card_last_four"] = card_last4

        # card_brand: Only for card payments, canonical set
        if is_card_payment:
            if i % 55 == 0:
                brand = None
            elif i % 65 == 0:
                brand = random.choice(["Unknown", "N/A", "Card"])
            else:
                brand = random.choice(CARD_BRAND_CANONICAL)
            record["card_brand"] = brand
        elif method in ["Bank Transfer", "PayPal", "Gift Card", "Cash on Delivery"]:
            # Non-card payments should have NULL card fields
            if i % 85 == 0:
                record["card_last_four"] = str(random.randint(1000, 9999))  # Violation
                record["card_brand"] = random.choice(CARD_BRAND_CANONICAL)

        # authorization_code: Required for Completed card payments
        if status in ["Completed", "completed", "COMPLETED"] and is_card_payment:
            if i % 60 == 0:
                auth_code = None
            elif i % 73 == 0:
                auth_code = random.choice(["N/A", "PENDING", ""])
            else:
                auth_code = f"AUTH-{uuid.uuid4().hex[:12].upper()}"
            record["authorization_code"] = auth_code
        elif status in ["Failed", "Cancelled"] and i % 83 == 0:
            # Auth code for failed payment violation
            record["authorization_code"] = f"AUTH-{uuid.uuid4().hex[:12].upper()}"

        # risk_score: 0-100
        if random.random() > 0.7:
            if i % 65 == 0:
                risk = None
            elif i % 75 == 0:
                risk = random.choice(["Low", "Medium", "High", "N/A"])
            elif i % 85 == 0:
                risk = random.choice([-10, 150, 999])  # Out of range
            elif i % 91 == 0:
                # High risk but Completed violation
                if status == "Completed":
                    risk = random.randint(80, 99)
                else:
                    risk = random.randint(1, 30)
            else:
                # Valid: higher risk for failed payments
                if status in ["Failed", "failed", "Faild"]:
                    risk = random.randint(50, 99)
                elif status in ["Completed", "completed"]:
                    risk = random.randint(1, 40)
                else:
                    risk = random.randint(1, 60)
            record["risk_score"] = risk

        # customer_ip: Valid IPv4/IPv6
        if random.random() > 0.6:
            if i % 70 == 0:
                ip = None
            elif i % 80 == 0:
                ip = random.choice(["N/A", "0.0.0.0", "999.999.999.999", "127.0.0.1"])
            else:
                ip = fake.ipv4()
            record["customer_ip"] = ip

        # billing_country: ISO 3166-1 alpha-2
        if random.random() > 0.5:
            if i % 75 == 0:
                country = None
            elif i % 85 == 0:
                country = random.choice(
                    ["USA", "United States", "N/A", "UK"]
                )  # Invalid
            else:
                country = fake.country_code()
            record["billing_country"] = country

        # retry_attempt: Integer >= 0
        if random.random() > 0.7:
            if i % 78 == 0:
                retry = random.randint(-5, -1)  # Negative violation
            elif i % 88 == 0:
                retry = random.randint(10, 20)  # Unusually high
            else:
                # Valid: 0 for first attempt
                if status in ["Failed", "Cancelled"]:
                    retry = random.randint(0, 3)
                else:
                    retry = 0
            record["retry_attempt"] = retry

        # record_created: >= transaction_date
        if random.random() > 0.3:
            if i % 30 == 0:
                created = None
            elif i % 40 == 0:
                created = fake.date_time_between(
                    start_date="-1y", end_date="now"
                ).strftime("%Y-%m-%d %H:%M:%S")
            elif i % 50 == 0:
                # Created before payment violation
                if isinstance(payment_date, datetime):
                    created = payment_date - timedelta(days=random.randint(1, 30))
                else:
                    created = fake.date_time_between(start_date="-2y", end_date="-1y")
            else:
                # Valid: close to transaction_date
                if isinstance(payment_date, datetime):
                    created = payment_date + timedelta(minutes=random.randint(0, 60))
                else:
                    created = fake.date_time_between(start_date="-1y", end_date="now")
            record["record_created"] = created

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_payments_data(number_of_payments)
df = add_more_messiness(df)

output_file = "payments.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Inventory Table Generator

In [43]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
import string

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

VALID_WAREHOUSES = [
    "WH-EAST-01",
    "WH-WEST-01",
    "WH-CENTRAL-01",
    "WH-NORTH-01",
    "WH-SOUTH-01",
    "DC-01",
    "DC-02",
    "STORE-001",
    "STORE-002",
    "DROPSHIP",
]
STOCK_STATUS_CANONICAL = ["In Stock", "Low Stock", "Out of Stock", "Discontinued"]


def derive_stock_status(available_qty, min_stock_level, is_discontinued=False):
    """Derive stock_status from available_qty and min_stock_level per rules."""
    if is_discontinued:
        return "Discontinued"
    if available_qty is None or not isinstance(available_qty, (int, float)):
        return None
    if available_qty <= 0:
        return "Out of Stock"
    if isinstance(min_stock_level, (int, float)) and available_qty < min_stock_level:
        return "Low Stock"
    return "In Stock"


def generate_messy_inventory_data(
    num_rows=1500, product_id_format="PROD", supplier_id_format="SUPP"
):
    data = []
    used_inventory_ids = []

    product_ids = [f"PROD_{i + starting_product_index}" for i in range(num_rows)]
    supplier_ids = [
        f"SUPP_{i + starting_supplier_index}" for i in range(number_of_suppliers)
    ]

    product_inventory_map = {}
    product_categories = {
        "high_turnover": product_ids[: int(len(product_ids) * 0.2)],
        "medium_turnover": product_ids[
            int(len(product_ids) * 0.2) : int(len(product_ids) * 0.6)
        ],
        "low_turnover": product_ids[
            int(len(product_ids) * 0.6) : int(len(product_ids) * 0.9)
        ],
        "obsolete": product_ids[int(len(product_ids) * 0.9) :],
    }

    for i in range(num_rows):
        record = {}

        # inv_id: Primary key, positive integer, unique (with intentional duplicates at 2%)
        if i % 50 == 0 and used_inventory_ids:
            inv_id = random.choice(used_inventory_ids)
        else:
            inv_id = starting_inventory_index + i
            used_inventory_ids.append(inv_id)
        record["inv_id"] = inv_id if i % 100 != 0 else None  # 1% null

        # product_ref: Mandatory, ^PROD_[0-9]+$ format
        if i % 85 == 0:
            prod_id = None
        elif i % 60 == 0:
            prod_id = f"PROD_{9999}"  # FK violation
        elif i % 40 == 0:
            prod_id = random.choice(["INVALID", "NULL", "N/A", "", "DISCONTINUED"])
        elif i % 70 == 0 and product_inventory_map:
            prod_id = random.choice(list(product_inventory_map.keys())[:10])
        else:
            available = [p for p in product_ids if p not in product_inventory_map]
            prod_id = (
                random.choice(available) if available else random.choice(product_ids)
            )

        product_inventory_map[prod_id] = True
        record["product_ref"] = prod_id

        product_category = "medium_turnover"
        for cat, prods in product_categories.items():
            if prod_id in prods:
                product_category = cat
                break

        is_discontinued = product_category == "obsolete"

        # vendor_id: Mandatory, ^SUPP_[1-9][0-9]*$ format
        if i % 25 == 0:
            supp_id = None
        elif i % 35 == 0:
            supp_id = f"SUPP_{999}"  # FK violation
        elif i % 45 == 0:
            supp_id = random.choice(["INVALID", "N/A", "NULL", "", "UNKNOWN"])
        else:
            supp_id = random.choice(supplier_ids)
        record["vendor_id"] = supp_id

        # current_stock: Integer >= 0 (with intentional violations)
        if i % 90 == 0:
            stock_qty = None
        elif i % 30 == 0:
            stock_qty = random.choice(["Out of Stock", "Many", "Few", "N/A", "Unknown"])
        elif i % 40 == 0:
            stock_qty = random.randint(-100, -1)  # Business logic violation
        elif i % 50 == 0:
            stock_qty = random.choice([99999, 1000000, -9999])
        elif i % 60 == 0:
            stock_qty = random.choice([10.5, 25.3, 100.75])
        else:
            if product_category == "high_turnover":
                stock_qty = random.choices(
                    [
                        0,
                        random.randint(1, 10),
                        random.randint(11, 50),
                        random.randint(51, 200),
                        random.randint(201, 1000),
                    ],
                    weights=[5, 10, 30, 40, 15],
                    k=1,
                )[0]
            elif product_category == "medium_turnover":
                stock_qty = random.choices(
                    [
                        0,
                        random.randint(1, 20),
                        random.randint(21, 100),
                        random.randint(101, 500),
                    ],
                    weights=[3, 20, 50, 27],
                    k=1,
                )[0]
            elif product_category == "low_turnover":
                stock_qty = random.choices(
                    [
                        0,
                        random.randint(1, 50),
                        random.randint(51, 200),
                        random.randint(201, 1000),
                    ],
                    weights=[2, 15, 40, 43],
                    k=1,
                )[0]
            else:
                stock_qty = random.choices(
                    [0, random.randint(1, 10), random.randint(11, 100)],
                    weights=[60, 30, 10],
                    k=1,
                )[0]
        record["current_stock"] = stock_qty

        # reserved_stock: Integer >= 0, must be <= current_stock (with violations)
        if i % 25 == 0:
            reserved_qty = None
        elif i % 35 == 0:
            reserved_qty = random.choice(["Some", "None", "All", "N/A"])
        elif i % 45 == 0:
            reserved_qty = random.randint(-50, -1)  # Violation
        elif i % 55 == 0:  # reserved > stock violation
            if isinstance(stock_qty, (int, float)) and stock_qty > 0:
                reserved_qty = int(stock_qty * random.uniform(1.1, 2.0))
            else:
                reserved_qty = random.randint(100, 500)
        elif i % 65 == 0:
            reserved_qty = random.choice([5.5, 10.25, 15.75])
        else:
            # Valid: reserved_stock <= current_stock
            if isinstance(stock_qty, (int, float)) and stock_qty > 0:
                reserved_qty = random.choices(
                    [
                        0,
                        int(stock_qty * 0.1),
                        int(stock_qty * 0.3),
                        int(stock_qty * 0.5),
                        int(stock_qty),
                    ],
                    weights=[30, 25, 25, 15, 5],
                    k=1,
                )[0]
            else:
                reserved_qty = 0
        record["reserved_stock"] = reserved_qty

        # min_stock_level: Integer >= 0
        if i % 30 == 0:
            reorder_level = None
        elif i % 40 == 0:
            reorder_level = random.choice(["Low", "Medium", "High", "Auto"])
        elif i % 50 == 0:
            reorder_level = random.randint(-50, -1)
        elif i % 60 == 0:
            reorder_level = random.choice([99999, -999])
        elif i % 70 == 0:
            if isinstance(stock_qty, (int, float)):
                reorder_level = int(abs(stock_qty) * 2 + 100)
            else:
                reorder_level = 1000
        else:
            if product_category == "high_turnover":
                reorder_level = random.randint(50, 200)
            elif product_category == "medium_turnover":
                reorder_level = random.randint(20, 100)
            elif product_category == "low_turnover":
                reorder_level = random.randint(5, 30)
            else:
                reorder_level = 0
        record["min_stock_level"] = reorder_level

        # created_date: Must be <= today, serves as baseline for chronology
        if random.random() > 0.3:
            if i % 30 == 0:
                created = None
            elif i % 40 == 0:
                created = fake.date_time_between(
                    start_date="-2y", end_date="now"
                ).strftime("%Y-%m-%d %H:%M:%S")
            elif i % 50 == 0:
                created = fake.date_time_between(
                    start_date="+1d", end_date="+30d"
                )  # Violation
            else:
                created = fake.date_time_between(start_date="-2y", end_date="now")
            record["created_date"] = created
        else:
            created = None

        # last_restock_date: Must be >= created_date and <= today
        if i % 25 == 0:
            restocked = None
        elif i % 35 == 0:
            restocked_dt = fake.date_time_between(start_date="-6m", end_date="now")
            formats = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y", "%d-%m-%Y", "%Y%m%d"]
            restocked = restocked_dt.strftime(random.choice(formats))
        elif i % 45 == 0:
            restocked = fake.date_time_between(
                start_date="+1d", end_date="+30d"
            )  # Violation
        elif i % 55 == 0:
            restocked = fake.date_time_between(start_date="-5y", end_date="-2y")
        else:
            # Valid: >= created_date, <= today
            if isinstance(created, datetime):
                restocked = fake.date_time_between(start_date=created, end_date="now")
            elif product_category == "high_turnover":
                restocked = fake.date_time_between(start_date="-7d", end_date="now")
            elif product_category == "medium_turnover":
                restocked = fake.date_time_between(start_date="-30d", end_date="now")
            elif product_category == "low_turnover":
                restocked = fake.date_time_between(start_date="-90d", end_date="now")
            else:
                restocked = fake.date_time_between(start_date="-2y", end_date="-6m")
        record["last_restock_date"] = restocked

        # last_sale_date: Must be >= created_date and <= today
        if random.random() > 0.3:
            if i % 30 == 0:
                last_sold = None
            elif i % 50 == 0:
                last_sold = fake.date_time_between(
                    start_date="+1d", end_date="+30d"
                )  # Violation
            elif i % 60 == 0 and isinstance(stock_qty, (int, float)) and stock_qty < 10:
                last_sold = None  # Low stock never sold - business issue
            else:
                # Valid: >= created_date, <= today
                if isinstance(created, datetime):
                    last_sold = fake.date_time_between(
                        start_date=created, end_date="now"
                    )
                elif product_category == "high_turnover":
                    last_sold = fake.date_time_between(start_date="-1d", end_date="now")
                elif product_category == "medium_turnover":
                    last_sold = fake.date_time_between(start_date="-7d", end_date="now")
                elif product_category == "low_turnover":
                    last_sold = fake.date_time_between(
                        start_date="-30d", end_date="now"
                    )
                else:
                    last_sold = fake.date_time_between(start_date="-1y", end_date="-3m")
            record["last_sale_date"] = last_sold

        # monthly_storage_cost: Decimal >= 0
        if i % 25 == 0:
            storage_cost = None
        elif i % 35 == 0:
            storage_cost = random.choice(["Free", "Included", "N/A", "Variable"])
        elif i % 45 == 0:
            storage_cost = round(random.uniform(-5, -0.1), 2)  # Violation
        elif i % 55 == 0:
            storage_cost = random.choice([999.99, 0.001, -99])
        elif i % 65 == 0:
            storage_cost = 0
        else:
            storage_cost = round(random.uniform(0.10, 5.00), 2)
        record["monthly_storage_cost"] = storage_cost

        # warehouse_location: Mandatory, valid format
        if random.random() > 0.4:
            if i % 45 == 0:
                warehouse = None
            elif i % 55 == 0:
                warehouse = random.choice(["", "N/A", "Multiple"])  # Invalid
            else:
                warehouse = random.choice(VALID_WAREHOUSES)
            record["warehouse_location"] = warehouse

        # available_qty: Must equal current_stock - reserved_stock exactly
        if random.random() > 0.5:
            if all(isinstance(x, (int, float)) for x in [stock_qty, reserved_qty]):
                available = stock_qty - reserved_qty
                if i % 75 == 0:
                    available = -abs(available)  # Violation
            elif i % 65 == 0:
                available = random.choice(["Calculate", "N/A", "Check system"])
            else:
                available = None
            record["available_qty"] = available
        else:
            # Derive correctly when not intentionally violating
            if all(isinstance(x, (int, float)) for x in [stock_qty, reserved_qty]):
                available = stock_qty - reserved_qty
                record["available_qty"] = available

        # stock_status: Derived from available_qty and min_stock_level
        if random.random() > 0.6:
            available = record.get("available_qty")
            if i % 90 == 0:
                status = random.choice(["Available", "Unavailable", "Check"])  # Invalid
            elif isinstance(available, (int, float)) and isinstance(
                reorder_level, (int, float)
            ):
                status = derive_stock_status(available, reorder_level, is_discontinued)
            else:
                status = None
            record["stock_status"] = status

        # total_stock_value: current_stock * unit_cost
        if random.random() > 0.6:
            if isinstance(stock_qty, (int, float)) and stock_qty > 0:
                unit_cost = round(random.uniform(5, 200), 2)
                stock_value = round(stock_qty * unit_cost, 2)
                if i % 80 == 0:
                    stock_value = stock_value * random.uniform(0.5, 1.5)  # Wrong calc
            elif isinstance(stock_qty, (int, float)) and stock_qty == 0:
                stock_value = 0.00  # Rule: if current_stock = 0, value = 0
            else:
                stock_value = None
            record["total_stock_value"] = (
                round(stock_value, 2)
                if isinstance(stock_value, (int, float))
                else stock_value
            )

        # days_since_last_sale: Must equal (today - last_sale_date)
        if random.random() > 0.7 and "last_sale_date" in record:
            last_sold = record.get("last_sale_date")
            if isinstance(last_sold, datetime):
                days_since = (datetime.now() - last_sold).days
                if i % 85 == 0:
                    days_since = -random.randint(1, 30)  # Violation
            elif i % 75 == 0:
                days_since = random.choice(["Never", "N/A", "Unknown"])
            else:
                days_since = None
            record["days_since_last_sale"] = days_since

        # restock_lead_time_days: Positive integer >= 1
        if random.random() > 0.7:
            if i % 65 == 0:
                lead_time = None
            elif i % 75 == 0:
                lead_time = random.choice(["Immediate", "Variable", "TBD"])
            elif i % 85 == 0:
                lead_time = random.randint(-10, -1)  # Violation
            elif i % 95 == 0:
                lead_time = random.choice([0, 999, 365])  # 0 is violation per rule
            else:
                lead_time = random.choices(
                    [1, 3, 7, 14, 30, 60, 90], weights=[10, 20, 30, 20, 10, 5, 5], k=1
                )[0]
            record["restock_lead_time_days"] = lead_time

        # expiry_date: For perishables, >= created_date, >= last_restock_date
        if random.random() > 0.85:
            if i % 70 == 0:
                expiry = None
            elif i % 80 == 0:
                expiry = fake.date_between(
                    start_date="-30d", end_date="-1d"
                )  # Already expired
            elif i % 90 == 0:
                expiry = random.choice(["N/A", "Non-perishable", "Check package"])
            else:
                # Valid: >= created_date, >= last_restock_date
                base_date = (
                    restocked if isinstance(restocked, datetime) else datetime.now()
                )
                expiry = fake.date_between(
                    start_date=(
                        base_date.date() if isinstance(base_date, datetime) else "+1d"
                    ),
                    end_date="+2y",
                )
            record["expiry_date"] = expiry

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_inventory_data(number_of_inventories)
df = add_more_messiness(df)

output_file = "inventories.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)
  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Reviews Table Generator

In [44]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
MODERATION_STATUS_CANONICAL = ["Approved", "Pending", "Rejected", "Flagged"]
SUBMISSION_DEVICE_CANONICAL = [
    "Desktop",
    "Tablet",
    "iOS App",
    "Android App",
    "Mobile Web",
]
REVIEW_LANGUAGE_CANONICAL = ["en", "es", "fr", "de", "zh", "ja", "pt", "ru"]

# Rating step policy: integers only (1,2,3,4,5) with some half-stars as violations
VALID_RATINGS = [1, 2, 3, 4, 5]

# Spam/quality detection patterns
SPAM_TITLES = [
    "AMAZING!!!",
    "BEST EVER",
    "DO NOT BUY",
    "SCAM!!!",
    "Five Stars",
    "Good",
    "OK",
    "Nice",
    "👍",
    "⭐⭐⭐⭐⭐",
    "!!!!!!!!",
]
GENERIC_REVIEWS = [
    "Good product",
    "As expected",
    "Nice quality",
    "Fast shipping",
    "Would buy again",
    "Recommended",
    "Not bad",
    "Pretty good",
]
PLACEHOLDER_TEXT = ["undefined", "lorem ipsum", "test", "asdf", "N/A"]


def generate_messy_reviews_data(
    num_rows=3000, product_id_format="PROD", customer_id_format="CUST"
):
    data = []
    used_review_ids = []

    product_ids = [
        f"PROD_{i + starting_product_index}" for i in range(number_of_products)
    ]
    customer_ids = [
        f"CUST_{i + starting_customer_index}" for i in range(number_of_customers)
    ]

    customer_product_pairs = {}
    popular_products = product_ids[:20]

    for i in range(num_rows):
        record = {}

        # review_ref: Primary key, positive integer, unique
        if i % 53 == 0 and used_review_ids:
            review_id = random.choice(used_review_ids)  # Duplicate violation
        else:
            review_id = starting_review_index + i
            used_review_ids.append(review_id)
        record["review_ref"] = review_id if i % 97 != 0 else None  # 1% null

        # product_ref: Mandatory, FK to products
        if i % 83 == 0:
            prod_id = None
        elif i % 59 == 0:
            prod_id = f"PROD_{9999}"  # FK violation
        elif i % 41 == 0:
            prod_id = random.choice(["INVALID", "NULL", "N/A", "", "REMOVED"])
        else:
            if random.random() < 0.4:
                prod_id = random.choice(popular_products)
            else:
                prod_id = random.choice(product_ids)
        record["product_ref"] = prod_id

        # customer_ref: Mandatory (allow NULL for guest reviews), FK to customers
        is_guest = i % 17 == 0  # ~6% guest reviews
        if is_guest:
            cust_id = None
        elif i % 57 == 0:
            cust_id = f"CUST_{99999}"  # FK violation
        elif i % 47 == 0:
            cust_id = random.choice(["INVALID", "GUEST", "ANONYMOUS", ""])
        else:
            cust_id = random.choice(customer_ids)

        # Track duplicate reviews (same customer-product)
        if cust_id and prod_id:
            pair_key = f"{cust_id}_{prod_id}"
            if i % 31 == 0 and pair_key in customer_product_pairs:
                pass  # Duplicate review violation
            customer_product_pairs[pair_key] = True
        record["customer_ref"] = cust_id

        # star_rating: 1-5 inclusive, integers only (half-stars as violations)
        if i % 27 == 0:
            rating = None
        elif i % 37 == 0:
            rating = random.choice(
                ["Five stars", "Good", "Bad", "N/A", "****"]
            )  # String violation
        elif i % 48 == 0:
            rating = random.choice([0, 6, 10, -1, 100])  # Out of range violation
        elif i % 58 == 0:
            rating = random.choice([3.5, 4.5, 2.7, 1.8])  # Half-star violation
        else:
            # J-shaped distribution (more 5s and 1s)
            rating = random.choices([1, 2, 3, 4, 5], weights=[15, 5, 10, 25, 45], k=1)[
                0
            ]
        record["star_rating"] = rating

        # submitted_date: <= now(), no future dates
        if i % 26 == 0:
            review_date = None
        elif i % 36 == 0:
            review_date_dt = fake.date_time_between(start_date="-2y", end_date="now")
            formats = ["%Y-%m-%d %H:%M:%S", "%m/%d/%Y", "%d-%m-%Y"]
            review_date = review_date_dt.strftime(random.choice(formats))
        elif i % 46 == 0:
            review_date = fake.date_time_between(
                start_date="+1d", end_date="+30d"
            )  # Future violation
        elif i % 56 == 0:
            review_date = fake.date_time_between(
                start_date="-10y", end_date="-5y"
            )  # Very old
        elif i % 66 == 0:
            review_date = random.choice(
                ["1970-01-01", "1900-01-01", "0000-00-00"]
            )  # Placeholder
        else:
            days_ago = random.choices(
                [
                    random.randint(1, 7),
                    random.randint(8, 30),
                    random.randint(31, 180),
                    random.randint(181, 730),
                ],
                weights=[40, 30, 20, 10],
                k=1,
            )[0]
            review_date = datetime.now() - timedelta(days=days_ago)
        record["submitted_date"] = review_date

        # review_date_only: YYYY-MM-DD, must match submitted_date
        if isinstance(review_date, datetime):
            if i % 76 == 0:
                # Date mismatch violation
                record["review_date_only"] = (
                    review_date - timedelta(days=random.randint(1, 5))
                ).strftime("%Y-%m-%d")
            else:
                record["review_date_only"] = review_date.strftime("%Y-%m-%d")
        elif i % 86 == 0:
            record["review_date_only"] = None
        else:
            record["review_date_only"] = fake.date_between(
                start_date="-2y", end_date="today"
            ).strftime("%Y-%m-%d")

        # review_headline: Optional, 2-100 chars
        if i % 29 == 0:
            title = None
        elif i % 39 == 0:
            title = random.choice(["", " ", ".", "?", "N/A"])  # Too short violation
        elif i % 49 == 0:
            title = random.choice(SPAM_TITLES)  # Spam violation
        elif i % 69 == 0:
            title = fake.text(max_nb_chars=150)[:150]  # Too long violation (> 100)
        elif i % 79 == 0:
            title = "!!!!!!!!"  # Repeated punctuation violation
        elif i % 89 == 0:
            title = random.choice(
                ["undefined", "lorem ipsum", "error: stack trace"]
            )  # Template error
        else:
            # Generate realistic title based on rating
            if isinstance(rating, int):
                if rating >= 4:
                    title = random.choice(
                        [
                            "Great product!",
                            "Excellent quality",
                            "Highly recommend",
                            "Love it!",
                            "Perfect!",
                            "Exceeded expectations",
                            "Amazing value",
                            "Very satisfied",
                            fake.sentence(nb_words=4)[:80],
                        ]
                    )
                elif rating == 3:
                    title = random.choice(
                        [
                            "Decent product",
                            "It's okay",
                            "Average quality",
                            "Not bad",
                            "Could be better",
                            "Mixed feelings",
                            fake.sentence(nb_words=3)[:60],
                        ]
                    )
                else:
                    title = random.choice(
                        [
                            "Disappointed",
                            "Not worth it",
                            "Poor quality",
                            "Waste of money",
                            "Do not recommend",
                            "Terrible experience",
                            fake.sentence(nb_words=3)[:60],
                        ]
                    )
            else:
                title = fake.sentence(nb_words=4)[:80]
        record["review_headline"] = title

        # review_content: Minimum 10-20 chars, quality checks
        if i % 22 == 0:
            text = None
        elif i % 32 == 0:
            text = random.choice(
                ["", " ", "Good", "Bad", "OK", "."]
            )  # Too short violation
        elif i % 42 == 0:
            text = random.choice(GENERIC_REVIEWS)  # Generic/low quality
        elif i % 52 == 0:
            # Spam with URL/email
            text = f"Check out {fake.url()} for deals! Contact {fake.email()} for info."
        elif i % 62 == 0:
            # Repetitive text violation
            word = random.choice(["GREAT", "BAD", "LOVE", "HATE"])
            text = f"{word} " * random.randint(10, 50)
        elif i % 72 == 0:
            text = fake.text(max_nb_chars=5000)  # Very long
        elif i % 82 == 0:
            # Non-English (language mismatch potential)
            text = random.choice(
                [
                    "很好的产品！强烈推荐。",
                    "Très bon produit, je recommande!",
                    "отличный продукт",
                    "素晴らしい製品です",
                ]
            )
        elif i % 92 == 0:
            # Content doesn't match rating
            if isinstance(rating, int) and rating >= 4:
                text = "Terrible product. Very disappointed. Would not buy again. Waste of money."
            elif isinstance(rating, int) and rating <= 2:
                text = "This is the best product I've ever purchased! Absolutely love it! Perfect!"
            else:
                text = fake.paragraph(nb_sentences=3)
        else:
            # Valid: realistic review based on rating
            if isinstance(rating, int):
                if rating >= 4:
                    text = fake.paragraph(nb_sentences=random.randint(2, 5))
                    text += random.choice(
                        [" Highly recommend!", " Would buy again.", " Great value.", ""]
                    )
                elif rating == 3:
                    text = fake.paragraph(nb_sentences=random.randint(2, 4))
                    text += random.choice(
                        [" It's okay for the price.", " Has pros and cons.", ""]
                    )
                else:
                    text = fake.paragraph(nb_sentences=random.randint(1, 3))
                    text += random.choice(
                        [" Very disappointed.", " Not worth the money.", ""]
                    )
            else:
                text = fake.paragraph(nb_sentences=random.randint(2, 4))
        record["review_content"] = text

        # verified_purchase: Boolean, hard rule: TRUE only if customer_ref non-null
        if i % 28 == 0:
            verified = None
        elif i % 38 == 0:
            verified = random.choice(["Y", "N", "Yes", "No", "1", "0", "true", "false"])
        elif i % 50 == 0:
            verified = random.choice(["Maybe", "Unknown", "Pending", ""])  # Invalid
        elif i % 60 == 0:
            # Anonymous but verified violation
            if is_guest:
                verified = True
            else:
                verified = False
        else:
            # Valid: verified only if customer exists
            if cust_id and cust_id not in ["INVALID", "GUEST", "ANONYMOUS", ""]:
                verified = random.choices([True, False], weights=[70, 30], k=1)[0]
            else:
                verified = False
        record["verified_purchase"] = verified

        # helpful_count: Integer >= 0
        if i % 44 == 0:
            helpful = None
        elif i % 54 == 0:
            helpful = random.choice(["Many", "Few", "None", "N/A"])  # String violation
        elif i % 64 == 0:
            helpful = random.randint(-10, -1)  # Negative violation
        elif i % 74 == 0:
            # Suspicious: too many votes for new review
            if (
                isinstance(review_date, datetime)
                and (datetime.now() - review_date).days < 7
            ):
                helpful = random.randint(100, 1000)
            else:
                helpful = random.randint(0, 50)
        else:
            # Valid: older reviews have more votes
            if isinstance(review_date, datetime):
                days_old = (datetime.now() - review_date).days
                max_votes = min(days_old // 10, 100)
                helpful = random.randint(0, max(max_votes, 1))
            else:
                helpful = random.randint(0, 20)
        record["helpful_count"] = helpful

        # total_votes: Integer >= 0, >= helpful_count
        if isinstance(helpful, int) and helpful >= 0:
            if i % 84 == 0:
                # total < helpful violation
                total = max(0, helpful - random.randint(1, 5))
            else:
                # Valid: total >= helpful
                total = helpful + random.randint(0, 20)
        else:
            total = random.randint(0, 30)
        record["total_votes"] = total

        # image_count: Integer >= 0, typically 0-10
        if i % 61 == 0:
            images = None
        elif i % 71 == 0:
            images = random.choice(["Yes", "Multiple", "None"])  # String violation
        elif i % 81 == 0:
            images = random.randint(-5, -1)  # Negative violation
        elif i % 91 == 0:
            images = random.randint(15, 50)  # > 10 suspicious
        else:
            # Valid: 0-10
            images = random.choices(
                [0, 1, 2, 3, 4, 5], weights=[40, 30, 15, 10, 3, 2], k=1
            )[0]
        record["image_count"] = images

        # reviewer_name: 1-80 chars, defaulting rules
        if i % 55 == 0:
            name = None
        elif i % 65 == 0:
            name = random.choice(
                ["", "123456", "@#$%"]
            )  # Invalid (only symbols/digits)
        elif i % 75 == 0:
            name = fake.email()  # Email pattern violation
        elif i % 85 == 0:
            name = "A" * 100  # Too long violation
        else:
            # Valid with defaulting
            if verified == True and name is None:
                name = "Verified Reviewer"
            elif is_guest:
                name = random.choice(
                    ["A Customer", "Anonymous Shopper", fake.first_name()]
                )
            else:
                name = fake.name()
        record["reviewer_name"] = name

        # moderation_status: Canonical set
        if i % 51 == 0:
            status = None
        elif i % 63 == 0:
            status = random.choice(["", "N/A", "Unknown"])
        elif i % 73 == 0:
            status = random.choice(["Hidden", "Needs Review", "Live"])  # Non-canonical
        else:
            # Valid: mostly Approved
            status = random.choices(
                MODERATION_STATUS_CANONICAL, weights=[80, 10, 5, 5], k=1
            )[0]
        record["moderation_status"] = status

        # seller_response: Optional, 5-2000 chars
        if random.random() > 0.9:  # 10% have response
            if i % 70 == 0:
                response = None
            elif i % 80 == 0:
                response = random.choice(["", "Thanks", "N/A"])  # Too short/placeholder
            elif i % 90 == 0:
                # Response on non-approved review violation
                if status in ["Pending", "Rejected"]:
                    response = "Thank you for your feedback!"
                else:
                    response = None
            else:
                # Valid response
                if isinstance(rating, int) and rating <= 2:
                    response = f"We're sorry about your experience. Please contact support@{fake.domain_name()} for help."
                else:
                    response = "Thank you for your review! We appreciate your feedback."
            record["seller_response"] = response

        # review_language: ISO 639-1 two-letter
        if i % 68 == 0:
            lang = None
        elif i % 78 == 0:
            lang = random.choice(["", "N/A", "Unknown", "English"])  # Invalid format
        elif i % 88 == 0:
            # Language mismatch with content
            if text and ("很好" in str(text) or "Très" in str(text)):
                lang = "en"  # Mismatch violation
            else:
                lang = random.choice(["zh", "fr", "ja"])
        else:
            # Valid: mostly English
            lang = random.choices(
                REVIEW_LANGUAGE_CANONICAL, weights=[70, 10, 5, 5, 3, 3, 2, 2], k=1
            )[0]
        record["review_language"] = lang

        # submission_device: Canonical set
        if i % 67 == 0:
            device = None
        elif i % 77 == 0:
            device = random.choice(["", "N/A", "Unknown"])
        elif i % 87 == 0:
            device = random.choice(["iPhone", "Android", "Mobile"])  # Non-canonical
        else:
            device = random.choice(SUBMISSION_DEVICE_CANONICAL)
        record["submission_device"] = device

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_reviews_data(number_of_reviews)
df = add_more_messiness(df)

output_file = "reviews.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Marketing Campaigns Generator

In [45]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
CHANNEL_TYPES_CANONICAL = [
    "Email",
    "Social Media",
    "Search",
    "Display",
    "Video",
    "SMS",
    "Radio",
    "TV",
    "Direct Mail",
]
DIGITAL_CHANNELS = ["Email", "Social Media", "Search", "Display", "Video", "SMS"]
OFFLINE_CHANNELS = ["Radio", "TV", "Direct Mail"]
STATUS_CANONICAL = ["Active", "Paused", "Completed", "Planned", "Cancelled"]
PLATFORM_CANONICAL = ["Google Ads", "Meta Ads", "TikTok", "LinkedIn", "Instagram"]
VARIANT_CANONICAL = ["Control", "Variant A", "Variant B", "Variant C"]

CAMPAIGN_THEMES = [
    "Summer Sale",
    "Black Friday",
    "Christmas Special",
    "New Year Deal",
    "Spring Collection",
    "Back to School",
    "Flash Sale",
    "Clearance",
    "Product Launch",
    "Brand Awareness",
    "Customer Retention",
    "Lead Generation",
    "Holiday Special",
    "Anniversary Sale",
]

AUDIENCE_SEGMENTS = [
    "High_LTV",
    "Cart_Abandoners",
    "New_Customers_30d",
    "Women_25_40_USA",
    "Men_18_35_Urban",
    "Parents_with_children",
    "High_income_households",
    "College_students",
    "Senior_65_plus",
    "Millennials_Tech_savvy",
    "Gen_Z_Social",
    "B2B_Decision_makers",
    "Small_business_owners",
    "Fitness_enthusiasts",
    "Premium_customers",
    "First_time_buyers",
    "Loyal_customers",
]


def derive_status_from_dates(launch_date, completion_date):
    """Derive campaign status from dates per rules."""
    today = date.today()
    if not isinstance(launch_date, date) or not isinstance(completion_date, date):
        return random.choice(["Active", "Paused", "Completed"])
    if launch_date > today:
        return "Planned"
    if completion_date < today:
        return "Completed"
    return random.choice(["Active", "Paused"])


def generate_messy_marketing_campaigns_data(num_rows=500):
    data = []
    used_campaign_ids = []

    for i in range(num_rows):
        record = {}

        # campaign_ref: Primary key, positive integer, unique
        if i % 50 == 0 and used_campaign_ids:
            campaign_id = random.choice(used_campaign_ids)  # Duplicate violation
        else:
            campaign_id = starting_campaign_index + i
            used_campaign_ids.append(campaign_id)
        record["campaign_ref"] = campaign_id if i % 100 != 0 else None  # 1% null

        # campaign_title: 5-100 chars, non-null, non-placeholder
        if i % 25 == 0:
            name = None
        elif i % 35 == 0:
            name = random.choice(
                ["", " ", "Test", "Campaign", "N/A"]
            )  # Placeholder violation
        elif i % 45 == 0:
            name = fake.text(max_nb_chars=150)[:120]  # Exceeds 100
        elif i % 55 == 0:
            name = random.choice(["Campaign #1", "Sale!!!", "50% OFF", "MEGA SALE"])
        elif i % 65 == 0:
            name = "Summer Sale 2025"  # Duplicate name
        else:
            theme = random.choice(CAMPAIGN_THEMES)
            year = random.choice(["2024", "2025"])
            suffix = random.choice(["", " - Phase 1", " - Final"])
            name = f"{theme} {year}{suffix}"
            # Ensure 5-100 chars
            if len(name) < 5:
                name = name + " Campaign"
        record["campaign_title"] = name

        # channel_type: Canonical set only
        if i % 20 == 0:
            camp_type = None
        elif i % 30 == 0:
            camp_type = random.choice(
                ["email", "EMAIL", "Email Marketing", "E-mail"]
            )  # Case violation
        elif i % 40 == 0:
            camp_type = random.choice(
                ["Unknown", "Mixed", "Other", "N/A", ""]
            )  # Invalid
        elif i % 50 == 0:
            camp_type = random.choice(
                ["Emal", "Socail Media", "PPG", "Displya"]
            )  # Typos
        else:
            camp_type = random.choice(CHANNEL_TYPES_CANONICAL)
        record["channel_type"] = camp_type

        is_digital = camp_type in DIGITAL_CHANNELS
        is_offline = camp_type in OFFLINE_CHANNELS

        # launch_date: YYYY-MM-DD format
        if i % 30 == 0:
            start_date = None
        elif i % 40 == 0:
            start_dt = fake.date_between(start_date="-1y", end_date="+6m")
            formats = ["%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y", "%Y%m%d"]
            start_date = start_dt.strftime(random.choice(formats))
        elif i % 50 == 0:
            start_date = random.choice(
                ["0000-00-00", "2025-13-45", "TBD", "ASAP"]
            )  # Invalid
        elif i % 60 == 0:
            start_date = fake.date_between(
                start_date="-10y", end_date="-5y"
            )  # Very old
        elif i % 70 == 0:
            start_date = fake.date_between(
                start_date="+2y", end_date="+5y"
            )  # Far future
        else:
            start_date = fake.date_between(start_date="-6m", end_date="+3m")
        record["launch_date"] = start_date

        # completion_date: Must be > launch_date (with violations)
        if i % 25 == 0:
            end_date = None
        elif i % 35 == 0:  # Violation: end before start
            if isinstance(start_date, date):
                end_date = start_date - timedelta(days=random.randint(1, 30))
            else:
                end_date = fake.date_between(start_date="-2y", end_date="-1y")
        elif i % 45 == 0:
            end_date = start_date  # Same day - violation of strictly greater
        elif i % 55 == 0:
            end_date = fake.date_between(
                start_date="+10y", end_date="+20y"
            )  # >365 days
        elif i % 65 == 0:
            end_dt = fake.date_between(start_date="-3m", end_date="+6m")
            end_date = end_dt.strftime("%m/%d/%Y")  # String format
        else:
            # Valid: completion_date > launch_date, duration 1-365 days
            if isinstance(start_date, date):
                duration = random.choices(
                    [
                        random.randint(1, 7),
                        random.randint(8, 30),
                        random.randint(31, 90),
                        random.randint(91, 365),
                    ],
                    weights=[20, 40, 30, 10],
                    k=1,
                )[0]
                end_date = start_date + timedelta(days=duration)
            else:
                end_date = fake.date_between(start_date="-2m", end_date="+6m")
        record["completion_date"] = end_date

        # campaign_status: Canonical set, date-consistent
        if i % 20 == 0:
            status = None
        elif i % 30 == 0:
            status = random.choice(
                ["active", "ACTIVE", "Running", "Live", "1"]
            )  # Case violation
        elif i % 40 == 0:
            status = random.choice(
                ["Pending", "Draft", "Archived", "Deleted"]
            )  # Invalid
        elif i % 50 == 0:
            status = random.choice(["Activ", "Pasued", "Complted"])  # Typos
        else:
            # Valid: derive from dates
            status = derive_status_from_dates(start_date, end_date)
        record["campaign_status"] = status

        # allocated_budget: Decimal >= 0
        if i % 23 == 0:
            budget = None
        elif i % 31 == 0:
            budget = random.choice(
                ["Unlimited", "TBD", "Variable", "N/A", ""]
            )  # String
        elif i % 37 == 0:
            budget = round(random.uniform(-10000, -100), 2)  # Negative violation
        elif i % 47 == 0:
            budget = 0  # Zero budget
        elif i % 53 == 0:
            budget = random.choice([999999999.99, 0.01, -99999])  # Extreme
        else:
            # Valid: realistic budget based on channel
            if camp_type in ["TV", "Radio"]:
                budget = round(random.uniform(50000, 500000), 2)
            elif camp_type in ["Search", "Display"]:
                budget = round(random.uniform(1000, 50000), 2)
            elif camp_type in ["Email", "SMS"]:
                budget = round(random.uniform(100, 10000), 2)
            else:
                budget = round(random.uniform(500, 25000), 2)
        record["allocated_budget"] = budget

        # current_spend: Decimal >= 0, typically <= budget * 1.10
        if i % 27 == 0:
            spent = None
        elif i % 33 == 0:
            spent = random.choice(["In Progress", "Calculating", "N/A"])  # String
        elif i % 41 == 0:
            spent = round(random.uniform(-5000, -10), 2)  # Negative violation
        elif i % 51 == 0:  # Overspend > 110%
            if isinstance(budget, (int, float)) and budget > 0:
                spent = round(budget * random.uniform(1.15, 2.0), 2)
            else:
                spent = round(random.uniform(10000, 50000), 2)
        elif i % 61 == 0:
            spent = 0  # Zero spent
        else:
            # Valid: spend based on status and budget
            if isinstance(budget, (int, float)) and budget > 0:
                if status == "Planned":
                    spent = 0  # Not started
                elif status == "Completed":
                    spent = round(budget * random.uniform(0.7, 1.0), 2)
                elif status == "Cancelled":
                    spent = round(budget * random.uniform(0, 0.1), 2)  # Minimal
                else:  # Active/Paused
                    spent = round(budget * random.uniform(0.3, 0.9), 2)
            else:
                spent = round(random.uniform(100, 10000), 2)
        record["current_spend"] = spent

        # total_impressions: Integer >= 0 (digital channels should have this)
        if i % 29 == 0:
            impressions = None
        elif i % 39 == 0:
            impressions = random.choice(["Many", "High", "Low", "N/A"])  # String
        elif i % 49 == 0:
            impressions = random.randint(-10000, -1)  # Negative violation
        elif i % 59 == 0:
            impressions = 0  # Zero but will have clicks - violation setup
        elif i % 67 == 0:
            impressions = random.choice([999999999, 0.5, -99999])  # Extreme/decimal
        else:
            # Valid: impressions based on spend and channel
            if is_offline:
                impressions = (
                    None if random.random() > 0.3 else random.randint(10000, 1000000)
                )
            elif isinstance(spent, (int, float)) and spent > 0:
                if camp_type in ["Email", "SMS"]:
                    impressions = int(spent * random.uniform(10, 50))
                elif camp_type in ["Display", "Social Media"]:
                    impressions = int(spent * random.uniform(100, 500))
                else:
                    impressions = int(spent * random.uniform(50, 200))
            else:
                impressions = random.randint(1000, 100000)
        record["total_impressions"] = impressions

        # total_clicks: Integer >= 0, must be <= impressions
        if i % 25 == 0:
            clicks = None
        elif i % 35 == 0:
            clicks = random.choice(["Good CTR", "Low", "N/A"])  # String
        elif i % 45 == 0:
            clicks = random.randint(-1000, -1)  # Negative violation
        elif i % 55 == 0:  # clicks > impressions violation
            if isinstance(impressions, int) and impressions > 0:
                clicks = int(impressions * random.uniform(1.1, 2.0))
            else:
                clicks = random.randint(10000, 50000)
        elif i % 65 == 0:
            clicks = random.choice([100.5, 250.75, 1000.25])  # Decimal
        else:
            # Valid: clicks <= impressions, realistic CTR 0.5%-5%
            if isinstance(impressions, int) and impressions > 0:
                ctr = random.uniform(0.005, 0.05)
                clicks = int(impressions * ctr)
            else:
                clicks = random.randint(10, 5000)
        record["total_clicks"] = clicks

        # conversion_count: Integer >= 0, must be <= clicks
        if i % 30 == 0:
            conversions = None
        elif i % 40 == 0:
            conversions = random.choice(["Good", "Poor", "TBD", "N/A"])  # String
        elif i % 50 == 0:
            conversions = random.randint(-100, -1)  # Negative violation
        elif i % 60 == 0:  # conversions > clicks violation
            if isinstance(clicks, (int, float)) and clicks > 0:
                conversions = int(clicks * random.uniform(1.1, 2.0))
            else:
                conversions = random.randint(1000, 5000)
        elif i % 70 == 0:
            conversions = clicks  # 100% conversion - suspicious
        else:
            # Valid: conversions <= clicks, realistic 1%-10%
            if isinstance(clicks, (int, float)) and clicks > 0:
                conv_rate = random.uniform(0.01, 0.10)
                conversions = int(clicks * conv_rate)
            else:
                conversions = random.randint(0, 500)
        record["conversion_count"] = conversions

        # ctr_rate: (clicks / impressions) * 100, range 0-100
        if random.random() > 0.6:
            if i % 80 == 0:
                ctr = random.uniform(101, 200)  # Invalid > 100%
            elif i % 70 == 0:
                ctr = random.choice(["High", "Low", "N/A"])  # String
            elif (
                all(isinstance(x, (int, float)) for x in [clicks, impressions])
                and impressions > 0
            ):
                ctr = round((clicks / impressions) * 100, 2)
            elif isinstance(impressions, (int, float)) and impressions == 0:
                ctr = 0  # Division by zero rule
            else:
                ctr = None
            record["ctr_rate"] = ctr

        # conversion_rate: (conversions / clicks) * 100, range 0-100
        if random.random() > 0.6:
            if i % 85 == 0:
                conv_rate = random.uniform(101, 150)  # Invalid > 100%
            elif i % 75 == 0:
                conv_rate = random.choice(["Good", "Poor", "Average"])  # String
            elif (
                all(isinstance(x, (int, float)) for x in [conversions, clicks])
                and clicks > 0
            ):
                conv_rate = round((conversions / clicks) * 100, 2)
            elif isinstance(clicks, (int, float)) and clicks == 0:
                conv_rate = 0  # Division by zero rule
            else:
                conv_rate = None
            record["conversion_rate"] = conv_rate

        # avg_cpc: current_spend / total_clicks
        if random.random() > 0.7:
            if i % 90 == 0:
                cpc = random.choice([0, 1000, -10])  # Extreme/negative
            elif (
                all(isinstance(x, (int, float)) for x in [spent, clicks]) and clicks > 0
            ):
                cpc = round(spent / clicks, 2)
            elif isinstance(clicks, (int, float)) and clicks == 0:
                if isinstance(spent, (int, float)) and spent == 0:
                    cpc = 0  # Both zero
                else:
                    cpc = None  # Spend but no clicks - flag
            else:
                cpc = None
            record["avg_cpc"] = cpc

        # roi_percentage: ((revenue - spend) / spend) * 100, requires spend > 0
        if random.random() > 0.5:
            if i % 75 == 0:
                roi = random.choice([-100, 10000, 99999])  # Extreme
            elif i % 85 == 0:
                roi = random.choice(["Positive", "Negative", "Break-even"])  # String
            elif status in ["Planned"]:
                roi = None  # Not applicable for planned
            elif (
                all(isinstance(x, (int, float)) for x in [spent, conversions])
                and spent > 0
            ):
                avg_order_value = random.uniform(50, 200)
                revenue = conversions * avg_order_value
                roi = round(((revenue - spent) / spent) * 100, 2)
            elif isinstance(spent, (int, float)) and spent == 0:
                roi = None  # Division undefined
            else:
                roi = None
            record["roi_percentage"] = roi

        # target_segment: Non-empty for Active campaigns
        if i % 25 == 0:
            audience = None
        elif i % 35 == 0:
            audience = random.choice(["", "Everyone", "All", "N/A"])  # Invalid
        elif i % 45 == 0:
            audience = fake.text(max_nb_chars=1000)  # Very long
        elif i % 55 == 0:
            audience = random.choice(
                ["TBD", "See brief", "Multiple segments", "???"]
            )  # Placeholder
        else:
            # Valid: standardized segment codes
            segments = random.sample(AUDIENCE_SEGMENTS, random.randint(1, 3))
            audience = ", ".join(segments)
        record["target_segment"] = audience

        # campaign_manager: Non-null for Active/Completed
        if random.random() > 0.6:
            if i % 65 == 0:
                owner = None
            elif i % 75 == 0:
                owner = random.choice(
                    ["Team", "Agency", "N/A", "TBD", "Unknown"]
                )  # Placeholder
            else:
                owner = fake.name()
            record["campaign_manager"] = owner

        # ad_platform: Canonical set, must align with channel_type
        if is_digital and camp_type in ["Social Media", "Search", "Display", "Video"]:
            if i % 75 == 0:
                platform = None
            elif i % 85 == 0:
                platform = random.choice(["All", "Multiple", "N/A"])  # Invalid
            else:
                # Valid: platform aligned with channel
                if camp_type == "Search":
                    platform = "Google Ads"
                elif camp_type == "Social Media":
                    platform = random.choice(["Meta Ads", "LinkedIn", "TikTok"])
                elif camp_type == "Video":
                    platform = random.choice(["Google Ads", "Meta Ads", "TikTok"])
                else:
                    platform = random.choice(PLATFORM_CANONICAL)
            record["ad_platform"] = platform
        elif is_offline:
            # Offline channels should have NULL platform
            record["ad_platform"] = None if random.random() > 0.1 else "N/A"

        # test_variant: Consistent naming
        if random.random() > 0.7:
            if i % 70 == 0:
                variant = None
            elif i % 80 == 0:
                variant = random.choice(["Test", "Winner", "Original"])  # Non-canonical
            else:
                variant = random.choice(VARIANT_CANONICAL)
            record["test_variant"] = variant

        # created_timestamp: <= now(), <= launch_date
        if random.random() > 0.3:
            if i % 30 == 0:
                created = None
            elif i % 40 == 0:
                created = fake.date_time_between(
                    start_date="-1y", end_date="now"
                ).strftime("%Y-%m-%d %H:%M:%S")
            elif i % 50 == 0:  # Violation: created after launch
                if isinstance(start_date, date):
                    created = fake.date_time_between(
                        start_date=start_date + timedelta(days=1),
                        end_date=start_date + timedelta(days=30),
                    )
                else:
                    created = fake.date_time_between(start_date="+1d", end_date="+30d")
            else:
                # Valid: created before launch_date
                if isinstance(start_date, date):
                    created = fake.date_time_between(
                        start_date=start_date - timedelta(days=60),
                        end_date=start_date - timedelta(days=1),
                    )
                else:
                    created = fake.date_time_between(start_date="-1y", end_date="now")
            record["created_timestamp"] = created

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_marketing_campaigns_data(number_of_campaigns)
df = add_more_messiness(df)

output_file = "marketing_campaigns.xlsx"
df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Customer Sessions Table Generator

In [46]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, timezone
import random
import string
import uuid
import re

fake = Faker(["en_US", "en_GB"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# System launch date for validation
SYSTEM_LAUNCH_DATE = datetime(2020, 1, 1, tzinfo=timezone.utc)

# Controlled vocabularies
CANONICAL_DEVICES = {"Mobile", "Desktop", "Tablet", "Smart TV", "Wearable"}
CANONICAL_TRAFFIC_SOURCES = {
    "Organic Search",
    "Paid Search",
    "Social Media",
    "Direct",
    "Email",
    "Referral",
    "Display Ads",
}
CANONICAL_BROWSERS = {"Chrome", "Safari", "Edge", "Firefox", "Opera", "Samsung Browser"}
CANONICAL_COUNTRIES = {
    "US",
    "GB",
    "CA",
    "AU",
    "DE",
    "FR",
    "JP",
    "IN",
    "BR",
    "MX",
    "IT",
    "ES",
    "NL",
    "SE",
    "CH",
    "NZ",
    "SG",
    "HK",
    "KR",
    "CN",
}


def generate_session_ref():
    """Generate unique UUID v4 format session reference."""
    return str(uuid.uuid4()).lower()


def to_iso8601(dt):
    """Convert to ISO-8601 format with UTC timezone."""
    if dt is None:
        return None
    if isinstance(dt, str):
        try:
            parsed = pd.to_datetime(dt, errors="coerce")
            if pd.isna(parsed):
                return None
            return parsed.strftime("%Y-%m-%dT%H:%M:%SZ")
        except:
            return None
    if isinstance(dt, (datetime, pd.Timestamp)):
        return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
    return None


def normalize_user_id(user_id, customer_id_format="CUST"):
    """Normalize and validate user_id format."""
    if user_id is None:
        return None
    if isinstance(user_id, str):
        user_id = user_id.strip()
        if user_id.lower() in [
            "invalid",
            "guest",
            "anonymous",
            "n/a",
            "",
            "null",
            "none",
        ]:
            return None
        # If has prefix, normalize
        if customer_id_format == "CUST":
            if user_id.startswith("CUST_") or user_id.startswith("cust_"):
                try:
                    num = int(user_id.split("_")[1])
                    if num > 0:
                        return f"CUST_{num}"
                except:
                    return None
        elif customer_id_format == "CUSTOMER":
            if user_id.startswith("CUSTOMER-") or user_id.startswith("customer-"):
                try:
                    num = int(user_id.split("-")[1])
                    if num > 0:
                        return f"CUSTOMER-{num}"
                except:
                    return None
        else:  # NUMBER
            try:
                num = int(user_id)
                if num > 0:
                    return num
            except:
                pass
    return None


def normalize_bool(value):
    """Normalize to boolean: Y/N, yes/no, true/false, 1/0."""
    if value is None:
        return None
    if isinstance(value, bool):
        return value
    if isinstance(value, str):
        value_lower = value.lower().strip()
        if value_lower in ["y", "yes", "true", "t", "1"]:
            return True
        elif value_lower in ["n", "no", "false", "f", "0"]:
            return False
    if isinstance(value, (int, float)):
        return bool(value)
    return None


def normalize_device_category(device):
    """Normalize to canonical device category."""
    if not device or (
        isinstance(device, str)
        and device.lower() in ["unknown", "bot", "crawler", "n/a", "", "none"]
    ):
        return None
    if isinstance(device, str):
        device_lower = device.lower().strip()
        device_title = device.title()

        # Try exact match first
        for canonical in CANONICAL_DEVICES:
            if device_lower == canonical.lower():
                return canonical

        # Try pattern matching
        if device_lower in ["mobile", "phone", "smartphone", "mobile phone", "cell"]:
            return "Mobile"
        elif device_lower in ["desktop", "computer", "pc", "windows", "macos", "linux"]:
            return "Desktop"
        elif device_lower in ["ipad", "tablet"]:
            return "Tablet"
        elif device_lower in ["tv", "smart tv", "smarttv"]:
            return "Smart TV"
        elif device_lower in ["watch", "wearable"]:
            return "Wearable"

    return None


def normalize_traffic_source(source):
    """Normalize to canonical traffic source."""
    if not source or (
        isinstance(source, str)
        and source.lower()
        in ["unknown", "n/a", "(not set)", "(none)", "", "null", "none"]
    ):
        return None
    if isinstance(source, str):
        source_lower = source.lower().strip()

        # Try exact match first
        for canonical in CANONICAL_TRAFFIC_SOURCES:
            if source_lower == canonical.lower():
                return canonical

        # Try pattern matching
        if "google" in source_lower and "ads" in source_lower:
            return "Paid Search"
        elif "bing" in source_lower and "ads" in source_lower:
            return "Paid Search"
        elif (
            "facebook" in source_lower
            or "instagram" in source_lower
            or "tiktok" in source_lower
        ):
            return "Social Media"
        elif "search" in source_lower and "organic" in source_lower:
            return "Organic Search"
        elif "direct" in source_lower:
            return "Direct"
        elif (
            "email" in source_lower
            or "newsletter" in source_lower
            or "mailchimp" in source_lower
        ):
            return "Email"
        elif "referral" in source_lower or "affiliate" in source_lower:
            return "Referral"
        elif "display" in source_lower or "banner" in source_lower:
            return "Display Ads"

    return None


def normalize_browser(browser):
    """Normalize to canonical browser name."""
    if not browser or (
        isinstance(browser, str)
        and browser.lower() in ["unknown", "n/a", "", "none", "null"]
    ):
        return None
    if isinstance(browser, str):
        browser_lower = browser.lower().strip()

        # Try exact match
        for canonical in CANONICAL_BROWSERS:
            if browser_lower == canonical.lower():
                return canonical

        # Try pattern matching
        if "chrome" in browser_lower:
            return "Chrome"
        elif "safari" in browser_lower:
            return "Safari"
        elif "edge" in browser_lower:
            return "Edge"
        elif "firefox" in browser_lower:
            return "Firefox"
        elif "opera" in browser_lower:
            return "Opera"
        elif "samsung" in browser_lower:
            return "Samsung Browser"

    return None


def normalize_country(country):
    """Normalize to ISO 3166-1 alpha-2 uppercase."""
    if not country or (
        isinstance(country, str)
        and country.lower() in ["unknown", "n/a", "", "none", "null"]
    ):
        return None
    if isinstance(country, str):
        country = country.strip().upper()
        if country in CANONICAL_COUNTRIES:
            return country
        # Handle UK → GB
        if country == "UK":
            return "GB"
    return None


def is_valid_page_path(path):
    """Validate that path starts with / and has valid characters."""
    if not path or not isinstance(path, str):
        return False
    path = path.strip()
    if len(path) > 2048:
        return False
    if not path.startswith("/"):
        return False
    # Check for invalid patterns
    if "<script" in path.lower() or "http://" in path or "https://" in path:
        return False
    if any(char in path for char in [chr(0), chr(1), chr(2)]):  # Control characters
        return False
    # Allow URL-safe characters
    return re.match(r"^/[a-zA-Z0-9\-._~:/?#\[\]@!$&\'()*+,;=%]*$", path) is not None


def normalize_page_path(path):
    """Normalize and validate page path."""
    if not path or not isinstance(path, str):
        return None
    path = path.strip()
    if path.lower() in ["n/a", "null", "none", "", "404", "timeout"]:
        return None
    if not is_valid_page_path(path):
        return None
    return path


def calculate_session_duration(start_ts, end_ts):
    """Calculate session duration in seconds."""
    if start_ts is None or end_ts is None:
        return None
    try:
        start = pd.to_datetime(start_ts)
        end = pd.to_datetime(end_ts)
        if end < start:
            return None
        return int((end - start).total_seconds())
    except:
        return None


def generate_messy_customer_sessions_data(num_rows=5000, customer_id_format="CUST"):
    """Generate customer sessions with all 14 validation rules integrated."""
    data = []
    used_session_refs = set()
    valid_customer_ids = set()
    customer_sessions = {}  # Track sessions per customer for visitor_type

    # Generate pool of valid customer IDs
    for i in range(number_of_customers):
        if customer_id_format == "CUST":
            cust_id = f"CUST_{starting_customer_index + i}"
        elif customer_id_format == "CUSTOMER":
            cust_id = f"CUSTOMER-{starting_customer_index + i}"
        else:
            cust_id = str(starting_customer_index + i)
        valid_customer_ids.add(cust_id)

    # Traffic source mapping
    traffic_source_mapping = {
        "google ads": "Paid Search",
        "bing ads": "Paid Search",
        "facebook ads": "Social Media",
        "instagram": "Social Media",
        "tiktok": "Social Media",
        "organic search": "Organic Search",
        "direct": "Direct",
        "email": "Email",
        "affiliate": "Referral",
        "display ads": "Display Ads",
    }

    for i in range(num_rows):
        record = {}
        idx = starting_session_index + i

        # 1. session_ref: UUID v4, unique, no duplicates
        session_ref = generate_session_ref()
        while session_ref in used_session_refs:
            session_ref = generate_session_ref()
        record["session_ref"] = session_ref
        used_session_refs.add(session_ref)

        # 2. user_id: NULL for guests, referential integrity for logged-in
        is_guest = random.random() < 0.35  # 35% guest sessions

        if is_guest:
            user_id = None
        elif i % 60 == 0:
            # Invalid/non-existent customer (violation for testing)
            if customer_id_format == "CUST":
                user_id = f"CUST_99999"
            elif customer_id_format == "CUSTOMER":
                user_id = f"CUSTOMER-99999"
            else:
                user_id = "99999"
        elif i % 40 == 0:
            # Invalid format
            user_id = random.choice(["INVALID", "GUEST", "ANONYMOUS", "", "N/A"])
        else:
            user_id = random.choice(list(valid_customer_ids))

        # Normalize and validate user_id
        user_id = normalize_user_id(user_id, customer_id_format)

        # Track sessions per customer
        if user_id:
            if user_id not in customer_sessions:
                customer_sessions[user_id] = []
            customer_sessions[user_id].append(i)

        record["user_id"] = user_id

        # 3. start_timestamp: ISO-8601, non-null, >= system launch
        if i % 25 == 0:
            start_ts = None
        elif i % 35 == 0:
            # Future date (violation for testing)
            start_ts = fake.date_time_between(start_date="+1d", end_date="+7d")
        elif i % 45 == 0:
            # Too old (violation)
            start_ts = fake.date_time_between(start_date="-3y", end_date="-2y")
        else:
            start_ts = fake.date_time_between(
                start_date=SYSTEM_LAUNCH_DATE, end_date="now"
            )

        # Enforce NOT NULL and convert to ISO-8601
        if start_ts is None:
            start_ts = fake.date_time_between(start_date="-30d", end_date="now")

        start_iso = to_iso8601(start_ts)
        if start_iso and pd.to_datetime(start_iso) > datetime.now(timezone.utc):
            start_iso = to_iso8601(datetime.now(timezone.utc))
        if start_iso and pd.to_datetime(start_iso) < SYSTEM_LAUNCH_DATE:
            start_iso = to_iso8601(SYSTEM_LAUNCH_DATE)

        record["start_timestamp"] = start_iso

        # 3. end_timestamp: ISO-8601, >= start_timestamp, <= 4 hours
        if i % 30 == 0:
            end_ts = None
        elif i % 40 == 0:
            # End before start (violation)
            if isinstance(start_ts, datetime):
                end_ts = start_ts - timedelta(minutes=random.randint(1, 60))
            else:
                end_ts = None
        elif i % 50 == 0:
            # Very long session > 4 hours (violation)
            if isinstance(start_ts, datetime):
                end_ts = start_ts + timedelta(hours=random.randint(5, 24))
            else:
                end_ts = None
        else:
            if isinstance(start_ts, datetime):
                # Realistic duration: 1-240 minutes (4 hours)
                duration_minutes = random.choices(
                    [
                        random.randint(1, 5),  # Bounce
                        random.randint(6, 15),  # Quick
                        random.randint(16, 30),  # Normal
                        random.randint(31, 60),  # Engaged
                        random.randint(61, 240),  # Very engaged
                    ],
                    weights=[35, 25, 20, 15, 5],
                    k=1,
                )[0]
                end_ts = start_ts + timedelta(minutes=duration_minutes)
            else:
                end_ts = None

        # Convert to ISO-8601
        end_iso = to_iso8601(end_ts) if end_ts else None

        # Validate chronology
        if start_iso and end_iso:
            start_dt = pd.to_datetime(start_iso)
            end_dt = pd.to_datetime(end_iso)
            if end_dt < start_dt:
                # Fix: use same timestamp
                end_iso = start_iso
            # Check 4-hour limit
            if (end_dt - start_dt).total_seconds() > 14400:  # 4 hours
                end_iso = (start_dt + timedelta(hours=4)).strftime("%Y-%m-%dT%H:%M:%SZ")

        record["end_timestamp"] = end_iso

        # 11. session_duration_sec: Must equal (end - start) in seconds
        duration_sec = calculate_session_duration(start_iso, end_iso)
        record["session_duration_sec"] = duration_sec

        # 4. device_category: Canonical values
        if i % 25 == 0:
            device = None
        elif i % 35 == 0:
            device = random.choice(["mobile", "MOBILE", "deskop", "bot", "unknown"])
        else:
            device = random.choices(
                ["Mobile", "Desktop", "Tablet", "Smart TV", "Wearable"],
                weights=[45, 40, 10, 3, 2],
                k=1,
            )[0]

        device = normalize_device_category(device)
        record["device_category"] = device

        # 5. traffic_source: Canonical values
        if i % 20 == 0:
            source = None
        elif i % 30 == 0:
            source = random.choice(["unknown", "n/a", "(not set)", "test.localhost"])
        else:
            source = random.choice(list(CANONICAL_TRAFFIC_SOURCES))

        source = normalize_traffic_source(source)
        record["traffic_source"] = source

        # 6. page_views: Integer >= 1
        if i % 25 == 0:
            page_views = None
        elif i % 35 == 0:
            page_views = random.choice(["many", "-5", "0"])  # Violations
        elif i % 45 == 0:
            page_views = 0  # Must be >= 1
        else:
            if duration_sec and duration_sec > 0:
                # Realistic: 1-50 pages based on duration
                if duration_sec < 300:  # < 5 min
                    page_views = random.randint(1, 2)
                elif duration_sec < 900:  # < 15 min
                    page_views = random.randint(2, 5)
                elif duration_sec < 1800:  # < 30 min
                    page_views = random.randint(4, 10)
                else:
                    page_views = random.randint(8, 30)
            else:
                page_views = random.randint(1, 15)

        # Ensure >= 1
        if isinstance(page_views, int):
            if page_views < 1:
                page_views = 1
        else:
            page_views = None

        record["page_views"] = page_views

        # 6. products_browsed: Integer >= 0, <= page_views
        if i % 30 == 0:
            products = None
        elif i % 40 == 0:
            products = random.choice(["multiple", "-3"])  # Violations
        else:
            if isinstance(page_views, int) and page_views > 0:
                # products <= pages
                max_products = max(1, int(page_views * 0.6))
                products = random.randint(0, max_products)
            else:
                products = random.randint(0, 5)

        # Validate
        if isinstance(products, int) and isinstance(page_views, int):
            if products > page_views:
                products = page_views
            if products < 0:
                products = 0
        else:
            products = None

        record["products_browsed"] = products

        # 7. purchase_made: Boolean
        if i % 35 == 0:
            purchase = None
        elif i % 45 == 0:
            purchase = random.choice(["maybe", "pending"])  # Violations
        else:
            # Realistic: more likely if products viewed
            if isinstance(products, int) and products >= 5:
                purchase = random.choices([True, False], weights=[30, 70], k=1)[0]
            elif isinstance(products, int) and products >= 2:
                purchase = random.choices([True, False], weights=[15, 85], k=1)[0]
            else:
                purchase = random.choices([True, False], weights=[2, 98], k=1)[0]

        purchase = normalize_bool(purchase)
        record["purchase_made"] = purchase

        # 7. cart_abandoned: Boolean, mutual exclusivity with purchase
        if i % 40 == 0:
            abandoned = None
        else:
            # Can't abandon if converted
            if purchase is True:
                abandoned = False
            else:
                # Higher chance if products viewed
                if isinstance(products, int) and products > 0:
                    abandoned = random.choices([True, False], weights=[40, 60], k=1)[0]
                else:
                    abandoned = False

        record["cart_abandoned"] = abandoned

        # 7. bounce_session: Boolean, page_views = 1
        if random.random() > 0.6:
            if isinstance(page_views, int):
                bounce = page_views == 1
                # Consistency: if bounce, page_views must = 1
                if bounce and page_views != 1:
                    bounce = False
            else:
                bounce = None

            bounce = normalize_bool(bounce)
            record["bounce_session"] = bounce

        # 8. geo_country: ISO Alpha-2
        if i % 25 == 0:
            country = None
        elif i % 35 == 0:
            country = random.choice(["unknown", "n/a", ""])
        else:
            country = random.choice(list(CANONICAL_COUNTRIES))

        country = normalize_country(country)
        record["geo_country"] = country

        # 9. entry_page: Valid path
        if i % 30 == 0:
            entry = None
        elif i % 40 == 0:
            entry = random.choice(["N/A", "404", ""])
        else:
            entry = random.choice(
                [
                    "/home",
                    "/products",
                    "/sale",
                    "/search",
                    "/category/electronics",
                    "/product/item-123",
                    "/cart",
                    "/checkout",
                ]
            )

        entry = normalize_page_path(entry)
        record["entry_page"] = entry

        # 9. exit_page: Valid path, consistency checks
        if i % 35 == 0:
            exit_page = None
        elif i % 45 == 0:
            exit_page = random.choice(["N/A", "timeout", ""])
        else:
            # Consistency: if page_views = 1, entry_page = exit_page
            if isinstance(page_views, int) and page_views == 1 and entry:
                exit_page = entry
            elif purchase is True:
                exit_page = "/order-confirmation"
            elif abandoned is True:
                exit_page = random.choice(["/cart", "/checkout"])
            else:
                exit_page = random.choice(
                    ["/home", "/products", "/product/item-456", "/about", "/contact"]
                )

        exit_page = normalize_page_path(exit_page)
        record["exit_page"] = exit_page

        # 10. visitor_type: New or Returning
        if user_id:
            # Check if returning
            if user_id in customer_sessions and len(customer_sessions[user_id]) > 1:
                visitor = "Returning"
            else:
                visitor = "New"
        else:
            # Guest sessions can be New or Returning
            visitor = random.choice(["New", "Returning"])

        if i % 90 == 0:
            visitor = random.choice(["unknown", "n/a"])  # Violation

        # Normalize
        if isinstance(visitor, str) and visitor.title() in ["New", "Returning"]:
            visitor = visitor.title()
        else:
            visitor = None

        record["visitor_type"] = visitor

        # 12. cart_items_count: Integer >= 0
        if random.random() > 0.6:
            if purchase is True or abandoned is True:
                if i % 80 == 0:
                    cart_count = None
                else:
                    cart_count = random.randint(1, 10)
            else:
                cart_count = 0

            # Validate
            if isinstance(cart_count, int) and cart_count < 0:
                cart_count = 0

            record["cart_items_count"] = cart_count

        # 13. browser_name: Canonical values
        if i % 30 == 0:
            browser = None
        elif i % 40 == 0:
            browser = random.choice(["chrome mobile", "unknown"])
        else:
            browser = random.choice(list(CANONICAL_BROWSERS))

        browser = normalize_browser(browser)
        record["browser_name"] = browser

        # 14. session_revenue: Decimal 2 decimals, >= 0, dependency on purchase_made
        if purchase is True:
            if i % 85 == 0:
                revenue = None
            elif i % 95 == 0:
                revenue = round(random.uniform(-100, 0), 2)  # Violation
            else:
                revenue = round(random.uniform(20, 1000), 2)
        else:
            revenue = 0.00

        # Ensure 2 decimal places
        if isinstance(revenue, (int, float)):
            revenue = round(revenue, 2)
            if revenue < 0:
                revenue = 0.00

        record["session_revenue"] = revenue

        data.append(record)

    df = pd.DataFrame(data)

    # Add empty rows and NULL-value rows (NOT session_ref duplicates)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " "]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_more_messiness(df):
    """Add additional data quality issues."""
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


if __name__ == "__main__":
    df = generate_messy_customer_sessions_data(number_of_sessions, "CUST")
    df = add_more_messiness(df)

    output_file = "customer_sessions.xlsx"
    df.to_excel(output_file, index=False)

  df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)


### Supplier Table Generator

In [47]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta, date
import random

fake = Faker(["en_US", "en_GB", "de_DE", "fr_FR"])
Faker.seed(42)
np.random.seed(42)
random.seed(42)

# Canonical values
STATUS_CANONICAL = ["Active", "Inactive", "Suspended", "Pending", "Under Review"]
COUNTRY_CODES = [
    "US",
    "GB",
    "DE",
    "FR",
    "CA",
    "AU",
    "JP",
    "CN",
    "IN",
    "BR",
    "MX",
    "IT",
    "ES",
    "NL",
]

# Rating text to numeric mapping
RATING_TEXT_MAP = {
    "Excellent": 5.00,
    "Good": 4.00,
    "Average": 3.00,
    "Fair": 3.00,
    "Poor": 2.00,
    "Bad": 1.00,
}


def generate_messy_supplier_data(num_rows=1000):
    data = []
    used_ids = []
    used_tax_ids = []
    duplicate_names = []

    for i in range(num_rows):
        record = {}

        # supplier_id: Primary key, format ^SUP_[1-9][0-9]*$, positive, unique
        if i % 53 == 0 and used_ids:
            supplier_id = random.choice(used_ids)  # Duplicate violation
        elif i % 97 == 0:
            supplier_id = None  # NULL violation
        elif i % 61 == 0:
            # Format violations
            formats = [
                f"S{i + starting_supplier_index}",
                f"VND-{i + starting_supplier_index}",
                f"PARTNER_{i + starting_supplier_index}",
                str(i + starting_supplier_index),
            ]
            supplier_id = random.choice(formats)
        elif i % 73 == 0:
            # Case inconsistencies
            supplier_id = random.choice(
                [
                    f"sup_{i + starting_supplier_index}",
                    f"Sup_{i + starting_supplier_index}",
                ]
            )
        elif i % 83 == 0:
            # Placeholder values
            supplier_id = random.choice(["SUP_0", "SUP_999999", "SUP_TEST"])
        else:
            # Valid: canonical format
            supplier_id = f"SUP_{i + starting_supplier_index}"

        used_ids.append(supplier_id)
        record["supplier_id"] = supplier_id

        # supplier_rating: Decimal 0.00-5.00, 2 decimals
        if i % 27 == 0:
            rating = None
        elif i % 37 == 0:
            # String values (should be mapped)
            rating = random.choice(
                ["Excellent", "Good", "Fair", "Poor", "N/A", "5 stars"]
            )
        elif i % 47 == 0:
            # Negative (out of range)
            rating = round(random.uniform(-1.0, -0.1), 2)
        elif i % 57 == 0:
            # Too high (> 5.00)
            rating = round(random.uniform(5.1, 10.0), 2)
        elif i % 67 == 0:
            # Too many decimals
            rating = round(random.uniform(0, 5), 5)
        elif i % 77 == 0:
            # Integer format
            rating = random.randint(1, 5)
        else:
            # Valid: realistic rating distribution (skewed toward higher)
            rating = round(
                random.choices(
                    [
                        random.uniform(0, 2),
                        random.uniform(2, 3.5),
                        random.uniform(3.5, 5),
                    ],
                    weights=[10, 30, 60],
                    k=1,
                )[0],
                2,
            )
        record["supplier_rating"] = rating

        # supplier_status: Canonical set
        if i % 22 == 0:
            status = None
        elif i % 32 == 0:
            # Case inconsistencies
            status = random.choice(
                ["active", "ACTIVE", "inactive", "INACTIVE", "pending", "PENDING"]
            )
        elif i % 42 == 0:
            # Typos
            status = random.choice(["actve", "Inactiv", "Pendng", "Suspnded"])
        elif i % 52 == 0:
            # Invalid values
            status = random.choice(["YES", "NO", "TRUE", "FALSE", "N/A", "TBD", ""])
        elif i % 62 == 0:
            # Synonyms that should be mapped
            status = random.choice(["On Hold", "Disabled", "Blocked", "Approved"])
        else:
            # Valid: canonical values
            status = random.choices(STATUS_CANONICAL, weights=[50, 20, 10, 15, 5], k=1)[
                0
            ]
        record["supplier_status"] = status

        # is_preferred: Boolean, typically TRUE implies is_verified=TRUE
        if i % 26 == 0:
            is_preferred = None
        elif i % 36 == 0:
            # Various boolean representations
            is_preferred = random.choice(
                ["Y", "N", "Yes", "No", "1", "0", "true", "false"]
            )
        elif i % 46 == 0:
            # Invalid values
            is_preferred = random.choice(["Maybe", "Unknown", "N/A", "TBD", ""])
        else:
            # Valid: 20% are preferred
            is_preferred = random.choices([True, False], weights=[20, 80], k=1)[0]
        record["is_preferred"] = is_preferred

        # is_verified: Boolean
        if i % 29 == 0:
            is_verified = None
        elif i % 39 == 0:
            # Various boolean representations
            is_verified = random.choice(
                ["Y", "N", "Yes", "No", "1", "0", "true", "false"]
            )
        elif i % 49 == 0:
            # Invalid values
            is_verified = random.choice(["Pending", "In Progress", "N/A", ""])
        elif i % 59 == 0:
            # is_preferred=TRUE but is_verified=FALSE violation
            if is_preferred == True:
                is_verified = False
            else:
                is_verified = True
        elif i % 69 == 0:
            # Status=Suspended/Inactive but is_preferred=TRUE violation
            if status in ["Suspended", "Inactive"]:
                record["is_preferred"] = True
                is_verified = True
            else:
                is_verified = False
        else:
            # Valid: 70% are verified
            is_verified = random.choices([True, False], weights=[70, 30], k=1)[0]
        record["is_verified"] = is_verified

        # contract_start_date: YYYY-MM-DD
        if i % 23 == 0:
            contract_start = None
        elif i % 33 == 0:
            # String format variations
            start_dt = fake.date_between(start_date="-5y", end_date="+6m")
            formats = ["%Y-%m-%d", "%m/%d/%Y", "%d-%m-%Y", "%Y%m%d"]
            contract_start = start_dt.strftime(random.choice(formats))
        elif i % 43 == 0:
            # Invalid/placeholder dates
            contract_start = random.choice(
                ["0000-00-00", "1900-01-01", "2025-13-45", "TBD", "N/A"]
            )
        elif i % 54 == 0:
            # Very old contracts (> 10 years)
            contract_start = fake.date_between(start_date="-15y", end_date="-11y")
        elif i % 64 == 0:
            # Far future contracts
            contract_start = fake.date_between(start_date="+2y", end_date="+5y")
        else:
            # Valid: within reasonable range
            contract_start = fake.date_between(start_date="-3y", end_date="+3m")
        record["contract_start_date"] = contract_start

        # contract_end_date: > contract_start_date, duration 1-5 years typical
        if i % 28 == 0:
            contract_end = None
        elif i % 38 == 0:
            # End before start violation
            if isinstance(contract_start, date):
                contract_end = contract_start - timedelta(days=random.randint(1, 365))
            else:
                contract_end = fake.date_between(start_date="-6y", end_date="-5y")
        elif i % 48 == 0:
            # Same as start date (not strictly greater)
            contract_end = contract_start
        elif i % 58 == 0:
            # Very short contract (< 30 days)
            if isinstance(contract_start, date):
                contract_end = contract_start + timedelta(days=random.randint(1, 29))
            else:
                contract_end = fake.date_between(start_date="-1y", end_date="now")
        elif i % 68 == 0:
            # Very long contract (> 10 years)
            if isinstance(contract_start, date):
                contract_end = contract_start + timedelta(
                    days=random.randint(3700, 5500)
                )
            else:
                contract_end = fake.date_between(start_date="+10y", end_date="+15y")
        elif i % 78 == 0:
            # String format
            end_dt = fake.date_between(start_date="-1y", end_date="+5y")
            contract_end = end_dt.strftime("%m/%d/%Y")
        else:
            # Valid: 6 months to 5 years duration
            if isinstance(contract_start, date):
                duration_days = random.randint(180, 1825)
                contract_end = contract_start + timedelta(days=duration_days)
            else:
                contract_end = fake.date_between(start_date="-2y", end_date="+3y")
        record["contract_end_date"] = contract_end

        # supplier_name: Mandatory, 2-150 chars
        if i % 199 == 0:
            name = None
        elif i % 31 == 0:
            # Duplicates with variations
            if duplicate_names:
                base_name = random.choice(duplicate_names)
                variations = [
                    base_name,
                    base_name.upper(),
                    base_name.lower(),
                    base_name + " Inc",
                    base_name + " LLC",
                    base_name + " Co.",
                ]
                name = random.choice(variations)
            else:
                name = fake.company()
                duplicate_names.append(name)
        elif i % 41 == 0:
            # Placeholders
            name = random.choice(
                ["NULL", "N/A", "", "Supplier", "Vendor", "Company", "TEST"]
            )
        elif i % 51 == 0:
            # Too short (< 2 chars)
            name = random.choice(["A", "X", ""])
        elif i % 71 == 0:
            # Only numbers/symbols
            name = random.choice(["12345", "###", "@@@", "---"])
        elif i % 81 == 0:
            # Names with special characters
            name = fake.company() + random.choice(
                [" & Co.", " @ Supply", " #1", " *Premium*"]
            )
        else:
            # Valid company name
            name = fake.company()
        record["supplier_name"] = name

        # contact_email: Valid email format
        if i % 17 == 0:
            email = None
        elif i % 34 == 0:
            # Invalid formats
            invalid_emails = [
                "not-an-email",
                "@company.com",
                fake.user_name(),
                fake.user_name() + "@",
                "N/A",
                "NA",
                "TBD",
                "",
            ]
            email = random.choice(invalid_emails)
        elif i % 44 == 0:
            # Dummy domains
            email = f"{fake.user_name()}@example.com"
        elif i % 54 == 0:
            # Case issues
            email = fake.company_email().upper()
        elif i % 74 == 0:
            # Status=Active but email=NULL violation
            if status == "Active":
                email = None
            else:
                email = fake.company_email()
        else:
            # Valid email
            email = fake.company_email()
        record["contact_email"] = email

        # phone_number: E.164 format, 7-15 digits
        if i % 21 == 0:
            phone = None
        elif i % 35 == 0:
            # Invalid formats
            invalid_phones = [
                "0000000000",
                "9999999999",
                "123",
                "N/A",
                "",
                "555-555-5555",
                "+10000000000",
                "1111111111",
            ]
            phone = random.choice(invalid_phones)
        elif i % 45 == 0:
            # Too short (< 7 digits)
            phone = str(random.randint(100, 999999))
        elif i % 55 == 0:
            # Too long (> 15 digits)
            phone = str(random.randint(10**16, 10**18))
        else:
            # Valid: various formats (will need normalization)
            phone = fake.phone_number()
        record["phone_number"] = phone

        # tax_id: Country-specific format, unique
        if i % 24 == 0:
            tax_id = None
        elif i % 34 == 0:
            # Placeholder values
            tax_id = random.choice(
                ["N/A", "NA", "PENDING", "NOT PROVIDED", "000000000"]
            )
        elif i % 56 == 0 and used_tax_ids:
            # Duplicate tax_id violation
            tax_id = random.choice(used_tax_ids)
        elif i % 66 == 0:
            # Status=Active, is_verified=TRUE but tax_id=NULL violation
            if status == "Active" and is_verified == True:
                tax_id = None
            else:
                tax_id = f"{random.randint(10, 99)}-{random.randint(1000000, 9999999)}"
        else:
            # Valid: country-specific formats
            tax_formats = [
                f"{random.randint(10, 99)}-{random.randint(1000000, 9999999)}",  # US EIN
                f"GB{random.randint(100000000, 999999999)}",  # UK VAT
                f"DE{random.randint(100000000, 999999999)}",  # DE VAT
                f"FR{random.randint(10000000000, 99999999999)}",  # FR VAT
            ]
            tax_id = random.choice(tax_formats)
            used_tax_ids.append(tax_id)
        record["tax_id"] = tax_id

        # city: 2-60 chars, no digits-only
        if i % 19 == 0:
            city = None
        elif i % 30 == 0:
            # Placeholder values
            city = random.choice(["", " ", "N/A", "NA", "TBD", "Unknown", "NULL"])
        elif i % 40 == 0:
            # Case inconsistencies
            city = random.choice([fake.city().upper(), fake.city().lower()])
        elif i % 50 == 0:
            # Special characters
            city = fake.city() + random.choice([" (Main)", " - HQ", " *", " #1"])
        elif i % 70 == 0:
            # Digits only (invalid)
            city = str(random.randint(10000, 99999))
        else:
            # Valid city
            city = fake.city()
        record["city"] = city

        # state: Valid subdivision for country
        if i % 25 == 0:
            state = None
        elif i % 35 == 0:
            # Mixed formats (abbrev vs full)
            state = random.choice([fake.state(), fake.state_abbr()])
        elif i % 45 == 0:
            # Placeholder values
            state = random.choice(["", "N/A", "NA", "TBD", "NULL"])
        elif i % 55 == 0:
            # Case issues
            state = random.choice([fake.state().upper(), fake.state().lower()])
        else:
            # Valid state
            state = fake.state()
        record["state"] = state

        # zip_code: Country-specific format
        if i % 20 == 0:
            zip_code = None
        elif i % 30 == 0:
            # Placeholder values
            zip_code = random.choice(
                ["00000", "99999", "XXXXX", "N/A", "TBD", "", "11111"]
            )
        elif i % 40 == 0:
            # International formats
            intl_formats = [
                f"{fake.country_code()}-{random.randint(1000, 9999)}",
                f"{random.choice(['SW', 'NW', 'SE', 'NE'])}{random.randint(1, 9)} {random.randint(1, 9)}{random.choice(['AA', 'BB'])}",
            ]
            zip_code = random.choice(intl_formats)
        elif i % 60 == 0:
            # Too short/long
            zip_code = (
                str(random.randint(100, 999))
                if random.random() > 0.5
                else str(random.randint(100000, 9999999))
            )
        else:
            # Valid US ZIP
            zip_code = fake.zipcode()
        record["zip_code"] = zip_code

        # country: ISO 3166-1 alpha-2
        if i % 26 == 0:
            country = None
        elif i % 36 == 0:
            # Country codes (valid ISO)
            country = random.choice(COUNTRY_CODES)
        elif i % 46 == 0:
            # ISO3 codes (should be alpha-2)
            iso3_codes = ["USA", "GBR", "DEU", "FRA", "CHN", "JPN"]
            country = random.choice(iso3_codes)
        elif i % 56 == 0:
            # Variations (should be normalized)
            country = random.choice(
                [
                    "United States",
                    "United States of America",
                    "USA",
                    "US",
                    "U.S.",
                    "U.S.A.",
                    "America",
                ]
            )
        elif i % 66 == 0:
            # Invalid/old country names
            old_countries = [
                "USSR",
                "Yugoslavia",
                "Czechoslovakia",
                "East Germany",
                "Burma",
            ]
            country = random.choice(old_countries)
        elif i % 76 == 0:
            # Placeholder values
            country = random.choice(
                ["N/A", "NA", "Unknown", "TBD", "NULL", "", "International"]
            )
        elif i % 86 == 0:
            # UK vs GB
            country = "UK"  # Should be GB for strict ISO
        else:
            # Valid: full country name
            country = fake.country()
        record["country"] = country

        # created_at: <= now(), <= contract_start_date
        if i % 31 == 0:
            created = None
        elif i % 41 == 0:
            # String timestamp
            created = fake.date_time_between(start_date="-5y", end_date="now").strftime(
                "%Y-%m-%d %H:%M:%S"
            )
        elif i % 63 == 0:
            # Future date violation
            created = fake.date_time_between(start_date="+1d", end_date="+1y")
        elif i % 73 == 0:
            # created_at > contract_start_date violation
            if isinstance(contract_start, date):
                created = datetime.combine(
                    contract_start + timedelta(days=random.randint(30, 365)),
                    datetime.min.time(),
                )
            else:
                created = fake.date_time_between(start_date="now", end_date="+1y")
        else:
            # Valid: before contract start
            if isinstance(contract_start, date):
                max_date = datetime.combine(contract_start, datetime.min.time())
                created = fake.date_time_between(start_date="-5y", end_date=max_date)
            else:
                created = fake.date_time_between(start_date="-5y", end_date="now")
        record["created_at"] = created

        data.append(record)

    df = pd.DataFrame(data)

    # Add exact duplicates (2%)
    for _ in range(int(num_rows * 0.02)):
        if len(df) > 0:
            df = pd.concat([df, df.sample(1)], ignore_index=True)

    # Add empty rows (0.5%)
    for _ in range(int(num_rows * 0.005)):
        empty_row = pd.Series([None] * len(df.columns), index=df.columns)
        df = pd.concat([df, pd.DataFrame([empty_row])], ignore_index=True)

    # Add NULL string rows (1%)
    for _ in range(int(num_rows * 0.01)):
        null_values = ["NULL", "N/A", "null", "NA", "", " ", "None", "nil"]
        null_row = pd.Series(
            [random.choice(null_values) for _ in range(len(df.columns))],
            index=df.columns,
        )
        df = pd.concat([df, pd.DataFrame([null_row])], ignore_index=True)

    df = df.sample(frac=1).reset_index(drop=True)
    return df


def add_supplier_messiness(df):
    string_cols = df.select_dtypes(include=["object"]).columns

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.05
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: "  " + str(x) + "  " if pd.notna(x) else x
        )

    for col in string_cols:
        mask = np.random.random(len(df)) < 0.03
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: (
                str(x).upper()
                if pd.notna(x) and random.random() > 0.5
                else str(x).lower() if pd.notna(x) else x
            )
        )

    for col in string_cols[:3]:
        mask = np.random.random(len(df)) < 0.02
        special_chars = ["@", "#", "!", "*", "&", "%"]
        df.loc[mask, col] = df.loc[mask, col].apply(
            lambda x: str(x) + random.choice(special_chars) if pd.notna(x) else x
        )

    return df


df = generate_messy_supplier_data(number_of_suppliers)
df = add_supplier_messiness(df)

output_file = "suppliers.xlsx"
df.to_excel(output_file, index=False)