In [None]:
import pandas as pd
import numpy as np
import random

# --- Settings ---
n_rows = 3000
np.random.seed(42)

regions = {
    "Khartoum": ["Khartoum", "Omdurman"],
    "Red Sea": ["Port Sudan"],
    "Kassala": ["Kassala"]
}
segments = ["Individual", "Small Business", "Enterprise"]
payment_types = ["Cash", "Bank"]
sales_reps = ["Fatima Khalid", "Omer Abdelrahman", "Sara Ali", "Yasir Hassan", "Mona Abdalla"]

products = [
    ("PRD-101", "HP LaserJet P1102", "Electronics"),
    ("PRD-102", "Dell Latitude 5400", "Electronics"),
    ("PRD-103", "Office Desk - Wood", "Furniture"),
    ("PRD-104", "Ergonomic Chair", "Furniture"),
    ("PRD-105", "A4 Paper 500 Sheets", "Office Supplies"),
    ("PRD-106", "Stapler Set", "Office Supplies"),
    ("PRD-107", "Canon Printer Ink", "Electronics"),
    ("PRD-108", "Meeting Table", "Furniture"),
    ("PRD-109", "Pen Box (50 pcs)", "Office Supplies"),
    ("PRD-110", "Desk Lamp", "Electronics")
]

first = ["Ahmed", "Fatima", "Mohamed", "Sara", "Huda", "Omer", "Rania", "Nour", "Maha", "Ali", "Khalid", "Eiman"]
last = ["Musa", "Ali", "Abdalla", "Osman", "Ahmed", "Hassan", "Yousif", "Mohamed", "Khalid", "Abdelsalam"]
customers = [f"{random.choice(first)} {random.choice(last)}" for _ in range(300)]

data = []
for i in range(n_rows):
    year = np.random.randint(2023, 2026)
    order_id = f"SUD-{year}-{1000+i}"
    order_date = pd.Timestamp(f"{year}-01-01") + pd.to_timedelta(np.random.randint(0, 365), unit="D")
    customer = random.choice(customers)
    segment = random.choice(segments)
    region = random.choice(list(regions.keys()))
    city = random.choice(regions[region])
    product = random.choice(products)
    pid, pname, category = product
    qty = np.random.randint(1, 11)
    unit_price = np.random.uniform(2000, 100000)
    discount = np.random.choice([0, 5, 10, 15], p=[0.7, 0.15, 0.1, 0.05])
    total_sales = round(qty * unit_price * (1 - discount/100), 2)
    cost = round(total_sales * np.random.uniform(0.7, 0.9), 2)
    profit = round(total_sales - cost, 2)
    payment = random.choice(payment_types)
    sales_rep = random.choice(sales_reps)

    data.append([order_id, order_date, customer, segment, region, city, pid, pname, category,
                 qty, unit_price, total_sales, cost, profit, payment, sales_rep, discount])

df = pd.DataFrame(data, columns=[
    "Order_ID", "Order_Date", "Customer_Name", "Segment", "Region", "City",
    "Product_ID", "Product_Name", "Category", "Quantity", "Unit_Price",
    "Total_Sales", "Cost", "Profit", "Payment_Type", "Sales_Rep", "Discount"
])

# 5% negative quantities (returns)
neg_idx = np.random.choice(df.index, int(0.05 * n_rows), replace=False)
df.loc[neg_idx, ["Quantity", "Total_Sales", "Cost", "Profit"]] *= -1

# --- MESSY DATA INJECTION ---

# Random date formats
date_formats = [
    lambda x: x.strftime("%Y/%m/%d"),
    lambda x: x.strftime("%d-%m-%Y"),
    lambda x: x.strftime("%b %d, %Y"),
    lambda x: x.strftime("%d/%m/%y")
]
df["Order_Date"] = [random.choice(date_formats)(d) for d in df["Order_Date"]]

# Currency symbols and commas
def messy_number(x):
    if random.random() < 0.15:
        return f"{x:,.0f} SDG"
    elif random.random() < 0.1:
        return f"{x/1000:.1f}k"
    return round(x, 2)

for col in ["Unit_Price", "Total_Sales", "Cost", "Profit"]:
    df[col] = df[col].apply(messy_number)

# City and Region typos
df.loc[np.random.choice(df.index, 100, replace=False), "City"] = df["City"].str.upper()
city_typos = [" portsudan", "Kasala ", "Khartom", " omdurman"]
df.loc[np.random.choice(df.index, 80, replace=False), "City"] = np.random.choice(city_typos, 80)
region_typos = [" khartoum", "red sea", " kassala", "RedSea"]
df.loc[np.random.choice(df.index, 50, replace=False), "Region"] = np.random.choice(region_typos, 50)

# Payment typos
pay_typos = ["cash ", "bankk", "Bank ", "CASH"]
df.loc[np.random.choice(df.index, 40, replace=False), "Payment_Type"] = np.random.choice(pay_typos, 40)

# Missing values
for col in ["Customer_Name", "Discount", "Payment_Type"]:
    df.loc[np.random.choice(df.index, 70, replace=False), col] = np.nan

# Duplicates
df = pd.concat([df, df.sample(60, random_state=42)], ignore_index=True)

# Save messy dataset
df.to_csv("Sales Records.csv", index=False)
print("✅ Sales Records.csv created successfully")


✅ sudan_sales_raw_messy_2023_2025.csv created successfully
