In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

# ============================================================================
# 1. REGIONS.CSV - 8 rows
# ============================================================================
regions_data = {
    'region_id': ['R01', 'R02', 'R03', 'R04', 'R05', 'R06', 'R07', 'R08'],
    'country': ['Belgium', 'Netherlands', 'France', 'Germany', 'Luxembourg', 
                'belgium', 'france', 'Germany'],  # intentional duplicates/inconsistent casing
    'region_name': ['Benelux', 'benelux', 'Western Europe', 'Central Europe', 
                    'Benelux', 'BENELUX', 'Western EU', 'Central Europe'],
    'manager': ['Emma De Smet', 'Emma De Smet', 'Pierre Laurent', 'Markus Vogel',
                'Emma De Smet', 'emma de smet', 'Pierre Laurent', 'Markus Vogel']
}
df_regions = pd.DataFrame(regions_data)

# ============================================================================
# 2. PRODUCTS.CSV - 60 rows
# ============================================================================
product_names = [
    'Logitech MX Master 3 Mouse', 'IKEA Markus Chair', 'HP Envy 15 Laptop',
    'Leitz Paper Organizer', 'Philips LED Desk Lamp', 'Samsung 27" Monitor',
    'Dell XPS 13 Laptop', 'Microsoft Surface Keyboard', 'Logitech C920 Webcam',
    'IKEA Bekant Desk', 'HP LaserJet Printer', 'Canon PIXMA Printer',
    'Apple Magic Mouse', 'Fellowes Paper Shredder', 'Staedtler Pen Set',
    'Moleskine Notebook', 'Post-it Note Pack', 'Scotch Tape Dispenser',
    'Bose QuietComfort Headphones', 'Sony WH-1000XM4 Headphones',
    'Kingston USB Flash Drive 64GB', 'SanDisk SD Card 128GB', 'Seagate External HDD 2TB',
    'Western Digital SSD 1TB', 'TP-Link WiFi Router', 'Netgear Network Switch',
    'APC Battery Backup UPS', 'Belkin Surge Protector', 'Cable Management Box',
    'Monitor Stand Adjustable', 'Ergonomic Wrist Rest', 'Laptop Cooling Pad',
    'USB-C Hub Multiport', 'HDMI Cable 2m', 'DisplayPort Cable',
    'Ethernet Cable Cat6 5m', 'Wireless Mouse Pad', 'Mechanical Keyboard RGB',
    'Gaming Chair Racing Style', 'Standing Desk Converter', 'Desk Organizer Set',
    'File Cabinet 3-Drawer', 'Bookshelf 5-Tier', 'Whiteboard Magnetic 90x60cm',
    'Cork Board with Pins', 'Desk Calendar 2024', 'Wall Clock Modern',
    'Trash Bin Office', 'Coffee Maker Nespresso', 'Water Cooler Dispenser',
    'Air Purifier HEPA', 'Space Heater Portable', 'Table Fan Oscillating',
    'LED Strip Lights 5m', 'Smart Bulb Philips Hue', 'Extension Cord 3m',
    'Label Maker Brother', 'Calculator Scientific', 'Stapler Heavy Duty',
    'Hole Punch 3-Hole'
]

categories = ['Technology', 'Office Supplies', 'Home Office', 'Furniture', 'Accessories']
subcategories = {
    'Technology': ['Computers', 'Accessories', 'Peripherals', 'Networking'],
    'Office Supplies': ['Stationery', 'Organization', 'Paper Products'],
    'Home Office': ['Furniture', 'Lighting', 'Comfort'],
    'Furniture': ['Desks', 'Chairs', 'Storage'],
    'Accessories': ['Cables', 'Adapters', 'Misc']
}

suppliers = ['Logitech', 'IKEA', 'HP', 'Leitz', 'Philips', 'Samsung', 'Dell', 
             'Microsoft', 'Canon', 'Apple', 'Fellowes', 'Staedtler', 'Moleskine',
             'Bose', 'Sony', 'Kingston', 'SanDisk', 'Seagate', 'WD', 'TP-Link']

stock_statuses = ['In Stock', 'Low Stock', 'Out of Stock', 'In stock', 'LOW STOCK']

products_list = []
for i, name in enumerate(product_names[:60]):
    cat = random.choice(categories)
    subcat = random.choice(subcategories[cat])
    unit_price = round(random.uniform(9.99, 1500.00), 2)
    # Some missing cost_price (10% chance)
    cost_price = round(unit_price * random.uniform(0.55, 0.80), 2) if random.random() > 0.1 else None
    
    products_list.append({
        'product_id': f'P{i+101:03d}',
        'product_name': name,
        'category': cat,
        'subcategory': subcat,
        'unit_price': unit_price,
        'cost_price': cost_price,
        'supplier': random.choice(suppliers),
        'stock_status': random.choice(stock_statuses)
    })

df_products = pd.DataFrame(products_list)

# ============================================================================
# 3. CUSTOMERS.CSV - 1,200 rows
# ============================================================================
first_names_be = ['Sofie', 'Emma', 'Lucas', 'Noah', 'Marie', 'Thomas', 'Louise', 'Lars']
last_names_be = ['Janssens', 'Peeters', 'Maes', 'Jacobs', 'Willems', 'De Smet', 'Mertens']
first_names_nl = ['Daan', 'Emma', 'Liam', 'Sophie', 'Noah', 'Julia', 'Lucas', 'Anna']
last_names_nl = ['de Jong', 'Jansen', 'de Vries', 'van den Berg', 'Bakker', 'Visser']
first_names_fr = ['Louis', 'Emma', 'Gabriel', 'Louise', 'Arthur', 'Alice', 'Jules', 'Chlo√©']
last_names_fr = ['Martin', 'Bernard', 'Dubois', 'Laurent', 'Simon', 'Michel', 'Lefebvre']
first_names_de = ['Ben', 'Emma', 'Paul', 'Hannah', 'Leon', 'Mia', 'Finn', 'Sophia']
last_names_de = ['M√ºller', 'Schmidt', 'Schneider', 'Fischer', 'Weber', 'Meyer', 'Wagner']

countries = ['Belgium', 'Netherlands', 'France', 'Germany', 'Luxembourg']
cities = {
    'Belgium': ['Brussels', 'Antwerp', 'Ghent', 'Bruges', 'Leuven', 'Li√®ge'],
    'Netherlands': ['Amsterdam', 'Rotterdam', 'Utrecht', 'The Hague', 'Eindhoven'],
    'France': ['Paris', 'Lyon', 'Marseille', 'Toulouse', 'Nice', 'Lille'],
    'Germany': ['Berlin', 'Munich', 'Hamburg', 'Frankfurt', 'Cologne', 'Stuttgart'],
    'Luxembourg': ['Luxembourg City', 'Esch-sur-Alzette']
}

loyalty_statuses = ['Silver', 'Gold', 'Platinum', 'None', None]

customers_list = []
for i in range(1200):
    country = random.choice(countries)
    
    if country == 'Belgium':
        fname = random.choice(first_names_be)
        lname = random.choice(last_names_be)
    elif country == 'Netherlands':
        fname = random.choice(first_names_nl)
        lname = random.choice(last_names_nl)
    elif country == 'France':
        fname = random.choice(first_names_fr)
        lname = random.choice(last_names_fr)
    elif country == 'Germany':
        fname = random.choice(first_names_de)
        lname = random.choice(last_names_de)
    else:
        fname = random.choice(first_names_be + first_names_fr)
        lname = random.choice(last_names_be + last_names_fr)
    
    city = random.choice(cities[country])
    
    # Generate email with messiness
    email_base = f"{fname.lower()}.{lname.lower().replace(' ', '')}"
    email_domain = random.choice(['gmail.com', 'outlook.com', 'yahoo.com', 'hotmail.com', 'icloud.com'])
    
    # 5% missing @, 3% missing completely, 2% duplicate
    rand = random.random()
    if rand < 0.03:
        email = None
    elif rand < 0.08:
        email = f"{email_base}{email_domain}"  # missing @
    else:
        email = f"{email_base}@{email_domain}"
    
    # Some inconsistent casing
    if random.random() < 0.15:
        email = email.upper() if email else None
    
    # Signup date - some missing, spanning 2021 to 2025
    signup_date = None if random.random() < 0.05 else (
        datetime(2021, 1, 1) + timedelta(days=random.randint(0, 1770))
    ).strftime('%Y-%m-%d')
    
    age = random.randint(23, 70) if random.random() > 0.03 else None
    
    customers_list.append({
        'customer_id': f'CUST_{i+1001:04d}',
        'first_name': fname,
        'last_name': lname,
        'country': country,
        'city': city,
        'email': email,
        'signup_date': signup_date,
        'loyalty_status': random.choice(loyalty_statuses),
        'age': age
    })

# Add some duplicates (same person, different ID)
for _ in range(20):
    dup = random.choice(customers_list[:-20]).copy()
    dup['customer_id'] = f'CUST_{len(customers_list)+1001:04d}'
    customers_list.append(dup)

df_customers = pd.DataFrame(customers_list)

# ============================================================================
# 4. ORDERS.CSV - 5,000 rows
# ============================================================================
ship_modes = ['Standard', 'Express', 'Overnight', 'express', 'STANDARD', 'Economy']
payment_methods = ['Visa', 'Mastercard', 'PayPal', 'Bancontact', 'iDEAL', 'SEPA', 'visa']
order_priorities = ['High', 'Medium', 'Low', 'Urgent', 'low', 'MEDIUM']

orders_list = []
# 3 years of data: Nov 2022 to Nov 2025
start_date = datetime(2022, 11, 1)
end_date = datetime(2025, 11, 5)
date_range_days = (end_date - start_date).days

for i in range(5000):
    order_date = start_date + timedelta(days=random.randint(0, date_range_days))
    customer_id = random.choice(df_customers['customer_id'].tolist())
    region_id = random.choice(df_regions['region_id'].tolist())
    ship_mode = random.choice(ship_modes)
    payment = random.choice(payment_methods)
    
    # Delivery time based on ship mode
    if 'express' in ship_mode.lower():
        delivery = random.randint(1, 3)
    elif 'overnight' in ship_mode.lower():
        delivery = 1
    elif 'standard' in ship_mode.lower():
        delivery = random.randint(5, 10)
    else:
        delivery = random.randint(4, 14)
    
    # Some missing delivery times
    delivery = delivery if random.random() > 0.02 else None
    
    shipping = round(random.uniform(0, 15.95), 2)
    
    orders_list.append({
        'order_id': f'O{order_date.strftime("%Y%m%d")}{i:04d}',
        'order_date': order_date.strftime('%Y-%m-%d'),
        'customer_id': customer_id,
        'region_id': region_id,
        'ship_mode': ship_mode,
        'payment_method': payment,
        'delivery_time_days': delivery,
        'shipping_cost': shipping,
        'order_priority': random.choice(order_priorities)
    })

df_orders = pd.DataFrame(orders_list)

# ============================================================================
# 5. ORDER_DETAILS.CSV - ~10,000-12,000 rows
# ============================================================================
order_details_list = []
detail_counter = 1

for order in orders_list:
    # Each order has 1-4 line items
    num_items = random.choices([1, 2, 3, 4], weights=[0.3, 0.4, 0.2, 0.1])[0]
    
    for _ in range(num_items):
        product = random.choice(products_list)
        quantity = random.randint(1, 5)
        
        # Discount - some missing
        discount = round(random.uniform(0, 0.30), 2) if random.random() > 0.15 else None
        discount_val = discount if discount else 0
        
        tax_rate = 0.21  # VAT
        unit_price = product['unit_price']
        subtotal = unit_price * quantity * (1 - discount_val)
        total_price = round(subtotal * (1 + tax_rate), 2)
        
        # Calculate profit
        if product['cost_price']:
            cost = product['cost_price'] * quantity
            profit = round(subtotal - cost, 2)
            # Occasionally negative profit (bad deals)
            if random.random() < 0.02:
                profit = round(profit * -1.5, 2)
        else:
            profit = None
        
        order_details_list.append({
            'order_detail_id': f'OD{detail_counter:07d}',
            'order_id': order['order_id'],
            'product_id': product['product_id'],
            'quantity': quantity,
            'discount': discount,
            'tax_rate': tax_rate,
            'total_price': total_price,
            'profit': profit
        })
        detail_counter += 1

df_order_details = pd.DataFrame(order_details_list)

# ============================================================================
# SAVE ALL FILES
# ============================================================================
print("Generating EuroMart Dataset...")
print(f"\n‚úì regions.csv: {len(df_regions)} rows")
print(f"‚úì customers.csv: {len(df_customers)} rows")
print(f"‚úì products.csv: {len(df_products)} rows")
print(f"‚úì orders.csv: {len(df_orders)} rows")
print(f"‚úì order_details.csv: {len(df_order_details)} rows")

df_regions.to_csv('regions.csv', index=False)
df_customers.to_csv('customers.csv', index=False)
df_products.to_csv('products.csv', index=False)
df_orders.to_csv('orders.csv', index=False)
df_order_details.to_csv('order_details.csv', index=False)

# ============================================================================
# GENERATE README
# ============================================================================
readme_content = """# EuroMart Retail Group Dataset

## Background
You're a junior data analyst at EuroMart Retail Group, a regional retailer headquartered in Brussels. 
You operate in Belgium, France, Netherlands, Germany, and Luxembourg, selling office, tech, and 
home-lifestyle products online and through regional distributors.

## Dataset Files

### 1. regions.csv (8 rows)
Regional structure with intentional inconsistencies:
- Duplicate region names with different casing
- Same managers assigned to multiple regions
- Inconsistent country capitalization

### 2. customers.csv (1,200 rows)
Customer data with realistic messiness:
- Missing or malformed emails (~8%)
- Duplicate customers (~20 records)
- Missing signup dates (~5%)
- Missing age values (~3%)
- Inconsistent email casing

### 3. products.csv (60 rows)
Product catalog with data quality issues:
- Missing cost_price (~10%)
- Inconsistent stock_status casing
- Mix of EU brands (IKEA, Logitech, HP, Philips, etc.)
- Prices in euros

### 4. orders.csv (5,000 rows)
Order-level transactions:
- Date range: 2024-01-01 to 2024-10-27
- Missing delivery_time_days (~2%)
- Inconsistent casing in ship_mode and order_priority
- Realistic shipping costs

### 5. order_details.csv (~10,491 rows)
Line-item sales data:
- Missing discount values (~15%)
- Occasional negative profit (~2%) - bad deals
- Missing profit where cost_price unavailable
- VAT rate of 21%

## Relationships (ERD)
```
regions.region_id  ‚Üê‚Üí  orders.region_id
customers.customer_id  ‚Üê‚Üí  orders.customer_id
orders.order_id  ‚Üê‚Üí  order_details.order_id
products.product_id  ‚Üê‚Üí  order_details.product_id
```

## Suggested Analytics Projects

### Data Cleaning (ETL Practice)
1. Fix malformed emails (missing @)
2. Standardize region names and casing
3. Handle missing cost_price and profit calculations
4. Identify and merge duplicate customers
5. Standardize categorical values (ship_mode, order_priority, etc.)

### Business Analysis
1. **Profitability Analysis**: By region, category, supplier, product
2. **Customer Segmentation**: Loyalty status vs purchase behavior
3. **Shipping Analysis**: Delivery time vs ship_mode, cost optimization
4. **Discount Impact**: How discounts affect profit margins
5. **Regional Performance**: Sales and profit by country/region
6. **Product Performance**: Best/worst sellers, stock status correlation
7. **Temporal Trends**: Monthly sales, seasonality, growth rates

### KPIs to Build
- Total Revenue
- Total Profit & Profit Margin %
- Average Order Value
- Average Delivery Time
- Customer Lifetime Value
- Return Customer Rate
- Revenue by Region/Country
- Top Products by Profit

### Advanced Analytics
- Cohort analysis by signup date
- RFM (Recency, Frequency, Monetary) segmentation
- Delivery performance dashboard
- Inventory optimization recommendations
- Customer churn prediction

## Data Quality Issues (Intentional)
‚úì Missing values across all tables
‚úì Inconsistent casing and formatting
‚úì Duplicate records
‚úì Occasional outliers and errors
‚úì Malformed email addresses
‚úì Some negative profits
‚úì Missing foreign key references

## Tools Suggested
- SQL Server for data warehousing
- Power BI for visualization
- Python/Pandas for data cleaning
- Excel for quick exploration

Happy analyzing! üéØ
"""

with open('README_EuroMart.txt', 'w', encoding='utf-8') as f:
    f.write(readme_content)

print(f"\n‚úì README_EuroMart.txt created")
print("\n" + "="*60)
print("üéâ EuroMart Dataset Generation Complete!")
print("="*60)
print("\nAll files saved:")
print("  ‚Ä¢ regions.csv")
print("  ‚Ä¢ customers.csv")
print("  ‚Ä¢ products.csv")
print("  ‚Ä¢ orders.csv")
print("  ‚Ä¢ order_details.csv")
print("  ‚Ä¢ README_EuroMart.txt")
print("\nReady for ETL, analysis, and visualization!")

Generating EuroMart Dataset...

‚úì regions.csv: 8 rows
‚úì customers.csv: 1220 rows
‚úì products.csv: 60 rows
‚úì orders.csv: 5000 rows
‚úì order_details.csv: 10349 rows

‚úì README_EuroMart.txt created

üéâ EuroMart Dataset Generation Complete!

All files saved:
  ‚Ä¢ regions.csv
  ‚Ä¢ customers.csv
  ‚Ä¢ products.csv
  ‚Ä¢ orders.csv
  ‚Ä¢ order_details.csv
  ‚Ä¢ README_EuroMart.txt

Ready for ETL, analysis, and visualization!
