In [1]:
%pip install faker

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.9 MB ? eta -:--:--
    --------------------------------------- 0.0/1.9 MB 640.0 kB/s eta 0:00:03
   ---------- ----------------------------- 0.5/1.9 MB 5.2 MB/s eta 0:00:01
   ---------------------------------- ----- 1.7/1.9 MB 13.5 MB/s eta 0:00:01
   ---------------------------------------- 1.9/1.9 MB 11.3 MB/s eta 0:00:00
Installing collected packages: faker
Successfully installed faker-37.5.3
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

# Initialize Faker
fake = Faker()

# Parameters
num_products = 380
num_customers = 50

# E-commerce categories, suppliers, and products
ecom_categories = [
    "Electronics", "Home & Kitchen", "Fashion", "Sports & Outdoors",
    "Books", "Beauty & Personal Care", "Toys & Games", "Pet Supplies",
    "Automotive", "Office Products"
]

ecom_suppliers = [
    "AmazonBasics", "Sony", "Samsung", "Nike", "Adidas", "Apple",
    "Microsoft", "Lenovo", "Puma", "Philips", "LG", "HP"
]

ecom_products = [
    "Wireless Earbuds", "Smartphone", "Laptop", "Bluetooth Speaker", "Gaming Mouse",
    "Air Fryer", "Coffee Maker", "Vacuum Cleaner", "Backpack", "Running Shoes",
    "Wrist Watch", "Sunglasses", "Yoga Mat", "Board Game", "Pet Food",
    "Car Phone Mount", "Desk Lamp", "External Hard Drive", "Water Bottle", "E-reader"
]

# Warehouse locations
warehouses = ["Dubai", "Abu Dhabi", "Sharjah", "Riyadh", "Jeddah", "Doha"]

# Generate product details
products = []
for i in range(num_products):
    sku = f"SKU-{1000 + i}"
    products.append({
        "SKU": sku,
        "Product_Family_Name": random.choice(ecom_products),
        "Category": random.choice(ecom_categories),
        "Supplier": random.choice(ecom_suppliers),
        "Price_per_Unit (USD)": round(random.uniform(5, 500), 2)
    })

products_df = pd.DataFrame(products)

# Generate customers
customers = [f"Customer-{str(i+1).zfill(3)}" for i in range(num_customers)]
high_volume_customers = random.sample(customers, int(num_customers * 0.3))
low_volume_customers = list(set(customers) - set(high_volume_customers))

# Time series: ~250k rows
start_date = datetime(2023, 11, 15)
end_date = datetime(2025, 8, 5)
num_days = (end_date - start_date).days

sales_data = []
for i in range(num_days):
    current_date = start_date + timedelta(days=i)
    for product in products:
        sku = product["SKU"]
        stock_level = max(10, int(np.random.normal(500, 100)))
        
        # Seasonal demand patterns
        if hash(sku) % 3 == 0:
            seasonal_factor = 1.8 if current_date.month in [11, 12, 1] else 0.6
        elif hash(sku) % 3 == 1:
            seasonal_factor = 1.5 if current_date.month in [6, 7] else 0.8
        else:
            seasonal_factor = 1.2 if current_date.month in [8, 9] else 1.0
        
        # Trend factor
        trend_factor = 1 + (i / num_days) * 0.1
        base_sales = max(10, int(np.random.normal(50, 10)))
        sales_quantity = int(base_sales * seasonal_factor * trend_factor)
        
        # Customer effect
        if random.random() < 0.7:
            customer_id = random.choice(low_volume_customers)
            sales_quantity = int(sales_quantity * 0.5)
        else:
            customer_id = random.choice(high_volume_customers)
            sales_quantity = int(sales_quantity * 1.5)
        
        # Extra features
        discount = round(random.choice([0, 5, 10, 15, 20]), 2)  # %
        return_qty = int(sales_quantity * random.choice([0, 0.01, 0.02, 0.05]))  # up to 5%
        shipping_cost = round(random.uniform(2, 20), 2)  # USD
        warehouse = random.choice(warehouses)
        
        # Revenue & lead time
        revenue = round(sales_quantity * product["Price_per_Unit (USD)"] * (1 - discount/100), 2)
        lead_time = random.choice([1, 2, 3, 5, 7, 10])
        
        sales_data.append({
            "Date": current_date.strftime("%Y-%m-%d"),
            "SKU": sku,
            "Customer ID": customer_id,
            "Stock Level": stock_level,
            "Sales Quantity": sales_quantity,  # ✅ Target column for demand forecasting
            "Revenue (USD)": revenue,
            "Lead Time (days)": lead_time,
            "Discount Applied (%)": discount,
            "Return Quantity": return_qty,
            "Shipping Cost (USD)": shipping_cost,
            "Warehouse Location": warehouse
        })

# Merge sales and product details
sales_df = pd.DataFrame(sales_data)
full_dataset = sales_df.merge(products_df, on="SKU")

# Save dataset
full_dataset.to_csv("ecommerce_supply_chain.csv", index=False)
print(f"Dataset generated: {full_dataset.shape[0]} rows, {full_dataset.shape[1]} columns, saved as 'ecommerce_supply_chain.csv'")


Dataset generated: 239020 rows, 15 columns, saved as 'ecommerce_supply_chain.csv'


In [3]:
import pandas as pd

df = pd.read_csv(r"C:\Users\elroy\OneDrive\Desktop\Supplychain\ecommerce_supply_chain.csv")
print(df.columns.tolist())


['Date', 'SKU', 'Customer ID', 'Stock Level', 'Sales Quantity', 'Revenue (USD)', 'Lead Time (days)', 'Discount Applied (%)', 'Return Quantity', 'Shipping Cost (USD)', 'Warehouse Location', 'Product_Family_Name', 'Category', 'Supplier', 'Price_per_Unit (USD)']
