In [9]:
pip install faker pandas numpy

Collecting faker
  Downloading faker-37.1.0-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.1.0-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.1.0


In [10]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta


# Configuration

In [56]:
np.random.seed(42)
fake = Faker()
num_customers = 5000
num_products = 1500
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 1, 31)


## 1. CUSTOMERS TABLE

In [49]:
customers = pd.DataFrame({
    "customer_id": ["CUST-" + str(i).zfill(6) for i in range(1, num_customers+1)],
    "name": [fake.name() for _ in range(num_customers)],
    "age": np.random.normal(loc=35, scale=10, size=num_customers).astype(int),
    "gender": np.random.choice(["Male", "Female", "Other"], num_customers, p=[0.45, 0.45, 0.1]),
    "email": [fake.email() for _ in range(num_customers)],
    "location": [fake.country() for _ in range(num_customers)],
    "registration_date": [
        fake.date_between(
            start_date=start_date - timedelta(days=730),
            end_date=end_date.date()
        )
        for _ in range(num_customers)
    ],
    "preferred_style": np.random.choice(["Casual", "Formal", "Sport", "Bohemian", "Luxury"], num_customers, p=[0.3, 0.2, 0.2, 0.2, 0.1])
})

# Age validation
customers["age"] = customers["age"].clip(18, 80)

## 2. PRODUCTS TABLE

In [26]:
brands = {
    "Fast Fashion": ["Zara", "H&M", "Uniqlo"],
    "Premium": ["Levi's", "Calvin Klein", "Tommy Hilfiger"],
    "Luxury": ["Gucci", "Prada", "Louis Vuitton", "Balenciaga", "Versace"]
}

categories = {
    "Shirts": ["Oxford", "Linen", "Silk"],
    "Dresses": ["Cocktail", "Evening Gown", "Maxi"],
    "Pants": ["Trousers", "Chinos", "Designer Jeans"],
    "Jackets": ["Bomber", "Trench", "Leather"],
    "Suits": ["Business", "Tuxedo", "Three-piece"]
}

products_data = []
for pid in range(1, num_products+1):
    # Select brand tier
    brand_tier = np.random.choice(["Fast Fashion", "Premium", "Luxury"], p=[0.6, 0.3, 0.1])
    brand = np.random.choice(brands[brand_tier])

    # Set price based on brand tier
    if brand_tier == "Luxury":
        price = np.round(np.random.uniform(500, 5000) + 0.95)
    elif brand_tier == "Premium":
        price = np.round(np.random.uniform(150, 800) + 0.95)
    else:
        price = np.round(np.random.uniform(20, 300) + 0.95)

    category = np.random.choice(list(categories.keys()))
    products_data.append({
        "product_id": f"PROD-{pid:06d}",
        "product_name": f"{brand} {np.random.choice(categories[category])} {fake.color_name()}",
        "category": category,
        "brand": brand,
        "brand_tier": brand_tier,
        "size": np.random.choice(["XS", "S", "M", "L", "XL"], p=[0.1, 0.2, 0.4, 0.2, 0.1]),
        "color": fake.color_name(),
        "price": price,
        "stock": np.random.randint(0, 100) if brand_tier == "Fast Fashion" else np.random.randint(0, 20)
    })

products = pd.DataFrame(products_data)

## 3. INTERACTIONS TABLE

In [28]:
num_interactions = 100_000

interactions = pd.DataFrame({
    "interaction_id": [f"INTER-{i:07d}" for i in range(1, num_interactions+1)],
    "customer_id": np.random.choice(customers["customer_id"], num_interactions),
    "product_id": np.random.choice(products["product_id"], num_interactions),
    "event_type": np.random.choice(["view", "click", "add_to_cart"], num_interactions, p=[0.6, 0.35, 0.05]),
    "event_timestamp": [fake.date_time_between(start_date, end_date) for _ in range(num_interactions)],
    "session_id": [fake.uuid4() for _ in range(num_interactions)]
}).sort_values("event_timestamp").reset_index(drop=True)

## 4. TRANSACTIONS TABLE

In [32]:
num_transactions = 15_000  # 3x original ratio

# Select only products with stock > 0
available_products = products[products["stock"] > 0]["product_id"].tolist()

transactions = pd.DataFrame({
    "transaction_id": [f"TRX-{i:07d}" for i in range(1, num_transactions+1)],
    "customer_id": np.random.choice(customers["customer_id"], num_transactions),
    "product_id": np.random.choice(available_products, num_transactions),
    "quantity": np.random.randint(1, 3, num_transactions),  # Fewer items for luxury
    "purchase_date": [fake.date_time_between(start_date, end_date) for _ in range(num_transactions)],
    "payment_method": np.random.choice(
        ["Credit Card", "Debit Card", "PayPal", "Crypto"],
        num_transactions,
        p=[0.5, 0.3, 0.15, 0.05]
    )
})

# Merge with product prices
transactions = transactions.merge(products[["product_id", "price"]], on="product_id")
transactions["total_amount"] = transactions["price"] * transactions["quantity"]
transactions["return_status"] = np.random.choice([True, False], num_transactions, p=[0.05, 0.95])

# Update stock levels (vectorized operation)
products["stock"] = products.apply(
    lambda row: row["stock"] - transactions[transactions["product_id"] == row["product_id"]]["quantity"].sum(),
    axis=1
)
products["stock"] = products["stock"].clip(lower=0)

## 5. Tabla INVENTORY_HISTORY

In [36]:
inventory_history = []
for product in products["product_id"]:
    base_stock = np.random.randint(50, 100)
    for day in pd.date_range(start=start_date, end=end_date, freq="W"):
        inventory_history.append({
            "product_id": product,
            "date": day.date(),
            "stock_level": max(0, base_stock - np.random.randint(0, 5))
        })

inventory_history = pd.DataFrame(inventory_history)

## 6. Tabla CUSTOMER_SEGMENTS

In [39]:
customer_segments = pd.DataFrame({
    "customer_id": customers["customer_id"],
    "segment": np.random.choice(
        ["High Spender", "Frequent Buyer", "Occasional", "Inactive"],
        num_customers,
        p=[0.1, 0.2, 0.4, 0.3]
    ),
    "update_date": end_date.date()
})


## Final Validation

In [57]:
# 1. Foreign keys
assert transactions["product_id"].isin(products["product_id"]).all()
assert interactions["customer_id"].isin(customers["customer_id"]).all()

# 2. Non-negative stock
assert (products["stock"] >= 0).all()

# 3. Date validation
# Convert to date objects for proper comparison
start_date_date = start_date.date()
end_date_date = end_date.date()

# Check transactions are within range
assert (transactions["purchase_date"].dt.date >= start_date_date).all()
assert (transactions["purchase_date"].dt.date <= end_date_date).all()

# Check registration dates
assert (customers["registration_date"] >= (start_date - timedelta(days=730)).date()).all()
assert (customers["registration_date"] <= end_date_date).all()

print("All validations passed!")

All validations passed!


## CSV Saved

In [55]:

# Guardar en CSV
customers.to_csv("customers.csv", index=False)
products.to_csv("products.csv", index=False)
interactions.to_csv("interactions.csv", index=False)
transactions.to_csv("transactions.csv", index=False)
inventory_history.to_csv("inventory_history.csv", index=False)
customer_segments.to_csv("customer_segments.csv", index=False)


¡Datos generados exitosamente!
