In [13]:
pip install faker pandas numpy

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random


# Configuration

In [2]:
np.random.seed(42)
fake = Faker()
num_customers = 5000
num_products = 1500
num_interactions = 100000
num_transactions = 15000
start_date = datetime(2023, 1, 1)
end_date = datetime(2024, 1, 31)


## 1. CUSTOMERS TABLE

In [3]:
customers = pd.DataFrame({
    "customer_id": list(range(1, num_customers + 1)),
    "name": [fake.name() for _ in range(num_customers)],
    "age": np.random.normal(loc=35, scale=10, size=num_customers).astype(int),
    "gender": np.random.choice(["Male", "Female", "Other"], num_customers, p=[0.45, 0.45, 0.1]),
    "email": [fake.email() for _ in range(num_customers)],
    "location": [fake.country() for _ in range(num_customers)],
    "registration_date": [
        fake.date_between(
            start_date=start_date - timedelta(days=730),
            end_date=end_date.date()
        )
        for _ in range(num_customers)
    ],
    "preferred_style": np.random.choice(["Casual", "Formal", "Sport", "Bohemian", "Luxury"], num_customers, p=[0.3, 0.2, 0.2, 0.2, 0.1])
})

# Age validation
customers["age"] = customers["age"].clip(18, 80)

## 2. PRODUCTS TABLE

In [4]:

# =============================================
# 1. Master Data and Configurations
# =============================================
materials = [
    "Cotton", "Polyester", "Denim", "Silk", "Wool",
    "Leather", "Linen", "Spandex", "Cashmere", "Velvet",
    "Nylon", "Rayon", "Satin", "Chiffon", "Tweed"
]

formality_levels = ["Casual", "Office", "Party", "Formal", "Beach", "Athletic"]
seasons = ["Spring", "Summer", "Fall", "Winter", "All-Season"]

categories = {
    "Shirts": {
        "subcategories": ["T-Shirt", "Dress Shirt", "Polo", "Blouse", "Oxford"],
        "common_materials": ["Cotton", "Polyester", "Linen"]
    },
    "Pants": {
        "subcategories": ["Jeans", "Chinos", "Slacks", "Leggings", "Cargo"],
        "common_materials": ["Denim", "Cotton", "Wool"]
    },
    "Dresses": {
        "subcategories": ["Cocktail", "Maxi", "Mini", "Wedding", "Midi"],
        "common_materials": ["Silk", "Satin", "Chiffon"]
    },
    "Outerwear": {
        "subcategories": ["Jacket", "Coat", "Blazer", "Parka", "Windbreaker"],
        "common_materials": ["Wool", "Leather", "Polyester"]
    }
}

brands = {
    "Fast Fashion": ["Zara", "H&M", "Uniqlo", "Gap", "Mango"],
    "Premium": ["Levi's", "Calvin Klein", "Tommy Hilfiger", "Boss", "Diesel"],
    "Luxury": ["Gucci", "Prada", "Louis Vuitton", "Balenciaga", "Versace"]
}

# =============================================
# 2. Helper Functions
# =============================================
def generate_material_combinations(category):
    """Generate realistic material combinations based on category"""
    category_materials = categories[category]["common_materials"]
    num_materials = random.choices([1, 2, 3], weights=[0.4, 0.4, 0.2])[0]

    # 70% probability to use category-specific materials
    if random.random() < 0.7:
        base = random.sample(category_materials, min(num_materials, len(category_materials)))
    else:
        base = []

    # Add complementary materials
    complements = random.sample([m for m in materials if m not in category_materials],
                             num_materials - len(base))

    return ", ".join(base + complements)

def generate_price(brand_tier):
    """Generate realistic prices based on brand tier"""
    if brand_tier == "Luxury":
        return round(random.uniform(300, 3000) + 0.95, 2)
    elif brand_tier == "Premium":
        return round(random.uniform(80, 500) + 0.95, 2)
    else:
        return round(random.uniform(15, 120) + 0.95, 2)

# =============================================
# 3. Product Data Generation
# =============================================
products = []

for product_id in range(1, 1501):
    # Category selection
    category = random.choice(list(categories.keys()))
    subcategory = random.choice(categories[category]["subcategories"])

    # Brand tier selection
    brand_tier = random.choices(
        list(brands.keys()),
        weights=[0.6, 0.3, 0.1],
        k=1
    )[0]

    # Product construction
    product = {
        "product_id": product_id,
        "product_name": f"{random.choice(brands[brand_tier])} {subcategory}",
        "category": category,
        "subcategory": subcategory,
        "formality": random.choice(formality_levels),
        "season": random.choices(
            seasons,
            weights=[0.2, 0.3, 0.2, 0.2, 0.1],  # Higher summer probability
            k=1
        )[0],
        "materials": generate_material_combinations(category),
        "size": random.choices(
            ["XS", "S", "M", "L", "XL"],
            weights=[0.1, 0.3, 0.3, 0.2, 0.1],
            k=1
        )[0],
        "color": fake.safe_color_name().title(),
        "brand_tier": brand_tier,
        "price": generate_price(brand_tier),
        "stock": random.randint(0, 100) if brand_tier == "Fast Fashion" else random.randint(0, 20),
        "release_date": fake.date_between(
            start_date=datetime(2022, 1, 1),
            end_date=datetime(2024, 12, 31)
        ).strftime("%Y-%m-%d")
    }

    products.append(product)

# Create DataFrame
products = pd.DataFrame(products)


In [5]:
print(products.sample(3))

      product_id   product_name   category subcategory formality  season  \
905          906  Levi's Oxford     Shirts      Oxford     Party  Winter   
1489        1490    Mango Parka  Outerwear       Parka    Office  Winter   
535          536   Mango Blazer  Outerwear      Blazer     Party  Summer   

           materials size    color    brand_tier   price  stock release_date  
905   Rayon, Chiffon    L   Silver       Premium  377.26     13   2024-02-18  
1489       Polyester    M   Maroon  Fast Fashion   60.38     16   2023-02-21  
535        Polyester   XL  Fuchsia  Fast Fashion  102.60     81   2023-09-09  


## 3. INTERACTIONS TABLE

In [6]:
num_interactions = 100_000

interactions = pd.DataFrame({
    "interaction_id": list(range(1, num_interactions + 1)),
    "customer_id": np.random.choice(customers["customer_id"], num_interactions),
    "product_id": np.random.choice(products["product_id"], num_interactions),
    "event_type": np.random.choice(["view", "click", "add_to_cart"], num_interactions, p=[0.6, 0.35, 0.05]),
    "event_timestamp": [fake.date_time_between(start_date, end_date) for _ in range(num_interactions)],
    "session_id": [fake.uuid4() for _ in range(num_interactions)]
}).sort_values("event_timestamp").reset_index(drop=True)

## 4. TRANSACTIONS TABLE

In [7]:
num_transactions = 15_000  # 3x original ratio

# Select only products with stock > 0
available_products = products[products["stock"] > 0]["product_id"].tolist()

transactions = pd.DataFrame({
    "transaction_id": list(range(1, num_transactions + 1)),
    "customer_id": np.random.choice(customers["customer_id"], num_transactions),
    "product_id": np.random.choice(available_products, num_transactions),
    "quantity": np.random.randint(1, 3, num_transactions),  # Fewer items for luxury
    "purchase_date": [fake.date_time_between(start_date, end_date) for _ in range(num_transactions)],
    "payment_method": np.random.choice(
        ["Credit Card", "Debit Card", "PayPal", "Crypto"],
        num_transactions,
        p=[0.5, 0.3, 0.15, 0.05]
    )
})

# Merge with product prices
transactions = transactions.merge(products[["product_id", "price"]], on="product_id")
transactions["total_amount"] = transactions["price"] * transactions["quantity"]
transactions["return_status"] = np.random.choice([True, False], num_transactions, p=[0.05, 0.95])

# Update stock levels (vectorized operation)
products["stock"] = products.apply(
    lambda row: row["stock"] - transactions[transactions["product_id"] == row["product_id"]]["quantity"].sum(),
    axis=1
)
products["stock"] = products["stock"].clip(lower=0)

## 5. Tabla INVENTORY_HISTORY

In [8]:
# ... (previous code to generate 'products' DataFrame) ...

inventory_history = []
for product in products["product_id"]: # The 'products' DataFrame should be available here
    base_stock = np.random.randint(50, 100)
    for day in pd.date_range(start=start_date, end=end_date, freq="W"):
        inventory_history.append({
            "product_id": product,  # Use single product ID instead of repeating
            "date": day.date(),
            "stock_level": max(0, base_stock - np.random.randint(0, 5))
        })

inventory_history = pd.DataFrame(inventory_history)

In [9]:
inventory_history.head()

Unnamed: 0,product_id,date,stock_level
0,1,2023-01-01,89
1,1,2023-01-08,89
2,1,2023-01-15,88
3,1,2023-01-22,89
4,1,2023-01-29,88


## 6. Tabla CUSTOMER_SEGMENTS

In [10]:
customer_segments = pd.DataFrame({
    "customer_id": customers["customer_id"],
    "segment": np.random.choice(
        ["High Spender", "Frequent Buyer", "Occasional", "Inactive"],
        num_customers,
        p=[0.1, 0.2, 0.4, 0.3]
    ),
    "update_date": end_date.date()
})


## Final Validation

In [11]:
# 1. Foreign keys
assert transactions["product_id"].isin(products["product_id"]).all()
assert interactions["customer_id"].isin(customers["customer_id"]).all()

# 2. Non-negative stock
assert (products["stock"] >= 0).all()

# 3. Date validation
# Convert to date objects for proper comparison
start_date_date = start_date.date()
end_date_date = end_date.date()

# Check transactions are within range
assert (transactions["purchase_date"].dt.date >= start_date_date).all()
assert (transactions["purchase_date"].dt.date <= end_date_date).all()

# Check registration dates
assert (customers["registration_date"] >= (start_date - timedelta(days=730)).date()).all()
assert (customers["registration_date"] <= end_date_date).all()

print("All validations passed!")

All validations passed!


## CSV Saved

In [12]:

# Guardar en CSV
customers.to_csv("customers.csv", index=False)
products.to_csv("products.csv", index=False)
interactions.to_csv("interactions.csv", index=False)
transactions.to_csv("transactions.csv", index=False)
inventory_history.to_csv("inventory_history.csv", index=False)
customer_segments.to_csv("customer_segments.csv", index=False)
