In [1]:
pip install pandas numpy faker

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import pandas as pd
import numpy as np
from faker import Faker
import random
import os
from datetime import timedelta

fake = Faker()
np.random.seed(42)
random.seed(42)

# Create output folder
OUTPUT_DIR = "synthetic_olist_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------------------
# CONFIG
# ---------------------------
NUM_CUSTOMERS = 1000
NUM_ORDERS = 1000
NUM_PRODUCTS = 80

# ---------------------------
# CUSTOMERS
# ---------------------------
customers = []

for _ in range(NUM_CUSTOMERS):
    customers.append({
        "customer_id": fake.uuid4(),
        "customer_unique_id": fake.uuid4(),
        "customer_zip_code_prefix": random.randint(10000, 99999),
        "customer_city": fake.city(),
        "customer_state": fake.state_abbr()
    })

customers_df = pd.DataFrame(customers)

# ---------------------------
# PRODUCTS
# ---------------------------
categories = ["electronics", "furniture", "home_appliances", "fashion", "office"]

products = []

for _ in range(NUM_PRODUCTS):
    products.append({
        "product_id": fake.uuid4(),
        "product_category_name": random.choice(categories),
        "product_name_lenght": random.randint(20, 60),
        "product_description_lenght": random.randint(100, 800),
        "product_photos_qty": random.randint(1, 5),
        "product_weight_g": random.randint(300, 5000),
        "product_length_cm": random.randint(10, 100),
        "product_height_cm": random.randint(5, 50),
        "product_width_cm": random.randint(10, 100)
    })

products_df = pd.DataFrame(products)

# ---------------------------
# ORDERS
# ---------------------------
orders = []

order_statuses = [
    "delivered", "shipped", "canceled",
    "invoiced", "processing"
]

for _ in range(NUM_ORDERS):
    purchase_date = fake.date_between(start_date="-6M", end_date="today")
    approved_date = purchase_date + timedelta(days=random.randint(0, 2))
    delivered_date = approved_date + timedelta(days=random.randint(2, 10))

    orders.append({
        "order_id": fake.uuid4(),
        "customer_id": random.choice(customers_df["customer_id"]),
        "order_status": random.choice(order_statuses),
        "order_purchase_timestamp": purchase_date,
        "order_approved_at": approved_date,
        "order_delivered_carrier_date": approved_date + timedelta(days=1),
        "order_delivered_customer_date": delivered_date,
        "order_estimated_delivery_date": delivered_date + timedelta(days=2)
    })

orders_df = pd.DataFrame(orders)

# ---------------------------
# ORDER ITEMS (3–4 per order)
# ---------------------------
order_items = []

for _, order in orders_df.iterrows():
    num_items = random.randint(3, 4)
    selected_products = products_df.sample(num_items)

    for i, (_, product) in enumerate(selected_products.iterrows(), start=1):
        order_items.append({
            "order_id": order["order_id"],
            "order_item_id": i,
            "product_id": product["product_id"],
            "seller_id": fake.uuid4(),
            "shipping_limit_date": order["order_purchase_timestamp"] + timedelta(days=3),
            "price": round(random.uniform(50, 5000), 2),
            "freight_value": round(random.uniform(10, 300), 2)
        })

order_items_df = pd.DataFrame(order_items)

# ---------------------------
# PAYMENTS (1–2 per order)
# ---------------------------
payments = []

payment_types = ["credit_card", "upi", "debit_card", "voucher"]

for _, order in orders_df.iterrows():
    num_payments = random.choice([1, 1, 2])
    total_value = order_items_df[
        order_items_df["order_id"] == order["order_id"]
    ]["price"].sum()

    split_values = np.random.dirichlet(np.ones(num_payments)) * total_value

    for seq, value in enumerate(split_values, start=1):
        payments.append({
            "order_id": order["order_id"],
            "payment_sequential": seq,
            "payment_type": random.choice(payment_types),
            "payment_installments": random.randint(1, 6),
            "payment_value": round(value, 2)
        })

payments_df = pd.DataFrame(payments)

# ---------------------------
# SAVE CSV FILES
# ---------------------------
customers_df.to_csv(f"{OUTPUT_DIR}/olist_customers_dataset.csv", index=False)
products_df.to_csv(f"{OUTPUT_DIR}/olist_products_dataset.csv", index=False)
orders_df.to_csv(f"{OUTPUT_DIR}/olist_orders_dataset.csv", index=False)
order_items_df.to_csv(f"{OUTPUT_DIR}/olist_order_items_dataset.csv", index=False)
payments_df.to_csv(f"{OUTPUT_DIR}/olist_order_payments_dataset.csv", index=False)

print("✅ Synthetic Olist-style datasets generated successfully!")


✅ Synthetic Olist-style datasets generated successfully!


In [6]:
customers_d.head()

NameError: name 'customers_d' is not defined