#Dataset Mockup Generate

### Imports & Config

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
import random
from pathlib import Path

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [12]:
TZ = ZoneInfo("Asia/Bangkok")
START_DT = datetime(2025, 6, 25, 0, 0, tzinfo=TZ)
END_DT   = datetime(2025, 9, 23, 23, 59, tzinfo=TZ)

OUT_DIR = Path("./Datasets/mockup_ver2")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def fmt_ts(dt: datetime) -> str:
    return dt.strftime("%d/%m/%Y %H:%M")

### Static Master Data: Users, Products, Stores

#### User

In [5]:
N_USERS = 1000
users = pd.DataFrame({
    "user_id": [f"U{idx:04d}" for idx in range(1, N_USERS+1)]
})
users.head(), users.shape


(  user_id
 0   U0001
 1   U0002
 2   U0003
 3   U0004
 4   U0005,
 (1000, 1))

#### Products

In [22]:
N_PRODUCTS = 1000
CATEGORIES = [
    "Beverages","Snacks","InstantFoods","DairyBakery","FrozenFoods",
    "HealthBeauty","Household","PersonalCare","ReadyToEat","Others"
]
N_BRANDS = 50
BRANDS = [f"Brand_{i:03d}" for i in range(1, N_BRANDS+1)]

# Base price ranges by category (THB) – feel free to tweak
cat_price_ranges = {
    "Beverages": (20, 80),
    "Snacks": (15, 60),
    "InstantFoods": (25, 120),
    "DairyBakery": (25, 120),
    "FrozenFoods": (40, 180),
    "HealthBeauty": (40, 200),
    "Household": (30, 200),
    "PersonalCare": (30, 200),
    "ReadyToEat": (40, 150),
    "Others": (20, 300),
}

prod_cats  = np.random.choice(CATEGORIES, size=N_PRODUCTS, replace=True)
prod_brands= np.random.choice(BRANDS, size=N_PRODUCTS, replace=True)

base_prices = []
for c in prod_cats:
    lo, hi = cat_price_ranges[c]
    base_prices.append(int(round(np.random.uniform(lo, hi))))


products = pd.DataFrame({
    "product_id": [f"P{idx:04d}" for idx in range(1, N_PRODUCTS+1)],
    "category": prod_cats,
    "brand": prod_brands,
    "base_price": base_prices
})
products.head(), products.shape


(  product_id      category      brand  base_price
 0      P0001  PersonalCare  Brand_002         191
 1      P0002  HealthBeauty  Brand_030         191
 2      P0003        Snacks  Brand_005          41
 3      P0004     Beverages  Brand_032          58
 4      P0005   DairyBakery  Brand_034          54,
 (1000, 4))

#### Store

In [23]:
N_STORES = 50
ZONES = list(range(1, 13))  # 1..12
PROVINCES = [
    "กรุงเทพฯ","เชียงใหม่","ขอนแก่น","นครราชสีมา","ชลบุรี",
    "ภูเก็ต","สุราษฎร์ธานี","อุบลราชธานี","สงขลา","นนทบุรี"
]
PROFILES = [
    "Residential/Market","Educational Institution","Office","Gas Station",
    "Factory","Hospital","Tourist Attraction","Entertainment Venue","Transport Hub"
]

stores = pd.DataFrame({
    "store_id": [f"S{idx:03d}" for idx in range(1, N_STORES+1)],
    "zone": np.random.choice(ZONES, size=N_STORES, replace=True),
    "province": np.random.choice(PROVINCES, size=N_STORES, replace=True),
    "profile": np.random.choice(PROFILES, size=N_STORES, replace=True),
})
stores.head(), stores.shape


(  store_id  zone      province              profile
 0     S001     3  สุราษฎร์ธานี   Residential/Market
 1     S002     5  สุราษฎร์ธานี             Hospital
 2     S003    10      กรุงเทพฯ          Gas Station
 3     S004     8  สุราษฎร์ธานี  Entertainment Venue
 4     S005    10        ภูเก็ต   Tourist Attraction,
 (50, 4))

#### Promotions

In [24]:
N_PROMOS = 100
PROMO_TYPES = ["Flash Sale","Buy 1 get 1","Brandday","Mega Sale","Product_Coupon"]

# pick 100 unique products for promotions (1 promotion = 1 product)
promo_products = np.random.choice(products["product_id"], size=N_PROMOS, replace=False)

# discount rules:
# - For non-B1G1 types: choose int 10..50
# - For "Buy 1 get 1": store 100 (interpreted as effective 50% when qty>=2, else 0%)
def sample_discount(ptype: str) -> int:
    if ptype == "Buy 1 get 1":
        return 100
    return int(np.random.choice(range(10, 51)))  # 10..50

def rand_dt_between(a: datetime, b: datetime) -> datetime:
    # Uniform random datetime between a and b (inclusive of a, exclusive near b)
    delta = (b - a).total_seconds()
    r = np.random.uniform(0, max(delta, 1))
    return a + timedelta(seconds=r)

promo_rows = []
for i in range(N_PROMOS):
    promo_id = f"PR{i+1:04d}"
    ptype = np.random.choice(PROMO_TYPES)
    disc  = sample_discount(ptype)
    
    # decide short or long window
    if np.random.rand() < 0.5:
        # short 7-14 days
        length_days = np.random.randint(7, 15)
    else:
        # long 30-60 days
        length_days = np.random.randint(30, 61)
    
    # choose start so that end within global window
    latest_start = END_DT - timedelta(days=length_days)
    start_dt = rand_dt_between(START_DT, latest_start)
    end_dt   = start_dt + timedelta(days=length_days)
    # clamp end_dt to END_DT (rare case)
    if end_dt > END_DT:
        end_dt = END_DT

    promo_rows.append({
        "promo_id": promo_id,
        "promo_type": ptype,
        "product_id": promo_products[i],
        "discount": disc,
        "start_date": fmt_ts(start_dt),
        "end_date": fmt_ts(end_dt),
        # keep raw datetimes for sampling later (not saved to CSV)
        "_start_raw": start_dt,
        "_end_raw": end_dt,
    })

promotions = pd.DataFrame(promo_rows)

# Keep a fast lookup for promo windows and product mapping
promo_dict = promotions.set_index("promo_id")[["promo_type","product_id","discount","_start_raw","_end_raw"]].to_dict("index")

promotions_display = promotions.drop(columns=["_start_raw","_end_raw"]).copy()
promotions_display.head(), promotions_display.shape


(  promo_id      promo_type product_id  discount        start_date  \
 0   PR0001  Product_Coupon      P0790        19  23/08/2025 21:09   
 1   PR0002       Mega Sale      P0277        31  13/07/2025 21:07   
 2   PR0003  Product_Coupon      P0441        25  04/08/2025 01:41   
 3   PR0004       Mega Sale      P0640        18  22/08/2025 15:31   
 4   PR0005     Buy 1 get 1      P0291       100  13/07/2025 09:56   
 
            end_date  
 0  31/08/2025 21:09  
 1  19/08/2025 21:07  
 2  11/09/2025 01:41  
 3  04/09/2025 15:31  
 4  23/08/2025 09:56  ,
 (100, 6))

### Helper


In [25]:
def sample_day_uniform(start_dt: datetime, end_dt: datetime) -> datetime:
    """Pick a day uniformly, then pick a time within that day uniformly."""
    # sample a day index
    n_days = (end_dt.date() - start_dt.date()).days + 1
    day_offset = np.random.randint(0, n_days)
    day = (start_dt.date() + timedelta(days=day_offset))
    # sample a time in [00:00, 23:59]
    hh = np.random.randint(0, 24)
    mm = np.random.randint(0, 60)
    # combine with timezone
    dt = datetime(day.year, day.month, day.day, hh, mm, tzinfo=TZ)
    # ensure within bounds
    if dt < start_dt: dt = start_dt
    if dt > end_dt:   dt = end_dt
    return dt

def sample_ts_global() -> datetime:
    return sample_day_uniform(START_DT, END_DT)

def sample_ts_in_window(a: datetime, b: datetime) -> datetime:
    """Uniform by day+time within [a..b]."""
    if b < a:
        a, b = b, a
    return sample_day_uniform(a, b)

def nonpromo_price(base_price: float, qty: int) -> int:
    noise = np.random.uniform(0.95, 1.10)
    return int(round(base_price * qty * noise))

def promo_price(ptype: str, discount_pct: int, base_price: float, qty: int) -> int:
    base = base_price * qty
    if ptype == "Buy 1 get 1":
        eff_pct = 50 if qty >= 2 else 0
    else:
        eff_pct = discount_pct
    after_disc = base * (1 - eff_pct/100.0)
    noise = np.random.uniform(0.97, 1.03)
    return int(round(after_disc * noise))


### Generate Transactions

In [26]:
TX_records = []
PMTX_records = []

# ID counters
tx_counter = 1
pmtx_counter = 1

# For efficient base_price lookup
prod2base = products.set_index("product_id")["base_price"].to_dict()

# For sampling promos quickly (uniform over promo_id)
all_promo_ids = promotions["promo_id"].tolist()

for uid in users["user_id"]:
    total_bills = np.random.randint(10, 31)  # 10..30
    p_promo = np.clip(np.random.normal(loc=0.30, scale=0.08), 0.10, 0.50)
    n_promo = int(round(total_bills * p_promo))
    n_nonpromo = total_bills - n_promo

    # --- Non-promo bills ---
    for _ in range(n_nonpromo):
        product_id = np.random.choice(products["product_id"])
        qty = int(np.random.randint(1, 6))
        base_price = prod2base[product_id]
        price = nonpromo_price(base_price, qty)
        ts = sample_ts_global()
        store_id = np.random.choice(stores["store_id"])
        TX_records.append({
            "transaction_id": f"TX{tx_counter:07d}",
            "user_id": uid,
            "product_id": product_id,
            "qty": qty,
            "price": price,
            "timestamp": fmt_ts(ts),
            "store_id": store_id
        })
        tx_counter += 1

    # --- Promo bills ---
    for _ in range(n_promo):
        # pick a promo
        promo_id = np.random.choice(all_promo_ids)
        pinfo = promo_dict[promo_id]
        ptype = pinfo["promo_type"]
        pprod = pinfo["product_id"]
        pdisc = pinfo["discount"]
        pstart = pinfo["_start_raw"]
        pend   = pinfo["_end_raw"]

        # product & qty from promo's product
        product_id = pprod
        qty = int(np.random.randint(1, 6))
        base_price = prod2base[product_id]

        # timestamp within promo window but also within global window (they overlap by construction)
        ts = sample_ts_in_window(max(START_DT, pstart), min(END_DT, pend))

        # channel
        is_online = bool(np.random.rand() < 0.15)  # 15% online
        store_id = None if is_online else np.random.choice(stores["store_id"])

        price = promo_price(ptype, pdisc, base_price, qty)

        PMTX_records.append({
            "transaction_id": f"PMTX{pmtx_counter:07d}",
            "user_id": uid,
            "product_id": product_id,  # added for real training/analysis
            "qty": qty,                # added
            "price": price,            # net price after discount
            "timestamp": fmt_ts(ts),
            "is_online": is_online,
            "store_id": store_id,
            "promo_id": promo_id
        })
        pmtx_counter += 1

len(TX_records), len(PMTX_records)

(14102, 5974)

### Build Df

In [27]:
transactions = pd.DataFrame(TX_records, columns=[
    "transaction_id","user_id","product_id","qty","price","timestamp","store_id"
])

promotion_transactions = pd.DataFrame(PMTX_records, columns=[
    "transaction_id","user_id","product_id","qty","price","timestamp","is_online","store_id","promo_id"
])

# Final promotions view (drop raw datetime helpers)
promotions_final = promotions_display.copy()

transactions.head(), promotion_transactions.head(), promotions_final.head()


(  transaction_id user_id product_id  qty  price         timestamp store_id
 0      TX0000001   U0001      P0189    4    371  15/08/2025 10:33     S020
 1      TX0000002   U0001      P0110    5    311  01/08/2025 09:09     S018
 2      TX0000003   U0001      P0056    5    718  25/08/2025 13:00     S047
 3      TX0000004   U0001      P0426    5    383  02/08/2025 07:49     S027
 4      TX0000005   U0001      P0957    3    914  30/08/2025 01:21     S036,
   transaction_id user_id product_id  qty  price         timestamp  is_online  \
 0    PMTX0000001   U0001      P0650    5    266  28/07/2025 09:14      False   
 1    PMTX0000002   U0001      P0269    4     87  18/09/2025 02:11      False   
 2    PMTX0000003   U0001      P0259    3    172  19/08/2025 23:34      False   
 3    PMTX0000004   U0001      P0812    5    389  11/09/2025 14:48      False   
 4    PMTX0000005   U0001      P0079    3    248  22/08/2025 21:42      False   
 
   store_id promo_id  
 0     S028   PR0030  
 1     S0

In [28]:
users.to_csv(OUT_DIR / "users.csv", index=False, encoding="utf-8-sig")
products.to_csv(OUT_DIR / "products.csv", index=False, encoding="utf-8-sig")
stores.to_csv(OUT_DIR / "stores.csv", index=False, encoding="utf-8-sig")
promotions_final.to_csv(OUT_DIR / "promotions.csv", index=False, encoding="utf-8-sig")
transactions.to_csv(OUT_DIR / "transactions.csv", index=False, encoding="utf-8-sig")
promotion_transactions.to_csv(OUT_DIR / "promotion_transactions.csv", index=False, encoding="utf-8-sig")

print("Saved to:", OUT_DIR.resolve())

Saved to: E:\Double-CAI\Notebooks\Datasets\mockup_ver2
