In [146]:
import pandas as pd
import numpy as np
import math

In [147]:
import os
print("Current working dir:", os.getcwd())


Current working dir: e:\Double-CAI\Notebooks


In [148]:
path = "Datasets/mockup_ver2/"

products = pd.read_csv(path + "products.csv")
promotions = pd.read_csv(path + "promotions.csv")
transactions = pd.read_csv(path + "transactions.csv")
promotion_transactions = pd.read_csv(path + "promotion_transactions.csv")
stores = pd.read_csv(path + "stores.csv")
users = pd.read_csv(path + "users.csv")


#### Data info

In [149]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   product_id  1000 non-null   object
 1   category    1000 non-null   object
 2   brand       1000 non-null   object
 3   base_price  1000 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 31.4+ KB


In [150]:
promotions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   promo_id    100 non-null    object
 1   promo_type  100 non-null    object
 2   product_id  100 non-null    object
 3   discount    100 non-null    int64 
 4   start_date  100 non-null    object
 5   end_date    100 non-null    object
dtypes: int64(1), object(5)
memory usage: 4.8+ KB


In [151]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14102 entries, 0 to 14101
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   transaction_id  14102 non-null  object
 1   user_id         14102 non-null  object
 2   product_id      14102 non-null  object
 3   qty             14102 non-null  int64 
 4   price           14102 non-null  int64 
 5   timestamp       14102 non-null  object
 6   store_id        14102 non-null  object
dtypes: int64(2), object(5)
memory usage: 771.3+ KB


In [152]:
promotion_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5974 entries, 0 to 5973
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   transaction_id  5974 non-null   object
 1   user_id         5974 non-null   object
 2   product_id      5974 non-null   object
 3   qty             5974 non-null   int64 
 4   price           5974 non-null   int64 
 5   timestamp       5974 non-null   object
 6   is_online       5974 non-null   bool  
 7   store_id        5076 non-null   object
 8   promo_id        5974 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 379.3+ KB


In [153]:
stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   store_id  50 non-null     object
 1   zone      50 non-null     int64 
 2   province  50 non-null     object
 3   profile   50 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.7+ KB


## Merge


In [154]:
tx_base = (transactions
           .merge(products, on="product_id", how="left", validate="many_to_one")
           .merge(stores,   on="store_id",   how="left", validate="many_to_one"))
tx_base["promo_id_used"] = pd.NA
# ถ้าไม่มี is_online ใน TX ให้เติมด้วย False (หรือกฎธุรกิจของคุณ)
if "is_online" not in tx_base.columns:
    tx_base["is_online"] = False

In [155]:
pm_base = (promotion_transactions
           .merge(products, on="product_id", how="left", validate="many_to_one")
           .merge(stores,   on="store_id",   how="left", validate="many_to_one")
            .rename(columns={"promo_id": "promo_id_used"}))

if "is_online" not in pm_base.columns:
    pm_base["is_online"] = False

# ---- เตรียม promotions สำหรับ join แบบ product-level (ไม่แตะเวลา) ----
promos_prod = promotions.rename(columns={"product_id": "product_id_y"})
keep_promo_cols = ["promo_id","promo_type","product_id_y","discount"]
# เก็บ start/end ถ้าอยากดูภายหลัง แต่จะไม่ใช้กรอง
if "start_date" in promotions.columns: keep_promo_cols.append("start_date")
if "end_date"   in promotions.columns: keep_promo_cols.append("end_date")
promos_prod = promos_prod[keep_promo_cols]

In [156]:
cand_tx = tx_base.merge(promos_prod, left_on="product_id", right_on="product_id_y",
                        how="left", validate="many_to_many")
cand_tx["target"] = 0

# ---- Candidate PMTX: match product_id ตรง (ไม่กรองเวลา) ----
cand_pm = pm_base.merge(promos_prod, left_on="product_id", right_on="product_id_y",
                        how="left", validate="many_to_many")
cand_pm["target"] = (cand_pm["promo_id"] == cand_pm["promo_id_used"]).astype(int)

In [157]:
def finalize_cols(df):
    df = df.copy()
    df["product_id_x"] = df["product_id"]  # ฝั่งธุรกรรม
    # zone/province/profile ว่างเมื่อ is_online=True
    online_mask = df.get("is_online", False) == True
    for col, val in [("zone","Online"), ("province","Online"), ("profile","Online")]:
        if col in df.columns:
            df.loc[online_mask, col] = df.loc[online_mask, col].fillna(val)
    cols = [
        "transaction_id","user_id","product_id_x","qty","price","timestamp","store_id",
        "is_online",
        "promo_id","category","brand","base_price","zone","province","profile",
        "promo_type","product_id_y","discount",
        # เก็บ start/end ไว้ดูเฉย ๆ ถ้ามี แต่ไม่ใช้งานกรอง
    ]
    if "start_date" in df.columns: cols.append("start_date")
    if "end_date"   in df.columns: cols.append("end_date")
    cols += ["promo_id_used","target"]
    return df[cols]


cand_tx = finalize_cols(cand_tx)
cand_pm = finalize_cols(cand_pm)

 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online' 'Online'
 'Online' 'Online' 'Online' 'Online' 'Online' 'Onli

In [158]:
binary_per_candidate = (pd.concat([cand_tx, cand_pm], ignore_index=True)
                        .drop_duplicates(subset=["transaction_id","promo_id"], keep="first")
                        .sort_values(["transaction_id","promo_id"])
                        .reset_index(drop=True))

In [159]:
binary_per_candidate.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20076 entries, 0 to 20075
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  20076 non-null  object 
 1   user_id         20076 non-null  object 
 2   product_id_x    20076 non-null  object 
 3   qty             20076 non-null  int64  
 4   price           20076 non-null  int64  
 5   timestamp       20076 non-null  object 
 6   store_id        19178 non-null  object 
 7   is_online       20076 non-null  bool   
 8   promo_id        7400 non-null   object 
 9   category        20076 non-null  object 
 10  brand           20076 non-null  object 
 11  base_price      20076 non-null  int64  
 12  zone            20076 non-null  object 
 13  province        20076 non-null  object 
 14  profile         20076 non-null  object 
 15  promo_type      7400 non-null   object 
 16  product_id_y    7400 non-null   object 
 17  discount        7400 non-null  

In [161]:
binary_per_candidate.to_csv("Datasets/mockup_ver2/binary_per_candidate.csv", index=False)

In [162]:
view = binary_per_candidate.sort_values(
    by=["transaction_id", "target"], 
    ascending=[True, False]  # transaction_id จากน้อย→มาก, target=1 มาก่อน
).reset_index(drop=True)

view.to_csv("Datasets/mockup_ver2/binary_per_candidate_sorted.csv", index=False)