In [106]:
from pathlib import Path
import numpy as np
import pandas as pd

# ML
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb

from pathlib import Path

In [107]:
BASE = Path("Datasets/mockup_ver2/")

tx_merge = pd.read_csv(BASE/"tx_merge3.csv") 
promotions = pd.read_csv(BASE/"promotions.csv", parse_dates=["start_date","end_date"])

promos_df = promotions.copy()
df = tx_merge.copy()
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

  tx_merge = pd.read_csv(BASE/"tx_merge3.csv")
  promotions = pd.read_csv(BASE/"promotions.csv", parse_dates=["start_date","end_date"])
  promotions = pd.read_csv(BASE/"promotions.csv", parse_dates=["start_date","end_date"])


In [108]:
import lightgbm as lgb
HAS_LGB = True

SEED = 42
NEED_K = 8
PCA_K  = 30
TOPK_TYPES = 2
REL_TH = 0.30
MAX_CANDS = 40

# ---- Columns (อิงจาก tx_merge2.csv ของคุณ) ----
COL_TX   = "transaction_id"
COL_USER = "user_id"
COL_PROD = "product_id"
COL_QTY  = "qty"
COL_PRICE= "price"

COL_CAT   = "products.category"
COL_BRAND = "products.brand"
COL_TS    = "timestamp"
COL_STORE = "store_id"
COL_ONLINE= "is_online"

COL_ORDER_H = "order_hour"
COL_DOW     = "dayofweek"
COL_MONTH   = "month"
COL_DAY     = "day"
COL_WOY     = "weekofyear"
COL_QUARTER = "quarter"
COL_IS_WKD  = "is_weekend"
COL_THAI_SEAS = "thai_season"
COL_IN_FEST   = "InFestival"

COL_WKD_BOOST = "weekday_boost"
COL_WKE_BOOST = "weekend_boost"
COL_FES_BOOST = "festival_boost"
COL_PEAKS     = "peaks_encoded"
COL_HOUR_W    = "hour_weight"
COL_LOYALTY   = "loyalty_score"
COL_EXPECT    = "expected_basket_items"
COL_ELAS      = "price_elasticity"
COL_SEGMENT   = "segment"

# ถ้าใช้ label จาก tx_merge โดยตรง:
LABEL_COL_IN_TX = "promotions.promo_type"

In [109]:
rename_map = {}
if "promotions.promo_type" in promos_df.columns:
    rename_map["promotions.promo_type"] = "promo_type"
if "promotion_category" in promos_df.columns and "promo_type" not in promos_df.columns:
    rename_map["promotion_category"] = "promo_type"
if "promotion_type" in promos_df.columns and "promo_type" not in promos_df.columns:
    rename_map["promotion_type"] = "promo_type"
if "scope" in promos_df.columns and "product_scope" not in promos_df.columns:
    rename_map["scope"] = "product_scope"

promos_df = promos_df.rename(columns=rename_map)

# เติมคอลัมน์ที่ขาดด้วยค่า default ปลอดภัย
defaults = {
    "promo_id": "__UNK__",
    "promo_type": "Unknown",
    "product_scope": "",
    "is_online": 1,
    "start_date": pd.Timestamp("2000-01-01"),
    "end_date":   pd.Timestamp("2100-01-01"),
    "est_margin": 0.0
}
for c, d in defaults.items():
    if c not in promos_df.columns:
        promos_df[c] = d

# final check
need_cols = ["promo_id","promo_type","product_scope","is_online","start_date","end_date","est_margin"]
missing = [c for c in need_cols if c not in promos_df.columns]
assert not missing, f"promos_df ขาดคอลัมน์: {missing}"

# แปลงวันที่ (กัน type ผิด)
promos_df["start_date"] = pd.to_datetime(promos_df["start_date"], errors="coerce")
promos_df["end_date"]   = pd.to_datetime(promos_df["end_date"], errors="coerce")

In [110]:
agg = {}
if COL_PROD in df.columns: agg[COL_PROD] = "nunique"
if COL_QTY  in df.columns: agg[COL_QTY]  = "sum"
if COL_PRICE in df.columns and COL_QTY in df.columns:
    df["_revenue"] = df[COL_PRICE].fillna(0) * df[COL_QTY].fillna(0)
    agg["_revenue"] = "sum"
elif COL_PRICE in df.columns:
    agg[COL_PRICE] = "sum"

basket = (
    df.groupby(COL_TX).agg(agg)
      .rename(columns={COL_PROD: "basket_unique_items"})
      .reset_index()
)

evt = df.groupby(COL_TX)[COL_TS].min().rename("event_time").reset_index()
basket = basket.merge(evt, on=COL_TX, how="left")

# context ที่มีอยู่แล้วในไฟล์
context_cols = [
    COL_STORE, COL_ONLINE,
    COL_ORDER_H, COL_DOW, COL_MONTH, COL_DAY, COL_WOY, COL_QUARTER,
    COL_IS_WKD, COL_THAI_SEAS, COL_IN_FEST,
    COL_WKD_BOOST, COL_WKE_BOOST, COL_FES_BOOST, COL_PEAKS, COL_HOUR_W,
    COL_LOYALTY, COL_EXPECT, COL_ELAS, COL_SEGMENT
]
for c in context_cols:
    if c in df.columns:
        first = df.groupby(COL_TX)[c].first().reset_index()
        basket = basket.merge(first, on=COL_TX, how="left")

# multi-hot: k=category/brand proportions
def crosstab_prop(frame, key, val, prefix):
    if val not in frame.columns:
        return pd.DataFrame({key: frame[key].unique()})
    ct = pd.crosstab(frame[key], frame[val])
    if ct.empty:
        return pd.DataFrame({key: frame[key].unique()})
    prop = ct.div(ct.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)
    prop.columns = [f"{prefix}={c}" for c in prop.columns]
    return prop.reset_index()

cat_prop   = crosstab_prop(df, COL_TX, COL_CAT,   "cat")
brand_prop = crosstab_prop(df, COL_TX, COL_BRAND, "brand")
basket = basket.merge(cat_prop, on=COL_TX, how="left").merge(brand_prop, on=COL_TX, how="left")

if COL_ONLINE in basket.columns:
    basket[COL_ONLINE] = basket[COL_ONLINE].astype(int)

comp_cols = [c for c in basket.columns if c.startswith("cat=") or c.startswith("brand=")]
num_cols = [
    "basket_unique_items", COL_QTY, "_revenue", COL_PRICE,
    COL_ORDER_H, COL_DOW, COL_MONTH, COL_DAY, COL_WOY, COL_QUARTER,
    COL_IS_WKD, COL_THAI_SEAS, COL_IN_FEST, COL_WKD_BOOST, COL_WKE_BOOST, COL_FES_BOOST,
    COL_PEAKS, COL_HOUR_W, COL_LOYALTY, COL_EXPECT, COL_ELAS
]
num_cols = [c for c in num_cols if c in basket.columns]

FEATURE_COLS = num_cols + ([COL_ONLINE] if COL_ONLINE in basket.columns else []) + comp_cols
basket_feat = basket.copy()

# sanity print
print("basket_feat shape:", basket_feat.shape)
print("num FEATURES:", len(FEATURE_COLS))

basket_feat shape: (19178, 85)
num FEATURES: 81


In [111]:
def get_top_types(probs, classes, k=2, ensure_non_nopromo=2, nopromo_label="NoPromo"):
    """
    เลือกประเภทโปรฯ สำหรับ recall: บังคับให้มีอย่างน้อย ensure_non_nopromo ประเภทที่ไม่ใช่ NoPromo
    แล้วค่อยเติม NoPromo ในลิสต์ (ถ้าจำเป็น)
    """
    order = np.argsort(probs)[::-1]
    cls_order = [classes[i] for i in order]

    non_np = [c for c in cls_order if c != nopromo_label]
    top_non_np = non_np[:max(ensure_non_nopromo, 1)]

    merged, seen = [], set()
    for c in top_non_np + cls_order:
        if c not in seen:
            merged.append(c); seen.add(c)
        if len(merged) >= k + 1:  # เผื่อ 1 ช่องให้ NoPromo
            break

    if nopromo_label not in merged:
        merged.append(nopromo_label)

    return merged[:k+1]


In [112]:
# %% Need-state discovery (fixed: auto-encode non-numeric) 
from sklearn.metrics import silhouette_score

# ทำ one-hot ให้ทุกคอลัมน์ที่เป็น object/category (กัน error 'Rainy')
X_df = basket_feat[FEATURE_COLS].copy()

# bool -> int
bool_cols = X_df.select_dtypes(include=["bool"]).columns
if len(bool_cols):
    X_df[bool_cols] = X_df[bool_cols].astype(int)

obj_cols = X_df.select_dtypes(include=["object", "category"]).columns
if len(obj_cols):
    X_df = pd.get_dummies(X_df, columns=obj_cols, dummy_na=True)

X = X_df.fillna(0.0).astype(float).values

# Scale + PCA
sc = StandardScaler()
Xs = sc.fit_transform(X)

pca = PCA(n_components=min(PCA_K, Xs.shape[1]), random_state=SEED)
Xp  = pca.fit_transform(Xs)

# KMeans
mbk = MiniBatchKMeans(n_clusters=NEED_K, random_state=SEED, batch_size=4096, n_init=10)
labels = mbk.fit_predict(Xp)
basket_feat["need_state_cluster"] = labels

# silhouette (sample)
try:
    idx = np.random.RandomState(SEED).choice(len(Xp), size=min(5000, len(Xp)), replace=False)
    sil = silhouette_score(Xp[idx], labels[idx])
except Exception:
    sil = np.nan
print(f"Silhouette(sample): {sil:.3f}")

# profiling
prof_cols = [
    "basket_unique_items", COL_QTY, COL_PRICE, "_revenue",
    COL_ORDER_H, COL_DOW, COL_IS_WKD, COL_THAI_SEAS, COL_IN_FEST,
    COL_WKD_BOOST, COL_WKE_BOOST, COL_FES_BOOST, COL_HOUR_W,
    COL_LOYALTY, COL_EXPECT, COL_ELAS
]
prof_cols = [c for c in prof_cols if c in basket_feat.columns]

def top_components(df_in, key, cols, n=8):
    rows = []
    for k, grp in df_in.groupby(key):
        sums = grp[cols].sum().sort_values(ascending=False)
        rows.append({key: k, "top_components": "; ".join([f"{c}:{sums[c]:.1f}" for c in sums.index[:n]])})
    return pd.DataFrame(rows)

comp_cols = [c for c in basket_feat.columns if c.startswith("cat=") or c.startswith("brand=")]
prof = (
    basket_feat.groupby("need_state_cluster")[prof_cols]
    .mean(numeric_only=True).round(3).reset_index()
)
topc = top_components(basket_feat, "need_state_cluster", comp_cols, n=8) if comp_cols else pd.DataFrame(columns=["need_state_cluster","top_components"])

need_profile = prof.merge(topc, on="need_state_cluster", how="left")
need_profile.insert(1, "count", basket_feat.groupby("need_state_cluster")[COL_TX].nunique().values)
need_profile.insert(2, "share_pct", (need_profile["count"]/need_profile["count"].sum()*100).round(2))

need_profile.head(10)



Silhouette(sample): 0.081


Unnamed: 0,need_state_cluster,count,share_pct,basket_unique_items,qty,_revenue,order_hour,dayofweek,is_weekend,InFestival,weekday_boost,weekend_boost,festival_boost,hour_weight,loyalty_score,expected_basket_items,price_elasticity,top_components
0,0,5697,29.71,1.0,2.906,870.099,11.645,3.034,0.286,0.079,1.0,0.892,0.98,1.0,0.923,2.989,0.01,cat=Snacks:718.0; cat=HealthBeauty:694.0; cat=...
1,1,2286,11.92,1.0,2.961,895.44,11.461,3.029,0.294,0.087,1.0,1.0,1.0,0.995,0.924,2.989,0.007,cat=ReadyToEat:294.0; cat=Snacks:277.0; cat=He...
2,2,3179,16.58,1.0,2.928,823.406,11.306,3.014,0.282,0.074,1.0,1.014,1.015,0.992,0.924,2.989,-0.008,cat=ReadyToEat:434.0; cat=Snacks:397.0; cat=He...
3,3,3344,17.44,1.0,2.936,869.68,11.576,3.045,0.295,0.081,1.0,1.05,1.05,1.004,0.923,2.989,0.007,cat=ReadyToEat:454.0; cat=Snacks:442.0; cat=In...
4,4,1348,7.03,1.0,3.649,2602.58,11.456,2.832,0.26,0.083,1.0,0.986,1.015,0.994,0.924,2.99,0.028,cat=Others:1181.0; brand=Brand_028:172.0; bran...
5,5,632,3.3,1.0,3.111,1130.049,11.601,2.968,0.291,0.076,1.0,0.989,1.021,0.988,0.922,2.99,0.004,brand=Brand_013:632.0; cat=Household:146.0; ca...
6,6,521,2.72,1.0,2.94,842.697,11.388,2.917,0.271,0.075,1.0,0.991,1.02,0.992,0.924,2.989,0.013,brand=Brand_040:521.0; cat=Household:172.0; ca...
7,7,2171,11.32,1.0,2.961,930.578,11.246,2.953,0.275,0.068,1.0,1.1,1.1,0.997,0.923,2.989,0.008,cat=HealthBeauty:259.0; cat=ReadyToEat:249.0; ...


In [113]:
# เตรียม label ต่อธุรกรรมจาก tx_merge โดยตรง (ถ้าไม่มี ใช้วิธี join ผ่าน promo_id แทน)
if LABEL_COL_IN_TX not in tx_merge.columns:
    raise ValueError(f"ไม่พบ {LABEL_COL_IN_TX} ใน tx_merge")

label_df = (
    tx_merge.groupby(COL_TX)[LABEL_COL_IN_TX].first().reset_index()
    .rename(columns={LABEL_COL_IN_TX:"used_type"})
)
label_df["used_type"] = label_df["used_type"].fillna("NoPromo")

data_ptype = basket_feat.merge(label_df, on=COL_TX, how="left")
data_ptype["used_type"] = data_ptype["used_type"].fillna("NoPromo")

# one-hot ฟีเจอร์สำหรับทั้งชุด → คอลัมน์จะตรงกันแน่นอน
X_all = data_ptype[FEATURE_COLS].copy()

bool_cols = X_all.select_dtypes(include=["bool"]).columns
if len(bool_cols):
    X_all[bool_cols] = X_all[bool_cols].astype(int)

obj_cols = X_all.select_dtypes(include=["object","category"]).columns
if len(obj_cols):
    X_all = pd.get_dummies(X_all, columns=obj_cols, dummy_na=True)

X_all = X_all.fillna(0.0).astype(float)

# split ตามเวลา
if "event_time" in data_ptype.columns and data_ptype["event_time"].notna().any():
    data_ptype = data_ptype.sort_values("event_time")
    X_all = X_all.loc[data_ptype.index]
    cut = int(len(data_ptype)*0.8)
    tr_idx = data_ptype.index[:cut]
    va_idx = data_ptype.index[cut:]
else:
    tr_idx, va_idx = train_test_split(
        data_ptype.index, test_size=0.2, random_state=SEED, stratify=data_ptype["used_type"]
    )

Xtr = X_all.loc[tr_idx].values
Xva = X_all.loc[va_idx].values
ytr = data_ptype.loc[tr_idx, "used_type"].values
yva = data_ptype.loc[va_idx, "used_type"].values

classes = np.unique(data_ptype["used_type"].values)
class_to_idx = {c:i for i,c in enumerate(classes)}
ytr_idx = np.array([class_to_idx[c] for c in ytr])
yva_idx = np.array([class_to_idx[c] for c in yva])

# base model + calibration (รองรับหลายเวอร์ชัน sklearn)
if HAS_LGB:
    base = lgb.LGBMClassifier(
        objective="multiclass",
        num_class=len(classes),
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=SEED
    )
else:
    base = GradientBoostingClassifier(random_state=SEED)

try:
    ptype_model = CalibratedClassifierCV(estimator=base, method="sigmoid", cv=3)
except TypeError:
    ptype_model = CalibratedClassifierCV(base_estimator=base, method="sigmoid", cv=3)

ptype_model.fit(Xtr, ytr_idx)
pred = ptype_model.predict(Xva)
print("Validation report (P(type|X))")
print(classification_report(yva_idx, pred, target_names=list(classes)))

ptype_classes  = list(classes)
ptype_featcols = list(X_all.columns)  # สำคัญ: ใช้ตอน inference ต้อง align คอลัมน์ชุดนี้


Validation report (P(type|X))
                precision    recall  f1-score   support

      Brandday       0.00      0.00      0.00         0
   Buy 1 get 1       0.00      0.00      0.00         0
    Flash Sale       0.00      0.00      0.00         0
     Mega Sale       0.00      0.00      0.00         0
       NoPromo       1.00      0.95      0.97      3836
Product_Coupon       0.00      0.00      0.00         0

      accuracy                           0.95      3836
     macro avg       0.17      0.16      0.16      3836
  weighted avg       1.00      0.95      0.97      3836



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [114]:
def encode_features_for_ptype(row_series, raw_feature_cols, feat_cols_all):
    row_df = pd.DataFrame([row_series[raw_feature_cols]])
    # bool -> int
    bool_cols = row_df.select_dtypes(include=["bool"]).columns
    if len(bool_cols):
        row_df[bool_cols] = row_df[bool_cols].astype(int)
    # one-hot สำหรับ object/category
    obj_cols = row_df.select_dtypes(include=["object","category"]).columns
    if len(obj_cols):
        row_df = pd.get_dummies(row_df, columns=obj_cols, dummy_na=True)
    # align columns
    for c in feat_cols_all:
        if c not in row_df.columns:
            row_df[c] = 0.0
    row_df = row_df[feat_cols_all].fillna(0.0).astype(float)
    return row_df.values  # shape (1, d)

def eligibility_filter(promos_df, context_row, now):
    out = promos_df.copy()
    if "start_date" in out.columns:
        out["start_date"] = pd.to_datetime(out["start_date"], errors="coerce")
    if "end_date" in out.columns:
        out["end_date"] = pd.to_datetime(out["end_date"], errors="coerce")
    if "is_online" in out.columns and COL_ONLINE in context_row.index:
        out = out[out["is_online"] == int(context_row[COL_ONLINE])]
    if "start_date" in out.columns and "end_date" in out.columns and pd.notna(now):
        out = out[(out["start_date"] <= now) & (now <= out["end_date"])]
    return out

# แทนที่ฟังก์ชันเดิมทั้งก้อน
def simple_scope_relevance(basket_row, promo_row):
    """
    คำนวณความเกี่ยวข้องระหว่างโปรกับตะกร้า
    - ถ้า product_scope มี category/code: วัด Jaccard กับ cat=... ในบิล
    - ถ้า scope ว่าง: ลดน้ำหนักลง ตามความนิยมของหมวดในบิล (ไม่ใช่ 0.5 ตายตัว)
    """
    scope_raw = str(promo_row.get("product_scope", "") or "").strip().lower()
    # ดึงหมวดในบิล (จากฟีเจอร์ cat=... ที่เป็นสัดส่วน)
    basket_cats = {col.split("cat=")[1].lower() for col in basket_row.index
                   if isinstance(col, str) and col.startswith("cat=") and float(basket_row[col]) > 0}

    if not basket_cats:
        return 0.15  # ไม่มีสัดส่วนหมวด → ให้ต่ำหน่อย

    # เคสมี scope → tokenize เป็นชุดคำ (รองรับ comma, ;, space)
    if scope_raw:
        sep = [",",";","|","/"]
        for s in sep: scope_raw = scope_raw.replace(s, " ")
        scope_set = {tok for tok in scope_raw.split() if tok}
        if not scope_set:
            return 0.2
        inter = len(basket_cats & scope_set)
        union = len(basket_cats | scope_set)
        j = inter/union if union else 0.0
        # เพิ่ม boost ถ้า inter>0
        bonus = 0.2 if inter > 0 else 0.0
        return min(1.0, 0.3 + 0.7*j + bonus)

    # เคส scope ว่าง → ให้คะแนนตามความ “กระจุกตัว” ของหมวดในบิล
    # ยิ่งบิลมี 1-2 หมวดหลักชัดเจน → relevance สูงขึ้น (โปรจับหมวดกว้างก็ยังพอเวิร์ก)
    cat_share = [float(basket_row[c]) for c in basket_row.index
                 if isinstance(c, str) and c.startswith("cat=")]
    if not cat_share:
        return 0.2
    top_share = sorted(cat_share, reverse=True)[:2]
    focus = sum(top_share)  # ~ 0.6–1.0 ถ้าบิลโฟกัสหมวดชัด
    return max(0.2, min(0.7, 0.3 + 0.4*focus))


def recall_candidates_for_event_relaxed(
    basket_row,
    promos_df,
    probs, classes,
    topk_types=2,
    relevance_thresh=0.30,
    nopromo_label="NoPromo"
):
    # 2.1 เลือกประเภท robust
    top_types = get_top_types(probs, classes, k=topk_types, ensure_non_nopromo=2, nopromo_label=nopromo_label)
    now = basket_row.get("event_time", pd.NaT)

    def _elig(df, strict_online=True):
        out = df.copy()
        if "start_date" in out.columns and "end_date" in out.columns and pd.notna(now):
            out = out[(out["start_date"] <= now) & (now <= out["end_date"])]
        if strict_online and "is_online" in out.columns and "is_online" in basket_row.index:
            out = out[out["is_online"] == int(basket_row["is_online"])]
        return out

    def _score_scope(df_):
        df_ = df_.copy()
        df_["scope_relevance"] = df_.apply(lambda r: simple_scope_relevance(basket_row, r), axis=1)
        return df_

    # Stage 1: เข้มที่สุด — date+channel + type filter
    cand = _elig(promos_df, strict_online=True)
    if "promo_type" in cand.columns:
        cand = cand[cand["promo_type"].isin(top_types)]
    cand = _score_scope(cand)
    out = cand[cand["scope_relevance"] >= relevance_thresh]

    # Stage 2: ผ่อน channel (online/offline)
    if out.empty:
        cand2 = _elig(promos_df, strict_online=False)
        if "promo_type" in cand2.columns:
            cand2 = cand2[cand2["promo_type"].isin(top_types)]
        cand2 = _score_scope(cand2)
        out = cand2[cand2["scope_relevance"] >= max(0.2, relevance_thresh*0.75)]

    # Stage 3: ผ่อน type filter (เลือกตาม scope สูงสุดแทน)
    if out.empty:
        cand3 = _elig(promos_df, strict_online=False)
        cand3 = _score_scope(cand3)
        out = cand3.nlargest(20, "scope_relevance")  # ดึงมาบางส่วนให้มีตัวเลือก

    # เติม NoPromo ไว้เป็น baseline เสมอ
    nopromo = pd.DataFrame([{
        "promo_id": "__NOPROMO__", "promo_type": nopromo_label,
        "product_scope": "", "est_margin": 0.0, "scope_relevance": 0.0
    }])
    return pd.concat([out, nopromo], ignore_index=True).drop_duplicates(subset=["promo_id"], keep="first")



In [115]:
def build_ranking_frame(basket_feats, ptype_model, ptype_classes, ptype_featcols,
                        promos_df, label_df, topk=TOPK_TYPES, max_cands=MAX_CANDS):
    class_to_idx = {c:i for i,c in enumerate(ptype_classes)}
    data = basket_feats.merge(label_df, on=COL_TX, how="left")
    data["used_type"] = data["used_type"].fillna("NoPromo")

    rows = []
    for _, row in data.iterrows():
        # encode ให้คอลัมน์ one-hot ตรงกับตอนเทรน
        X = encode_features_for_ptype(row, FEATURE_COLS, ptype_featcols)
        probs = ptype_model.predict_proba(X)[0]

        cands = recall_candidates_for_event_relaxed(
            basket_row=row,
            promos_df=promos_df,
            probs=probs,
            classes=ptype_classes,
            topk_types=TOPK_TYPES,
            relevance_thresh=REL_TH,
            nopromo_label="NoPromo"
        )

        if len(cands) > max_cands:
            cands = pd.concat([
                cands.nlargest(max_cands//2, "scope_relevance"),
                cands.sample(n=max_cands-(max_cands//2), random_state=SEED, replace=False)
            ])

        used_type = row["used_type"]
        for _, pr in cands.iterrows():
            label = 1 if (pr["promo_type"] == used_type or (used_type=="NoPromo" and pr["promo_id"]=="__NOPROMO__")) else 0
            rows.append({
                "event_id": row[COL_TX],
                "promo_id": pr["promo_id"],
                "promo_type": pr["promo_type"],
                "ptype_prob": float(probs[class_to_idx.get(pr["promo_type"], class_to_idx.get("NoPromo", 0))]),
                "scope_relevance": pr.get("scope_relevance", 0.0),
                "est_margin": pr.get("est_margin", 0.0),
                "is_online": row.get(COL_ONLINE, 0),
                "order_hour": row.get(COL_ORDER_H, 0),
                "dayofweek": row.get(COL_DOW, 0),
                "need_state_cluster": row.get("need_state_cluster", 0),
                "label": label
            })
    rank_df = pd.DataFrame(rows)

    # cap negatives per event
    out = []
    for eid, grp in rank_df.groupby("event_id"):
        pos = grp[grp["label"]==1]
        neg = grp[grp["label"]==0]
        keep_neg = neg if len(neg) <= (max_cands - len(pos)) else neg.sample(n=max_cands - len(pos), random_state=SEED)
        out.append(pd.concat([pos, keep_neg], ignore_index=True))
    return pd.concat(out, ignore_index=True)

rank_df = build_ranking_frame(
    basket_feats=basket_feat,
    ptype_model=ptype_model,
    ptype_classes=ptype_classes,
    ptype_featcols=ptype_featcols,
    promos_df=promos_df,
    label_df=label_df,
    topk=TOPK_TYPES,
    max_cands=MAX_CANDS
)
rank_df.head()


Unnamed: 0,event_id,promo_id,promo_type,ptype_prob,scope_relevance,est_margin,is_online,order_hour,dayofweek,need_state_cluster,label
0,PMTX0000001,PR0005,Buy 1 get 1,0.832334,0.7,0.0,0,9,0,0,1
1,PMTX0000001,PR0009,Buy 1 get 1,0.832334,0.7,0.0,0,9,0,0,1
2,PMTX0000001,PR0021,Buy 1 get 1,0.832334,0.7,0.0,0,9,0,0,1
3,PMTX0000001,PR0030,Buy 1 get 1,0.832334,0.7,0.0,0,9,0,0,1
4,PMTX0000001,PR0034,Buy 1 get 1,0.832334,0.7,0.0,0,9,0,0,1


In [117]:
# หลังสร้าง rank_df = pd.DataFrame(rows)
# bring event_time
rank_df = rank_df.merge(
    basket_feat[[COL_TX, "event_time"]].drop_duplicates(),
    left_on="event_id", right_on=COL_TX, how="left"
).drop(columns=[COL_TX])

# parse dates
for c in ["start_date","end_date"]:
    if c in rank_df.columns:
        rank_df[c] = pd.to_datetime(rank_df[c], errors="coerce")

# new features (เหมือน patch ด้านบน)
rank_df["discount_norm"] = (rank_df["discount"].astype(float).fillna(0) / 100.0) if "discount" in rank_df.columns else 0.0

rank_df["is_active_now"] = (
    (rank_df["start_date"] <= rank_df["event_time"]) &
    (rank_df["event_time"] <= rank_df["end_date"])
).astype(int) if {"start_date","end_date","event_time"}.issubset(rank_df.columns) else 1

rank_df["days_to_end"] = (
    (rank_df["end_date"] - rank_df["event_time"]).dt.days.fillna(0).clip(lower=-365, upper=365)
) if {"end_date","event_time"}.issubset(rank_df.columns) else 0

rank_df["type_dup_penalty"] = (
    rank_df.groupby(["event_id","promo_type"])["promo_id"].transform("count") - 1
).clip(lower=0).fillna(0)

rank_df["dup_product_penalty"] = (
    rank_df.groupby(["event_id","product_id"])["promo_id"].transform("count") - 1
).clip(lower=0).fillna(0) if "product_id" in rank_df.columns else 0


In [118]:
def ndcg_at_k(rels, k=5):
    rels = np.asfarray(rels)[:k]
    if rels.size == 0: return 0.0
    dcg = np.sum((2**rels - 1) / np.log2(np.arange(2, rels.size + 2)))
    ideal = np.sort(rels)[::-1]
    idcg = np.sum((2**ideal - 1) / np.log2(np.arange(2, ideal.size + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def train_ranker(rank_df, k_list=(3,5)):
    F = ["ptype_prob","scope_relevance","est_margin",
     "discount_norm","is_active_now","days_to_end",
     "type_dup_penalty","dup_product_penalty",
     "is_online","order_hour","dayofweek","need_state_cluster"]

    ev = rank_df["event_id"].unique()
    tr_e, va_e = train_test_split(ev, test_size=0.2, random_state=SEED)
    tr = rank_df[rank_df["event_id"].isin(tr_e)]
    va = rank_df[rank_df["event_id"].isin(va_e)]

    def to_group(df_):
        grp_sizes = df_.groupby("event_id").size().values
        X = df_[F].fillna(0).values
        y = df_["label"].values
        return X, y, grp_sizes

    if HAS_LGB:
        Xtr, ytr, gtr = to_group(tr)
        Xva, yva, gva = to_group(va)

        # ----- core API with callbacks (รองรับหลายเวอร์ชัน) -----
        try:
            dtr = lgb.Dataset(Xtr, label=ytr, group=gtr)
            dva = lgb.Dataset(Xva, label=yva, group=gva, reference=dtr)
            params = dict(
                objective="lambdarank",
                metric="ndcg",          # <--- สำคัญ: ใช้ 'ndcg' + eval_at แทน 'ndcg@k'
                eval_at=[3, 5],        # <--- ระบุ k ที่ต้องการประเมิน
                learning_rate=0.05,
                num_leaves=63,
                min_data_in_leaf=100,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                bagging_freq=1,
                verbosity=-1,
                seed=SEED
            )
            cbs = []
            # ใส่ early_stopping ผ่าน callback (บางเวอร์ชันเท่านั้น)
            try:
                cbs.append(lgb.early_stopping(stopping_rounds=100))
            except Exception:
                pass
            # ใส่ log interval ถ้ามี
            try:
                cbs.append(lgb.log_evaluation(100))
            except Exception:
                pass

            try:
                model = lgb.train(
                    params,
                    dtr,
                    num_boost_round=800,
                    valid_sets=[dtr, dva],
                    valid_names=["train","valid"],
                    callbacks=cbs
                )
            except ValueError:
                # ถ้ายัง complain เรื่อง metric/early stopping ให้รันแบบไม่มี early stopping
                model = lgb.train(
                    params,
                    dtr,
                    num_boost_round=800,
                    valid_sets=[dtr, dva],
                    valid_names=["train","valid"]
                )
            use_core_api = True

        except Exception:
            # ----- fallback เป็น sklearn API LGBMRanker -----
            ranker = lgb.LGBMRanker(
                objective="lambdarank",
                n_estimators=800,
                learning_rate=0.05,
                num_leaves=63,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=SEED
            )
            try:
                # บางเวอร์ชันรองรับ eval_at ผ่าน set_params
                ranker.set_params(metric="ndcg", eval_at=[3,5])
            except Exception:
                pass
            try:
                ranker.fit(
                    Xtr, ytr,
                    group=gtr.tolist(),
                    eval_set=[(Xva, yva)],
                    eval_group=[gva.tolist()]
                )
            except TypeError:
                ranker.fit(Xtr, ytr, group=gtr.tolist())
            model = ranker
            use_core_api = False

        # ----- ประเมิน NDCG -----
        ndcgs = {f"ndcg@{k}":[] for k in k_list}
        for eid, grp in va.groupby("event_id"):
            if use_core_api:
                s = model.predict(grp[F].fillna(0).values,
                                  num_iteration=getattr(model, "best_iteration", None))
            else:
                s = model.predict(grp[F].fillna(0).values)
            grp = grp.assign(_s=s).sort_values("_s", ascending=False)
            for k in k_list:
                ndcgs[f"ndcg@{k}"].append(ndcg_at_k(grp["label"].values, k))
        return {"model": model, "feature_cols": F, "report": {m: float(np.mean(v)) for m,v in ndcgs.items()}}

    else:
        # Fallback: pointwise classifier
        clf = GradientBoostingClassifier(random_state=SEED)
        Xtr, ytr, _ = to_group(tr)
        Xva, yva, _ = to_group(va)
        clf.fit(Xtr, ytr)
        ndcgs = {f"ndcg@{k}":[] for k in k_list}
        for eid, grp in va.groupby("event_id"):
            s = clf.predict_proba(grp[F].fillna(0).values)[:,1]
            grp = grp.assign(_s=s).sort_values("_s", ascending=False)
            for k in k_list:
                ndcgs[f"ndcg@{k}"].append(ndcg_at_k(grp["label"].values, k))
        return {"model": clf, "feature_cols": F, "report": {m: float(np.mean(v)) for m,v in ndcgs.items()}, "fallback_pointwise": True}



rank_art = train_ranker(rank_df)
rank_art["report"]


Training until validation scores don't improve for 100 rounds
[100]	train's ndcg@3: 1	train's ndcg@5: 1	valid's ndcg@3: 0.999218	valid's ndcg@5: 0.999218
Early stopping, best iteration is:
[19]	train's ndcg@3: 0.99987	train's ndcg@5: 0.99987	valid's ndcg@3: 0.999479	valid's ndcg@5: 0.999479


{'ndcg@3': 0.48096976016684045, 'ndcg@5': 0.48096976016684045}

In [120]:
def score_event(event_tx_id, basket_feats, ptype_model, ptype_classes, ptype_featcols,
                promos_df, rank_art, topk=TOPK_TYPES, rel_th=REL_TH):
    # 0) ดึงแถวบริบท
    row = basket_feats[basket_feats[COL_TX]==event_tx_id]
    if row.empty:
        raise ValueError("transaction_id ไม่พบใน basket_feats")
    row = row.iloc[0]

    # 1) prior P(type|X)
    X = encode_features_for_ptype(row, FEATURE_COLS, ptype_featcols)
    probs = ptype_model.predict_proba(X)[0]
    class_to_idx = {c:i for i,c in enumerate(ptype_classes)}

    # 2) recall (แบบ relaxed)
    cands = recall_candidates_for_event_relaxed(
        basket_row=row,
        promos_df=promos_df,
        probs=probs,
        classes=ptype_classes,
        topk_types=TOPK_TYPES,
        relevance_thresh=rel_th,
        nopromo_label="NoPromo"
    )

    # 3) เตรียมฟีเจอร์ให้ครบสำหรับ ranker (เติม "ก่อน" ใช้ F)
    tmp = cands.copy()

    # prior prob ต่อโปรชนิดนั้น
    tmp["ptype_prob"] = tmp["promo_type"].apply(
        lambda t: probs[class_to_idx.get(t, class_to_idx.get("NoPromo", 0))]
    )

    # บริบทเหตุการณ์
    tmp["is_online"] = int(row.get(COL_ONLINE, 0))
    tmp["order_hour"] = int(row.get(COL_ORDER_H, 0))
    tmp["dayofweek"] = int(row.get(COL_DOW, 0))
    tmp["need_state_cluster"] = int(row.get("need_state_cluster", 0))

    # วันที่/ช่วงโปร
    now = row.get("event_time", pd.NaT)
    if "start_date" in tmp.columns and "end_date" in tmp.columns and pd.notna(now):
        tmp["is_active_now"] = ((tmp["start_date"] <= now) & (now <= tmp["end_date"])).astype(int)
        tmp["days_to_end"] = (tmp["end_date"] - now).dt.days.clip(lower=-365, upper=365)
    else:
        tmp["is_active_now"] = 1
        tmp["days_to_end"] = 0

    # ส่วนลด normalize
    if "discount" in tmp.columns:
        tmp["discount_norm"] = pd.to_numeric(tmp["discount"], errors="coerce").fillna(0) / 100.0
    else:
        tmp["discount_norm"] = 0.0

    # penalties ในกลุ่มเดียวกัน
    tmp["type_dup_penalty"] = (
        tmp.groupby("promo_type")["promo_id"].transform("count") - 1
    ).clip(lower=0).fillna(0)

    if "product_id" in tmp.columns:
        tmp["dup_product_penalty"] = (
            tmp.groupby("product_id")["promo_id"].transform("count") - 1
        ).clip(lower=0).fillna(0)
    else:
        tmp["dup_product_penalty"] = 0.0

    # กัน missing ที่ ranker ต้องใช้
    needed = ["ptype_prob","scope_relevance","est_margin",
              "discount_norm","is_active_now","days_to_end",
              "type_dup_penalty","dup_product_penalty",
              "is_online","order_hour","dayofweek","need_state_cluster"]
    for c in needed:
        if c not in tmp.columns:
            tmp[c] = 0.0
    tmp[needed] = tmp[needed].fillna(0)

    # 4) จัดอันดับด้วย ranker
    F = rank_art["feature_cols"]  # ต้องตรงกับตอนเทรน
    mdl = rank_art["model"]
    Xr = tmp[F].fillna(0).values

    if HAS_LGB and "fallback_pointwise" not in rank_art:
        s = mdl.predict(Xr, num_iteration=getattr(mdl, "best_iteration", None))
    else:
        s = mdl.predict_proba(Xr)[:, 1]

    # normalize และ tie-breaker
    s_ptp = float(np.ptp(s))
    tmp["ranker_score"] = (s - float(np.min(s))) / s_ptp if s_ptp > 1e-9 else s
    if tmp["ranker_score"].nunique() == 1:
        tb = (tmp["promo_id"].astype(str).apply(lambda x: (hash(x) % 997) / 997.0)) * 0.01
        tmp["ranker_score"] = tmp["ranker_score"] + tb

    # 5) blend คะแนนสุดท้าย (หลังมีทุกฟีเจอร์แล้ว)
    w = {
        "ptype_prob": 0.30,
        "ranker_score": 0.35,
        "scope_relevance": 0.15,
        "est_margin": 0.05,
        "discount_norm": 0.10,
        "is_active_now": 0.05
    }
    pen = {"type_dup_penalty": 0.05, "dup_product_penalty": 0.08}

    tmp["final_score"] = (
        w["ptype_prob"]*tmp["ptype_prob"] +
        w["ranker_score"]*tmp["ranker_score"] +
        w["scope_relevance"]*tmp["scope_relevance"] +
        w["est_margin"]*tmp["est_margin"] +
        w["discount_norm"]*tmp["discount_norm"] +
        w["is_active_now"]*tmp["is_active_now"]
        - pen["type_dup_penalty"]*tmp["type_dup_penalty"]
        - pen["dup_product_penalty"]*tmp["dup_product_penalty"]
    )

    return tmp.sort_values("final_score", ascending=False).reset_index(drop=True)


sample_tx_id = basket_feat[COL_TX].iloc[9000]
score_event(sample_tx_id, basket_feat, ptype_model, ptype_classes, ptype_featcols, promos_df, rank_art).head(10)


Unnamed: 0,promo_id,promo_type,product_id,discount,start_date,end_date,product_scope,is_online,est_margin,scope_relevance,...,order_hour,dayofweek,need_state_cluster,is_active_now,days_to_end,discount_norm,type_dup_penalty,dup_product_penalty,ranker_score,final_score
0,__NOPROMO__,NoPromo,,,NaT,NaT,,0,0.0,0.0,...,17,1,1,0,0.0,0.0,0,0.0,1.0,0.619114
1,PR0073,Buy 1 get 1,P0297,100.0,2025-07-18 14:12:00,2025-09-14 14:12:00,,0,0.0,0.7,...,17,1,1,1,4.0,1.0,4,0.0,0.04971,0.078644
2,PR0078,Buy 1 get 1,P0980,100.0,2025-08-08 09:16:00,2025-09-18 09:16:00,,0,0.0,0.7,...,17,1,1,1,8.0,1.0,4,0.0,0.04971,0.078644
3,PR0083,Buy 1 get 1,P0587,100.0,2025-09-06 17:03:00,2025-09-19 17:03:00,,0,0.0,0.7,...,17,1,1,1,9.0,1.0,4,0.0,0.04971,0.078644
4,PR0084,Buy 1 get 1,P0656,100.0,2025-08-01 09:18:00,2025-09-10 09:18:00,,0,0.0,0.7,...,17,1,1,1,0.0,1.0,4,0.0,0.04971,0.078644
5,PR0095,Buy 1 get 1,P0964,100.0,2025-07-25 20:15:00,2025-09-09 20:15:00,,0,0.0,0.7,...,17,1,1,1,0.0,1.0,4,0.0,0.04971,0.078644
6,PR0088,Product_Coupon,P0623,48.0,2025-09-04 09:56:00,2025-09-13 09:56:00,,0,0.0,0.7,...,17,1,1,1,3.0,0.48,8,0.0,0.0,-0.189538
7,PR0069,Product_Coupon,P0925,43.0,2025-09-07 08:34:00,2025-09-20 08:34:00,,0,0.0,0.7,...,17,1,1,1,10.0,0.43,8,0.0,0.0,-0.194538
8,PR0067,Product_Coupon,P0182,29.0,2025-08-11 02:29:00,2025-09-21 02:29:00,,0,0.0,0.7,...,17,1,1,1,11.0,0.29,8,0.0,0.0,-0.208538
9,PR0003,Product_Coupon,P0441,25.0,2025-08-04 01:41:00,2025-09-11 01:41:00,,0,0.0,0.7,...,17,1,1,1,1.0,0.25,8,0.0,0.0,-0.212538


In [143]:
sample_tx_id = basket_feat["transaction_id"].iloc[5]
rec = score_event(sample_tx_id, basket_feat, ptype_model, ptype_classes, ptype_featcols,
                  promos_df, rank_art, topk=2, rel_th=0.30)
rec[['promo_id','promo_type','discount','product_scope','ranker_score','final_score']].head(5)  


Unnamed: 0,promo_id,promo_type,discount,product_scope,ranker_score,final_score
0,PR0073,Buy 1 get 1,100.0,,1.0,0.57864
1,PR0078,Buy 1 get 1,100.0,,1.0,0.57864
2,PR0083,Buy 1 get 1,100.0,,1.0,0.57864
3,PR0084,Buy 1 get 1,100.0,,1.0,0.57864
4,PR0095,Buy 1 get 1,100.0,,1.0,0.57864
