In [77]:
from pathlib import Path
import numpy as np
import pandas as pd

# ML
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as lgb

from pathlib import Path

In [78]:
BASE = Path("Datasets/mockup_ver2/")

tx_merge = pd.read_csv(BASE/"tx_merge3.csv") 
promotions = pd.read_csv(BASE/"promotions.csv", parse_dates=["start_date","end_date"])

promos_df = promotions.copy()
df = tx_merge.copy()
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")

  promotions = pd.read_csv(BASE/"promotions.csv", parse_dates=["start_date","end_date"])
  promotions = pd.read_csv(BASE/"promotions.csv", parse_dates=["start_date","end_date"])


In [79]:
import lightgbm as lgb
HAS_LGB = True

SEED = 42
NEED_K = 8
PCA_K  = 30
TOPK_TYPES = 2
REL_TH = 0.30
MAX_CANDS = 40

# ---- Columns (อิงจาก tx_merge2.csv ของคุณ) ----
COL_TX   = "transaction_id"
COL_USER = "user_id"
COL_PROD = "product_id"
COL_QTY  = "qty"
COL_PRICE= "price"

COL_CAT   = "products.category"
COL_BRAND = "products.brand"
COL_TS    = "timestamp"
COL_STORE = "store_id"
COL_ONLINE= "is_online"

COL_ORDER_H = "order_hour"
COL_DOW     = "dayofweek"
COL_MONTH   = "month"
COL_DAY     = "day"
COL_WOY     = "weekofyear"
COL_QUARTER = "quarter"
COL_IS_WKD  = "is_weekend"
COL_THAI_SEAS = "thai_season"
COL_IN_FEST   = "InFestival"

COL_WKD_BOOST = "weekday_boost"
COL_WKE_BOOST = "weekend_boost"
COL_FES_BOOST = "festival_boost"
COL_PEAKS     = "peaks_encoded"
COL_HOUR_W    = "hour_weight"
COL_LOYALTY   = "loyalty_score"
COL_EXPECT    = "expected_basket_items"
COL_ELAS      = "price_elasticity"
COL_SEGMENT   = "segment"

# ถ้าใช้ label จาก tx_merge โดยตรง:
LABEL_COL_IN_TX = "promotion_type"

In [80]:
rename_map = {}
if "promotions.promo_type" in promos_df.columns:
    rename_map["promotions.promo_type"] = "promo_type"
if "promotion_category" in promos_df.columns and "promo_type" not in promos_df.columns:
    rename_map["promotion_category"] = "promo_type"
if "promotion_type" in promos_df.columns and "promo_type" not in promos_df.columns:
    rename_map["promotion_type"] = "promo_type"
if "scope" in promos_df.columns and "product_scope" not in promos_df.columns:
    rename_map["scope"] = "product_scope"

promos_df = promos_df.rename(columns=rename_map)

# เติมคอลัมน์ที่ขาดด้วยค่า default ปลอดภัย
defaults = {
    "promo_id": "__UNK__",
    "promo_type": "Unknown",
    "product_scope": "",
    "is_online": 1,
    "start_date": pd.Timestamp("2000-01-01"),
    "end_date":   pd.Timestamp("2100-01-01"),
    "est_margin": 0.0
}
for c, d in defaults.items():
    if c not in promos_df.columns:
        promos_df[c] = d

# final check
need_cols = ["promo_id","promo_type","product_scope","is_online","start_date","end_date","est_margin"]
missing = [c for c in need_cols if c not in promos_df.columns]
assert not missing, f"promos_df ขาดคอลัมน์: {missing}"

# แปลงวันที่ (กัน type ผิด)
promos_df["start_date"] = pd.to_datetime(promos_df["start_date"], errors="coerce")
promos_df["end_date"]   = pd.to_datetime(promos_df["end_date"], errors="coerce")

In [81]:
agg = {}
if COL_PROD in df.columns: agg[COL_PROD] = "nunique"
if COL_QTY  in df.columns: agg[COL_QTY]  = "sum"
if COL_PRICE in df.columns and COL_QTY in df.columns:
    df["_revenue"] = df[COL_PRICE].fillna(0) * df[COL_QTY].fillna(0)
    agg["_revenue"] = "sum"
elif COL_PRICE in df.columns:
    agg[COL_PRICE] = "sum"

basket = (
    df.groupby(COL_TX).agg(agg)
      .rename(columns={COL_PROD: "basket_unique_items"})
      .reset_index()
)

evt = df.groupby(COL_TX)[COL_TS].min().rename("event_time").reset_index()
basket = basket.merge(evt, on=COL_TX, how="left")

# context ที่มีอยู่แล้วในไฟล์
context_cols = [
    COL_STORE, COL_ONLINE,
    COL_ORDER_H, COL_DOW, COL_MONTH, COL_DAY, COL_WOY, COL_QUARTER,
    COL_IS_WKD, COL_THAI_SEAS, COL_IN_FEST,
    COL_WKD_BOOST, COL_WKE_BOOST, COL_FES_BOOST, COL_PEAKS, COL_HOUR_W,
    COL_LOYALTY, COL_EXPECT, COL_ELAS, COL_SEGMENT
]

for c in context_cols:
    if c in df.columns:
        first = df.groupby(COL_TX)[c].first().reset_index()
        basket = basket.merge(first, on=COL_TX, how="left")

# multi-hot: k=category/brand proportions
def crosstab_prop(frame, key, val, prefix):
    if val not in frame.columns:
        return pd.DataFrame({key: frame[key].unique()})
    ct = pd.crosstab(frame[key], frame[val])
    if ct.empty:
        return pd.DataFrame({key: frame[key].unique()})
    prop = ct.div(ct.sum(axis=1).replace(0, np.nan), axis=0).fillna(0)
    prop.columns = [f"{prefix}={c}" for c in prop.columns]
    return prop.reset_index()

cat_prop   = crosstab_prop(df, COL_TX, COL_CAT,   "cat")
brand_prop = crosstab_prop(df, COL_TX, COL_BRAND, "brand")
basket = basket.merge(cat_prop, on=COL_TX, how="left").merge(brand_prop, on=COL_TX, how="left")

if COL_ONLINE in basket.columns:
    basket[COL_ONLINE] = basket[COL_ONLINE].astype(int)

comp_cols = [c for c in basket.columns if c.startswith("cat=") or c.startswith("brand=")]
num_cols = [
    "basket_unique_items", COL_QTY, "_revenue", COL_PRICE,
    COL_ORDER_H, COL_DOW, COL_MONTH, COL_DAY, COL_WOY, COL_QUARTER,
    COL_IS_WKD, COL_THAI_SEAS, COL_IN_FEST, COL_WKD_BOOST, COL_WKE_BOOST, COL_FES_BOOST,
    COL_PEAKS, COL_HOUR_W, COL_LOYALTY, COL_EXPECT, COL_ELAS
]
num_cols = [c for c in num_cols if c in basket.columns]

FEATURE_COLS = num_cols + ([COL_ONLINE] if COL_ONLINE in basket.columns else []) + comp_cols
basket_feat = basket.copy()

# sanity print
print("basket_feat shape:", basket_feat.shape)
print("num FEATURES:", len(FEATURE_COLS))

basket_feat shape: (19178, 85)
num FEATURES: 81


In [82]:
def get_top_types(probs, classes, k=2, ensure_non_nopromo=2, nopromo_label="NoPromo"):
    """
    เลือกประเภทโปรฯ สำหรับ recall: บังคับให้มีอย่างน้อย ensure_non_nopromo ประเภทที่ไม่ใช่ NoPromo
    แล้วค่อยเติม NoPromo ในลิสต์ (ถ้าจำเป็น)
    """
    order = np.argsort(probs)[::-1]
    cls_order = [classes[i] for i in order]

    non_np = [c for c in cls_order if c != nopromo_label]
    top_non_np = non_np[:max(ensure_non_nopromo, 1)]

    merged, seen = [], set()
    for c in top_non_np + cls_order:
        if c not in seen:
            merged.append(c); seen.add(c)
        if len(merged) >= k + 1:  # เผื่อ 1 ช่องให้ NoPromo
            break

    if nopromo_label not in merged:
        merged.append(nopromo_label)

    return merged[:k+1]


In [83]:
# %% Need-state discovery (fixed: auto-encode non-numeric) 
from sklearn.metrics import silhouette_score

# ทำ one-hot ให้ทุกคอลัมน์ที่เป็น object/category (กัน error 'Rainy')
X_df = basket_feat[FEATURE_COLS].copy()

# bool -> int
bool_cols = X_df.select_dtypes(include=["bool"]).columns
if len(bool_cols):
    X_df[bool_cols] = X_df[bool_cols].astype(int)

obj_cols = X_df.select_dtypes(include=["object", "category"]).columns
if len(obj_cols):
    X_df = pd.get_dummies(X_df, columns=obj_cols, dummy_na=True)

X = X_df.fillna(0.0).astype(float).values

# Scale + PCA
sc = StandardScaler()
Xs = sc.fit_transform(X)

pca = PCA(n_components=min(PCA_K, Xs.shape[1]), random_state=SEED)
Xp  = pca.fit_transform(Xs)

# KMeans
mbk = MiniBatchKMeans(n_clusters=NEED_K, random_state=SEED, batch_size=4096, n_init=10)
labels = mbk.fit_predict(Xp)
basket_feat["need_state_cluster"] = labels

# silhouette (sample)
try:
    idx = np.random.RandomState(SEED).choice(len(Xp), size=min(5000, len(Xp)), replace=False)
    sil = silhouette_score(Xp[idx], labels[idx])
except Exception:
    sil = np.nan
print(f"Silhouette(sample): {sil:.3f}")

# profiling
prof_cols = [
    "basket_unique_items", COL_QTY, COL_PRICE, "_revenue",
    COL_ORDER_H, COL_DOW, COL_IS_WKD, COL_THAI_SEAS, COL_IN_FEST,
    COL_WKD_BOOST, COL_WKE_BOOST, COL_FES_BOOST, COL_HOUR_W,
    COL_LOYALTY, COL_EXPECT, COL_ELAS
]
prof_cols = [c for c in prof_cols if c in basket_feat.columns]

def top_components(df_in, key, cols, n=8):
    rows = []
    for k, grp in df_in.groupby(key):
        sums = grp[cols].sum().sort_values(ascending=False)
        rows.append({key: k, "top_components": "; ".join([f"{c}:{sums[c]:.1f}" for c in sums.index[:n]])})
    return pd.DataFrame(rows)

comp_cols = [c for c in basket_feat.columns if c.startswith("cat=") or c.startswith("brand=")]
prof = (
    basket_feat.groupby("need_state_cluster")[prof_cols]
    .mean(numeric_only=True).round(3).reset_index()
)
topc = top_components(basket_feat, "need_state_cluster", comp_cols, n=8) if comp_cols else pd.DataFrame(columns=["need_state_cluster","top_components"])

need_profile = prof.merge(topc, on="need_state_cluster", how="left")
need_profile.insert(1, "count", basket_feat.groupby("need_state_cluster")[COL_TX].nunique().values)
need_profile.insert(2, "share_pct", (need_profile["count"]/need_profile["count"].sum()*100).round(2))

need_profile.head(10)



Silhouette(sample): 0.085


Unnamed: 0,need_state_cluster,count,share_pct,basket_unique_items,qty,_revenue,order_hour,dayofweek,is_weekend,InFestival,weekday_boost,weekend_boost,festival_boost,hour_weight,loyalty_score,expected_basket_items,price_elasticity,top_components
0,0,2530,13.19,1.0,2.927,968.25,11.516,3.012,0.277,0.083,1.1,0.879,0.95,0.999,0.922,2.989,0.014,cat=Snacks:320.0; cat=Household:293.0; cat=Rea...
1,1,3509,18.3,1.0,3.019,1065.008,11.574,3.061,0.304,0.08,1.001,1.049,1.05,1.004,0.924,2.989,0.007,cat=ReadyToEat:504.0; cat=Others:459.0; cat=Sn...
2,2,465,2.42,1.0,3.133,1217.232,11.761,3.065,0.286,0.08,1.018,0.991,1.021,0.994,0.923,2.989,0.017,brand=Brand_023:462.0; cat=DairyBakery:113.0; ...
3,3,3151,16.43,1.0,2.979,1001.444,11.51,2.919,0.273,0.085,1.014,1.0,1.014,0.996,0.923,2.989,0.004,cat=ReadyToEat:422.0; cat=Snacks:420.0; cat=Ho...
4,4,1195,6.23,1.0,2.906,663.389,11.552,2.862,0.261,0.074,1.016,1.003,1.024,0.992,0.925,2.99,0.007,cat=InstantFoods:1111.0; brand=Brand_036:176.0...
5,5,2472,12.89,1.0,3.028,1045.177,11.214,3.055,0.288,0.071,1.001,1.02,1.001,0.993,0.924,2.989,-0.002,cat=ReadyToEat:336.0; cat=HealthBeauty:290.0; ...
6,6,2251,11.74,1.0,2.987,984.446,11.323,2.958,0.275,0.071,0.95,1.1,1.1,0.997,0.923,2.989,0.006,cat=ReadyToEat:265.0; cat=HealthBeauty:256.0; ...
7,7,3605,18.8,1.0,2.987,1029.343,11.6,3.047,0.292,0.079,1.05,0.9,1.0,0.995,0.924,2.989,0.01,cat=Household:479.0; cat=ReadyToEat:447.0; cat...


In [85]:
# เตรียม label ต่อธุรกรรมจาก tx_merge โดยตรง (ถ้าไม่มี ใช้วิธี join ผ่าน promo_id แทน)
if LABEL_COL_IN_TX not in tx_merge.columns:
    raise ValueError(f"ไม่พบ {LABEL_COL_IN_TX} ใน tx_merge")

label_df = (
    tx_merge.groupby(COL_TX)[LABEL_COL_IN_TX].first().reset_index()
    .rename(columns={LABEL_COL_IN_TX:"used_type"})
)
label_df["used_type"] = label_df["used_type"].fillna("NoPromo")

data_ptype = basket_feat.merge(label_df, on=COL_TX, how="left")
data_ptype["used_type"] = data_ptype["used_type"].fillna("NoPromo")

# one-hot ฟีเจอร์สำหรับทั้งชุด → คอลัมน์จะตรงกันแน่นอน
X_all = data_ptype[FEATURE_COLS].copy()

bool_cols = X_all.select_dtypes(include=["bool"]).columns
if len(bool_cols):
    X_all[bool_cols] = X_all[bool_cols].astype(int)

obj_cols = X_all.select_dtypes(include=["object","category"]).columns
if len(obj_cols):
    X_all = pd.get_dummies(X_all, columns=obj_cols, dummy_na=True)

X_all = X_all.fillna(0.0).astype(float)

# split ตามเวลา
if "event_time" in data_ptype.columns and data_ptype["event_time"].notna().any():
    data_ptype = data_ptype.sort_values("event_time")
    X_all = X_all.loc[data_ptype.index]
    cut = int(len(data_ptype)*0.8)
    tr_idx = data_ptype.index[:cut]
    va_idx = data_ptype.index[cut:]
else:
    tr_idx, va_idx = train_test_split(
        data_ptype.index, test_size=0.2, random_state=SEED, stratify=data_ptype["used_type"]
    )

Xtr = X_all.loc[tr_idx].values
Xva = X_all.loc[va_idx].values
ytr = data_ptype.loc[tr_idx, "used_type"].values
yva = data_ptype.loc[va_idx, "used_type"].values

classes = np.unique(data_ptype["used_type"].values)
class_to_idx = {c:i for i,c in enumerate(classes)}
ytr_idx = np.array([class_to_idx[c] for c in ytr])
yva_idx = np.array([class_to_idx[c] for c in yva])

# base model + calibration (รองรับหลายเวอร์ชัน sklearn)
if HAS_LGB:
    base = lgb.LGBMClassifier(
        objective="multiclass",
        num_class=len(classes),
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=63,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=SEED
    )
else:
    base = GradientBoostingClassifier(random_state=SEED)

try:
    ptype_model = CalibratedClassifierCV(estimator=base, method="sigmoid", cv=3)
except TypeError:
    ptype_model = CalibratedClassifierCV(base_estimator=base, method="sigmoid", cv=3)

ptype_model.fit(Xtr, ytr_idx)
pred = ptype_model.predict(Xva)
print("Validation report (P(type|X))")
print(classification_report(yva_idx, pred, target_names=list(classes)))

ptype_classes  = list(classes)
ptype_featcols = list(X_all.columns)  # สำคัญ: ใช้ตอน inference ต้อง align คอลัมน์ชุดนี้


Validation report (P(type|X))
                precision    recall  f1-score   support

      Brandday       0.54      0.13      0.20       111
   Buy 1 get 1       0.85      0.16      0.27       144
    Flash Sale       0.00      0.00      0.00       303
     Mega Sale       0.74      0.20      0.32       123
       NoPromo       0.77      0.99      0.87      2904
Product_Coupon       0.00      0.00      0.00       251

      accuracy                           0.77      3836
     macro avg       0.48      0.25      0.28      3836
  weighted avg       0.65      0.77      0.68      3836



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [86]:
def encode_features_for_ptype(row_series, raw_feature_cols, feat_cols_all):
    row_df = pd.DataFrame([row_series[raw_feature_cols]])
    # bool -> int
    bool_cols = row_df.select_dtypes(include=["bool"]).columns
    if len(bool_cols):
        row_df[bool_cols] = row_df[bool_cols].astype(int)
    # one-hot สำหรับ object/category
    obj_cols = row_df.select_dtypes(include=["object","category"]).columns
    if len(obj_cols):
        row_df = pd.get_dummies(row_df, columns=obj_cols, dummy_na=True)
    # align columns
    for c in feat_cols_all:
        if c not in row_df.columns:
            row_df[c] = 0.0
    row_df = row_df[feat_cols_all].fillna(0.0).astype(float)
    return row_df.values  # shape (1, d)

def eligibility_filter(promos_df, context_row, now):
    out = promos_df.copy()
    if "start_date" in out.columns:
        out["start_date"] = pd.to_datetime(out["start_date"], errors="coerce")
    if "end_date" in out.columns:
        out["end_date"] = pd.to_datetime(out["end_date"], errors="coerce")
    if "is_online" in out.columns and COL_ONLINE in context_row.index:
        out = out[out["is_online"] == int(context_row[COL_ONLINE])]
    if "start_date" in out.columns and "end_date" in out.columns and pd.notna(now):
        out = out[(out["start_date"] <= now) & (now <= out["end_date"])]
    return out

# แทนที่ฟังก์ชันเดิมทั้งก้อน
def simple_scope_relevance(basket_row, promo_row):
    """
    คำนวณความเกี่ยวข้องระหว่างโปรกับตะกร้า
    - ถ้า product_scope มี category/code: วัด Jaccard กับ cat=... ในบิล
    - ถ้า scope ว่าง: ลดน้ำหนักลง ตามความนิยมของหมวดในบิล (ไม่ใช่ 0.5 ตายตัว)
    """
    scope_raw = str(promo_row.get("product_scope", "") or "").strip().lower()
    # ดึงหมวดในบิล (จากฟีเจอร์ cat=... ที่เป็นสัดส่วน)
    basket_cats = {col.split("cat=")[1].lower() for col in basket_row.index
                   if isinstance(col, str) and col.startswith("cat=") and float(basket_row[col]) > 0}

    if not basket_cats:
        return 0.15  # ไม่มีสัดส่วนหมวด → ให้ต่ำหน่อย

    # เคสมี scope → tokenize เป็นชุดคำ (รองรับ comma, ;, space)
    if scope_raw:
        sep = [",",";","|","/"]
        for s in sep: scope_raw = scope_raw.replace(s, " ")
        scope_set = {tok for tok in scope_raw.split() if tok}
        if not scope_set:
            return 0.2
        inter = len(basket_cats & scope_set)
        union = len(basket_cats | scope_set)
        j = inter/union if union else 0.0
        # เพิ่ม boost ถ้า inter>0
        bonus = 0.2 if inter > 0 else 0.0
        return min(1.0, 0.3 + 0.7*j + bonus)

    # เคส scope ว่าง → ให้คะแนนตามความ “กระจุกตัว” ของหมวดในบิล
    # ยิ่งบิลมี 1-2 หมวดหลักชัดเจน → relevance สูงขึ้น (โปรจับหมวดกว้างก็ยังพอเวิร์ก)
    cat_share = [float(basket_row[c]) for c in basket_row.index
                 if isinstance(c, str) and c.startswith("cat=")]
    if not cat_share:
        return 0.2
    top_share = sorted(cat_share, reverse=True)[:2]
    focus = sum(top_share)  # ~ 0.6–1.0 ถ้าบิลโฟกัสหมวดชัด
    return max(0.2, min(0.7, 0.3 + 0.4*focus))


def recall_candidates_for_event_relaxed(
    basket_row,
    promos_df,
    probs, classes,
    topk_types=2,
    relevance_thresh=0.30,
    nopromo_label="NoPromo"
):
    # 2.1 เลือกประเภท robust
    top_types = get_top_types(probs, classes, k=topk_types, ensure_non_nopromo=2, nopromo_label=nopromo_label)
    now = basket_row.get("event_time", pd.NaT)

    def _elig(df, strict_online=True):
        out = df.copy()
        if "start_date" in out.columns and "end_date" in out.columns and pd.notna(now):
            out = out[(out["start_date"] <= now) & (now <= out["end_date"])]
        if strict_online and "is_online" in out.columns and "is_online" in basket_row.index:
            out = out[out["is_online"] == int(basket_row["is_online"])]
        return out

    def _score_scope(df_):
        df_ = df_.copy()
        df_["scope_relevance"] = df_.apply(lambda r: simple_scope_relevance(basket_row, r), axis=1)
        return df_

    # Stage 1: เข้มที่สุด — date+channel + type filter
    cand = _elig(promos_df, strict_online=True)
    if "promo_type" in cand.columns:
        cand = cand[cand["promo_type"].isin(top_types)]
    cand = _score_scope(cand)
    out = cand[cand["scope_relevance"] >= relevance_thresh]

    # Stage 2: ผ่อน channel (online/offline)
    if out.empty:
        cand2 = _elig(promos_df, strict_online=False)
        if "promo_type" in cand2.columns:
            cand2 = cand2[cand2["promo_type"].isin(top_types)]
        cand2 = _score_scope(cand2)
        out = cand2[cand2["scope_relevance"] >= max(0.2, relevance_thresh*0.75)]

    # Stage 3: ผ่อน type filter (เลือกตาม scope สูงสุดแทน)
    if out.empty:
        cand3 = _elig(promos_df, strict_online=False)
        cand3 = _score_scope(cand3)
        out = cand3.nlargest(20, "scope_relevance")  # ดึงมาบางส่วนให้มีตัวเลือก

    # เติม NoPromo ไว้เป็น baseline เสมอ
    nopromo = pd.DataFrame([{
        "promo_id": "__NOPROMO__", "promo_type": nopromo_label,
        "product_scope": "", "est_margin": 0.0, "scope_relevance": 0.0
    }])
    return pd.concat([out, nopromo], ignore_index=True).drop_duplicates(subset=["promo_id"], keep="first")



In [87]:
def build_ranking_frame(basket_feats, ptype_model, ptype_classes, ptype_featcols,
                        promos_df, label_df, topk=TOPK_TYPES, max_cands=MAX_CANDS):
    class_to_idx = {c:i for i,c in enumerate(ptype_classes)}
    data = basket_feats.merge(label_df, on=COL_TX, how="left")
    data["used_type"] = data["used_type"].fillna("NoPromo")

    rows = []
    for _, row in data.iterrows():
        # encode ให้คอลัมน์ one-hot ตรงกับตอนเทรน
        X = encode_features_for_ptype(row, FEATURE_COLS, ptype_featcols)
        probs = ptype_model.predict_proba(X)[0]

        cands = recall_candidates_for_event_relaxed(
            basket_row=row,
            promos_df=promos_df,
            probs=probs,
            classes=ptype_classes,
            topk_types=TOPK_TYPES,
            relevance_thresh=REL_TH,
            nopromo_label="NoPromo"
        )

        if len(cands) > max_cands:
            cands = pd.concat([
                cands.nlargest(max_cands//2, "scope_relevance"),
                cands.sample(n=max_cands-(max_cands//2), random_state=SEED, replace=False)
            ])

        used_type = row["used_type"]
        for _, pr in cands.iterrows():
            label = 1 if (pr["promo_type"] == used_type or (used_type=="NoPromo" and pr["promo_id"]=="__NOPROMO__")) else 0
            rows.append({
                "event_id": row[COL_TX],
                "promo_id": pr["promo_id"],
                "promo_type": pr["promo_type"],
                "ptype_prob": float(probs[class_to_idx.get(pr["promo_type"], class_to_idx.get("NoPromo", 0))]),
                "scope_relevance": pr.get("scope_relevance", 0.0),
                "est_margin": pr.get("est_margin", 0.0),
                "is_online": row.get(COL_ONLINE, 0),
                "order_hour": row.get(COL_ORDER_H, 0),
                "dayofweek": row.get(COL_DOW, 0),
                "need_state_cluster": row.get("need_state_cluster", 0),
                "label": label
            })
    rank_df = pd.DataFrame(rows)

    # cap negatives per event
    out = []
    for eid, grp in rank_df.groupby("event_id"):
        pos = grp[grp["label"]==1]
        neg = grp[grp["label"]==0]
        keep_neg = neg if len(neg) <= (max_cands - len(pos)) else neg.sample(n=max_cands - len(pos), random_state=SEED)
        out.append(pd.concat([pos, keep_neg], ignore_index=True))
    return pd.concat(out, ignore_index=True)

rank_df = build_ranking_frame(
    basket_feats=basket_feat,
    ptype_model=ptype_model,
    ptype_classes=ptype_classes,
    ptype_featcols=ptype_featcols,
    promos_df=promos_df,
    label_df=label_df,
    topk=TOPK_TYPES,
    max_cands=MAX_CANDS
)
rank_df.head()


Unnamed: 0,event_id,promo_id,promo_type,ptype_prob,scope_relevance,est_margin,is_online,order_hour,dayofweek,need_state_cluster,label
0,PMTX0000001,PR0005,Buy 1 get 1,0.519983,0.7,0.0,0,9,0,0,1
1,PMTX0000001,PR0021,Buy 1 get 1,0.519983,0.7,0.0,0,9,0,0,1
2,PMTX0000001,PR0030,Buy 1 get 1,0.519983,0.7,0.0,0,9,0,0,1
3,PMTX0000001,PR0034,Buy 1 get 1,0.519983,0.7,0.0,0,9,0,0,1
4,PMTX0000001,PR0048,Buy 1 get 1,0.519983,0.7,0.0,0,9,0,0,1


In [88]:
# หลังสร้าง rank_df = pd.DataFrame(rows)
# bring event_time
rank_df = rank_df.merge(
    basket_feat[[COL_TX, "event_time"]].drop_duplicates(),
    left_on="event_id", right_on=COL_TX, how="left"
).drop(columns=[COL_TX])

# parse dates
for c in ["start_date","end_date"]:
    if c in rank_df.columns:
        rank_df[c] = pd.to_datetime(rank_df[c], errors="coerce")

# new features (เหมือน patch ด้านบน)
rank_df["discount_norm"] = (rank_df["discount"].astype(float).fillna(0) / 100.0) if "discount" in rank_df.columns else 0.0

rank_df["is_active_now"] = (
    (rank_df["start_date"] <= rank_df["event_time"]) &
    (rank_df["event_time"] <= rank_df["end_date"])
).astype(int) if {"start_date","end_date","event_time"}.issubset(rank_df.columns) else 1

rank_df["days_to_end"] = (
    (rank_df["end_date"] - rank_df["event_time"]).dt.days.fillna(0).clip(lower=-365, upper=365)
) if {"end_date","event_time"}.issubset(rank_df.columns) else 0

rank_df["type_dup_penalty"] = (
    rank_df.groupby(["event_id","promo_type"])["promo_id"].transform("count") - 1
).clip(lower=0).fillna(0)

rank_df["dup_product_penalty"] = (
    rank_df.groupby(["event_id","product_id"])["promo_id"].transform("count") - 1
).clip(lower=0).fillna(0) if "product_id" in rank_df.columns else 0


In [89]:
def ndcg_at_k(rels, k=5):
    rels = np.asfarray(rels)[:k]
    if rels.size == 0: return 0.0
    dcg = np.sum((2**rels - 1) / np.log2(np.arange(2, rels.size + 2)))
    ideal = np.sort(rels)[::-1]
    idcg = np.sum((2**ideal - 1) / np.log2(np.arange(2, ideal.size + 2)))
    return dcg / idcg if idcg > 0 else 0.0

def train_ranker(rank_df, k_list=(3,5)):
    F = ["ptype_prob","scope_relevance","est_margin",
     "discount_norm","is_active_now","days_to_end",
     "type_dup_penalty","dup_product_penalty",
     "is_online","order_hour","dayofweek","need_state_cluster"]

    ev = rank_df["event_id"].unique()
    tr_e, va_e = train_test_split(ev, test_size=0.2, random_state=SEED)
    tr = rank_df[rank_df["event_id"].isin(tr_e)]
    va = rank_df[rank_df["event_id"].isin(va_e)]

    def to_group(df_):
        grp_sizes = df_.groupby("event_id").size().values
        X = df_[F].fillna(0).values
        y = df_["label"].values
        return X, y, grp_sizes

    if HAS_LGB:
        Xtr, ytr, gtr = to_group(tr)
        Xva, yva, gva = to_group(va)

        # ----- core API with callbacks (รองรับหลายเวอร์ชัน) -----
        try:
            dtr = lgb.Dataset(Xtr, label=ytr, group=gtr)
            dva = lgb.Dataset(Xva, label=yva, group=gva, reference=dtr)
            params = dict(
                objective="lambdarank",
                metric="ndcg",          # <--- สำคัญ: ใช้ 'ndcg' + eval_at แทน 'ndcg@k'
                eval_at=[3, 5],        # <--- ระบุ k ที่ต้องการประเมิน
                learning_rate=0.05,
                num_leaves=63,
                min_data_in_leaf=100,
                feature_fraction=0.8,
                bagging_fraction=0.8,
                bagging_freq=1,
                verbosity=-1,
                seed=SEED
            )
            cbs = []
            # ใส่ early_stopping ผ่าน callback (บางเวอร์ชันเท่านั้น)
            try:
                cbs.append(lgb.early_stopping(stopping_rounds=100))
            except Exception:
                pass
            # ใส่ log interval ถ้ามี
            try:
                cbs.append(lgb.log_evaluation(100))
            except Exception:
                pass

            try:
                model = lgb.train(
                    params,
                    dtr,
                    num_boost_round=800,
                    valid_sets=[dtr, dva],
                    valid_names=["train","valid"],
                    callbacks=cbs
                )
            except ValueError:
                # ถ้ายัง complain เรื่อง metric/early stopping ให้รันแบบไม่มี early stopping
                model = lgb.train(
                    params,
                    dtr,
                    num_boost_round=800,
                    valid_sets=[dtr, dva],
                    valid_names=["train","valid"]
                )
            use_core_api = True

        except Exception:
            # ----- fallback เป็น sklearn API LGBMRanker -----
            ranker = lgb.LGBMRanker(
                objective="lambdarank",
                n_estimators=800,
                learning_rate=0.05,
                num_leaves=63,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=SEED
            )
            try:
                # บางเวอร์ชันรองรับ eval_at ผ่าน set_params
                ranker.set_params(metric="ndcg", eval_at=[3,5])
            except Exception:
                pass
            try:
                ranker.fit(
                    Xtr, ytr,
                    group=gtr.tolist(),
                    eval_set=[(Xva, yva)],
                    eval_group=[gva.tolist()]
                )
            except TypeError:
                ranker.fit(Xtr, ytr, group=gtr.tolist())
            model = ranker
            use_core_api = False

        # ----- ประเมิน NDCG -----
        ndcgs = {f"ndcg@{k}":[] for k in k_list}
        for eid, grp in va.groupby("event_id"):
            if use_core_api:
                s = model.predict(grp[F].fillna(0).values,
                                  num_iteration=getattr(model, "best_iteration", None))
            else:
                s = model.predict(grp[F].fillna(0).values)
            grp = grp.assign(_s=s).sort_values("_s", ascending=False)
            for k in k_list:
                ndcgs[f"ndcg@{k}"].append(ndcg_at_k(grp["label"].values, k))
        return {"model": model, "feature_cols": F, "report": {m: float(np.mean(v)) for m,v in ndcgs.items()}}

    else:
        # Fallback: pointwise classifier
        clf = GradientBoostingClassifier(random_state=SEED)
        Xtr, ytr, _ = to_group(tr)
        Xva, yva, _ = to_group(va)
        clf.fit(Xtr, ytr)
        ndcgs = {f"ndcg@{k}":[] for k in k_list}
        for eid, grp in va.groupby("event_id"):
            s = clf.predict_proba(grp[F].fillna(0).values)[:,1]
            grp = grp.assign(_s=s).sort_values("_s", ascending=False)
            for k in k_list:
                ndcgs[f"ndcg@{k}"].append(ndcg_at_k(grp["label"].values, k))
        return {"model": clf, "feature_cols": F, "report": {m: float(np.mean(v)) for m,v in ndcgs.items()}, "fallback_pointwise": True}



rank_art = train_ranker(rank_df)
rank_art["report"]


Training until validation scores don't improve for 100 rounds
[100]	train's ndcg@3: 0.992347	train's ndcg@5: 0.994588	valid's ndcg@3: 0.98875	valid's ndcg@5: 0.990832
Early stopping, best iteration is:
[70]	train's ndcg@3: 0.990703	train's ndcg@5: 0.993287	valid's ndcg@3: 0.989239	valid's ndcg@5: 0.99116


{'ndcg@3': 0.9716150422727791, 'ndcg@5': 0.9728065864606266}

In [90]:
def score_event(event_tx_id, basket_feats, ptype_model, ptype_classes, ptype_featcols,
                promos_df, rank_art, topk=TOPK_TYPES, rel_th=REL_TH):
    # 0) ดึงแถวบริบท
    row = basket_feats[basket_feats[COL_TX]==event_tx_id]
    if row.empty:
        raise ValueError("transaction_id ไม่พบใน basket_feats")
    row = row.iloc[0]

    # 1) prior P(type|X)
    X = encode_features_for_ptype(row, FEATURE_COLS, ptype_featcols)
    probs = ptype_model.predict_proba(X)[0]
    class_to_idx = {c:i for i,c in enumerate(ptype_classes)}

    # 2) recall (แบบ relaxed)
    cands = recall_candidates_for_event_relaxed(
        basket_row=row,
        promos_df=promos_df,
        probs=probs,
        classes=ptype_classes,
        topk_types=TOPK_TYPES,
        relevance_thresh=rel_th,
        nopromo_label="NoPromo"
    )

    # 3) เตรียมฟีเจอร์ให้ครบสำหรับ ranker (เติม "ก่อน" ใช้ F)
    tmp = cands.copy()

    # prior prob ต่อโปรชนิดนั้น
    tmp["ptype_prob"] = tmp["promo_type"].apply(
        lambda t: probs[class_to_idx.get(t, class_to_idx.get("NoPromo", 0))]
    )

    # บริบทเหตุการณ์
    tmp["is_online"] = int(row.get(COL_ONLINE, 0))
    tmp["order_hour"] = int(row.get(COL_ORDER_H, 0))
    tmp["dayofweek"] = int(row.get(COL_DOW, 0))
    tmp["need_state_cluster"] = int(row.get("need_state_cluster", 0))

    # วันที่/ช่วงโปร
    now = row.get("event_time", pd.NaT)
    if "start_date" in tmp.columns and "end_date" in tmp.columns and pd.notna(now):
        tmp["is_active_now"] = ((tmp["start_date"] <= now) & (now <= tmp["end_date"])).astype(int)
        tmp["days_to_end"] = (tmp["end_date"] - now).dt.days.clip(lower=-365, upper=365)
    else:
        tmp["is_active_now"] = 1
        tmp["days_to_end"] = 0

    # ส่วนลด normalize
    if "discount" in tmp.columns:
        tmp["discount_norm"] = pd.to_numeric(tmp["discount"], errors="coerce").fillna(0) / 100.0
    else:
        tmp["discount_norm"] = 0.0

    # penalties ในกลุ่มเดียวกัน
    tmp["type_dup_penalty"] = (
        tmp.groupby("promo_type")["promo_id"].transform("count") - 1
    ).clip(lower=0).fillna(0)

    if "product_id" in tmp.columns:
        tmp["dup_product_penalty"] = (
            tmp.groupby("product_id")["promo_id"].transform("count") - 1
        ).clip(lower=0).fillna(0)
    else:
        tmp["dup_product_penalty"] = 0.0

    # กัน missing ที่ ranker ต้องใช้
    needed = ["ptype_prob","scope_relevance","est_margin",
              "discount_norm","is_active_now","days_to_end",
              "type_dup_penalty","dup_product_penalty",
              "is_online","order_hour","dayofweek","need_state_cluster"]
    for c in needed:
        if c not in tmp.columns:
            tmp[c] = 0.0
    tmp[needed] = tmp[needed].fillna(0)

    # 4) จัดอันดับด้วย ranker
    F = rank_art["feature_cols"]  # ต้องตรงกับตอนเทรน
    mdl = rank_art["model"]
    Xr = tmp[F].fillna(0).values

    if HAS_LGB and "fallback_pointwise" not in rank_art:
        s = mdl.predict(Xr, num_iteration=getattr(mdl, "best_iteration", None))
    else:
        s = mdl.predict_proba(Xr)[:, 1]

    # normalize และ tie-breaker
    s_ptp = float(np.ptp(s))
    tmp["ranker_score"] = (s - float(np.min(s))) / s_ptp if s_ptp > 1e-9 else s
    if tmp["ranker_score"].nunique() == 1:
        tb = (tmp["promo_id"].astype(str).apply(lambda x: (hash(x) % 997) / 997.0)) * 0.01
        tmp["ranker_score"] = tmp["ranker_score"] + tb

    # 5) blend คะแนนสุดท้าย (หลังมีทุกฟีเจอร์แล้ว)
    w = {
        "ptype_prob": 0.28,
        "ranker_score": 0.38,
        "scope_relevance": 0.15,
        "est_margin": 0.06,
        "discount_norm": 0.08,
        "is_active_now": 0.05
    }
    pen = {"type_dup_penalty": 0.05, "dup_product_penalty": 0.08}

    # tie-break helper: combine monotonic positives to reduce equal scores
    tie = (
        0.50*tmp["est_margin"].fillna(0).rank(pct=True) +
        0.30*tmp["discount_norm"].fillna(0).rank(pct=True) +
        0.20*tmp["scope_relevance"].fillna(0).rank(pct=True)
    )
    tie = (tie - tie.min()) / (tie.max() - tie.min() + 1e-9)

    # soft penalty for NoPromo to avoid topping unless clearly better
    is_np = ((tmp.get("promo_type").astype(str) == "NoPromo") | (tmp.get("promo_id").astype(str) == "__NOPROMO__")).astype(float)
    nopromo_penalty = 0.03 * is_np

    tmp["final_score"] = (
        w["ptype_prob"]*tmp["ptype_prob"] +
        w["ranker_score"]*tmp["ranker_score"] +
        w["scope_relevance"]*tmp["scope_relevance"] +
        w["est_margin"]*tmp["est_margin"] +
        w["discount_norm"]*tmp["discount_norm"] +
        w["is_active_now"]*tmp["is_active_now"]
        - pen["type_dup_penalty"]*tmp["type_dup_penalty"]
        - pen["dup_product_penalty"]*tmp["dup_product_penalty"]
        - nopromo_penalty
        + 0.01 * tie
    )

    # small deterministic jitter to break any remaining ties
    if tmp["final_score"].nunique() == 1:
        j = (tmp["promo_id"].astype(str).apply(lambda x: (hash(x) % 1009)/1009.0)) * 1e-4
        tmp["final_score"] = tmp["final_score"] + j

    return tmp.sort_values("final_score", ascending=False).reset_index(drop=True)


sample_tx_id = basket_feat[COL_TX].iloc[9000]
score_event(sample_tx_id, basket_feat, ptype_model, ptype_classes, ptype_featcols, promos_df, rank_art).head(10)


Unnamed: 0,promo_id,promo_type,product_id,discount,start_date,end_date,product_scope,is_online,est_margin,scope_relevance,...,order_hour,dayofweek,need_state_cluster,is_active_now,days_to_end,discount_norm,type_dup_penalty,dup_product_penalty,ranker_score,final_score
0,__NOPROMO__,NoPromo,,,NaT,NaT,,0,0.0,0.0,...,17,1,3,0,0.0,0.0,0,0.0,1.0,0.578439
1,PR0026,Mega Sale,P0527,29.0,2025-08-14 01:45:00,2025-09-18 01:45:00,,0,0.0,0.7,...,17,1,3,1,8.0,0.29,3,0.0,0.0,0.049096
2,PR0077,Mega Sale,P0404,29.0,2025-07-26 01:24:00,2025-09-15 01:24:00,,0,0.0,0.7,...,17,1,3,1,5.0,0.29,3,0.0,0.0,0.049096
3,PR0006,Mega Sale,P0107,22.0,2025-08-01 16:08:00,2025-09-21 16:08:00,,0,0.0,0.7,...,17,1,3,1,11.0,0.22,3,0.0,0.0,0.040666
4,PR0090,Mega Sale,P0763,14.0,2025-07-23 03:25:00,2025-09-20 03:25:00,,0,0.0,0.7,...,17,1,3,1,10.0,0.14,3,0.0,0.0,0.032567
5,PR0088,Product_Coupon,P0623,48.0,2025-09-04 09:56:00,2025-09-13 09:56:00,,0,0.0,0.7,...,17,1,3,1,3.0,0.48,8,0.0,0.030086,-0.173355
6,PR0069,Product_Coupon,P0925,43.0,2025-09-07 08:34:00,2025-09-20 08:34:00,,0,0.0,0.7,...,17,1,3,1,10.0,0.43,8,0.0,0.030086,-0.177921
7,PR0067,Product_Coupon,P0182,29.0,2025-08-11 02:29:00,2025-09-21 02:29:00,,0,0.0,0.7,...,17,1,3,1,11.0,0.29,8,0.0,0.030086,-0.190253
8,PR0003,Product_Coupon,P0441,25.0,2025-08-04 01:41:00,2025-09-11 01:41:00,,0,0.0,0.7,...,17,1,3,1,1.0,0.25,8,0.0,0.030086,-0.195151
9,PR0068,Product_Coupon,P0183,25.0,2025-07-24 13:49:00,2025-09-20 13:49:00,,0,0.0,0.7,...,17,1,3,1,10.0,0.25,8,0.0,0.030086,-0.195151


In [91]:
sample_tx_id = basket_feat["transaction_id"].iloc[5]
rec = score_event(sample_tx_id, basket_feat, ptype_model, ptype_classes, ptype_featcols,
                  promos_df, rank_art, topk=2, rel_th=0.30)
rec[['promo_id','promo_type','discount','product_scope','ranker_score','final_score']].head(5)  


Unnamed: 0,promo_id,promo_type,discount,product_scope,ranker_score,final_score
0,PR0005,Buy 1 get 1,100.0,,1.0,0.249258
1,PR0078,Buy 1 get 1,100.0,,1.0,0.249258
2,PR0030,Buy 1 get 1,100.0,,1.0,0.249258
3,PR0034,Buy 1 get 1,100.0,,1.0,0.249258
4,PR0095,Buy 1 get 1,100.0,,1.0,0.249258


In [92]:
from typing import List, Tuple

def apply_guardrails(
    ranked_promos: pd.DataFrame,
    k: int = 5,
    gap_rule_min_gap: float = 0.05,
    min_real_promos: int = 2,
    diversity_by: List[str] = ["promo_type", "product_scope"],
    max_per_type: int = 2,
    cap_nopromo: int = 1,
    nopromo_label: str = "NoPromo",
) -> pd.DataFrame:
    """
    Enforce guardrails over a single event candidate list already scored with `final_score`.
    Assumes columns: promo_id, promo_type, product_scope, final_score.
    Returns top-k after rules.
    """
    df = ranked_promos.copy()
    if df.empty:
        return df

    # 1) sort by final score
    df = df.sort_values("final_score", ascending=False).reset_index(drop=True)

    # 2) cap NoPromo count
    if cap_nopromo is not None and cap_nopromo >= 0:
        is_np = (df["promo_type"] == nopromo_label) | (df["promo_id"] == "__NOPROMO__")
        keep_np = df[is_np].head(cap_nopromo)
        keep_non = df[~is_np]
        df = pd.concat([keep_non, keep_np], ignore_index=True)
        df = df.sort_values("final_score", ascending=False).reset_index(drop=True)

    # 3) max per type
    if max_per_type is not None and max_per_type > 0 and "promo_type" in df.columns:
        df["_type_rank"] = df.groupby("promo_type").cumcount()
        df = df[df["_type_rank"] < max_per_type].drop(columns=["_type_rank"])  

    # 4) diversity constraints: ensure no exact duplicate scopes back-to-back
    if diversity_by:
        seen_keys = set()
        rows = []
        for _, r in df.iterrows():
            key = tuple(r.get(col, "") for col in diversity_by)
            if key not in seen_keys:
                rows.append(r)
                seen_keys.add(key)
            if len(rows) >= k * 3:  # keep buffer before gap rule
                break
        df = pd.DataFrame(rows)
        if not df.empty:
            df = df.sort_values("final_score", ascending=False).reset_index(drop=True)

    # 5) gap rule: keep items until score drops too much from best
    if not df.empty:
        best = float(df["final_score"].iloc[0])
        df = df[df["final_score"] >= best - gap_rule_min_gap]
        df = df.head(max(k, min_real_promos))

    # 6) ensure minimum real promos
    is_np = (df["promo_type"] == nopromo_label) | (df["promo_id"] == "__NOPROMO__")
    num_real = int((~is_np).sum())
    if num_real < min_real_promos:
        # pull more real promos from the original list
        src = ranked_promos.sort_values("final_score", ascending=False)
        extra = src[(~((src["promo_type"] == nopromo_label) | (src["promo_id"] == "__NOPROMO__"))) & (~src["promo_id"].isin(df["promo_id"]))]
        need = min_real_promos - num_real
        if need > 0 and not extra.empty:
            df = pd.concat([df, extra.head(need)], ignore_index=True)
            df = df.sort_values("final_score", ascending=False).head(max(k, min_real_promos))

    # final trim to k
    df = df.sort_values("final_score", ascending=False).head(k)
    return df



In [93]:
# Batch scoring + guardrails + validation

import random

def batch_score_with_guardrails(
    event_ids: List,
    basket_feats: pd.DataFrame,
    ptype_model,
    ptype_classes: List[str],
    ptype_featcols: List[str],
    promos_df: pd.DataFrame,
    rank_art: dict,
    k: int = 5,
    gap: float = 0.05,
    min_real: int = 2,
    diversity_by: List[str] = ["promo_type","product_scope"],
    max_per_type: int = 2,
    cap_nopromo: int = 1,
    nopromo_label: str = "NoPromo",
) -> Tuple[pd.DataFrame, dict]:
    rec_rows = []
    metrics = {"ndcg@3": [], "ndcg@5": [], "coverage": 0.0}

    # ground truth for validation
    # label_df from earlier cell
    truth = label_df.set_index(COL_TX)["used_type"].to_dict()

    for eid in event_ids:
        ranked = score_event(
            eid, basket_feats, ptype_model, ptype_classes, ptype_featcols, promos_df, rank_art
        )
        final = apply_guardrails(
            ranked, k=k, gap_rule_min_gap=gap, min_real_promos=min_real,
            diversity_by=diversity_by, max_per_type=max_per_type,
            cap_nopromo=cap_nopromo, nopromo_label=nopromo_label
        )

        # collect results
        final = final.assign(event_id=eid)
        rec_rows.append(final)

        # NDCG vs truth: relevance=1 if promo_type equals used_type
        used = truth.get(eid, "NoPromo")
        rels = (final["promo_type"].values == used).astype(int)
        metrics["ndcg@3"].append(ndcg_at_k(rels, 3))
        metrics["ndcg@5"].append(ndcg_at_k(rels, 5))

    recs = pd.concat(rec_rows, ignore_index=True) if rec_rows else pd.DataFrame()

    # coverage: share of events with at least one non-NoPromo recommended
    if not recs.empty:
        non_np_per_event = recs.groupby("event_id").apply(
            lambda g: (g["promo_type"] != nopromo_label).any()
        ).mean()
        metrics["coverage"] = float(non_np_per_event)
    else:
        metrics["coverage"] = 0.0

    metrics["ndcg@3"] = float(np.mean(metrics["ndcg@3"])) if metrics["ndcg@3"] else 0.0
    metrics["ndcg@5"] = float(np.mean(metrics["ndcg@5"])) if metrics["ndcg@5"] else 0.0
    return recs, metrics

# Run on a random sample of events
sample_events = basket_feat[COL_TX].drop_duplicates().sample(n=min(500, len(basket_feat)), random_state=SEED).tolist()
recs, m = batch_score_with_guardrails(
    sample_events,
    basket_feat,
    ptype_model,
    ptype_classes,
    ptype_featcols,
    promos_df,
    rank_art,
    k=5,
    gap=0.05,
    min_real=2,
    diversity_by=["promo_type","product_scope"],
    max_per_type=2,
    cap_nopromo=1,
)

m, recs.head(10)[["event_id","promo_id","promo_type","final_score"]]


  non_np_per_event = recs.groupby("event_id").apply(


({'ndcg@3': 0.9568674437241791,
  'ndcg@5': 0.9568674437241791,
  'coverage': 0.992},
     event_id     promo_id      promo_type  final_score
 0  TX0013649  __NOPROMO__         NoPromo     0.577496
 1  TX0013649       PR0011  Product_Coupon     0.072343
 2  TX0013649       PR0065  Product_Coupon     0.069708
 3  TX0002059  __NOPROMO__         NoPromo     0.568125
 4  TX0002059       PR0011  Product_Coupon    -0.235423
 5  TX0002059       PR0065  Product_Coupon    -0.237590
 6  TX0010175  __NOPROMO__         NoPromo     0.578439
 7  TX0010175       PR0011  Product_Coupon    -0.073720
 8  TX0010175       PR0065  Product_Coupon    -0.075949
 9  TX0004561  __NOPROMO__         NoPromo     0.567094)

In [94]:
# Extra ranking metrics

def precision_recall_at_k(pred_types, true_type, k=5):
    topk = list(pred_types[:k])
    hits = sum(t == true_type for t in topk)
    prec = hits / max(k, 1)
    rec = 1.0 if true_type in topk else 0.0  # single-label recall
    return float(prec), float(rec)


def reciprocal_rank(pred_types, true_type):
    for i, t in enumerate(pred_types, start=1):
        if t == true_type:
            return float(1.0 / i)
    return 0.0


def average_precision(pred_types, true_type):
    ap, hits = 0.0, 0
    for i, t in enumerate(pred_types, start=1):
        if t == true_type:
            hits += 1
            ap += hits / i
    return float(ap / max(hits, 1)) if hits else 0.0


In [95]:
# Train/Test split by event_time from tx_merge3.csv and full evaluation

# 1) Build event list with timestamps
_events = basket_feat[[COL_TX, "event_time"]].drop_duplicates().dropna()
_events = _events.sort_values("event_time")
cut = int(len(_events) * 0.8)
train_events = set(_events.iloc[:cut][COL_TX].tolist())
test_events  = set(_events.iloc[cut:][COL_TX].tolist())

# 2) Rebuild rank_df restricted to train events and train a fresh ranker
rank_df_train = rank_df[rank_df["event_id"].isin(train_events)].copy()
rank_art_tt   = train_ranker(rank_df_train)

# 3) Evaluate on test events with guardrails
truth = label_df.set_index(COL_TX)["used_type"].to_dict()

def eval_on_events(event_ids, k_list=(3,5), k_guard=5):
    ndcgs = {f"ndcg@{k}": [] for k in k_list}
    cover, precs, recs, mrrs, maps = [], [], [], [], []

    for eid in event_ids:
        raw = score_event(eid, basket_feat, ptype_model, ptype_classes, ptype_featcols, promos_df, rank_art_tt)
        fin = apply_guardrails(raw, k=k_guard, gap_rule_min_gap=0.05, min_real_promos=2,
                               diversity_by=["promo_type","product_scope"], max_per_type=2, cap_nopromo=1)
        # relevance by promo_type match (single-label)
        y_true = truth.get(eid, "NoPromo")
        rels = (fin["promo_type"].values == y_true).astype(int)
        for k in k_list:
            ndcgs[f"ndcg@{k}"].append(ndcg_at_k(rels, k))
        cover.append((fin["promo_type"] != "NoPromo").any())
        p, r = precision_recall_at_k(fin["promo_type"].values, y_true, k=k_guard)
        precs.append(p); recs.append(r)
        mrrs.append(reciprocal_rank(fin["promo_type"].values, y_true))
        maps.append(average_precision(fin["promo_type"].values, y_true))

    out = {m: float(np.mean(v)) if v else 0.0 for m, v in ndcgs.items()}
    out.update({
        "coverage": float(np.mean(cover)) if cover else 0.0,
        f"precision@{k_guard}": float(np.mean(precs)) if precs else 0.0,
        f"recall@{k_guard}": float(np.mean(recs)) if recs else 0.0,
        "mrr": float(np.mean(mrrs)) if mrrs else 0.0,
        "map": float(np.mean(maps)) if maps else 0.0,
    })
    return out

metrics_test = eval_on_events(sorted(test_events), k_list=(3,5), k_guard=5)
metrics_test


Training until validation scores don't improve for 100 rounds
[100]	train's ndcg@3: 0.999962	train's ndcg@5: 0.999972	valid's ndcg@3: 0.996579	valid's ndcg@5: 0.997372
Early stopping, best iteration is:
[51]	train's ndcg@3: 0.998484	train's ndcg@5: 0.998946	valid's ndcg@3: 0.99719	valid's ndcg@5: 0.997908


{'ndcg@3': 0.821997287112932,
 'ndcg@5': 0.821997287112932,
 'coverage': 0.9671532846715328,
 'precision@5': 0.18847758081334723,
 'recall@5': 0.8331595411887383,
 'mrr': 0.8159106708376782,
 'map': 0.8180396246089676}

In [104]:
tx_id = basket_feat["transaction_id"].iloc[6]
rec = score_event(tx_id, basket_feat, ptype_model, ptype_classes, ptype_featcols, promos_df, rank_art)
rec[["promo_id","promo_type","final_score","ranker_score","ptype_prob","scope_relevance","est_margin","discount_norm"]].head(20)

Unnamed: 0,promo_id,promo_type,final_score,ranker_score,ptype_prob,scope_relevance,est_margin,discount_norm
0,PR0005,Buy 1 get 1,0.790203,1.0,0.520811,1.0,0.0,1.0
1,__NOPROMO__,NoPromo,0.059535,0.0,0.358139,0.0,0.0,0.0


In [97]:
# === Step 1: Build promotion-product mapping and export CSV ===
from collections import defaultdict

prod_path = BASE/"products.csv"
prom_path = BASE/"promotions.csv"
prom_tx_path = BASE/"promotion_transactions.csv"

products_df = pd.read_csv(prod_path)
promotions_df_full = pd.read_csv(prom_path, parse_dates=["start_date","end_date"], dayfirst=False)
try:
    promo_tx = pd.read_csv(prom_tx_path)
except FileNotFoundError:
    promo_tx = pd.DataFrame(columns=["transaction_id","promo_id","product_id","min_qty","discount_applied"])  # safe empty

# Normalize
_products = products_df.rename(columns={"category": "category", "brand": "brand"})
_proms = promotions_df_full.copy()

# Build product_ids list per promo from historical mapping
promo_to_products = (
    promo_tx.groupby("promo_id")["product_id"].apply(lambda s: sorted(set(s.dropna().astype(str)))).to_dict()
)

# Lookup for product -> (category, brand)
prod_lookup = _products.set_index("product_id")[ ["category","brand"] ]

# Infer category/brand scopes heuristically
category_scope = {}
brand_scope = {}
min_qty_map = {}
if not promo_tx.empty:
    if "min_qty" in promo_tx.columns:
        min_qty_map = promo_tx.groupby("promo_id")["min_qty"].min().fillna(1).astype(int).to_dict()
    for pid, plist in promo_to_products.items():
        idx = [p for p in plist if p in prod_lookup.index]
        if not idx:
            continue
        dfp = prod_lookup.loc[idx]
        cat_counts = dfp["category"].value_counts()
        br_counts  = dfp["brand"].value_counts()
        if len(dfp):
            if not cat_counts.empty and (cat_counts.iloc[0] / len(dfp) >= 0.6):
                category_scope[pid] = [str(cat_counts.index[0])]
            if not br_counts.empty and (br_counts.iloc[0] / len(dfp) >= 0.6):
                brand_scope[pid] = [str(br_counts.index[0])]

# Defaults
DEFAULT_MIN_QTY = 1
DEFAULT_MAX_DISCOUNT_PER_USER = 1000.0

rows = []
for _, pr in _proms.iterrows():
    pid = pr.get("promo_id")
    rows.append({
        "promo_id": pid,
        "product_id": ",".join(promo_to_products.get(pid, [])),
        "category_scope": ",".join(category_scope.get(pid, [])),
        "brand_scope": ",".join(brand_scope.get(pid, [])),
        "min_qty": int(min_qty_map.get(pid, DEFAULT_MIN_QTY)),
        "max_discount_per_user": DEFAULT_MAX_DISCOUNT_PER_USER
    })

promotion_products = pd.DataFrame(rows)

# Persist
out_path = BASE/"promotion_products.csv"
promotion_products.to_csv(out_path, index=False)
print(f"promotion_products.csv written to {out_path} with shape {promotion_products.shape}")

# Helper: build fast lookup dicts
_promoprod_lookup = {
    r["promo_id"]: {
        "product_ids": [p for p in str(r["product_id"]).split(",") if p and p != 'nan'],
        "categories": [c for c in str(r["category_scope"]).split(",") if c and c != 'nan'],
        "brands": [b for b in str(r["brand_scope"]).split(",") if b and b != 'nan'],
        "min_qty": r["min_qty"],
        "max_discount_per_user": r["max_discount_per_user"],
    }
    for _, r in promotion_products.iterrows()
}


promotion_products.csv written to Datasets\mockup_ver2\promotion_products.csv with shape (100, 6)


  promotions_df_full = pd.read_csv(prom_path, parse_dates=["start_date","end_date"], dayfirst=False)
  promotions_df_full = pd.read_csv(prom_path, parse_dates=["start_date","end_date"], dayfirst=False)


In [98]:
# === Step 2: Enhanced features + precise scope relevance ===

# Safe utilities using available data
try:
    tx_df_full = tx_merge.copy()
except NameError:
    tx_df_full = pd.read_csv(BASE/"transactions.csv")

# Join with promo usage if available
try:
    promo_tx_full = pd.read_csv(BASE/"promotion_transactions.csv")
except FileNotFoundError:
    promo_tx_full = pd.DataFrame(columns=["transaction_id","promo_id","product_id"])  

# Build quick helper indices
_tx_by_id = tx_df_full.groupby("transaction_id")
_promotx_by_promo = promo_tx_full.groupby("promo_id") if not promo_tx_full.empty else {}


def _get_basket_details(df_rows: pd.DataFrame) -> dict:
    return {
        'product_ids': df_rows['product_id'].astype(str).tolist() if 'product_id' in df_rows.columns else [],
        'categories': df_rows.get('products.category', pd.Series([], dtype=str)).astype(str).tolist() if 'products.category' in df_rows.columns else [],
        'brands': df_rows.get('products.brand', pd.Series([], dtype=str)).astype(str).tolist() if 'products.brand' in df_rows.columns else [],
        'quantities': df_rows.get('qty', pd.Series([], dtype=float)).astype(float).tolist() if 'qty' in df_rows.columns else [],
        'values': df_rows.get('_revenue', pd.Series([], dtype=float)).astype(float).tolist() if '_revenue' in df_rows.columns else (df_rows.get('price', pd.Series([], dtype=float)).astype(float).tolist() if 'price' in df_rows.columns else []),
    }


def get_historical_conversion(promo_id: str) -> float:
    if isinstance(_promotx_by_promo, dict) or promo_tx_full.empty:
        return 0.0
    grp = _promotx_by_promo.get_group(promo_id) if promo_id in _promotx_by_promo.groups else None
    if grp is None or grp.empty:
        return 0.0
    # crude estimate: unique transactions using this promo / total transactions during active days
    used_tx = grp['transaction_id'].nunique()
    prom_row = promotions_df_full[promotions_df_full['promo_id']==promo_id]
    if prom_row.empty:
        return min(1.0, used_tx / max(len(tx_df_full), 1))
    s, e = prom_row.iloc[0]['start_date'], prom_row.iloc[0]['end_date']
    if 'timestamp' in tx_df_full.columns and pd.notna(s) and pd.notna(e):
        mask = (pd.to_datetime(tx_df_full['timestamp'], errors='coerce')>=s) & (pd.to_datetime(tx_df_full['timestamp'], errors='coerce')<=e)
        denom = int(tx_df_full.loc[mask, 'transaction_id'].nunique()) or 1
    else:
        denom = int(tx_df_full['transaction_id'].nunique()) or 1
    return float(used_tx/denom)


def get_avg_basket_lift(promo_id: str) -> float:
    # estimate: avg qty of eligible items with promo vs without (very rough)
    if promo_tx_full.empty:
        return 0.0
    elig = promo_tx_full[promo_tx_full['promo_id']==promo_id]
    if elig.empty:
        return 0.0
    tx_ids = elig['transaction_id'].unique().tolist()
    q_with = tx_df_full[tx_df_full['transaction_id'].isin(tx_ids)].get('qty', pd.Series([], dtype=float)).astype(float)
    q_all = tx_df_full.get('qty', pd.Series([], dtype=float)).astype(float)
    if q_all.empty:
        return 0.0
    return float(q_with.mean() - q_all.mean())


def get_user_promo_history(user_id, promo_id: str) -> float:
    if user_id is None or promo_tx_full.empty:
        return 0.0
    if 'user_id' not in tx_df_full.columns:
        return 0.0
    tx_ids = tx_df_full[tx_df_full['user_id']==user_id]['transaction_id'].unique().tolist()
    if not tx_ids:
        return 0.0
    used = promo_tx_full[(promo_tx_full['transaction_id'].isin(tx_ids)) & (promo_tx_full['promo_id']==promo_id)]
    return float(min(1.0, used['transaction_id'].nunique() / max(len(tx_ids),1)))


def calculate_eligible_revenue(basket_df: pd.DataFrame, eligible_products: list[str]) -> float:
    if not eligible_products:
        return 0.0
    elig = basket_df[basket_df['product_id'].astype(str).isin(set(eligible_products))]
    if '_revenue' in elig.columns:
        return float(elig['_revenue'].sum())
    if {'price','qty'}.issubset(elig.columns):
        return float((elig['price'].fillna(0)*elig['qty'].fillna(0)).sum())
    if 'price' in elig.columns:
        return float(elig['price'].fillna(0).sum())
    return 0.0


def calculate_enhanced_features(basket_df: pd.DataFrame, basket_row: pd.Series, promotion: pd.Series, promoprod_lookup: dict) -> dict:
    uid = basket_row.get('user_id', None)
    pid = promotion.get('promo_id')
    scope = promoprod_lookup.get(pid, {"product_ids":[],"categories":[],"brands":[],"min_qty":1,"max_discount_per_user":1000.0})
    basket_details = _get_basket_details(basket_df)

    basket_products = basket_details['product_ids']
    eligible_products = scope['product_ids']

    inter = len(set(basket_products) & set(eligible_products))
    product_overlap_ratio = float(inter / max(len(basket_products), 1))

    eligible_revenue = calculate_eligible_revenue(basket_df, eligible_products)
    actual_discount_value = float(eligible_revenue * float(promotion.get('discount', 0) or 0) / 100.0)

    conv = get_historical_conversion(pid)
    lift = get_avg_basket_lift(pid)
    affinity = get_user_promo_history(uid, pid)

    now = pd.to_datetime(basket_row.get('event_time', pd.NaT), errors='coerce')
    s = pd.to_datetime(promotion.get('start_date', pd.NaT), errors='coerce')
    days_since_start = int((now - s).days) if (pd.notna(now) and pd.notna(s)) else 0
    promotion_freshness = float(1.0 / (1 + max(days_since_start, 0)))

    # simple competition proxy: count active promos of same type at this moment
    same_type_active = 0
    if 'promo_type' in promotions_df_full.columns:
        t = promotion.get('promo_type')
        if pd.notna(now):
            active = promotions_df_full[(promotions_df_full['promo_type']==t) & (promotions_df_full['start_date']<=now) & (now<=promotions_df_full['end_date'])]
            same_type_active = int(len(active))
    promo_uniqueness_score = float(1.0 / (1 + same_type_active))

    return {
        'product_overlap_ratio': product_overlap_ratio,
        'eligible_revenue': eligible_revenue,
        'actual_discount_value': actual_discount_value,
        'promo_conversion_rate': conv,
        'promo_avg_basket_lift': lift,
        'user_promo_affinity': affinity,
        'days_since_start': days_since_start,
        'promotion_freshness': promotion_freshness,
        'similar_promos_active': same_type_active,
        'promo_uniqueness_score': promo_uniqueness_score,
    }


def calculate_precise_scope_relevance(basket_df: pd.DataFrame, promotion: pd.Series, promoprod_lookup: dict) -> float:
    # Detailed basket
    bd = _get_basket_details(basket_df)
    scope = promoprod_lookup.get(promotion.get('promo_id'), {"product_ids":[],"categories":[],"brands":[]})

    # base relevance
    product_match = len(set(bd['product_ids']) & set(scope.get('product_ids', [])))
    category_match = len(set(bd['categories']) & set(scope.get('categories', [])))
    brand_match = len(set(bd['brands']) & set(scope.get('brands', [])))

    denom_p = max(len(bd['product_ids']), 1)
    denom_c = max(len(bd['categories']), 1)
    denom_b = max(len(bd['brands']), 1)

    relevance = (
        0.5 * (product_match / denom_p) +
        0.3 * (category_match / denom_c) +
        0.2 * (brand_match / denom_b)
    )

    # value weight boost
    try:
        values = bd['values']
        prods = bd['product_ids']
        val_total = float(sum(values)) or 1.0
        value_weight = float(sum(v for i, v in enumerate(values) if prods[i] in set(scope.get('product_ids', []))))
        value_ratio = float(value_weight / val_total)
    except Exception:
        value_ratio = 0.0

    final_relevance = 0.7 * relevance + 0.3 * value_ratio
    return float(max(0.0, min(1.0, final_relevance)))


In [100]:
# === Step 3: Redefine recall, training features, tie-breaking, and scoring v2 ===

# Smart tie-breaking per requirement

def apply_tiebreaking(candidates: pd.DataFrame) -> pd.DataFrame:
    if candidates.empty:
        return candidates
    score_threshold = 0.001
    df = candidates.copy()
    df['score_bucket'] = (df['final_score'] / score_threshold).astype(int)
    # ensure columns exist with safe defaults
    for c in ['promo_conversion_rate','promotion_freshness','promo_uniqueness_score','est_margin']:
        if c not in df.columns:
            df[c] = 0.0
    df['tiebreak_score'] = (
        df['promo_conversion_rate'] * 0.4 +
        df['promotion_freshness'] * 0.3 +
        df['promo_uniqueness_score'] * 0.2 +
        df['est_margin'] * 0.1
    )
    df['final_score_adjusted'] = (
        df['final_score'] +
        df['tiebreak_score'] * 0.01 +
        df['promo_id'].astype(str).apply(lambda x: (hash(x) % 1000) / 1_000_000)
    )
    return df.sort_values('final_score_adjusted', ascending=False).drop(columns=['score_bucket'], errors='ignore')


# Override recall to use precise scope relevance and keep type gating

def recall_candidates_for_event_relaxed(basket_row: pd.Series,
                                        promos_df: pd.DataFrame,
                                        probs: np.ndarray,
                                        classes: list,
                                        topk_types: int = 2,
                                        relevance_thresh: float = 0.30,
                                        nopromo_label: str = "NoPromo") -> pd.DataFrame:
    top_types = get_top_types(probs, classes, k=topk_types, ensure_non_nopromo=2, nopromo_label=nopromo_label)
    now = basket_row.get("event_time", pd.NaT)

    # Select candidate promos by date/channel/type
    def _elig(df, strict_online=True):
        out = df.copy()
        if 'start_date' in out.columns and 'end_date' in out.columns and pd.notna(now):
            out = out[(out['start_date'] <= now) & (now <= out['end_date'])]
        if strict_online and 'is_online' in out.columns and 'is_online' in basket_row.index:
            out = out[out['is_online'] == int(basket_row['is_online'])]
        return out

    cand = _elig(promos_df, strict_online=True)
    if 'promo_type' in cand.columns:
        cand = cand[cand['promo_type'].isin(top_types)]

    # Build basket rows for the transaction to compute relevance/features
    tx_id = basket_row.get('transaction_id')
    basket_tx_rows = tx_merge[tx_merge['transaction_id']==tx_id] if 'transaction_id' in tx_merge.columns else pd.DataFrame()

    def _score_add(df_):
        df_ = df_.copy()
        df_['scope_relevance'] = df_.apply(lambda r: calculate_precise_scope_relevance(basket_tx_rows, r, _promoprod_lookup), axis=1)
        # add enhanced per-promo features
        enh = df_.apply(lambda r: pd.Series(calculate_enhanced_features(basket_tx_rows, basket_row, r, _promoprod_lookup)), axis=1)
        for col in enh.columns:
            df_[col] = enh[col]
        return df_

    cand = _score_add(cand)
    out = cand[cand['scope_relevance'] >= relevance_thresh]

    if out.empty:
        cand2 = _elig(promos_df, strict_online=False)
        if 'promo_type' in cand2.columns:
            cand2 = cand2[cand2['promo_type'].isin(top_types)]
        out = _score_add(cand2)
        out = out[out['scope_relevance'] >= max(0.2, relevance_thresh*0.75)]

    if out.empty:
        cand3 = _elig(promos_df, strict_online=False)
        out = _score_add(cand3)
        out = out.nlargest(50, 'scope_relevance')

    nopromo = pd.DataFrame([{
        'promo_id': '__NOPROMO__', 'promo_type': nopromo_label,
        'product_scope': '', 'est_margin': 0.0, 'scope_relevance': 0.0,
        'product_overlap_ratio': 0.0, 'eligible_revenue': 0.0, 'actual_discount_value': 0.0,
        'promo_conversion_rate': 0.0, 'promo_avg_basket_lift': 0.0,
        'user_promo_affinity': 0.0, 'days_since_start': 0, 'promotion_freshness': 0.0,
        'similar_promos_active': 0, 'promo_uniqueness_score': 0.0
    }])
    return pd.concat([out, nopromo], ignore_index=True).drop_duplicates(subset=['promo_id'], keep='first')


# Upgrade training feature set and params

def train_ranker(rank_df: pd.DataFrame, k_list=(3,5)):
    base_F = [
        'ptype_prob','scope_relevance','est_margin','discount_norm','is_active_now','days_to_end',
        'type_dup_penalty','dup_product_penalty','is_online','order_hour','dayofweek','need_state_cluster'
    ]
    extra_F = [
        'product_overlap_ratio','eligible_revenue','actual_discount_value',
        'promo_conversion_rate','promo_avg_basket_lift','user_promo_affinity',
        'promotion_freshness','promo_uniqueness_score'
    ]
    F = [f for f in base_F + extra_F if f in rank_df.columns]

    ev = rank_df['event_id'].unique()
    tr_e, va_e = train_test_split(ev, test_size=0.2, random_state=SEED)
    tr = rank_df[rank_df['event_id'].isin(tr_e)]
    va = rank_df[rank_df['event_id'].isin(va_e)]

    def to_group(df_):
        grp_sizes = df_.groupby('event_id').size().values
        X = df_[F].fillna(0).values
        y = df_['label'].values
        return X, y, grp_sizes

    if HAS_LGB:
        Xtr, ytr, gtr = to_group(tr)
        Xva, yva, gva = to_group(va)
        try:
            dtr = lgb.Dataset(Xtr, label=ytr, group=gtr)
            dva = lgb.Dataset(Xva, label=yva, group=gva, reference=dtr)
            params = dict(
                objective='lambdarank',
                metric='ndcg',
                eval_at=[1,3,5],
                label_gain=[0,1,3,7,15],
                max_position=10,
                learning_rate=0.05,
                num_leaves=63,
                min_data_in_leaf=50,
                min_sum_hessian_in_leaf=5.0,
                lambda_l1=0.1,
                lambda_l2=0.1,
                feature_fraction=0.85,
                bagging_fraction=0.85,
                bagging_freq=1,
                verbosity=-1,
                seed=SEED,
            )
            cbs = []
            try: cbs.append(lgb.early_stopping(stopping_rounds=100))
            except Exception: pass
            try: cbs.append(lgb.log_evaluation(100))
            except Exception: pass
            try:
                model = lgb.train(params, dtr, num_boost_round=800, valid_sets=[dtr, dva], valid_names=['train','valid'], callbacks=cbs)
            except ValueError:
                model = lgb.train(params, dtr, num_boost_round=800, valid_sets=[dtr, dva], valid_names=['train','valid'])
            use_core_api = True
        except Exception:
            ranker = lgb.LGBMRanker(objective='lambdarank', n_estimators=800, learning_rate=0.05,
                                    num_leaves=63, subsample=0.85, colsample_bytree=0.85, random_state=SEED)
            try: ranker.set_params(metric='ndcg', eval_at=[1,3,5], label_gain=[0,1,3,7,15])
            except Exception: pass
            try:
                ranker.fit(Xtr, ytr, group=gtr.tolist(), eval_set=[(Xva, yva)], eval_group=[gva.tolist()])
            except TypeError:
                ranker.fit(Xtr, ytr, group=gtr.tolist())
            model = ranker
            use_core_api = False

        # Evaluate
        def _predict(grp_df):
            if use_core_api:
                return model.predict(grp_df[F].fillna(0).values, num_iteration=getattr(model, 'best_iteration', None))
            return model.predict(grp_df[F].fillna(0).values)

        ndcgs = {f'ndcg@{k}': [] for k in k_list}
        for eid, grp in va.groupby('event_id'):
            s = _predict(grp)
            grp = grp.assign(_s=s).sort_values('_s', ascending=False)
            for k in k_list:
                ndcgs[f'ndcg@{k}'].append(ndcg_at_k(grp['label'].values, k))
        return {'model': model, 'feature_cols': F, 'report': {m: float(np.mean(v)) for m, v in ndcgs.items()}}

    # fallback classifier
    clf = GradientBoostingClassifier(random_state=SEED)
    Xtr, ytr, _ = to_group(tr)
    Xva, yva, _ = to_group(va)
    clf.fit(Xtr, ytr)
    ndcgs = {f'ndcg@{k}': [] for k in k_list}
    for eid, grp in va.groupby('event_id'):
        s = clf.predict_proba(grp[F].fillna(0).values)[:,1]
        grp = grp.assign(_s=s).sort_values('_s', ascending=False)
        for k in k_list:
            ndcgs[f'ndcg@{k}'].append(ndcg_at_k(grp['label'].values, k))
    return {'model': clf, 'feature_cols': F, 'report': {m: float(np.mean(v)) for m, v in ndcgs.items()}, 'fallback_pointwise': True}


# Scoring v2 using the new features + tie-breaking

def score_event_v2(event_tx_id,
                   basket_feats: pd.DataFrame,
                   ptype_model,
                   ptype_classes,
                   ptype_featcols,
                   promos_df: pd.DataFrame,
                   rank_art: dict,
                   topk: int = TOPK_TYPES,
                   rel_th: float = REL_TH):
    row = basket_feats[basket_feats[COL_TX]==event_tx_id]
    if row.empty:
        raise ValueError('transaction_id ไม่พบใน basket_feats')
    row = row.iloc[0]

    X = encode_features_for_ptype(row, FEATURE_COLS, ptype_featcols)
    probs = ptype_model.predict_proba(X)[0]
    class_to_idx = {c:i for i,c in enumerate(ptype_classes)}

    cands = recall_candidates_for_event_relaxed(
        basket_row=row,
        promos_df=promos_df,
        probs=probs,
        classes=ptype_classes,
        topk_types=TOPK_TYPES,
        relevance_thresh=rel_th,
        nopromo_label='NoPromo'
    )

    tmp = cands.copy()
    tmp['ptype_prob'] = tmp['promo_type'].apply(lambda t: probs[class_to_idx.get(t, class_to_idx.get('NoPromo', 0))])
    tmp['is_online'] = int(row.get(COL_ONLINE, 0))
    tmp['order_hour'] = int(row.get(COL_ORDER_H, 0))
    tmp['dayofweek'] = int(row.get(COL_DOW, 0))
    tmp['need_state_cluster'] = int(row.get('need_state_cluster', 0))

    now = row.get('event_time', pd.NaT)
    if {'start_date','end_date'}.issubset(tmp.columns) and pd.notna(now):
        tmp['is_active_now'] = ((tmp['start_date'] <= now) & (now <= tmp['end_date'])).astype(int)
        tmp['days_to_end'] = (tmp['end_date'] - now).dt.days.clip(lower=-365, upper=365)
    else:
        tmp['is_active_now'] = 1
        tmp['days_to_end'] = 0

    if 'discount' in tmp.columns:
        tmp['discount_norm'] = pd.to_numeric(tmp['discount'], errors='coerce').fillna(0) / 100.0
    else:
        tmp['discount_norm'] = 0.0

    tmp['type_dup_penalty'] = (tmp.groupby('promo_type')['promo_id'].transform('count') - 1).clip(lower=0).fillna(0)
    if 'product_id' in tmp.columns:
        tmp['dup_product_penalty'] = (tmp.groupby('product_id')['promo_id'].transform('count') - 1).clip(lower=0).fillna(0)
    else:
        tmp['dup_product_penalty'] = 0.0

    needed = [
        'ptype_prob','scope_relevance','est_margin','discount_norm','is_active_now','days_to_end',
        'type_dup_penalty','dup_product_penalty','is_online','order_hour','dayofweek','need_state_cluster',
        'product_overlap_ratio','eligible_revenue','actual_discount_value','promo_conversion_rate',
        'promo_avg_basket_lift','user_promo_affinity','promotion_freshness','promo_uniqueness_score']
    for c in needed:
        if c not in tmp.columns:
            tmp[c] = 0
    tmp[needed] = tmp[needed].fillna(0)

    F = rank_art['feature_cols']
    mdl = rank_art['model']
    Xr = tmp[F].fillna(0).values
    if HAS_LGB and 'fallback_pointwise' not in rank_art:
        s = mdl.predict(Xr, num_iteration=getattr(mdl, 'best_iteration', None))
    else:
        s = mdl.predict_proba(Xr)[:,1]

    ptp = float(np.ptp(s))
    tmp['ranker_score'] = (s - float(np.min(s))) / ptp if ptp > 1e-9 else s
    if tmp['ranker_score'].nunique() == 1:
        tb = (tmp['promo_id'].astype(str).apply(lambda x: (hash(x) % 997) / 997.0)) * 0.01
        tmp['ranker_score'] = tmp['ranker_score'] + tb

    w = {
        'ptype_prob': 0.25,
        'ranker_score': 0.40,
        'scope_relevance': 0.15,
        'est_margin': 0.05,
        'discount_norm': 0.05,
        'is_active_now': 0.05
    }
    pen = {'type_dup_penalty': 0.05, 'dup_product_penalty': 0.08}

    tie = (
        0.40*tmp['promo_conversion_rate'].rank(pct=True) +
        0.30*tmp['promotion_freshness'].rank(pct=True) +
        0.20*tmp['promo_uniqueness_score'].rank(pct=True) +
        0.10*tmp['est_margin'].rank(pct=True)
    )
    tie = (tie - tie.min()) / (tie.max() - tie.min() + 1e-9)

    is_np = ((tmp.get('promo_type').astype(str) == 'NoPromo') | (tmp.get('promo_id').astype(str) == '__NOPROMO__')).astype(float)
    nopromo_penalty = 0.03 * is_np

    tmp['final_score'] = (
        w['ptype_prob']*tmp['ptype_prob'] +
        w['ranker_score']*tmp['ranker_score'] +
        w['scope_relevance']*tmp['scope_relevance'] +
        w['est_margin']*tmp['est_margin'] +
        w['discount_norm']*tmp['discount_norm'] +
        w['is_active_now']*tmp['is_active_now'] -
        pen['type_dup_penalty']*tmp['type_dup_penalty'] -
        pen['dup_product_penalty']*tmp['dup_product_penalty'] -
        nopromo_penalty + 0.01 * tie
    )

    ranked = apply_tiebreaking(tmp)
    return ranked.sort_values('final_score_adjusted', ascending=False).reset_index(drop=True)

# convenience alias
score_event = score_event_v2


In [115]:
# === PATHS (ตามโครงสร้างของคุณ) ===========================================
from pathlib import Path
import os, json, pickle, sys, platform, sklearn
from datetime import datetime

ROOT = Path(".")                               # โฟลเดอร์โปรเจกต์
DATA = ROOT / "Datasets" / "mockup_ver2"       # ที่เก็บ CSV ตามรูป
ARTI = ROOT / "Notebooks" / "artifacts"        # โฟลเดอร์เซฟอาร์ติแฟกต์

(ARTI/"models").mkdir(parents=True, exist_ok=True)
(ARTI/"preprocessors").mkdir(parents=True, exist_ok=True)
(ARTI/"data").mkdir(parents=True, exist_ok=True)
(ARTI/"configs").mkdir(parents=True, exist_ok=True)

def pkl_save(obj, path):
    with open(path, "wb") as f:
        pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)

# ---------------------------------------------------------------------------
# วัตถุเหล่านี้ควรมีอยู่แล้วในโน้ตบุ๊กเทรนของคุณ
# ptype_model, ptype_classes, ptype_featcols
# pca_need, kmeans_need
# ranker_model, ranker_featcols
# FEATURE_COLS
# (optional) scaler_ptype
# (optional) promotion_products  -> ถ้าคุณ build แล้ว
# ---------------------------------------------------------------------------

# === 1) เซฟโมเดล / พรีโปรฯ =================================================
pkl_save(ptype_model,   ARTI/"models/ptype_model.pkl")
pkl_save(pca_need,      ARTI/"preprocessors/pca_need.pkl")
pkl_save(kmeans_need,   ARTI/"preprocessors/kmeans_need.pkl")
pkl_save(ranker_model,  ARTI/"models/ranker_model.pkl")

try:
    if scaler_ptype is not None:
        pkl_save(scaler_ptype, ARTI/"preprocessors/scaler_ptype.pkl")
except NameError:
    pass

# (ถ้าใช้ LightGBM และอยากมี native ไฟล์สำรอง)
try:
    import lightgbm as lgb
    if hasattr(ranker_model, "booster_"):
        ranker_model.booster_.save_model(str(ARTI/"models/ranker_model.txt"))
except Exception as e:
    print("Skip saving LightGBM native:", e)

# === 2) เซฟ CONFIGS (คอลัมน์ที่ต้องใช้ตอน infer + การ์ดเรล) ===============
feature_config = {
    "ptype_classes": list(ptype_classes),
    "ptype_featcols": list(ptype_featcols),
    "FEATURE_COLS": list(FEATURE_COLS),
    "ranker_featcols": list(ranker_featcols),
}
with open(ARTI/"configs/feature_config.json","w",encoding="utf-8") as f:
    json.dump(feature_config, f, ensure_ascii=False, indent=2)

guardrails = {
    "gap_rule_min_gap": 0.05,
    "min_real_promos": 2,
    "diversity_by": ["promo_type","product_scope"],
    "max_per_type": 2,
    "cap_nopromo": 1,
    "nopromo_label": "NoPromo",
    "relevance_thresh": 0.30,
    "topk_types": 3,
    "min_non_nopromo": 2,
    "K_final": 5
}
with open(ARTI/"configs/guardrails_config.json","w",encoding="utf-8") as f:
    json.dump(guardrails, f, ensure_ascii=False, indent=2)

# === 3) เซฟไฟล์ mapping โปรสินค้า (ถ้ามี) ==================================
try:
    promotion_products.to_csv(ARTI/"data/promotion_products.csv", index=False)
except NameError:
    # ถ้าไม่ได้สร้าง promotion_products ในโน้ตบุ๊กเทรน ก็ข้ามได้
    pass

# === 4) บันทึกเวอร์ชันไลบรารี ===============================================
versions = {
    "timestamp": datetime.utcnow().isoformat()+"Z",
    "python": sys.version,
    "platform": platform.platform(),
    "sklearn": sklearn.__version__,
}
with open(ARTI/"configs/versions.json","w",encoding="utf-8") as f:
    json.dump(versions, f, ensure_ascii=False, indent=2)

print("✅ Saved artifacts to:", ARTI.resolve())


NameError: name 'pca_need' is not defined