In [3]:
# ==== INSTALLS (รันครั้งแรกครั้งเดียว) ====
# %pip install lightgbm scikit-learn pandas numpy joblib --quiet

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from lightgbm import LGBMClassifier
import joblib
import json
import os


# ตัวอย่าง path เต็ม
CSV = Path(r"C:\Users\apiwi\OneDrive\Documents\GitHub\Double_CAI\EQ\dh_lightgbm_8feat.csv")

assert CSV.exists(), f"ไม่พบไฟล์: {CSV}"

df = pd.read_csv(CSV)
print(f"✅ โหลดไฟล์สำเร็จ: {CSV}")

✅ โหลดไฟล์สำเร็จ: C:\Users\apiwi\OneDrive\Documents\GitHub\Double_CAI\EQ\dh_lightgbm_8feat.csv


In [4]:
# 8 features + target
cat_cols = ["department","commodity_desc","sub_commodity_desc"]
num_cols = ["qty","sales_value","price_per_unit","hour","week_no"]
target   = "used_coupon"  # binary 0/1

# ตรวจความครบ
for c in cat_cols + num_cols + [target]:
    if c not in df.columns:
        raise ValueError(f"คอลัมน์ขาด: {c}")

# แยก X, y
X_raw = df[cat_cols + num_cols].copy()
y = df[target].astype(int).values

In [5]:
# แปลงหมวดหมู่เป็นตัวเลขแบบ ordinal (เหมาะกับ LightGBM + robust ต่อ unseen = -1)
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_cat = enc.fit_transform(X_raw[cat_cols].astype(str))
X_num = X_raw[num_cols].to_numpy(dtype=float)

import numpy as np
X = np.concatenate([X_cat, X_num], axis=1)

# บันทึกชื่อฟีเจอร์หลัง encode (เพื่ออ้างอิง)
feature_names = [f"enc__{c}" for c in cat_cols] + num_cols

# Split
X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_tr.shape, X_te.shape


((2076585, 8), (519147, 8))

In [8]:
# คำนวณ weight balance อัตโนมัติสำหรับ class 1
scale_pos_weight = (len(y_tr) - sum(y_tr)) / sum(y_tr)
print(f"scale_pos_weight: {scale_pos_weight:.2f}")

clf = LGBMClassifier(
    objective="binary",
    class_weight="balanced",
    n_estimators=600,
    learning_rate=0.05,
    num_leaves=63,
    random_state=42,
)

clf.fit(
    X_tr, y_tr,
    eval_set=[(X_te, y_te)],
    eval_metric="auc"
)


scale_pos_weight: 70.27
[LightGBM] [Info] Number of positive: 29138, number of negative: 2047447
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.076077 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1602
[LightGBM] [Info] Number of data points in the train set: 2076585, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000


0,1,2
,boosting_type,'gbdt'
,num_leaves,63
,max_depth,-1
,learning_rate,0.05
,n_estimators,600
,subsample_for_bin,200000
,objective,'binary'
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [13]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report, roc_auc_score

# y_proba: ความน่าจะเป็นคลาส 1 จากโมเดล
y_proba = clf.predict_proba(X_te)[:, 1]

def find_best_threshold(y_true, y_score, grid=None, metric="f1"):
    # กำหนดกริด threshold ละเอียดพอสมควร
    if grid is None:
        grid = np.linspace(0.05, 0.95, 19)  # 0.05, 0.10, ..., 0.95

    best_th, best = 0.5, -1.0
    history = []
    for th in grid:
        y_pred = (y_score >= th).astype(int)
        p = precision_score(y_true, y_pred, zero_division=0)
        r = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        history.append((th, p, r, f1))
        score = {"f1": f1, "precision": p, "recall": r}[metric]
        if score > best:
            best, best_th = score, th
    return best_th, history

best_th, hist = find_best_threshold(y_te, y_proba, metric="f1")
print(f"✅ Best threshold (by F1): {best_th:.3f}")

# รายงาน metric ที่ threshold นี้
y_pred_best = (y_proba >= best_th).astype(int)
print("\n=== Report @ best threshold ===")
print(classification_report(y_te, y_pred_best, digits=4))
print("ROC-AUC:", roc_auc_score(y_te, y_proba))




✅ Best threshold (by F1): 0.850

=== Report @ best threshold ===
              precision    recall  f1-score   support

           0     0.9914    0.9916    0.9915    511863
           1     0.4030    0.3972    0.4001      7284

    accuracy                         0.9833    519147
   macro avg     0.6972    0.6944    0.6958    519147
weighted avg     0.9832    0.9833    0.9832    519147

ROC-AUC: 0.9245228890985417


In [14]:
# ดู 5 ค่าที่ดีที่สุดตาม F1
top = sorted(hist, key=lambda x: x[3], reverse=True)[:5]
print("\nTop-5 thresholds by F1 (th, precision, recall, f1)")
for th, p, r, f1 in top:
    print(f"{th:>5.2f}  P={p:0.3f}  R={r:0.3f}  F1={f1:0.3f}")


Top-5 thresholds by F1 (th, precision, recall, f1)
 0.85  P=0.403  R=0.397  F1=0.400
 0.90  P=0.545  R=0.290  F1=0.378
 0.80  P=0.303  R=0.484  F1=0.373
 0.75  P=0.225  R=0.559  F1=0.321
 0.95  P=0.672  R=0.198  F1=0.306


In [15]:
import json, os
from pathlib import Path

ARTI = Path("./artifacts_new")
ARTI.mkdir(exist_ok=True, parents=True)

cfg_path = ARTI / "inference_config.json"
cfg = {
    "best_threshold_for_used_coupon": float(best_th),
    "objective": "maximize_F1_class1"
}
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(cfg, f, ensure_ascii=False, indent=2)

print("Saved threshold to:", cfg_path.resolve())

Saved threshold to: C:\Users\apiwi\OneDrive\Documents\GitHub\Double_CAI\EQ\artifacts_new\inference_config.json


In [17]:
# โหลด threshold แล้วนำไปใช้ตอนทำนาย
with open("./artifacts_new/inference_config.json", "r", encoding="utf-8") as f:
    infer_cfg = json.load(f)

TH = infer_cfg["best_threshold_for_used_coupon"]

# สมมติ X_new คือฟีเจอร์ของลูกค้า/ตะกร้าใหม่
# y_proba_new = clf.predict_proba(X_new)[:, 1]
# y_pred_new  = (y_proba_new >= TH).astype(int)


In [18]:
from sklearn.metrics import classification_report, roc_auc_score

# ใช้ threshold ที่หาได้
y_pred_best = (y_proba >= best_th).astype(int)

print(f"=== Final Evaluation @ Threshold = {best_th:.3f} ===")
print(classification_report(y_te, y_pred_best, digits=4))
print("ROC-AUC:", roc_auc_score(y_te, y_proba))


=== Final Evaluation @ Threshold = 0.850 ===
              precision    recall  f1-score   support

           0     0.9914    0.9916    0.9915    511863
           1     0.4030    0.3972    0.4001      7284

    accuracy                         0.9833    519147
   macro avg     0.6972    0.6944    0.6958    519147
weighted avg     0.9832    0.9833    0.9832    519147

ROC-AUC: 0.9245228890985417


In [19]:
import numpy as np

def precision_recall_at_k(y_true, y_score, k=10):
    """
    y_true: array of true binary labels (0/1)
    y_score: array of model scores or probabilities
    k: number of top samples to consider
    """
    # เรียงจากคะแนนมาก → น้อย
    order = np.argsort(y_score)[::-1]
    y_true_sorted = np.array(y_true)[order]
    
    # เอาเฉพาะ top-k
    y_true_topk = y_true_sorted[:k]
    
    # precision = ของจริงที่ถูก / ทั้งหมดที่เลือก
    precision = np.sum(y_true_topk) / k
    
    # recall = ของจริงที่ถูก / ของจริงทั้งหมดใน dataset
    recall = np.sum(y_true_topk) / np.sum(y_true)
    
    return precision, recall

# 🔹 ตัวอย่างการคำนวณ:
p1, r1 = precision_recall_at_k(y_te, y_proba, k=1)
p10, r10 = precision_recall_at_k(y_te, y_proba, k=10)

print(f"Precision@1:  {p1:.4f}")
print(f"Recall@1:     {r1:.4f}")
print(f"Precision@10: {p10:.4f}")
print(f"Recall@10:    {r10:.4f}")


Precision@1:  1.0000
Recall@1:     0.0001
Precision@10: 0.6000
Recall@10:    0.0008
