In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, LinearRegression, PoissonRegressor
from sklearn.metrics import pairwise_distances_argmin_min
from pathlib import Path

: 

In [None]:
BASE = Path("Datasets/mockup_ver2/")  # เปลี่ยนเป็นโฟลเดอร์ของคุณได้
TX_CANDIDATES = BASE/"transactions.csv"
USERS_PATH = BASE/"users.csv"

In [None]:
SEED = 42
np.random.seed(SEED)

In [None]:
tx = pd.read_csv(TX_CANDIDATES)
tx["timestamp"] = pd.to_datetime(tx["timestamp"],format="%d/%m/%Y %H:%M" )

tx["order_date"] = tx["timestamp"].dt.date
tx["order_hour"] = tx["timestamp"].dt.hour
tx["dayofweek"]  = tx["timestamp"].dt.dayofweek  # Mon=0..Sun=6
tx["is_weekend"] = tx["dayofweek"].isin([5,6]).astype(int)
tx["month"]      = tx["timestamp"].dt.month

tx["unit_price"] = np.where(tx["qty"]>0, tx["price"]/tx["qty"], np.nan)

In [None]:
txn = (
    tx.groupby(["user_id","transaction_id"], as_index=False)
      .agg({
          "timestamp":"min",
          "qty":"sum",
          "price":"sum",
          "unit_price":"mean",
          "order_hour":"min",
          "dayofweek":"min",
          "is_weekend":"max",
          "month":"min",
      })
      .sort_values(["user_id","timestamp"])
)

In [None]:
txn["next_ts"] = txn.groupby("user_id")["timestamp"].shift(-1)
txn["days_to_next"] = (txn["next_ts"] - txn["timestamp"]).dt.days
txn["repeat_30d"] = (txn["days_to_next"].notna() & (txn["days_to_next"] <= 30)).astype(int)

X_rep = pd.get_dummies(
    txn[["is_weekend","month","order_hour","price"]],
    columns=["month","order_hour"], drop_first=True
).fillna(0)
y_rep = txn["repeat_30d"]

# กันเคสข้อมูลน้อยมาก
if y_rep.nunique() == 1:
    # ถ้าไม่มีความแปรผันเลย ให้ตั้งค่าเฉลี่ยเป็นสcore
    p_avg = float(y_rep.mean())
    txn["pred_repeat_prob"] = p_avg
else:
    logit = LogisticRegression(max_iter=1000, random_state=SEED)
    logit.fit(X_rep, y_rep)
    txn["pred_repeat_prob"] = logit.predict_proba(X_rep)[:,1]

user_loyalty = txn.groupby("user_id")["pred_repeat_prob"].mean().rename("loyalty_score")

In [None]:
X_bsk = pd.get_dummies(
    txn[["is_weekend","month","order_hour"]],
    columns=["month","order_hour"], drop_first=True
).fillna(0)
y_bsk = txn["qty"].clip(lower=0)

if len(txn) < 30 or y_bsk.nunique() <= 1:
    # ข้อมูลน้อยหรือไม่มีความแปรผัน
    pred_basket = np.repeat(y_bsk.mean(), len(txn))
else:
    poisson = PoissonRegressor(alpha=0.1, max_iter=500)
    poisson.fit(X_bsk, y_bsk)
    pred_basket = poisson.predict(X_bsk).clip(min=0)

txn["pred_basket_qty"] = pred_basket
user_basket = txn.groupby("user_id")["pred_basket_qty"].mean().rename("expected_basket_items")

In [None]:
def elasticity_per_user(g):
    g = g.dropna(subset=["qty","unit_price"]).copy()
    g = g[(g["qty"]>0) & (g["unit_price"]>0)]
    if g["unit_price"].nunique() < 3 or len(g) < 5:
        return np.nan
    X = np.log(g[["unit_price"]].values)
    y = np.log(g["qty"].values)
    lr = LinearRegression()
    lr.fit(X, y)
    return float(lr.coef_[0])

user_elast = txn.groupby("user_id").apply(elasticity_per_user).rename("price_elasticity")

def global_elast_fn(df):
    d = df.dropna(subset=["qty","unit_price"])
    d = d[(d["qty"]>0) & (d["unit_price"]>0)]
    if d["unit_price"].nunique() < 3 or len(d) < 20:
        return -1.8
    X = np.log(d[["unit_price"]].values)
    y = np.log(d["qty"].values)
    lr = LinearRegression().fit(X, y)
    return float(lr.coef_[0])
global_elast = global_elast_fn(txn)
user_elast = user_elast.fillna(global_elast)

In [None]:
users_features = pd.concat([user_loyalty, user_basket, user_elast], axis=1).reset_index()
users_features["loyalty_score"] = users_features["loyalty_score"].clip(0,1)
users_features["expected_basket_items"] = users_features["expected_basket_items"].clip(0,50)
users_features["price_elasticity"] = users_features["price_elasticity"].clip(-5, 1)

In [None]:
cluster_cols = ["loyalty_score","expected_basket_items","price_elasticity"]
Z = users_features[cluster_cols].fillna(users_features[cluster_cols].median())
scaler = StandardScaler()
Zs = scaler.fit_transform(Z)



In [None]:
# ==== ใช้ users_features แทน tx ====
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

FEATURE_COLS = ["loyalty_score", "expected_basket_items", "price_elasticity"]
K_RANGE = range(2, 11)
SEED = 42

# 1) ถ้ายังไม่มี users_features (เพิ่งคำนวณเสร็จด้านบน), ข้ามส่วนนี้
#    แต่ถ้าเริ่มจากไฟล์ ให้โหลด:
# users_features = pd.read_csv(BASE/"users_features_model.csv")

# 2) เลือกฟีเจอร์จาก users_features (ไม่ใช่ tx)
X = users_features[FEATURE_COLS].copy()

# 3) กัน inf/NaN แล้วสเกล
X = X.replace([np.inf, -np.inf], np.nan)
X = X.fillna(X.median(numeric_only=True))

scaler_k = StandardScaler()
Xs = scaler_k.fit_transform(X)

# 4) ลูปหา K ที่ดี
rows = []
for k in K_RANGE:
    km = KMeans(n_clusters=k, n_init=20, random_state=SEED)
    labels = km.fit_predict(Xs)

    sse = float(km.inertia_)
    sil = silhouette_score(Xs, labels) if k > 1 else np.nan
    ch  = calinski_harabasz_score(Xs, labels)
    db  = davies_bouldin_score(Xs, labels)

    rows.append({"K": k, "SSE": sse, "Silhouette": sil, "CalinskiHarabasz": ch, "DaviesBouldin": db})

metrics = pd.DataFrame(rows)

# 5) แนะนำ K
best_sil_k = int(metrics.loc[metrics["Silhouette"].idxmax(), "K"])
metrics["rank_sil"] = metrics["Silhouette"].rank(ascending=False, method="min")
metrics["rank_ch"]  = metrics["CalinskiHarabasz"].rank(ascending=False, method="min")
metrics["rank_db"]  = metrics["DaviesBouldin"].rank(ascending=True,  method="min")
metrics["rank_total"] = (metrics["rank_sil"]*0.5 + metrics["rank_ch"]*0.3 + metrics["rank_db"]*0.2)
best_combo_k = int(metrics.sort_values("rank_total").iloc[0]["K"])

print("=== Metrics by K ===")
print(metrics[["K","SSE","Silhouette","CalinskiHarabasz","DaviesBouldin","rank_total"]])
print("\nSuggestion:")
print(f"- By Silhouette peak => K = {best_sil_k}")
print(f"- By combined ranks  => K = {best_combo_k}")


In [None]:
k = 5  # ปรับได้
kmeans = KMeans(n_clusters=k, n_init=20, random_state=SEED)
users_features["segment"] = kmeans.fit_predict(Zs)

centers = pd.DataFrame(scaler.inverse_transform(kmeans.cluster_centers_), columns=cluster_cols)
desc = []
for i, r in centers.iterrows():
    desc.append(f"S{i}: loyalty~{r['loyalty_score']:.2f}, basket~{r['expected_basket_items']:.2f}, elast~{r['price_elasticity']:.2f}")
seg_map = {i: d for i, d in enumerate(desc)}
users_features["segment_desc"] = users_features["segment"].map(seg_map)

In [None]:
# === Build & Save ONE FILE: user_id + metrics + segment ===
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

BASE = Path("Datasets/mockup_ver2/")
BASE.mkdir(parents=True, exist_ok=True)

# 1) เอา users_features: ใช้จากตัวแปรถ้ามี, ไม่งั้นโหลดจากไฟล์ที่เคยเซฟ
candidates = [
    BASE / "users_features_model_k5.csv",
    BASE / "users_features_model.csv",
    BASE / "users_features_with_segments.csv",  # เผื่อเคยสร้างแล้ว
]
if 'users_features' in globals():
    df = users_features.copy()
else:
    df = None
    for p in candidates:
        if p.exists():
            df = pd.read_csv(p)
            break
    if df is None:
        raise FileNotFoundError(
            "ไม่พบตัวแปร users_features และไม่มีไฟล์ users_features_model*.csv ในโฟลเดอร์ "
            f"{BASE}. โปรดรันขั้นตอนคำนวณ users_features ก่อนครับ."
        )

# 2) ตรวจว่ามีคอลัมน์ metric หลักครบ
REQUIRED_METRICS = {"user_id","loyalty_score","expected_basket_items","price_elasticity"}
missing = REQUIRED_METRICS - set(df.columns)
if missing:
    raise ValueError(f"ขาดคอลัมน์จำเป็นใน users_features: {missing}")

# 3) ถ้ายังไม่มี segment → ทำคลัสเตอร์ (k=5) ให้เลย
if "segment" not in df.columns:
    cluster_cols = ["loyalty_score","expected_basket_items","price_elasticity"]
    Z = df[cluster_cols].copy()
    Z = Z.replace([np.inf, -np.inf], np.nan).fillna(Z.median(numeric_only=True))

    scaler = StandardScaler().fit(Z)
    Zs = scaler.transform(Z)

    SEED = 42
    k = 5
    kmeans = KMeans(n_clusters=k, n_init=20, random_state=SEED).fit(Zs)
    df["segment"] = kmeans.labels_

    # สร้างคำอธิบายคลัสเตอร์ (segment_desc) จาก centroid (หน่วยเดิม)
    centers = pd.DataFrame(
        scaler.inverse_transform(kmeans.cluster_centers_),
        columns=cluster_cols
    ).round(3)
    centers.index.name = "segment"
    centers = centers.reset_index()

    desc = []
    for _, r in centers.iterrows():
        desc.append(
            f"S{int(r['segment'])}: loyalty~{r['loyalty_score']:.2f}, "
            f"basket~{r['expected_basket_items']:.2f}, "
            f"elast~{r['price_elasticity']:.2f}"
        )
    seg_map = {int(r["segment"]): d for r, d in zip(centers.to_dict("records"), desc)}
    df["segment_desc"] = df["segment"].map(seg_map)

# 4) เลือกคอลัมน์ที่ต้องการเป็น “ไฟล์เดียว”
cols = ["user_id","loyalty_score","expected_basket_items","price_elasticity","segment"]
if "segment_desc" in df.columns:
    cols.append("segment_desc")

out_df = df[cols].copy()
out_df["loyalty_score"] = out_df["loyalty_score"].round(3)
out_df["expected_basket_items"] = out_df["expected_basket_items"].round(2)
out_df["price_elasticity"] = out_df["price_elasticity"].round(2)

# 5) เซฟไฟล์เดียว
OUT_PATH = BASE / "users_features_with_segments.csv"
out_df.to_csv(OUT_PATH, index=False)
print(f"Saved -> {OUT_PATH}")

# (ทางเลือก) ดูตัวอย่าง 5 แถว
display(out_df.head())
