# Retail Market Basket (Apriori / maxFP-growth) + RFM — Submission Notebook

**Đáp ứng yêu cầu đề:**

- Chọn dataset Kaggle, tự đặt `min_support`, `min_confidence`
- ID **lẻ** dùng **Apriori**; ID **chẵn** dùng **maxFP-growth** (sinh tập phổ biến **tối đại** → tạo luật từ tập tối đại)
- Sinh **association rules**, xuất **CSV**, ghi **params.json**
- EDA và RFM (tuỳ chọn, điểm cộng)
- Nhắc đính kèm **ảnh chứng chỉ Coursera** trong Drive nộp bài

### Dataset (Nguồn & giấy phép)

- Kaggle: **Retail Analysis on Large Dataset**
- Link: https://www.kaggle.com/datasets/sahilprajapati143/retail-analysis-large-dataset
- Trường sử dụng: `Transaction_ID`, `Customer_ID`, `Date`, `Time`, `Product_Type` (hoặc `products`), `Amount`, `Total_Purchases`, `Total_Amount`, `Order_Status` (nếu có), ...


## 1) Cấu hình chung (chọn thuật toán theo ID)


In [None]:
# !pip install pandas numpy mlxtend scikit-learn matplotlib

import os
import json
import warnings
from itertools import combinations
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

# Thay STUDENT_ID bằng ID của bạn để auto chọn thuật toán (lẻ: apriori, chẵn: fpmax)
# ví dụ: 202312345. Nếu để None, dùng CONFIG['ALGO'] bên dưới.
STUDENT_ID = None

CONFIG = {
    "input_csv": r"E:\MSE\HomeWork\Ky 2\DAM501.8\Test & Project\Project\Project\new_retail_data.csv",
    "output_dir": "retail_outputs",

    # Cột dùng làm item cho giỏ hàng: "Product_Type" (gom nhóm, dễ ra luật) hoặc "products" (chi tiết)
    "PRODUCT_COLUMN": "Product_Type",

    # Thuật toán (nếu STUDENT_ID=None): 'apriori' | 'fpmax' | 'fpgrowth'
    "ALGO": "fpmax",    # đổi "apriori" nếu bạn là ID lẻ

    # Ngưỡng (có cell gợi ý phía dưới)
    "min_support": 0.01,
    "min_confidence": 0.5,

    # Fallback Apriori tự cài đặt
    "max_len": 3,

    # RFM
    "rfm_k": 4,
    "rfm_scaler": "robust",
}

# Auto chọn theo STUDENT_ID
if STUDENT_ID is not None:
    try:
        CONFIG["ALGO"] = "apriori" if int(STUDENT_ID) % 2 == 1 else "fpmax"
    except Exception:
        pass

os.makedirs(CONFIG["output_dir"], exist_ok=True)
print("ALGO:", CONFIG["ALGO"], "| PRODUCT_COLUMN:", CONFIG["PRODUCT_COLUMN"])


ALGO: fpmax | PRODUCT_COLUMN: Product_Type


## 2) Import thư viện thuật toán


In [None]:
# MLXTEND (Apriori, FP-Growth, FPMax)
try:
    from mlxtend.frequent_patterns import apriori, fpgrowth, fpmax, association_rules
    MLXTEND_AVAILABLE = True
except Exception as e:
    MLXTEND_AVAILABLE = False
    print("[WARN] mlxtend not available → dùng fallback Apriori nhỏ:", e)

# scikit-learn cho RFM clustering (tuỳ chọn)
try:
    from sklearn.preprocessing import RobustScaler, StandardScaler
    from sklearn.cluster import KMeans
    SKLEARN_AVAILABLE = True
except Exception as e:
    SKLEARN_AVAILABLE = False
    print("[WARN] scikit-learn not available → bỏ qua RFM clustering:", e)

import matplotlib.pyplot as plt


[WARN] mlxtend not available → dùng fallback Apriori nhỏ: No module named 'mlxtend'


## 3) Load & làm sạch (map đúng schema + Order_Status filter)


In [None]:
def load_and_basic_clean(cfg):
    path = cfg["input_csv"]
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Không tìm thấy file: {path}")
    df = pd.read_csv(path, low_memory=False)

    required = ["Transaction_ID", cfg["PRODUCT_COLUMN"], "Date"]
    for r in required:
        if r not in df.columns:
            raise ValueError(
                f"Thiếu cột bắt buộc: {r}. Cột hiện có: {list(df.columns)}")

    # Đổi tên
    df = df.rename(columns={
        "Transaction_ID": "invoice_id",
        cfg["PRODUCT_COLUMN"]: "product_name",
        "Date": "invoice_date",
        "Customer_ID": "customer_id" if "Customer_ID" in df.columns else "customer_id",
        "Country": "country" if "Country" in df.columns else "country",
    })

    # Kết hợp Date + Time nếu có
    df["invoice_date"] = pd.to_datetime(df["invoice_date"], errors="coerce")
    if "Time" in df.columns:
        d = df["invoice_date"].dt.strftime("%Y-%m-%d").fillna("")
        df["invoice_date"] = pd.to_datetime(
            (d + " " + df["Time"].astype(str)).str.strip(), errors="coerce")

    # Chuẩn hóa invoice_id
    df["invoice_id"] = df["invoice_id"].astype(
        str).str.replace(r"\.0$", "", regex=True)

    # Fallback quantity & unit_price
    if "quantity" not in df.columns:
        df["quantity"] = 1
    if "unit_price" not in df.columns:
        if "Amount" in df.columns:
            df["unit_price"] = pd.to_numeric(df["Amount"], errors="coerce")
        elif {"Total_Amount", "Total_Purchases"}.issubset(df.columns):
            ta = pd.to_numeric(df["Total_Amount"], errors="coerce")
            tp = pd.to_numeric(df["Total_Purchases"],
                               errors="coerce").replace(0, np.nan)
            df["unit_price"] = ta / tp
        else:
            df["unit_price"] = 1.0
    df["quantity"] = pd.to_numeric(df["quantity"], errors="coerce")
    df["unit_price"] = pd.to_numeric(df["unit_price"], errors="coerce")

    # Làm sạch product_name
    df["product_name"] = df["product_name"].astype(str).str.strip()
    if df["product_name"].str.contains(r"[;,|]", regex=True).any():
        df["product_name"] = df["product_name"].str.split(r"[;,|]")
        df = df.explode("product_name")
        df["product_name"] = df["product_name"].astype(str).str.strip()
        df = df[df["product_name"] != ""]

    # Bỏ thiếu core + số âm/zero
    df = df.dropna(subset=["invoice_id", "product_name", "invoice_date"])
    df = df[(df["quantity"].notna()) & (df["unit_price"].notna())]
    df = df[(df["quantity"] > 0) & (df["unit_price"] > 0)]

    # Lọc trạng thái đơn (nếu có)
    if "Order_Status" in df.columns:
        before = len(df)
        keep = {"shipped", "delivered", "processing",
                "completed", "paid", "success"}
        df = df[df["Order_Status"].astype(str).str.lower().isin(keep)]
        print(f"Filter Order_Status: {before} → {len(df)}")

    # Revenue + year_month
    if "Amount" in df.columns:
        df["revenue"] = pd.to_numeric(df["Amount"], errors="coerce")
        if df["revenue"].isna().any() or df["revenue"].eq(0).all():
            df["revenue"] = df["quantity"] * df["unit_price"]
    else:
        df["revenue"] = df["quantity"] * df["unit_price"]
    df["year_month"] = df["invoice_date"].dt.to_period("M").astype(str)

    meta = {
        "rows": len(df),
        "invoices": df["invoice_id"].nunique(),
        "customers": df["customer_id"].nunique() if "customer_id" in df.columns else None,
        "products": df["product_name"].nunique(),
        "date_min": str(df["invoice_date"].min()),
        "date_max": str(df["invoice_date"].max()),
        "product_col": cfg["PRODUCT_COLUMN"]
    }
    return df, meta


df, meta = load_and_basic_clean(CONFIG)
print(meta)
df.head(3)


Filter Order_Status: 300946 → 251776
{'rows': 251776, 'invoices': 246349, 'customers': 84388, 'products': 33, 'date_min': '2023-03-01 00:02:39', 'date_max': '2024-02-29 23:58:43', 'product_col': 'Product_Type'}


Unnamed: 0,invoice_id,customer_id,Name,Email,Phone,Address,City,State,Zipcode,country,...,Feedback,Shipping_Method,Payment_Method,Order_Status,Ratings,products,quantity,unit_price,revenue,year_month
0,8691788,37249.0,Michelle Harrington,Ebony39@gmail.com,1414787000.0,3959 Amanda Burgs,Dortmund,Berlin,77985.0,Germany,...,Excellent,Same-Day,Debit Card,Shipped,5.0,Cycling shorts,1,108.028757,108.028757,2023-09
1,2174773,69749.0,Kelsey Hill,Mark36@gmail.com,6852900000.0,82072 Dawn Centers,Nottingham,England,99071.0,UK,...,Excellent,Standard,Credit Card,Processing,4.0,Lenovo Tab,1,403.353907,403.353907,2023-12
2,6679610,30192.0,Scott Jensen,Shane85@gmail.com,8362160000.0,4133 Young Canyon,Geelong,New South Wales,75929.0,Australia,...,Average,Same-Day,Credit Card,Processing,2.0,Sports equipment,1,354.4776,354.4776,2023-04


## 4) EDA nhanh + xuất CSV


In [None]:
out_dir = CONFIG["output_dir"]
os.makedirs(out_dir, exist_ok=True)

top_products = (df.groupby("product_name")["quantity"]
                  .sum().sort_values(ascending=False).head(20).reset_index())
revenue_by_month = df.groupby("year_month")["revenue"].sum().reset_index()

top_customers = None
if "customer_id" in df.columns:
    top_customers = (df.groupby("customer_id")["revenue"]
                       .sum().sort_values(ascending=False).head(20).reset_index())

# Save CSV
df.head(200).to_csv(os.path.join(out_dir, "EDA_sample_rows.csv"), index=False)
top_products.to_csv(os.path.join(out_dir, "EDA_top_products.csv"), index=False)
revenue_by_month.to_csv(os.path.join(
    out_dir, "EDA_revenue_by_month.csv"), index=False)
if top_customers is not None:
    top_customers.to_csv(os.path.join(
        out_dir, "EDA_top_customers.csv"), index=False)

print("[OK] EDA CSV saved.")
display(top_products.head())
display(revenue_by_month.head())


[OK] EDA CSV saved.


Unnamed: 0,product_name,quantity
0,Water,20042
1,Smartphone,15205
2,Non-Fiction,14920
3,Fiction,14897
4,Juice,10130


Unnamed: 0,year_month,revenue
0,2023-03,5449685.0
1,2023-04,5285169.0
2,2023-05,5363719.0
3,2023-06,5266050.0
4,2023-07,5408173.0


## 5) Market Basket helpers (pivot, support, luật từ maximal)


In [None]:
def to_pivot_bool(df, item_col="product_name"):
    return (df.assign(val=1)
              .pivot_table(index="invoice_id", columns=item_col, values="val",
                           aggfunc="max", fill_value=0)
              .astype(bool))


def support_of_itemset_bool(pivot_bool, items):
    items = list(items)
    if len(items) == 0:
        return 0.0
    for it in items:
        if it not in pivot_bool.columns:
            return 0.0
    return pivot_bool[items].all(axis=1).mean()


def gen_rules_from_itemsets(itemsets_df, pivot_bool, min_conf=0.5):
    # Tạo DataFrame cột cố định để tránh KeyError nếu rỗng
    cols = ["antecedents", "consequents", "support", "confidence", "lift"]
    if itemsets_df is None or itemsets_df.empty:
        return pd.DataFrame(columns=cols)

    rows = []
    for _, r in itemsets_df.iterrows():
        iset = set(r["itemsets"])
        if len(iset) < 2:
            continue
        sup_ab = r["support"]
        items = list(iset)
        for k in range(1, len(items)):
            for A in combinations(items, k):
                A = set(A)
                B = iset - A
                sup_a = support_of_itemset_bool(pivot_bool, A)
                sup_b = support_of_itemset_bool(pivot_bool, B)
                if sup_a == 0:
                    continue
                conf = sup_ab / sup_a
                if conf >= min_conf and sup_b > 0:
                    lift = conf / sup_b
                    rows.append({
                        "antecedents": tuple(sorted(A)),
                        "consequents": tuple(sorted(B)),
                        "support": sup_ab,
                        "confidence": conf,
                        "lift": lift
                    })
    rules = pd.DataFrame(rows, columns=cols)
    if not rules.empty:
        rules = rules.sort_values(
            ["confidence", "lift", "support"], ascending=False)
    return rules


## 6) Run Market Basket theo thuật toán (Apriori / FPMax / FPGrowth) + lưu CSV


In [None]:
def run_market_basket(df, cfg):
    out_dir = cfg["output_dir"]
    os.makedirs(out_dir, exist_ok=True)
    pivot_bool = to_pivot_bool(df, "product_name")
    algo = cfg["ALGO"].lower()
    ms, mc = cfg["min_support"], cfg["min_confidence"]

    def _save(fi_df, rules_df, tag):
        fi_df = fi_df.reindex(columns=["itemsets", "support"])
        rules_df = rules_df.reindex(
            columns=["antecedents", "consequents", "support", "confidence", "lift"])
        fi_df.to_csv(os.path.join(
            out_dir, f"frequent_itemsets_{tag}.csv"), index=False)
        rules_df.to_csv(os.path.join(
            out_dir, f"association_rules_{tag}.csv"), index=False)
        print(
            f"[OK] Saved ({tag}) → #itemsets={len(fi_df)} | #rules={len(rules_df)}")

    if MLXTEND_AVAILABLE:
        if algo == "apriori":
            fi = apriori(pivot_bool, min_support=ms,
                         use_colnames=True, max_len=cfg["max_len"])
            if not fi.empty and "support" in fi:
                fi = fi.sort_values("support", ascending=False)
            rules = association_rules(
                fi, metric="confidence", min_threshold=mc)
            if rules.empty:
                rules = pd.DataFrame(
                    columns=["antecedents", "consequents", "support", "confidence", "lift"])
            _save(fi if not fi.empty else pd.DataFrame(
                columns=["itemsets", "support"]), rules, "apriori")
            return fi, rules

        if algo == "fpmax":
            fi = fpmax(pivot_bool, min_support=ms,
                       use_colnames=True)  # maximal itemsets
            if not fi.empty and "support" in fi:
                fi = fi.sort_values("support", ascending=False)
            rules = gen_rules_from_itemsets(fi, pivot_bool, min_conf=mc)
            _save(fi if not fi.empty else pd.DataFrame(columns=["itemsets", "support"]),
                  rules if not rules.empty else pd.DataFrame(
                      columns=["antecedents", "consequents", "support", "confidence", "lift"]),
                  "fpmax")
            return fi, rules

        if algo == "fpgrowth":
            fi = fpgrowth(pivot_bool, min_support=ms, use_colnames=True)
            if not fi.empty and "support" in fi:
                fi = fi.sort_values("support", ascending=False)
            rules = association_rules(
                fi, metric="confidence", min_threshold=mc)
            if rules.empty:
                rules = pd.DataFrame(
                    columns=["antecedents", "consequents", "support", "confidence", "lift"])
            _save(fi if not fi.empty else pd.DataFrame(
                columns=["itemsets", "support"]), rules, "fpgrowth")
            return fi, rules

        raise ValueError("ALGO phải là: 'apriori' | 'fpmax' | 'fpgrowth'")

    # ---- Fallback Apriori (không mlxtend) ----
    from collections import Counter
    tr = (df.groupby("invoice_id")["product_name"].apply(
        lambda s: frozenset(s.astype(str).tolist())).tolist())
    n = len(tr)
    c1 = Counter()
    [c1.update(t) for t in tr]
    L1 = {frozenset([k]): v/n for k, v in c1.items() if v/n >= ms}
    freq = dict(L1)

    def gen_cands(prev):
        prev = list(prev)
        res = set()
        for i in range(len(prev)):
            for j in range(i+1, len(prev)):
                u = prev[i] | prev[j]
                if len(u) == len(prev[i])+1 == len(prev[j])+1:
                    res.add(u)
        return res

    Lk = set(L1.keys())
    k = 2
    while k <= cfg["max_len"] and Lk:
        Ck = gen_cands(Lk)
        count_ck = {}
        for c in Ck:
            cnt = sum(1 for t in tr if c.issubset(t))
            sup = cnt/n
            if sup >= ms:
                count_ck[c] = sup
        Lk = set(count_ck.keys())
        freq.update(count_ck)
        k += 1

    fi_df = pd.DataFrame([{"itemsets": tuple(sorted(list(k))), "support": v} for k, v in freq.items()],
                         columns=["itemsets", "support"])
    if not fi_df.empty:
        fi_df = fi_df.sort_values("support", ascending=False)

    def sup_itemset(s):
        s = set(s)
        return sum(1 for t in tr if s.issubset(t))/n
    rows = []
    cols = ["antecedents", "consequents", "support", "confidence", "lift"]
    for iset, sup in freq.items():
        if len(iset) < 2:
            continue
        items = list(iset)
        for r in range(1, len(items)):
            for A in combinations(items, r):
                A = set(A)
                B = set(items)-A
                sup_a = sup_itemset(A)
                sup_b = sup_itemset(B)
                if sup_a == 0:
                    continue
                conf = sup/sup_a
                if conf >= mc and sup_b > 0:
                    lift = conf/sup_b
                    rows.append({"antecedents": tuple(sorted(A)), "consequents": tuple(sorted(B)),
                                 "support": sup, "confidence": conf, "lift": lift})
    rules_df = pd.DataFrame(rows, columns=cols)
    if not rules_df.empty:
        rules_df = rules_df.sort_values(
            ["confidence", "lift", "support"], ascending=False)

    _save(fi_df, rules_df, "fallback")
    return fi_df, rules_df


fi, rules = run_market_basket(df, CONFIG)
display(rules.head(10))


[OK] Saved (fallback) → #itemsets=32 | #rules=0


Unnamed: 0,antecedents,consequents,support,confidence,lift


## 7) Gợi ý ngưỡng (min_support/min_confidence)


In [None]:
def suggest_thresholds(df, cfg, supports=(0.001, 0.002, 0.005, 0.01), confs=(0.4, 0.5, 0.6, 0.7)):
    if not MLXTEND_AVAILABLE:
        print("[INFO] Không có mlxtend, bỏ qua suggest.")
        return pd.DataFrame()

    pivot_bool = to_pivot_bool(df, "product_name")
    algo = cfg["ALGO"].lower()
    out = []
    for ms in supports:
        if algo == "fpmax":
            fi = fpmax(pivot_bool, min_support=ms, use_colnames=True)
            rules = gen_rules_from_itemsets(fi, pivot_bool, min_conf=0.5)
        elif algo == "apriori":
            fi = apriori(pivot_bool, min_support=ms,
                         use_colnames=True, max_len=cfg["max_len"])
            rules = association_rules(
                fi, metric="confidence", min_threshold=0.5)
        else:
            fi = fpgrowth(pivot_bool, min_support=ms, use_colnames=True)
            rules = association_rules(
                fi, metric="confidence", min_threshold=0.5)
        for mc in confs:
            cnt = int((rules["confidence"] >= mc).sum()
                      ) if not rules.empty else 0
            out.append(
                {"min_support": ms, "min_confidence": mc, "n_rules": cnt})
    return pd.DataFrame(out).sort_values("n_rules", ascending=False)

# Ví dụ: suggest_thresholds(df, CONFIG, supports=(0.001,0.002,0.005), confs=(0.4,0.5,0.6))


## 8) RFM + KMeans (tuỳ chọn) + CSV


In [None]:
# 8) RFM + KMeans (đã FIX dấu ngoặc + lưu CSV)
import os


def compute_rfm(df):
    base = "customer_id" if "customer_id" in df.columns else "invoice_id"
    ref = df["invoice_date"].max() + pd.Timedelta(days=1)
    agg = df.groupby(base).agg(
        last=("invoice_date", "max"),
        frequency=("invoice_id", "nunique"),
        monetary=("revenue", "sum")
    ).reset_index()
    agg["recency"] = (ref - agg["last"]).dt.days
    return agg.rename(columns={base: "entity_id"})


if SKLEARN_AVAILABLE:
    rfm = compute_rfm(df).dropna()
    rfm["monetary_log"] = np.log1p(rfm["monetary"])

    scaler = RobustScaler(
    ) if CONFIG["rfm_scaler"] == "robust" else StandardScaler()
    X = scaler.fit_transform(rfm[["recency", "frequency", "monetary_log"]])

    km = KMeans(n_clusters=CONFIG["rfm_k"], random_state=42, n_init=10)
    rfm["cluster"] = km.fit_predict(X)

    prof = rfm.groupby("cluster").agg(
        cnt=("entity_id", "count"),
        recency_mean=("recency", "mean"),
        frequency_mean=("frequency", "mean"),
        monetary_mean=("monetary", "mean")
    ).reset_index()

    # gán nhãn cụm
    prof_sorted = prof.sort_values(
        by=["monetary_mean", "frequency_mean", "recency_mean"],
        ascending=[False, False, True]
    )
    labels = ["VIP", "Potential", "Regular",
              "Churn-risk", "Occasional", "New", "Low-value"]
    label_map = {cid: (labels[i] if i < len(labels) else f"Segment_{i}")
                 for i, cid in enumerate(prof_sorted["cluster"])}

    prof_labeled = prof.assign(segment=prof["cluster"].map(label_map))
    rfm_labeled = rfm.assign(segment=rfm["cluster"].map(label_map))

    display(prof_labeled.head())

    out_dir = CONFIG["output_dir"]
    os.makedirs(out_dir, exist_ok=True)
    rfm_labeled.to_csv(os.path.join(out_dir, "rfm_clusters.csv"), index=False)
    prof_labeled.to_csv(os.path.join(
        out_dir, "rfm_cluster_profile.csv"), index=False)
else:
    print("[INFO] Bỏ qua RFM clustering (thiếu scikit-learn).")


Unnamed: 0,cluster,cnt,recency_mean,frequency_mean,monetary_mean,segment
0,0,33810,67.402751,2.508903,651.660594,Potential
1,1,16777,233.985158,1.846874,503.615019,Regular
2,2,25383,62.500492,4.871843,1284.027406,VIP
3,3,8418,155.666785,1.222975,127.870089,Churn-risk


## 9) Lưu tham số nộp bài (`params.json`)


In [None]:
params = {
    "dataset": "Retail Analysis on Large Dataset (Kaggle)",
    "product_field": CONFIG["PRODUCT_COLUMN"],
    "algo": CONFIG["ALGO"],
    "min_support": CONFIG["min_support"],
    "min_confidence": CONFIG["min_confidence"]
}
with open(os.path.join(CONFIG["output_dir"], "params.json"), "w", encoding="utf-8") as f:
    json.dump(params, f, ensure_ascii=False, indent=2)
print("[OK] saved params.json →", CONFIG["output_dir"])


[OK] saved params.json → retail_outputs
