In [None]:
import os
import numpy as np
import pandas as pd
import joblib

# -----------------------------
# CONFIG
# -----------------------------
K = 12
MODEL_PATH = "data/hnm/models/logreg.joblib"   # change if needed
DATA_PATH  = "data/hnm/processed/transactions_sample.csv"
ITEMS_PATH = "data/hnm/processed/articles_features.csv"
USERS_PATH = "data/hnm/processed/customers_features.csv"

OUT_DIR = "data/hnm/tableau"
os.makedirs(OUT_DIR, exist_ok=True)

# -----------------------------
# LOAD
# -----------------------------
model = joblib.load(MODEL_PATH)

tx = pd.read_csv(DATA_PATH, dtype={"customer_id": str, "article_id": str})
items = pd.read_csv(ITEMS_PATH, dtype={"article_id": str})
users = pd.read_csv(USERS_PATH, dtype={"customer_id": str})

# positives only
tx = tx.drop_duplicates(subset=["customer_id", "article_id"])
tx["label"] = 1

# -----------------------------
# BUILD CANDIDATE SET
# (customer Ã— sampled items)
# -----------------------------
rng = np.random.default_rng(42)
all_articles = items["article_id"].unique()

rows = []
for cust, grp in tx.groupby("customer_id"):
    bought = set(grp["article_id"])
    candidates = set(bought)

    while len(candidates) < 200:   # candidate pool size
        candidates.add(rng.choice(all_articles))

    for a in candidates:
        rows.append((cust, a, int(a in bought)))

eval_df = pd.DataFrame(rows, columns=["customer_id", "article_id", "label"])

# join features
eval_df = eval_df.merge(items, on="article_id", how="left")
eval_df = eval_df.merge(users, on="customer_id", how="left")

X = eval_df.drop(columns=["label"])
y = eval_df["label"]

# -----------------------------
# PREDICT
# -----------------------------
eval_df["score"] = model.predict_proba(X)[:, 1]

# -----------------------------
# TOP-K PER CUSTOMER
# -----------------------------
topk = (
    eval_df.sort_values(["customer_id", "score"], ascending=[True, False])
           .groupby("customer_id")
           .head(K)
)

# -----------------------------
# METRIC FUNCTIONS
# -----------------------------
def apk(actual, predicted, k):
    score = 0.0
    hits = 0.0
    for i, p in enumerate(predicted[:k]):
        if p in actual and p not in predicted[:i]:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(actual), k) if actual else 0.0

def ndcg(actual, predicted, k):
    dcg = sum(
        1 / np.log2(i + 2)
        for i, p in enumerate(predicted[:k])
        if p in actual
    )
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(actual), k)))
    return dcg / idcg if idcg > 0 else 0.0

# -----------------------------
# CUSTOMER-LEVEL METRICS
# -----------------------------
records = []

for cust, grp in topk.groupby("customer_id"):
    actual = set(eval_df[(eval_df.customer_id == cust) & (eval_df.label == 1)]["article_id"])
    predicted = grp["article_id"].tolist()

    hits = len(actual.intersection(predicted))
    precision = hits / K
    recall = hits / len(actual) if actual else 0
    mapk = apk(actual, predicted, K)
    ndcgk = ndcg(actual, predicted, K)

    records.append([
        cust, hits, precision, recall, mapk, ndcgk
    ])

cust_metrics = pd.DataFrame(
    records,
    columns=[
        "customer_id",
        "hits_at_12",
        "precision_at_12",
        "recall_at_12",
        "map_at_12",
        "ndcg_at_12",
    ]
)

# -----------------------------
# OVERALL METRICS
# -----------------------------
overall = pd.DataFrame([{
    "customers_evaluated": len(cust_metrics),
    "mean_precision_at_12": cust_metrics["precision_at_12"].mean(),
    "mean_recall_at_12": cust_metrics["recall_at_12"].mean(),
    "MAP@12": cust_metrics["map_at_12"].mean(),
    "NDCG@12": cust_metrics["ndcg_at_12"].mean(),
}])

# -----------------------------
# SAVE FOR TABLEAU
# -----------------------------
cust_metrics.to_csv(f"{OUT_DIR}/customer_level_metrics.csv", index=False)
overall.to_csv(f"{OUT_DIR}/overall_metrics.csv", index=False)

topk_out = topk[["customer_id", "article_id", "score"]]
topk_out["rank"] = topk_out.groupby("customer_id")["score"].rank(ascending=False, method="first")
topk_out.to_csv(f"{OUT_DIR}/topk_recommendations.csv", index=False)

print("âœ… Evaluation complete")
print("ðŸ“Š CSVs written to:", OUT_DIR)
