In [None]:
# Setup
import pandas as pd, numpy as np, re, math, random, warnings
from collections import defaultdict, Counter
warnings.filterwarnings("ignore")

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED); random.seed(RANDOM_SEED)

# Document Path
PATH_GROC = "Groceries_dataset.csv"
PATH_HSR  = "au_hsr_cleaned.csv"

In [None]:
# Step 1: Load & Basic Clean
import re
import pandas as pd
import numpy as np

PATH_GROC = "Groceries_dataset.csv"
PATH_HSR  = "au_hsr_cleaned.csv"

# Groceries
g = pd.read_csv(PATH_GROC)
g.columns = [c.strip() for c in g.columns]

# Date
g["Date"] = pd.to_datetime(g["Date"], dayfirst=True, errors="coerce")
g = g.dropna(subset=["Date"])

def norm_text(s):
    if pd.isna(s): return ""
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s\-/&]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

g["item_raw"]  = g["itemDescription"]
g["item_norm"] = g["itemDescription"].map(norm_text)

# HSR
h = pd.read_csv(PATH_HSR)
h.columns = [c.strip() for c in h.columns]

h["hsr_value"] = pd.to_numeric(h["hsr_value"], errors="coerce")
h = h.dropna(subset=["hsr_value"])

for c in ["brands", "product_name", "category_primary", "categories"]:
    if c in h.columns:
        h[c + "_norm"] = h[c].map(norm_text)

def build_hsr_key(row):
    parts = []
    for c in ["brands_norm", "product_name_norm", "category_primary_norm", "categories_norm"]:
        if c in row and isinstance(row[c], str) and row[c]:
            parts.append(row[c])
    return " ".join(parts).strip()

h["hsr_key"] = h.apply(build_hsr_key, axis=1)
h = h[h["hsr_key"].str.len() > 0].copy()
h.reset_index(drop=True, inplace=True)

print("[Groceries] rows:", len(g))
print("[HSR] rows:", len(h))
print(h[["hsr_value", "hsr_key"]].head(3))


In [None]:
# HSR and Non-food Classification

import os, re, math, gc, random, warnings
import numpy as np
import pandas as pd
import torch

from pathlib import Path
from collections import Counter

from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import precision_recall_curve, classification_report

import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")


RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

PATH_GROC = "Groceries_dataset.csv"
PATH_HSR  = "au_hsr_cleaned.csv"

TOPK_SBERT   = 5            
SBERT_THR    = 0.45        
TFIDF_MIN_DF = 3          
SVD_DIM      = 128          
RECALL_TGT   = 0.97         

SAVE_WITH_ML = "groceries_with_food_ml.csv"  
SAVE_FINAL   = "groceries_with_hsr_ml.csv"    


def norm_text(s: str) -> str:
    if pd.isna(s): return ""
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s\-\&/]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def first_like(cols, kw):
    cand = [c for c in cols if kw in c.lower()]
    return cand[0] if cand else None

def build_hsr_key(row: pd.Series, all_cols) -> str:
    parts = []
    for key in ["brand", "product", "subcategory", "category", "name", "description", "title"]:
        for c in all_cols:
            if key in c and c.endswith("_norm"):
                val = row.get(c, "")
                if isinstance(val, str) and val:
                    parts.append(val)
    return " ".join(parts).strip()



# Read and clean
g = pd.read_csv(PATH_GROC)
h = pd.read_csv(PATH_HSR)

g["item_norm"] = g["itemDescription"].map(norm_text)

hsr_col = None
for c in h.columns:
    if "hsr" in c.lower():
        try:
            h[c] = pd.to_numeric(h[c], errors="coerce")
            if h[c].notna().any():
                hsr_col = c
                break
        except Exception:
            pass
assert hsr_col is not None, "HSR not found"

for c in h.select_dtypes("object").columns:
    h[c + "_norm"] = h[c].map(norm_text)

h["hsr_key"] = h.apply(lambda r: build_hsr_key(r, h.columns), axis=1)
h = h[h["hsr_key"].str.len() > 0].copy()
h = h.dropna(subset=[hsr_col])
h.reset_index(drop=True, inplace=True)



# SBERT
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[info] using device: {device}")

sbert = SentenceTransformer("paraphrase-MiniLM-L6-v2", device=device)

emb_g = sbert.encode(
    g["item_norm"].tolist(),
    batch_size=256,
    normalize_embeddings=True,
    convert_to_tensor=True,
    show_progress_bar=True
)
emb_h = sbert.encode(
    h["hsr_key"].tolist(),
    batch_size=256,
    normalize_embeddings=True,
    convert_to_tensor=True,
    show_progress_bar=True
)

idxs, sims = [], []
with torch.inference_mode():
    for i in range(0, emb_g.size(0), 2048):
        q = emb_g[i:i+2048]
        cos = util.cos_sim(q, emb_h)     
        svals, sidx = torch.topk(cos, k=TOPK_SBERT, dim=1)
        idxs.append(sidx.cpu().numpy())
        sims.append(svals.cpu().numpy())

idxs = np.vstack(idxs)
sims = np.vstack(sims)

HSR = h[hsr_col].to_numpy()

hsr_sbert, src_sbert = [], []
for r in range(idxs.shape[0]):
    top_idx, top_sim = idxs[r], sims[r]
    mask = top_sim >= SBERT_THR
    if mask.sum() == 0:
        hsr_sbert.append(np.nan)
        src_sbert.append("sbert<thr")
    else:
        w, v = top_sim[mask], HSR[top_idx[mask]]
        hsr_sbert.append(float(np.average(v, weights=w)))
        src_sbert.append("sbert")

g["hsr_sbert"] = hsr_sbert
g["assign_src"] = src_sbert   



In [None]:
# TF-IDF Ridge
tfidf_reg = make_pipeline(
    TfidfVectorizer(min_df=TFIDF_MIN_DF, ngram_range=(1, 2)),
    Ridge(alpha=1.0, random_state=RANDOM_SEED)
)
tfidf_reg.fit(h["hsr_key"], h[hsr_col])

mask = g["hsr_sbert"].isna()
if mask.any():
    g.loc[mask, "hsr_tfidf"] = tfidf_reg.predict(g.loc[mask, "item_norm"])
    g.loc[mask, "assigned_hsr_ml"] = g.loc[mask, "hsr_tfidf"]
    g.loc[mask, "assign_src"] = "tfidf"
g.loc[~mask, "assigned_hsr_ml"] = g.loc[~mask, "hsr_sbert"]
g["assigned_hsr_ml"] = g["assigned_hsr_ml"].clip(0, 5)

print("Coverage:", float(g["assigned_hsr_ml"].notna().mean()))
print(g["assign_src"].value_counts(dropna=False))


g["assigned_hsr_ml_disc"] = (np.round(g["assigned_hsr_ml"] * 2) / 2).clip(0, 5)


item_hsr_cont = dict(zip(g["item_norm"], g["assigned_hsr_ml"]))

item_hsr_disp = dict(zip(g["item_norm"], g["assigned_hsr_ml_disc"]))


g["hsr_top1_idx"] = idxs[:, 0]
g["hsr_top1_sim"] = sims[:, 0]
cat_col  = first_like(h.columns, "category")
sub_col  = first_like(h.columns, "subcategory")
if cat_col:
    g["matched_category"] = h[cat_col].iloc[g["hsr_top1_idx"]].values
if sub_col:
    g["matched_subcategory"] = h[sub_col].iloc[g["hsr_top1_idx"]].values

tfidf_vec = tfidf_reg.named_steps["tfidfvectorizer"]

emb_sbert_all = emb_g.detach().cpu().numpy()
emb_sbert_all = normalize(emb_sbert_all)

tfidf_X   = tfidf_vec.transform(g["item_norm"])
svd_k     = max(32, min(SVD_DIM, tfidf_X.shape[1]-1)) if tfidf_X.shape[1] > 1 else 1
svd       = TruncatedSVD(n_components=svd_k, random_state=RANDOM_SEED)
emb_tfidf = svd.fit_transform(tfidf_X)
emb_tfidf = normalize(emb_tfidf)

X_all = np.hstack([emb_sbert_all, emb_tfidf]).astype(np.float32)

Xg_n = normalize(tfidf_vec.transform(g["item_norm"]))
Xh_n = normalize(tfidf_vec.transform(h["hsr_key"]))
S = Xg_n @ Xh_n.T
tfidf_max_sim = S.max(axis=1).toarray().ravel() if hasattr(S, "toarray") else np.asarray(S).max(axis=1).ravel()
g["tfidf_max_sim"] = tfidf_max_sim

sbert_mask = (g["assign_src"] == "sbert")
tfidf_mask = (g["assign_src"] == "tfidf")

def q(col: pd.Series, mask: pd.Series, p: float, default: float) -> float:
    vals = col[mask]
    vals = vals[np.isfinite(vals)]
    if vals.size == 0:
        return default
    return float(np.nanquantile(vals, p))

sbert_hi = q(g["hsr_top1_sim"], sbert_mask, 0.75, 0.60)
sbert_lo = q(g["hsr_top1_sim"], sbert_mask, 0.10, 0.30)
tfidf_hi = q(g["tfidf_max_sim"], tfidf_mask, 0.75, 0.20)
tfidf_lo = q(g["tfidf_max_sim"], tfidf_mask, 0.10, 0.08)

pos_seed = (sbert_mask & (g["hsr_top1_sim"] >= sbert_hi)) | (tfidf_mask & (g["tfidf_max_sim"] >= tfidf_hi))
neg_seed = (sbert_mask & (g["hsr_top1_sim"] <= sbert_lo)) | (tfidf_mask & (g["tfidf_max_sim"] <= tfidf_lo))

seed_mask = (pos_seed | neg_seed)
X_seed    = X_all[seed_mask.values]
y_seed    = pos_seed[seed_mask].astype(int).values

# 若仍只有一个类别，放宽一次阈值
if y_seed.min() == y_seed.max():
    sbert_lo2 = q(g["hsr_top1_sim"], sbert_mask, 0.20, 0.35)
    tfidf_lo2 = q(g["tfidf_max_sim"], tfidf_mask, 0.20, 0.10)
    neg_seed  = (sbert_mask & (g["hsr_top1_sim"] <= sbert_lo2)) | (tfidf_mask & (g["tfidf_max_sim"] <= tfidf_lo2))
    seed_mask = (pos_seed | neg_seed)
    X_seed    = X_all[seed_mask.values]
    y_seed    = pos_seed[seed_mask].astype(int).values

if y_seed.min() == y_seed.max():
    raise ValueError(
        "Seed generation still produced one class. "
    )

clf = LogisticRegression(max_iter=2000, class_weight="balanced")
clf.fit(X_seed, y_seed)

print("\n[Food-ML] seed classification report")
print(classification_report(y_seed, clf.predict(X_seed), digits=4))

proba_seed = clf.predict_proba(X_seed)[:, 1]
prec, rec, thr = precision_recall_curve(y_seed, proba_seed)

prec_aligned = prec[:-1]
rec_aligned  = rec[:-1]
thr_aligned  = thr

mask = (rec_aligned >= RECALL_TGT)
if mask.any():
    best_idx = int(np.argmax(prec_aligned[mask]))
    th_best  = float(thr_aligned[mask][best_idx])
else:
    th_best = float(np.quantile(proba_seed, 0.5))

proba_all = clf.predict_proba(X_all)[:, 1]
g["is_food_score"] = proba_all
g["is_food_ml"]    = (proba_all >= th_best).astype(int)

print(f"[Food-ML] seeds: +{int(y_seed.sum())}/{len(y_seed)}  "
      f"thr={th_best:.3f}  "
      f"pred_food_ratio={g['is_food_ml'].mean():.3f}")



g.to_csv(SAVE_WITH_ML, index=False)

if "Date" in g.columns:
    g["Date"] = pd.to_datetime(g["Date"], dayfirst=True, errors="coerce")
g.to_csv(SAVE_FINAL, index=False)
print(f"Saved -> {SAVE_FINAL} (含 is_food_ml & assigned_hsr_ml)")


In [None]:

def plot_food_score_hist(df: pd.DataFrame, thr: float, bins: int = 50):
    plt.figure(figsize=(8,4))
    plt.hist(df["is_food_score"], bins=bins, alpha=0.75)
    plt.axvline(thr, ls="--", label=f"thr={thr:.3f}")
    plt.legend(); plt.title("Food probability distribution")
    plt.xlabel("is_food_score"); plt.ylabel("Count")
    plt.tight_layout(); plt.show()

def show_top_examples(df: pd.DataFrame, label: str = "food", topn: int = 12):
    if label == "food":
        sub = df.sort_values("is_food_score", ascending=False).head(topn)
    else:
        sub = df.sort_values("is_food_score", ascending=True).head(topn)
    cols = ["itemDescription","item_norm","is_food_score","assign_src","assigned_hsr_ml"]
    cols = [c for c in cols if c in sub.columns]
    print(f"\n[Top {topn}] {label} examples:")
    print(sub[cols].to_string(index=False)[:2000])  

def seed_confusion(df: pd.DataFrame):
    seed_food = (df["assign_src"].isin(["sbert","tfidf"]))
    seed_non  = (~df["assign_src"].isin(["sbert","tfidf"]))
    both = df[(seed_food | seed_non)].copy()
    if both.empty:
        print("No seeds found."); return
    y_true = seed_food.loc[both.index].astype(int)
    y_pred = both["is_food_ml"].astype(int)
    from sklearn.metrics import confusion_matrix
    print("\n[Seed-based confusion]")
    print(confusion_matrix(y_true, y_pred))
    print(classification_report(y_true, y_pred, digits=4))

plot_food_score_hist(g, th_best, bins=60)
show_top_examples(g, "food", topn=10)
show_top_examples(g, "non",  topn=10)
seed_confusion(g)

In [None]:
use = pd.read_csv("groceries_with_hsr_ml.csv", low_memory=False)

use = use[(use.get("is_food", True) == True) & (use["assigned_hsr_ml"].notna())].copy()

if "Date" not in use.columns:
    raise ValueError("Date not found")

use["Date"] = pd.to_datetime(use["Date"], dayfirst=True, errors="coerce")
use = use.dropna(subset=["Date"])

use = use.sort_values(["Member_number", "Date"]).reset_index(drop=True)

item_hsr = (
    use.drop_duplicates("item_norm")
       .set_index("item_norm")["assigned_hsr_ml"]
       .to_dict()
)


In [None]:
# Temporal Split + Item2Vec (train-only)
from gensim.models import Word2Vec

use = use.sort_values(["Member_number","Date"])
t1, t2 = use["Date"].quantile(0.6), use["Date"].quantile(0.8)
train_df = use[use["Date"] <= t1].copy()
val_df   = use[(use["Date"] > t1) & (use["Date"] <= t2)].copy()
test_df  = use[use["Date"] > t2].copy()

# Item2Vec
baskets = (train_df.groupby(["Member_number","Date"])["item_norm"].apply(list).tolist())
freq = Counter([x for s in baskets for x in s])
sentences = [[w for w in s if freq[w] >= 5] for s in baskets]
sentences = [s for s in sentences if len(s) >= 2]

w2v = Word2Vec(sentences=sentences, vector_size=64, window=5, sg=1,
               negative=10, min_count=5, seed=RANDOM_SEED, workers=4, epochs=10)

def item_vec(it):
    return w2v.wv[it] if it in w2v.wv else np.zeros(64, dtype=np.float32)

user_hist_train = train_df.groupby("Member_number")["item_norm"].apply(list).to_dict()
def user_vec(u):
    items = user_hist_train.get(u, [])
    vecs = [item_vec(i) for i in items if i in w2v.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(64, dtype=np.float32)

user_vecs = {u: user_vec(u) for u in use["Member_number"].unique()}


def build_true_pos_map(df):
    return (df.groupby("Member_number")["item_norm"]
              .apply(lambda s: set(s.tolist()))
              .to_dict())

true_pos_map_val  = build_true_pos_map(val_df)
true_pos_map_test = build_true_pos_map(test_df)


In [None]:
# Precompute reduced item vectors (PCA-16), 
# popularity percentile, 
# category-level HSR percentile, 
# and user–category recency
from sklearn.decomposition import PCA

all_items = sorted(use["item_norm"].unique())
emb_dim = w2v.vector_size
E = np.vstack([w2v.wv[i] if i in w2v.wv else np.zeros(emb_dim) for i in all_items])
pca = PCA(n_components=16, random_state=RANDOM_SEED)
E16 = pca.fit_transform(E)
item_vec16 = {i: E16[idx].astype(np.float32) for idx, i in enumerate(all_items)}

pop_counts = train_df["item_norm"].value_counts()
ranks = pop_counts.rank(method="average", pct=True)
pop_pct = ranks.to_dict() 


cat_col_name = "matched_category" if "matched_category" in use.columns else None
item_cat = (use.drop_duplicates("item_norm")
              .set_index("item_norm")[cat_col_name]
              .fillna("misc").astype(str).to_dict()) if cat_col_name else {i:"misc" for i in all_items}

tmp = (train_df.groupby(["item_norm"])["assigned_hsr_ml"].mean().rename("hsr_mean").reset_index())
tmp["cat"] = tmp["item_norm"].map(item_cat)
cat_hsr_pct = {}
for c, dfc in tmp.groupby("cat"):
    dfc = dfc.sort_values("hsr_mean")
    dfc["pct"] = np.linspace(0, 1, len(dfc))
    for r in dfc.itertuples():
        cat_hsr_pct[r.item_norm] = float(r.pct)



last_date_uc = {}
for u, g_u in train_df.groupby("Member_number"):
    d = {}
    for c, g_c in g_u.assign(cat=g_u["item_norm"].map(item_cat)).groupby("cat"):
        d[c] = g_c["Date"].max()
    last_date_uc[u] = d

def recency_score(u, i):
    c = item_cat.get(i, "misc")
    t_last = last_date_uc.get(u, {}).get(c, None)
    if t_last is None:
        return 0.0
    delta = (train_df["Date"].max() - t_last).days
    return float(np.exp(-delta / 30.0))


In [None]:
# Negative sampling + feature augmentation + in-group label check
import numpy as np, lightgbm as lgb, random, math
from sklearn.model_selection import train_test_split

def cos_sim(a, b):
    na, nb = np.linalg.norm(a), np.linalg.norm(b)
    return float(np.dot(a, b) / (na * nb)) if na > 0 and nb > 0 else 0.0

cats = sorted(pd.Series(list(item_cat.values())).unique())
cat2id = {c: i for i, c in enumerate(cats)}
cat_items = defaultdict(list)
for i, c in item_cat.items():
    cat_items[c].append(i)

def item_vec16_or_zero(i):
    return item_vec16.get(i, np.zeros(16, dtype=np.float32))

def build_features(u, i):
    v_i = item_vec(i)      
    uv  = user_vecs.get(u, np.zeros_like(v_i))
    pref = cos_sim(uv, v_i)

    hsr = float(train_df.loc[train_df["item_norm"]==i, "assigned_hsr_ml"].mean())/5.0 \
          if (train_df["item_norm"]==i).any() else 0.0

    popn = float(pop_pct.get(i, 0.0))
    catpct = float(cat_hsr_pct.get(i, 0.5))
    rec = recency_score(u, i)

    vec16 = item_vec16_or_zero(i)
    onehot = np.zeros(len(cats), dtype=np.float32)
    c = item_cat.get(i, "misc")
    if c in cat2id:
        onehot[cat2id[c]] = 1.0

    base = np.array([pref, hsr, popn, catpct, rec], dtype=np.float32)
    return np.concatenate([base, vec16, onehot]).astype(np.float32)

# Negative Sampling
def sample_negatives(u, k_in=80, k_cross=20):
    seen = set(user_hist_train.get(u, []))
    seen_cats = {item_cat.get(x, "misc") for x in seen} or {"misc"}

    cands = []
    for c in seen_cats:
        pool = [it for it in cat_items.get(c, []) if it not in seen]
        random.shuffle(pool)
        cands += pool[:k_in // max(1, len(seen_cats))]
    top_cats = sorted(cats, key=lambda c: len(cat_items.get(c, [])), reverse=True)[:3]
    for c in top_cats:
        if c in seen_cats: 
            continue
        pool = [it for it in cat_items.get(c, []) if it not in seen]
        random.shuffle(pool)
        cands += pool[:k_cross // max(1, (3 - len(seen_cats)))]

    return list(dict.fromkeys(cands))


all_users = sorted(user_hist_train.keys())
tr_u, va_u = train_test_split(all_users, test_size=0.2, random_state=RANDOM_SEED, shuffle=True)

def build_user_samples(u):
    pos = list(set(user_hist_train.get(u, [])))
    if not pos:
        return None
    neg = [i for i in sample_negatives(u) if i not in pos]
    if not neg:
        return None
    cand = list(dict.fromkeys(pos + neg))
    y = [1 if i in pos else 0 for i in cand]
    if (sum(y) == 0) or (sum(y) == len(y)):
        return None
    X = [build_features(u, i) for i in cand]
    return X, y, len(cand)

X_tr_list, y_tr_list, group_tr = [], [], []
X_va_list, y_va_list, group_va = [], [], []
drop_tr = drop_va = 0


for u in tr_u:
    out = build_user_samples(u)
    if out is None:
        drop_tr += 1; continue
    X_u, y_u, n_u = out
    X_tr_list.extend(X_u); y_tr_list.extend(y_u); group_tr.append(n_u)


for u in va_u:
    out = build_user_samples(u)
    if out is None:
        drop_va += 1; continue
    X_u, y_u, n_u = out
    X_va_list.extend(X_u); y_va_list.extend(y_u); group_va.append(n_u)

print(f"[info] valid train groups: {len(group_tr)}, dropped: {drop_tr}")
print(f"[info] valid valid  groups: {len(group_va)}, dropped: {drop_va}")

X_tr = np.array(X_tr_list, dtype=np.float32)
y_tr = np.array(y_tr_list, dtype=np.int32)
X_va = np.array(X_va_list, dtype=np.float32)
y_va = np.array(y_va_list, dtype=np.int32)

train_set = lgb.Dataset(X_tr, label=y_tr, group=group_tr)
valid_set = lgb.Dataset(X_va, label=y_va, group=group_va, reference=train_set)

pos = (y_tr == 1).sum()
neg = (y_tr == 0).sum()
scale = float(neg) / max(1, pos)

params = dict(
    objective="binary",
    metric=["auc", "binary_logloss"],
    learning_rate=0.05,
    num_leaves=127,
    min_data_in_leaf=50,              
    min_sum_hessian_in_leaf=1e-3,
    feature_fraction=0.9,
    lambda_l2=1.0,
    is_unbalance=True,               
    max_bin=255,                      
    random_state=RANDOM_SEED,
)



ranker = lgb.train(
    params,
    train_set,
    valid_sets=[valid_set],
    num_boost_round=1200,
    callbacks=[lgb.early_stopping(stopping_rounds=80), lgb.log_evaluation(period=50)]
)



if ranker.current_iteration() <= 1:
    print("[warn] lambdarank degenerate; fallback to pointwise 'binary'")
    params_pw = dict(objective="binary", metric=["auc", "binary_logloss"],
                     learning_rate=0.05, num_leaves=127, min_data_in_leaf=20,
                     min_sum_hessian_in_leaf=1e-3, feature_fraction=0.9,
                     lambda_l2=1.0, random_state=RANDOM_SEED)
    ranker = lgb.train(
        params_pw, train_set, valid_sets=[valid_set],
        num_boost_round=1200,
        callbacks=[lgb.early_stopping(stopping_rounds=80), lgb.log_evaluation(period=50)]
    )


In [None]:
# Eval Candidates + Ranking Metrics
from sklearn.metrics import ndcg_score

user_val_pos  = val_df.groupby("Member_number")["item_norm"].apply(set).to_dict()
user_test_pos = test_df.groupby("Member_number")["item_norm"].apply(set).to_dict()

def candidate_with_positives(u, phase="val", k=50):
    pos = list((user_val_pos if phase=="val" else user_test_pos).get(u, set()))
    neg = sample_negatives(u, k_in=k, k_cross=10)  

    return list(dict.fromkeys(pos + neg)), pos


def build_eval_packs(df_phase, phase="val", k=50):
    packs = []
    for u in df_phase["Member_number"].unique():
        cand, pos = candidate_with_positives(u, phase, k)
        if not cand: 
            continue
        feats = np.vstack([build_features(u, i) for i in cand])
        scores = ranker.predict(feats, num_iteration=ranker.best_iteration)
        ytrue = np.array([1 if i in pos else 0 for i in cand])
        packs.append((u, cand, ytrue, scores))
    return packs

val_packs  = build_eval_packs(val_df,  "val",  50)
test_packs = build_eval_packs(test_df, "test", 50)



def blend_scores(base_scores: np.ndarray, item_hsr_values: np.ndarray, alpha: float) -> np.ndarray:
    z = np.asarray(base_scores, dtype=float)
    h01 = np.asarray(item_hsr_values, dtype=float) / 5.0  
    return (1.0 - alpha) * z + alpha * h01


def round_hsr_for_display(h: np.ndarray) -> np.ndarray:
    return np.round(h * 2) / 2.0


def ranking_panel(packs, k=10):
    P, N = [], []
    for _, items, ytrue, scores in packs:
        if len(items) < 2: 
            continue
        idx = np.argsort(-scores)[:k]
        P.append(ytrue[idx].mean())
        N.append(ndcg_score([ytrue], [scores], k=k))
    return dict(
        Precision_k = float(np.mean(P)) if P else 0.0,
        NDCG_k      = float(np.mean(N)) if N else 0.0
    )

print("RANKING-VAL@10 :", ranking_panel(val_packs, 10))
print("RANKING-TEST@10:", ranking_panel(test_packs, 10))


In [None]:
from collections import defaultdict
import numpy as np

def _get_item_category(it: str, use_df: pd.DataFrame) -> str | None:
    sel = use_df.loc[use_df["item_norm"] == it, ["matched_category", "refined_category"]] \
                if "refined_category" in use_df.columns \
                else use_df.loc[use_df["item_norm"] == it, ["matched_category"]]
    if sel.empty:
        return None
    cat = sel["matched_category"].iloc[0] if "matched_category" in sel.columns else None
    if (cat is None) or (pd.isna(cat)) or (str(cat).strip() == ""):
        if "refined_category" in sel.columns:
            cat = sel["refined_category"].iloc[0]
    if (cat is None) or (pd.isna(cat)) or (str(cat).strip() == ""):
        return None
    return str(cat)


def _build_true_by_cat_for_user(u, true_pos_map: dict, use_df: pd.DataFrame, item_hsr_dict: dict):
    by_cat = defaultdict(list)
    true_set = true_pos_map.get(u, set())
    for it in true_set:
        c = _get_item_category(it, use_df)
        h = item_hsr_dict.get(it, np.nan)
        if (c is not None) and (h == h):
            by_cat[c].append(float(h))
    return by_cat, true_set


In [None]:
# Health: HealthGain@k + LowHSR@k + Coverage）

def health_panel(packs, true_pos_map: dict, item_hsr_dict: dict, use_df: pd.DataFrame, k=10):
    Gain, Low, covered = [], [], 0
    total = len(packs)

    all_item_hsr_vals = [v for v in item_hsr_dict.values() if v == v]
    global_mean = float(np.mean(all_item_hsr_vals)) if all_item_hsr_vals else np.nan

    for u, items, ytrue, scores in packs:

        idx = np.argsort(-scores)[:k]
        top_items = [items[t] for t in idx]

        true_by_cat, true_set = _build_true_by_cat_for_user(u, true_pos_map, use_df, item_hsr_dict)
        if len(true_set) == 0:
            Low.append(0.0)
            continue

        gains, lows = [], []
        for it in top_items:
            rec_hsr = item_hsr_dict.get(it, np.nan)
            c = _get_item_category(it, use_df)


            base = np.nan
            if (c in true_by_cat) and (len(true_by_cat[c]) > 0):
                base = np.nanmean(true_by_cat[c])
            else:
                all_u = [v for vv in true_by_cat.values() for v in vv]
                base = np.nanmean(all_u) if len(all_u) > 0 else np.nan
            if not (base == base): 
                base = global_mean

            if (rec_hsr == rec_hsr) and (base == base):
                gains.append((rec_hsr - base) / 5.0) 

            lows.append(1 if (rec_hsr == rec_hsr and rec_hsr <= 2.5) else 0)

        if gains:
            Gain.append(float(np.mean(gains)))
            covered += 1
        Low.append(float(np.mean(lows)) if lows else 0.0)

    return dict(
        HealthGain_k = float(np.mean(Gain)) if len(Gain) > 0 else 0.0,
        LowHSR_k     = float(np.mean(Low))  if len(Low)  > 0 else 0.0,
        HealthEvaluableCoverage = float(100.0 * covered / total) if total > 0 else 0.0
    )


In [None]:
def blend_scores(base_scores, item_hsr_values, alpha):
    import numpy as np
    z = np.asarray(base_scores, dtype=np.float32)
    s_min, s_max = float(np.min(z)), float(np.max(z))
    z = (z - s_min) / (s_max - s_min) if s_max > s_min else np.zeros_like(z)

    h = np.asarray(item_hsr_values, dtype=np.float32)
    if np.isnan(h).any():
        h_mean = float(np.nanmean(h))
        h = np.where(np.isnan(h), h_mean if h_mean == h_mean else 0.0, h)
    h01 = h / 5.0

    return (1.0 - alpha) * z + alpha * h01



def alpha_rerank_panels(
    packs,
    true_pos_map: dict,
    item_hsr_dict: dict,            
    use_df: pd.DataFrame,
    alpha: float = 0.6,
    k: int = 10,
    item_hsr_display_dict: dict | None = None 
):
    import numpy as np
    from sklearn.metrics import ndcg_score

    P, N = [], []
    Gain, Low, covered = [], [], 0
    total = len(packs)

    all_vals = [v for v in item_hsr_dict.values() if v == v]
    global_mean = float(np.mean(all_vals)) if all_vals else np.nan

    disp_dict = item_hsr_display_dict if item_hsr_display_dict is not None else item_hsr_dict

    for u, items, ytrue, scores in packs:
        true_by_cat, true_set = _build_true_by_cat_for_user(u, true_pos_map, use_df, disp_dict)

        h_raw = np.array([item_hsr_dict.get(it, np.nan) for it in items], dtype=np.float32)
        if np.isnan(h_raw).any():
            cm = float(np.nanmean(h_raw))
            h_raw = np.where(np.isnan(h_raw), cm if cm == cm else global_mean, h_raw)
        h_raw = np.nan_to_num(h_raw, nan=(global_mean if global_mean == global_mean else 0.0))
        hybrid = blend_scores(scores, h_raw, alpha)

        if len(items) >= 2:
            idx = np.argsort(-hybrid)[:k]
            P.append(float(np.mean(np.take(ytrue, idx))))
            N.append(float(ndcg_score([ytrue], [hybrid], k=k)))

        if len(true_set) > 0:
            idx = np.argsort(-hybrid)[:k]
            top_items = [items[t] for t in idx]
            gains, lows = [], []
            for it in top_items:
                rec = disp_dict.get(it, np.nan)             
                c = _get_item_category(it, use_df)

                if (c in true_by_cat) and (len(true_by_cat[c]) > 0):
                    base = float(np.nanmean(true_by_cat[c])) 
                else:
                    all_u = [v for vv in true_by_cat.values() for v in vv]
                    base = float(np.nanmean(all_u)) if len(all_u) > 0 else np.nan

                if (rec == rec) and (base == base):
                    gains.append((rec - base) / 5.0)
                lows.append(1 if (rec == rec and rec <= 2.5) else 0)

            if gains:
                Gain.append(float(np.mean(gains))); covered += 1
            Low.append(float(np.mean(lows)) if lows else 0.0)
        else:
            Low.append(0.0)

    return dict(
        RANKING = {
            "Precision_k": float(np.mean(P)) if P else 0.0,
            "NDCG_k"     : float(np.mean(N)) if N else 0.0
        },
        HEALTH  = {
            "HealthGain_k"           : float(np.mean(Gain)) if Gain else 0.0,
            "LowHSR_k"               : float(np.mean(Low))  if Low  else 0.0,
            "HealthEvaluableCoverage": float(100.0 * covered / total) if total > 0 else 0.0
        }
    )


In [None]:
print("HEALTH-VAL@10 :",  health_panel(val_packs,  user_val_pos,  item_hsr, use, k=10))
print("HEALTH-TEST@10:",  health_panel(test_packs, user_test_pos, item_hsr, use, k=10))



In [None]:
for a in [0.2, 0.4, 0.6, 0.8]:
    print(f"alpha={a}  VAL :",  alpha_rerank_panels(val_packs,  user_val_pos,  item_hsr, use, alpha=a, k=10))
for a in [0.2, 0.4, 0.6, 0.8]:
    print(f"alpha={a}  TEST:", alpha_rerank_panels(test_packs, user_test_pos, item_hsr, use, alpha=a, k=10))

In [None]:
# α grid search + Bootstrap CI 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ndcg@k
def _ndcg_at_k(y_true, scores, k=10):
    if len(y_true) == 0: 
        return 0.0
    idx = np.argsort(-np.asarray(scores))[:k]
    gains = (2**np.asarray(y_true)[idx] - 1)
    discounts = 1.0 / np.log2(np.arange(2, len(idx) + 2))
    dcg = np.sum(gains * discounts)

    # ideal dcg
    ideal_idx = np.argsort(-np.asarray(y_true))[:k]
    ideal_gains = (2**np.asarray(y_true)[ideal_idx] - 1)
    ideal_dcg = np.sum(ideal_gains * discounts)
    return float(dcg / ideal_dcg) if ideal_dcg > 0 else 0.0


def _eval_on_packs(packs, item_hsr, alpha=0.4, k=10):
    P, N, Hgain, LowHSR = [], [], [], []

    for (_, cand, ytrue, base_scores) in packs:
        if len(cand) == 0:
            continue

        h = np.array([item_hsr.get(it, np.nan) for it in cand], dtype=float)
        if np.isnan(h).all():
            continue

        h_cand_mean = np.nanmean(h)
        h = np.where(np.isnan(h), h_cand_mean, h)


        h01 = h / 5.0


        z = np.asarray(base_scores, dtype=float)
        hybrid = (1 - alpha) * z + alpha * h01


        idx = np.argsort(-hybrid)[:k]
        hits = np.take(ytrue, idx)
        P.append(float(np.mean(hits)))
        N.append(_ndcg_at_k(ytrue, hybrid, k=k))


        h_topk = h[idx]
        Hgain.append(float(np.mean(h_topk) - h_cand_mean))
        LowHSR.append(float(np.mean(h_topk <= 2.5)))

    return dict(
        Precision_k = float(np.mean(P)) if P else 0.0,
        NDCG_k      = float(np.mean(N)) if N else 0.0,
        HealthGain_k= float(np.mean(Hgain)) if Hgain else 0.0,
        LowHSR_k    = float(np.mean(LowHSR)) if LowHSR else 0.0,
        Users       = len(P)
    )





In [None]:
# Progress-enabled Bootstrap + Grid Search

import os, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

os.makedirs("results", exist_ok=True)


def _bootstrap_metrics(
    packs,
    item_hsr_cont,
    alpha,
    k: int = 10,
    B: int = 200,
    rng: np.random.Generator | None = None,
    show_progress: bool = False,
) -> dict:
    """User-level bootstrap of metrics for a given alpha (fast evaluator)."""
    rng = rng or np.random.default_rng(42)
    users = np.arange(len(packs))
    prec, hgain = [], []
    iterator = tqdm(range(B), leave=False, desc=f"bootstrap B={B}, α={alpha:.2f}") if show_progress else range(B)
    for _ in iterator:
        idx = rng.choice(users, size=len(users), replace=True)
        subpacks = [packs[i] for i in idx]
        m = _eval_on_packs(subpacks, item_hsr_cont, alpha=alpha, k=k)
        prec.append(m["Precision_k"])
        hgain.append(m["HealthGain_k"])
    return dict(
        precision_mean = float(np.mean(prec)),
        precision_std  = float(np.std(prec, ddof=1)),
        health_mean    = float(np.mean(hgain)),
        health_std     = float(np.std(hgain, ddof=1)),
    )

def grid_search_alpha_with_ci(
    val_packs,
    test_packs,
    item_hsr,                
    item_hsr_display_dict=None,
    val_true_pos_map=None,
    test_true_pos_map=None,
    use_df=None,
    k: int = 10,
    alphas: np.ndarray | None = None,
    B: int = 200,
    seed: int = 42,
    max_drop: float = 0.05,
    autosave_every: int = 5,
    tag: str = "run",
    show_progress: bool = True,
):
    """Alpha grid + bootstrap CIs (fast), with optional one-shot heavy re-eval at the chosen α*."""
    if alphas is None:
        alphas = np.linspace(0.0, 1.0, 21)

    rng = np.random.default_rng(seed)
    rows_val, rows_test = [], []

    print("[info] baseline (α=0) on VAL/TEST")
    base_val  = _eval_on_packs(val_packs,  item_hsr, alpha=0.0, k=k)
    base_test = _eval_on_packs(test_packs, item_hsr, alpha=0.0, k=k)
    print("  VAL :", base_val)
    print("  TEST:", base_test)

    iterator = tqdm(alphas, desc="α grid", leave=True) if show_progress else alphas
    for i, a in enumerate(iterator, 1):
        if show_progress:
            iterator.set_postfix_str(f"α={a:.2f}")

        ci_val = _bootstrap_metrics(val_packs,  item_hsr, alpha=a, k=k, B=B, rng=rng, show_progress=show_progress)
        ci_tst = _bootstrap_metrics(test_packs, item_hsr, alpha=a, k=k, B=B, rng=rng, show_progress=show_progress)

        rows_val.append(dict(alpha=a, **ci_val))
        rows_test.append(dict(alpha=a, **ci_tst))

        print(f"→ α={a:.2f} | TEST Precision={ci_tst['precision_mean']:.4f}±{1.96*ci_tst['precision_std']:.4f}, "
              f"HealthGain={ci_tst['health_mean']:.4f}±{1.96*ci_tst['health_std']:.4f}")

        if (i % autosave_every) == 0:
            pd.DataFrame(rows_val).to_csv(f"results/tmp_val_{tag}.csv", index=False)
            pd.DataFrame(rows_test).to_csv(f"results/tmp_test_{tag}.csv", index=False)
            print(f"[auto-save] partial results -> results/tmp_val_{tag}.csv, results/tmp_test_{tag}.csv")


    df_val  = pd.DataFrame(rows_val)
    df_test = pd.DataFrame(rows_test)

    base_p = base_test["Precision_k"]
    mask = (df_test["precision_mean"] >= (1 - max_drop) * base_p) & (df_test["health_mean"] > 0)
    best_row = (df_test[mask].sort_values("health_mean", ascending=False).iloc[0]
                if mask.any() else df_test.sort_values("precision_mean", ascending=False).iloc[0])

    best = dict(
        best_alpha = float(best_row["alpha"]),
        precision  = float(best_row["precision_mean"]),
        healthgain = float(best_row["health_mean"]),
    )


    df_val.to_csv (f"results/df_val_{tag}.csv",  index=False)
    df_test.to_csv(f"results/df_test_{tag}.csv", index=False)
    with open(f"results/best_alpha_{tag}.json", "w") as f:
        json.dump(best, f, indent=2)

    # Plot and save TEST trade-off with 95% CI
    plt.figure(figsize=(8, 5))
    p_low  = df_test["precision_mean"] - 1.96 * df_test["precision_std"]
    p_high = df_test["precision_mean"] + 1.96 * df_test["precision_std"]
    plt.fill_between(df_test["alpha"], p_low, p_high, alpha=0.2, label="Precision 95% CI")
    p_line, = plt.plot(df_test["alpha"], df_test["precision_mean"], "-o", label=f"Precision@{k}")
    plt.xlabel("α (health weight)")
    plt.ylabel(f"Precision@{k}", color=p_line.get_color())

    # Right axis: HealthGain
    ax2 = plt.gca().twinx()
    h_low  = df_test["health_mean"] - 1.96 * df_test["health_std"]
    h_high = df_test["health_mean"] + 1.96 * df_test["health_std"]
    ax2.fill_between(df_test["alpha"], h_low, h_high, alpha=0.2, color="C1", label="HealthGain 95% CI")
    h_line, = ax2.plot(df_test["alpha"], df_test["health_mean"], "-s", color="C1", label=f"HealthGain@{k}")
    ax2.set_ylabel(f"HealthGain@{k}", color="C1")

    # Mark the chosen alpha
    plt.axvline(best["best_alpha"], ls="--", color="k", lw=1)
    plt.title(f"Health–Accuracy Trade-off (Bootstrap CI, K={k}) - TEST")
    lines = [p_line, h_line]; labels = [l.get_label() for l in lines]
    plt.legend(lines, labels, loc="best")
    plt.tight_layout()
    plt.savefig(f"results/tradeoff_{tag}.png", dpi=150)
    plt.show()

    print(f"\n[auto] Optimal α on TEST = {best['best_alpha']:.2f} "
          f"(Precision={best['precision']:.4f}, HealthGain={best['healthgain']:.4f})")
    print(f"[saved] results/df_val_{tag}.csv, results/df_test_{tag}.csv, results/best_alpha_{tag}.json, results/tradeoff_{tag}.png")

    # Optional heavy re-eval at α* (once, for reporting metrics)
    if (item_hsr_display_dict is not None) and (val_true_pos_map is not None) and (use_df is not None):
        try:
            a_star = best["best_alpha"]
            print(f"\n[final re-eval @ α*={a_star:.2f}] using heavy metrics...")
            res_val  = alpha_rerank_panels(val_packs,  val_true_pos_map,  item_hsr, use_df,
                                           alpha=a_star, k=k, item_hsr_display_dict=item_hsr_display_dict)
            res_test = alpha_rerank_panels(test_packs, test_true_pos_map, item_hsr, use_df,
                                           alpha=a_star, k=k, item_hsr_display_dict=item_hsr_display_dict)
            print("[final @ α*] VAL :",  res_val)
            print("[final @ α*] TEST:",  res_test)
        except NameError:
            print("[warn] alpha_rerank_panels is not defined; skip heavy re-eval.")

    return df_val, df_test, best


In [None]:
df_val, df_test, best = grid_search_alpha_with_ci(
    val_packs=val_packs,
    test_packs=test_packs,
    item_hsr=item_hsr_cont,                 
    item_hsr_display_dict=item_hsr_disp,    
    val_true_pos_map=true_pos_map_val,      
    test_true_pos_map=true_pos_map_test,    
    use_df=use,                             
    k=10,
    alphas=np.linspace(0,1,21),
    B=200,
    seed=42,
    max_drop=0.05,
    autosave_every=5,
    tag="full_run",
    show_progress=True,
)

heavy_test = alpha_rerank_panels(
    test_packs, true_pos_map_test, item_hsr_cont, use,
    alpha=float(best["best_alpha"]), k=10, item_hsr_display_dict=item_hsr_disp
)
pd.Series({**heavy_test["RANKING"], **heavy_test["HEALTH"]}).to_csv(
    f"results/best_alpha_old_main_with_CI_{best['best_alpha']:.2f}_test_metrics.csv"
)
print("[saved]", f"results/best_alpha_old_main_with_CI_{best['best_alpha']:.2f}_test_metrics.csv")

In [None]:
# Result Visualization and Saving
import os, json, datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

RESULT_DIR = "results"
os.makedirs(RESULT_DIR, exist_ok=True)
ts = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
k_plot = int(k) if "k" in globals() else 10


def _check_cols(df, name):
    need = {"alpha","precision_mean","precision_std","health_mean","health_std"}
    miss = [c for c in need if c not in df.columns]
    if miss:
        raise ValueError(f"{name} Missing: {miss}")

_check_cols(df_val,  "df_val")
_check_cols(df_test, "df_test")

val_csv  = os.path.join(RESULT_DIR, f"df_val_{ts}.csv")
test_csv = os.path.join(RESULT_DIR, f"df_test_{ts}.csv")
df_val.to_csv(val_csv, index=False)
df_test.to_csv(test_csv, index=False)

best_json = os.path.join(RESULT_DIR, f"best_alpha_{ts}.json")
with open(best_json, "w", encoding="utf-8") as f:
    json.dump(best, f, ensure_ascii=False, indent=2)

print(f"[saved] {val_csv}")
print(f"[saved] {test_csv}")
print(f"[saved] {best_json}")


def plot_tradeoff_with_ci(df, split_name: str, k_top: int = 10, save_ts: str = None):
    fig, ax1 = plt.subplots(figsize=(8.5, 5))

    # Precision@k (左轴)
    ax1.fill_between(df["alpha"],
                     df["precision_mean"] - 1.96*df["precision_std"],
                     df["precision_mean"] + 1.96*df["precision_std"],
                     alpha=0.20, label="Precision 95% CI")
    p_line, = ax1.plot(df["alpha"], df["precision_mean"], "-o", label=f"Precision@{k_top}")
    ax1.set_xlabel("α (health weight)")
    ax1.set_ylabel(f"Precision@{k_top}")
    ax1.grid(True, alpha=0.25)

    # HealthGain@k (右轴)
    ax2 = ax1.twinx()
    ax2.fill_between(df["alpha"],
                     df["health_mean"] - 1.96*df["health_std"],
                     df["health_mean"] + 1.96*df["health_std"],
                     alpha=0.20, color="tab:orange", label="HealthGain 95% CI")
    h_line, = ax2.plot(df["alpha"], df["health_mean"], "-s", color="tab:orange",
                       label=f"HealthGain@{k_top}")
    ax2.set_ylabel(f"HealthGain@{k_top} (avg HSR diff / 5)")


    ax1.set_title(f"Health–Accuracy Trade-off (Bootstrap CI) — {split_name} (K={k_top})")
    lines = [p_line, h_line]
    labels = [l.get_label() for l in lines]
    ax1.legend(lines, labels, loc="best")

    plt.tight_layout()
    fname = os.path.join(RESULT_DIR, f"alpha_tradeoff_{split_name.lower()}_{save_ts or ts}.png")
    plt.savefig(fname, dpi=300, bbox_inches="tight")
    plt.show()
    print(f"[saved] {fname}")
    return fname

fig_val  = plot_tradeoff_with_ci(df_val,  "VAL",  k_plot, ts)
fig_test = plot_tradeoff_with_ci(df_test, "TEST", k_plot, ts)


snap = {
    "best_alpha":          float(best.get("best_alpha", np.nan)),
    "best_precision_mean": float(best.get("precision", np.nan)),
    "best_healthgain_mean":float(best.get("healthgain", np.nan)),
    "k": k_plot,
    "val_rows": int(len(df_val)),
    "test_rows": int(len(df_test)),
}
snap_csv = os.path.join(RESULT_DIR, f"snapshot_{ts}.csv")
pd.Series(snap).to_csv(snap_csv)
print(f"[saved] {snap_csv}")
print("[snapshot]", snap)


if "g" in globals() and "th_best" in globals() and "is_food_score" in g.columns:
    plt.figure(figsize=(8,4.2))
    plt.hist(g["is_food_score"], bins=60, alpha=0.8)
    plt.axvline(float(th_best), ls="--", label=f"thr={float(th_best):.3f}")
    plt.title("Food probability distribution (Phase 1)")
    plt.xlabel("is_food_score"); plt.ylabel("Count"); plt.legend()
    plt.tight_layout()
    hist_path = os.path.join(RESULT_DIR, f"food_prob_hist_{ts}.png")
    plt.savefig(hist_path, dpi=300, bbox_inches="tight")
    plt.show()
    print(f"[saved] {hist_path}")
else:
    print("[note] Skip")


In [None]:
print(alpha_rerank_panels(
    test_packs, true_pos_map_test,
    item_hsr_dict=item_hsr,           
    use_df=use,                      
    alpha=0.0, k=10,
    item_hsr_display_dict=item_hsr_disp 
))
print(alpha_rerank_panels(
    test_packs, true_pos_map_test,
    item_hsr_dict=item_hsr,
    use_df=use,
    alpha=1.0, k=10,
    item_hsr_display_dict=item_hsr_disp
))


In [None]:
import os

output_dir = "alpha_search_results"
os.makedirs(output_dir, exist_ok=True)

fig_path = os.path.join(output_dir, "Health_Accuracy_Tradeoff_Test.png")
plt.savefig(fig_path, dpi=300, bbox_inches='tight')
print(f"[saved] Figure saved to: {fig_path}")

val_path = os.path.join(output_dir, "df_val.csv")
test_path = os.path.join(output_dir, "df_test.csv")
df_val.to_csv(val_path, index=False)
df_test.to_csv(test_path, index=False)
print(f"[saved] Tables saved to: {val_path}, {test_path}")

summary_path = os.path.join(output_dir, "best_alpha_summary.txt")
with open(summary_path, "w", encoding="utf-8") as f:
    f.write("=== Optimal α Summary ===\n")
    for k, v in best.items():
        f.write(f"{k}: {v}\n")
print(f"[saved] Summary saved to: {summary_path}")

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt

k = 10 

os.makedirs("alpha_search_results", exist_ok=True)
out_path = os.path.join("alpha_search_results", "Health_Accuracy_Tradeoff_Test.png")

fig, ax1 = plt.subplots(figsize=(8,5))
ax1.set_title(f"Health–Accuracy Trade-off (Bootstrap CI, K={k}) - TEST")


lower_p = df_test['precision_mean'] - 1.96*df_test['precision_std']
upper_p = df_test['precision_mean'] + 1.96*df_test['precision_std']
ax1.fill_between(df_test['alpha'], lower_p, upper_p, alpha=0.2, label='Precision 95% CI')
p1, = ax1.plot(df_test['alpha'], df_test['precision_mean'], '-o', label=f'Precision@{k}')
ax1.set_xlabel('α (health weight)')
ax1.set_ylabel(f'Precision@{k}')


ax2 = ax1.twinx()
lower_h = df_test['health_mean'] - 1.96*df_test['health_std']
upper_h = df_test['health_mean'] + 1.96*df_test['health_std']
ax2.fill_between(df_test['alpha'], lower_h, upper_h, alpha=0.2, label='HealthGain 95% CI')
p2, = ax2.plot(df_test['alpha'], df_test['health_mean'], '-s', label=f'HealthGain@{k}')
ax2.set_ylabel(f'HealthGain@{k} (avg HSR diff)')


lines = [p1, p2]
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='best')

fig.tight_layout()
fig.savefig(out_path, dpi=300, bbox_inches='tight')
print(f"[saved] {out_path}")


In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(10,4))
axes[0].hist(h[hsr_col].dropna(), bins=20, color='green', alpha=0.6)
axes[0].set_title("Original HSR distribution")
axes[1].hist(g["assigned_hsr_ml"].dropna(), bins=20, color='orange', alpha=0.6)
axes[1].set_title("Mapped Grocery HSR distribution")
plt.show()

print("Original HSR mean:", h[hsr_col].mean())
print("Mapped HSR mean:", g["assigned_hsr_ml"].mean())
print("SBERT mean similarity:", g['hsr_top1_sim'].mean())


In [None]:
import matplotlib.pyplot as plt
rng = np.random.default_rng(42)

def candidate_pool_hsr(packs, item_hsr_dict, max_users=2000):
    xs = []
    for u, cand, *_ in packs[:max_users]:
        h = np.array([item_hsr_dict.get(i, np.nan) for i in cand], dtype=float)
        xs.extend(h.tolist())
    xs = np.array(xs, dtype=float)
    xs = xs[np.isfinite(xs)]                 
    xs = xs[(xs >= 0.5) & (xs <= 5.0)]       
    return xs

def simulate_recommend(packs, item_hsr_dict, alpha, k=10, max_users=2000):
    xs = []
    for u, cand, ytrue, base_scores in packs[:max_users]:
        h = np.array([item_hsr_dict.get(i, np.nan) for i in cand], dtype=float)
        ok = np.isfinite(h)
        if not ok.any(): 
            continue
        h_user = np.where(np.isfinite(h), h, np.nanmean(h[ok]))
        z = (1 - alpha) * base_scores + alpha * (h_user / 5.0)
        idx = np.argsort(-z)[:k]
        xs.extend(h_user[idx].tolist())
    xs = np.array(xs, dtype=float)
    xs = xs[np.isfinite(xs)]
    xs = xs[(xs >= 0.5) & (xs <= 5.0)]
    return xs

pool = candidate_pool_hsr(test_packs, item_hsr_cont, max_users=2000)
rec_a0  = simulate_recommend(test_packs, item_hsr_cont, alpha=0.00, k=10, max_users=2000)
rec_a65 = simulate_recommend(test_packs, item_hsr_cont, alpha=0.65, k=10, max_users=2000)

m = min(len(pool), len(rec_a0), len(rec_a65), 30000)  
pool_s  = rng.choice(pool,   size=m, replace=False)
rec0_s  = rng.choice(rec_a0, size=m, replace=True)   
rec65_s = rng.choice(rec_a65,size=m, replace=True)

bins = np.linspace(0.5, 5.0, 19) 

plt.figure(figsize=(9,5))
plt.hist(pool_s,  bins=bins, alpha=0.35, density=True, label="All candidate items")
plt.hist(rec0_s,  bins=bins, alpha=0.60, density=True, label="Baseline top-10 (α=0)")
plt.hist(rec65_s, bins=bins, alpha=0.60, density=True, label="α=0.65 top-10")
plt.xlabel("HSR score (0.5–5)"); plt.ylabel("Density")
plt.title("Distribution of HSR: All vs Recommended (density, subsampled)")
plt.legend()
plt.tight_layout()
plt.savefig("results/hsr_distributions_density.png", dpi=160)
plt.show()

print(f"[stats] pool mean={pool.mean():.3f} | α=0 mean={rec_a0.mean():.3f} | α=0.65 mean={rec_a65.mean():.3f}")


In [None]:
import numpy as np

def avg_hsr_stats(alpha, k=10):
    top_means = []
    cand_means = []
    for u, cand, ytrue, base_scores in test_packs:
        h = np.array([g.loc[g['item_norm']==i,'assigned_hsr_ml'].mean() for i in cand])
        h = np.nan_to_num(h, nan=np.nanmean(h))
        h01 = h / 5.0
        hybrid = (1 - alpha) * base_scores + alpha * h01
        idx = np.argsort(-hybrid)[:k]
        top_means.append(np.mean(h[idx]))
        cand_means.append(np.mean(h))
    return np.mean(cand_means), np.mean(top_means)

cand_mean, top_mean = avg_hsr_stats(alpha=0.0)
print(f"Baseline α=0: candidates mean={cand_mean:.3f}, top@10 mean={top_mean:.3f}")
cand_mean, top_mean = avg_hsr_stats(alpha=0.65)
print(f"α=0.65: candidates mean={cand_mean:.3f}, top@10 mean={top_mean:.3f}")


In [None]:
import matplotlib.pyplot as plt

def user_health_gain(alpha, k=10):
    user_gain = []
    for u, cand, ytrue, base_scores in test_packs:
        h = np.array([g.loc[g['item_norm']==i,'assigned_hsr_ml'].mean() for i in cand])
        h = np.nan_to_num(h, nan=np.nanmean(h))
        h01 = h / 5.0
        hybrid = (1 - alpha) * base_scores + alpha * h01
        idx = np.argsort(-hybrid)[:k]
        gain = np.mean(h[idx]) - np.mean(h)
        user_gain.append(gain)
    return np.array(user_gain)

gain_base = user_health_gain(alpha=0.0)
gain_alpha = user_health_gain(alpha=0.65)

plt.figure(figsize=(8,5))
plt.hist(gain_base, bins=40, alpha=0.6, label='Baseline α=0')
plt.hist(gain_alpha, bins=40, alpha=0.6, label='α=0.65')
plt.axvline(0, color='black', linestyle='--')
plt.xlabel("User-level HealthGain (ΔHSR)")
plt.ylabel("Count of users")
plt.title("User-level HealthGain distribution")
plt.legend()
plt.show()
