In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.metrics import ndcg_score
from tqdm import tqdm

# Data path: each file contains top-N candidates for multiple users
data_dir = Path("recys-rag-stack/data/processed/rank_labeled")
files = sorted(data_dir.glob("rank_*.parquet"))[:100]  # Limit the number of files

total_ndcg, user_count = 0.0, 0

for f in tqdm(files, desc="Evaluating per file"):
    df = pd.read_parquet(f)

    for uid, group in df.groupby("userid"):
        # Check if the group is valid: at least 10 samples and both positive & negative labels
        if len(group) < 10 or group["label"].nunique() < 2:
            continue

        # Sort by Two-Tower score and take top 10
        group = group.sort_values("score", ascending=False).head(10)

        # Evaluate NDCG@10
        y_true = group["label"].to_numpy().reshape(1, -1)
        y_score = group["score"].to_numpy().reshape(1, -1)
        total_ndcg += ndcg_score(y_true, y_score, k=10)
        user_count += 1

ndcg_final = total_ndcg / user_count if user_count else 0
print(f"Valid users: {user_count}")
print(f"Two-Tower NDCG@10: {ndcg_final:.4f}")


Evaluating per file: 100%|██████████| 100/100 [00:06<00:00, 15.86it/s]

Valid users: 100
Two-Tower NDCG@10: 0.0163





In [None]:
import pandas as pd, numpy as np, lightgbm as lgb
from pathlib import Path
from sklearn.metrics import ndcg_score
from tqdm import tqdm

model_path = "recys-rag-stack/outputs/lambdarank_model.txt"
data_dir = Path("recys-rag-stack/data/processed/rank_labeled")
files = sorted(data_dir.glob("rank_*.parquet"))[:100]

feature_cols = [
    "score",
    "user_total_views", "user_total_buys", "user_buy_rate",
    "item_total_views", "item_total_buys", "item_buy_rate",
]

model = lgb.Booster(model_file=model_path)

total_ndcg, user_count = 0.0, 0

for f in tqdm(files, desc="Evaluating per file"):
    df = pd.read_parquet(f)
    df["gbt_score"] = model.predict(df[feature_cols].astype("float32"),
                                    num_iteration=model.best_iteration)

    for uid, group in df.groupby("userid"):
        # Sort by GBT score and take top 10
        group = group.sort_values("gbt_score", ascending=False).head(10)

        # Only evaluate if group has at least 10 items and both positive & negative labels
        if len(group) < 10 or group["label"].nunique() < 2:
            continue

        y_true = group["label"].to_numpy().reshape(1, -1)
        y_score = group["gbt_score"].to_numpy().reshape(1, -1)
        total_ndcg += ndcg_score(y_true, y_score, k=10)
        user_count += 1

ndcg_final = total_ndcg / user_count if user_count else 0
print(f"Valid users: {user_count}")
print(f"GBT (LambdaRank) NDCG@10: {ndcg_final:.4f}")


Evaluating per file:   0%|          | 0/100 [00:00<?, ?it/s]

Evaluating per file: 100%|██████████| 100/100 [00:28<00:00,  3.50it/s]

Valid users: 54
GBT (LambdaRank) NDCG@10: 0.5789



