In [3]:
# Block 1–2: Setup, load data, strict schema checks (robust filenames)

import os
import numpy as np
import pandas as pd

SEED = 42
np.random.seed(SEED)

DATA_DIR = "/content/initial_data"

# robust filenames (support both "train.csv" and "train_holodilnik.csv")
def pick_path(dir_path: str, candidates: list[str]) -> str:
    for name in candidates:
        p = os.path.join(dir_path, name)
        if os.path.exists(p):
            return p
    raise FileNotFoundError(f"None of these files found in {dir_path}: {candidates}")

DISHES_PATH = pick_path(DATA_DIR, ["dishes.csv"])
USERS_PATH  = pick_path(DATA_DIR, ["users.csv"])
TRAIN_PATH  = pick_path(DATA_DIR, ["train.csv", "train_holodilnik.csv"])
TEST_PATH   = pick_path(DATA_DIR, ["test.csv", "test_holodilnik.csv"])
SUB_PATH    = pick_path(DATA_DIR, ["sample_submission.csv", "sample_submission_holodilnik.csv"])

print("Using paths")
print("dishes:", DISHES_PATH)
print("users :", USERS_PATH)
print("train :", TRAIN_PATH)
print("test  :", TEST_PATH)
print("sub   :", SUB_PATH)

dishes = pd.read_csv(DISHES_PATH)
users  = pd.read_csv(USERS_PATH)
train  = pd.read_csv(TRAIN_PATH)
test   = pd.read_csv(TEST_PATH)
sub    = pd.read_csv(SUB_PATH)

print("\nShapes")
print("dishes:", dishes.shape)
print("users :", users.shape)
print("train :", train.shape)
print("test  :", test.shape)
print("sub   :", sub.shape)

cand_cols = [f"cand_{i:02d}" for i in range(1, 21)]

req_dishes = {"dish_id", "dish_name", "category", "calories", "spicy", "allergen_tags", "tags"}
req_users  = {"user_id", "allergies", "liked_tags", "disliked_tags", "prefers_light_food", "sweet_tooth", "coffee_addict", "microwave_trust"}

req_train_base = {"event_id", "user_id", "day", "meal_slot", "hangover_level", "guests_count", "diet_mode", "fridge_load_pct", "target_dish_id"}
req_test_base  = {"query_id", "user_id", "day", "meal_slot", "hangover_level", "guests_count", "diet_mode", "fridge_load_pct"}

def assert_has_cols(df: pd.DataFrame, needed: set[str], name: str):
    missing = needed - set(df.columns)
    if missing:
        raise ValueError(f"{name} missing columns: {sorted(missing)}")

assert_has_cols(dishes, req_dishes, "dishes")
assert_has_cols(users,  req_users,  "users")
assert_has_cols(train,  req_train_base | set(cand_cols), "train")
assert_has_cols(test,   req_test_base  | set(cand_cols), "test")
assert_has_cols(sub,    {"query_id", "rec_1", "rec_2", "rec_3", "rec_4", "rec_5"}, "sample_submission")

# ID sanity
if dishes["dish_id"].nunique() != len(dishes):
    raise ValueError("dishes: dish_id not unique")
if not dishes["dish_id"].between(1, 200).all():
    raise ValueError("dishes: dish_id out of [1,200]")

if users["user_id"].nunique() != len(users):
    raise ValueError("users: user_id not unique")
if not users["user_id"].between(1, 15000).all():
    raise ValueError("users: user_id out of [1,15000]")

# candidates checks
def check_candidates(df: pd.DataFrame, name: str):
    cand = df[cand_cols]
    if cand.isna().any().any():
        raise ValueError(f"{name}: NaN in candidates, total={int(cand.isna().sum().sum())}")

    cand_vals = cand.to_numpy()
    if not np.issubdtype(cand_vals.dtype, np.number):
        raise ValueError(f"{name}: candidates dtype not numeric: {cand_vals.dtype}")

    if (cand_vals < 1).any() or (cand_vals > 200).any():
        bad_pos = np.argwhere((cand_vals < 1) | (cand_vals > 200))[:10]
        examples = [(int(i), int(j), int(cand_vals[i, j])) for i, j in bad_pos]
        raise ValueError(f"{name}: candidates out of [1,200], examples={examples}")

check_candidates(train, "train")
check_candidates(test, "test")

# train: target must be among candidates (vectorized)
cand_matrix = train[cand_cols].to_numpy()
target = train["target_dish_id"].to_numpy().reshape(-1, 1)
in_cand = (cand_matrix == target).any(axis=1)
rate = float(in_cand.mean())
print("\ntrain: target in candidates rate:", rate)
if rate < 0.999:
    bad = train.loc[~in_cand, ["event_id", "target_dish_id"] + cand_cols].head(3)
    raise ValueError(f"train: target not in candidates for some rows, examples:\n{bad}")

# reference integrity
dish_ids = set(dishes["dish_id"].astype(int).tolist())
for df_name, df in [("train", train), ("test", test)]:
    all_cands = set(np.unique(df[cand_cols].to_numpy()))
    miss = list(all_cands - dish_ids)
    if miss:
        raise ValueError(f"{df_name}: candidates contain dish_ids not in dishes.csv, examples={sorted(miss)[:20]}")

user_ids = set(users["user_id"].astype(int).tolist())
for df_name, df in [("train", train), ("test", test)]:
    miss_u = list(set(df["user_id"].astype(int).unique()) - user_ids)
    if miss_u:
        raise ValueError(f"{df_name}: has user_id not in users.csv, examples={sorted(miss_u)[:20]}")

# query_id sanity
if test["query_id"].nunique() != len(test):
    raise ValueError("test: query_id not unique")

print("\nOK – data loaded and validated")

print("\nSample rows")
display(dishes.head(3))
display(users.head(3))
display(train.head(2))
display(test.head(2))
display(sub.head(2))


Using paths
dishes: /content/initial_data/dishes.csv
users : /content/initial_data/users.csv
train : /content/initial_data/train_holodilnik.csv
test  : /content/initial_data/test_holodilnik.csv
sub   : /content/initial_data/sample_submission_holodilnik.csv

Shapes
dishes: (200, 7)
users : (15000, 8)
train : (300000, 29)
test  : (20000, 28)
sub   : (20000, 6)

train: target in candidates rate: 1.0

OK – data loaded and validated

Sample rows


Unnamed: 0,dish_id,dish_name,category,calories,spicy,allergen_tags,tags
0,1,Оливье,salad,480,0,egg|mayo|peas,fiber|savory|vegetarian
1,2,Винегрет,salad,304,0,,fiber|low_cal|pickled|savory|sour|vegan|vegeta...
2,3,Салат «Мимоза»,salad,267,0,egg|mayo,fiber|low_cal|savory|vegetarian


Unnamed: 0,user_id,allergies,liked_tags,disliked_tags,prefers_light_food,sweet_tooth,coffee_addict,microwave_trust
0,1,chocolate|mayo|peas,bread|chicken|fried|juice|pasta|seafood_tag|tu...,baked|coffee|compote,0,0,0,0
1,2,,coffee|compote|fried|tea|vegan,chicken|fresh|pasta|spicy,1,0,1,0
2,3,mayo,crispy|meat|pickled|savory|seafood_tag|soda|sweet,compote|fish_tag|pork|rice|vegetarian,0,1,1,0


Unnamed: 0,event_id,user_id,day,meal_slot,hangover_level,guests_count,diet_mode,fridge_load_pct,cand_01,cand_02,...,cand_12,cand_13,cand_14,cand_15,cand_16,cand_17,cand_18,cand_19,cand_20,target_dish_id
0,1,4677,9,lunch,1,0,0,46,10,79,...,33,48,38,105,164,199,163,159,38,52
1,2,1179,14,dinner,0,2,1,44,37,163,...,51,133,56,67,84,140,68,72,31,163


Unnamed: 0,query_id,user_id,day,meal_slot,hangover_level,guests_count,diet_mode,fridge_load_pct,cand_01,cand_02,...,cand_11,cand_12,cand_13,cand_14,cand_15,cand_16,cand_17,cand_18,cand_19,cand_20
0,1,113,6,lunch,3,4,0,91,89,41,...,34,155,115,138,166,161,139,53,173,143
1,2,10620,7,breakfast,3,2,1,62,95,147,...,137,75,48,22,133,63,55,102,106,138


Unnamed: 0,query_id,rec_1,rec_2,rec_3,rec_4,rec_5
0,1,36,89,108,56,99
1,2,36,89,108,56,99


In [6]:
# Block 3–4: Local metric (NDCG@5) + strong baseline (global popularity within candidates)

import numpy as np
import pandas as pd

cand_cols = [f"cand_{i:02d}" for i in range(1, 21)]

def ndcg_at_5_single_rank(rank_1based: int | None) -> float:
    if rank_1based is None:
        return 0.0
    if rank_1based < 1 or rank_1based > 5:
        return 0.0
    return float(1.0 / np.log2(rank_1based + 1))

def ndcg_at_5_from_recs(df: pd.DataFrame, target_col: str, rec_cols: list[str]) -> float:
    target = df[target_col].to_numpy()
    recs = df[rec_cols].to_numpy()
    score = 0.0
    for i in range(len(df)):
        t = target[i]
        row = recs[i]
        pos = None
        for j in range(min(5, row.shape[0])):
            if row[j] == t:
                pos = j + 1
                break
        score += ndcg_at_5_single_rank(pos)
    return float(score / len(df))


dish_pop = (
    train["target_dish_id"]
    .value_counts()
    .sort_index()
)
pop_dict = dish_pop.to_dict()

def rank_candidates_by_pop(cand_row: np.ndarray, pop: dict[int, int]) -> list[int]:

    uniq = []
    seen = set()
    for x in cand_row:
        x = int(x)
        if x not in seen:
            seen.add(x)
            uniq.append(x)

    uniq.sort(key=lambda d: (-pop.get(d, 0), d))
    return uniq[:5]


train_cands = train[cand_cols].to_numpy()
test_cands  = test[cand_cols].to_numpy()

train_recs = np.zeros((len(train), 5), dtype=int)
for i in range(len(train)):
    train_recs[i] = rank_candidates_by_pop(train_cands[i], pop_dict)

test_recs = np.zeros((len(test), 5), dtype=int)
for i in range(len(test)):
    test_recs[i] = rank_candidates_by_pop(test_cands[i], pop_dict)


rng = np.random.default_rng(SEED)
idx = np.arange(len(train))
rng.shuffle(idx)
holdout_size = int(0.2 * len(train))
va_idx = idx[:holdout_size]

va_df = train.iloc[va_idx].copy()
for k in range(5):
    va_df[f"rec_{k+1}"] = train_recs[va_idx, k]

baseline_ndcg = ndcg_at_5_from_recs(va_df, "target_dish_id", [f"rec_{i}" for i in range(1, 6)])
print("Baseline holdout NDCG@5:", baseline_ndcg)


submission_baseline = sub.copy()
for k in range(5):
    submission_baseline[f"rec_{k+1}"] = test_recs[:, k]


def validate_submission(subm: pd.DataFrame, test_df: pd.DataFrame):
    if len(subm) != len(test_df):
        raise ValueError("submission rows != test rows")
    if not subm["query_id"].equals(test_df["query_id"]):
        raise ValueError("query_id mismatch / order changed")

    rec_cols = [f"rec_{i}" for i in range(1, 6)]
    recs = subm[rec_cols].to_numpy()
    cands = test_df[cand_cols].to_numpy()


    distinct_ok = np.array([len(set(row.tolist())) == 5 for row in recs]).mean()
    print("Distinct@5 rate:", float(distinct_ok))

    in_cand = []
    for i in range(len(subm)):
        cand_set = set(cands[i].tolist())
        ok = all(int(x) in cand_set for x in recs[i].tolist())
        in_cand.append(ok)
    in_cand_rate = float(np.mean(in_cand))
    print("In-candidates@5 rate:", in_cand_rate)
    if in_cand_rate < 1.0:
        bad_i = int(np.where(np.array(in_cand) == 0)[0][0])
        raise ValueError(f"Found out-of-candidate recs at row {bad_i}, query_id={int(subm.loc[bad_i,'query_id'])}")

validate_submission(submission_baseline, test)

OUT_PATH = "/content/submission_baseline_popularity.csv"
submission_baseline.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
display(submission_baseline.head(5))


Baseline holdout NDCG@5: 0.27828914462875054
Distinct@5 rate: 1.0
In-candidates@5 rate: 1.0
Saved: /content/submission_baseline_popularity.csv


Unnamed: 0,query_id,rec_1,rec_2,rec_3,rec_4,rec_5
0,1,53,34,104,181,155
1,2,48,63,34,55,75
2,3,40,33,122,108,104
3,4,40,39,192,26,32
4,5,47,34,55,176,85


In [None]:
# Block 5 – Optimized: Candidate-level ranking + strong FE + CatBoostRanker + valid top-5 submission

!pip -q install -U catboost

import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

SEED = 42
rng = np.random.default_rng(SEED)

cand_cols = [f"cand_{i:02d}" for i in range(1, 21)]

# ----------------------------
# 0) Memory – downcast base tables
# ----------------------------
def downcast_int(s: pd.Series, minv=None, maxv=None):
    if s.isna().any():
        return s
    if minv is None: minv = int(s.min())
    if maxv is None: maxv = int(s.max())
    if minv >= 0:
        if maxv < 2**8:   return s.astype(np.uint8)
        if maxv < 2**16:  return s.astype(np.uint16)
        if maxv < 2**32:  return s.astype(np.uint32)
        return s.astype(np.uint64)
    else:
        if -(2**7)  <= minv and maxv < 2**7:   return s.astype(np.int8)
        if -(2**15) <= minv and maxv < 2**15:  return s.astype(np.int16)
        if -(2**31) <= minv and maxv < 2**31:  return s.astype(np.int32)
        return s.astype(np.int64)

def safe_str_series(s: pd.Series, na_val: str = "__NA__") -> pd.Series:
    return s.astype("string").fillna(na_val).astype("object")

# base event tables
for col in ["day", "hangover_level", "guests_count", "diet_mode"]:
    train[col] = downcast_int(train[col])
    test[col]  = downcast_int(test[col])

train["fridge_load_pct"] = train["fridge_load_pct"].astype(np.float32)
test["fridge_load_pct"]  = test["fridge_load_pct"].astype(np.float32)

train["user_id"] = downcast_int(train["user_id"])
test["user_id"]  = downcast_int(test["user_id"])

train["event_id"] = downcast_int(train["event_id"])
test["query_id"]  = downcast_int(test["query_id"])

train["target_dish_id"] = downcast_int(train["target_dish_id"])

for c in cand_cols:
    train[c] = downcast_int(train[c])
    test[c]  = downcast_int(test[c])

# dishes/users tables
dishes["dish_id"] = downcast_int(dishes["dish_id"])
users["user_id"]  = downcast_int(users["user_id"])

dishes["calories"] = dishes["calories"].astype(np.float32)
dishes["spicy"] = downcast_int(dishes["spicy"])
users["prefers_light_food"] = downcast_int(users["prefers_light_food"])
users["sweet_tooth"] = downcast_int(users["sweet_tooth"])
users["coffee_addict"] = downcast_int(users["coffee_addict"])
users["microwave_trust"] = downcast_int(users["microwave_trust"])

# normalize categoricals/text
dishes["category"] = safe_str_series(dishes["category"], "__NA__")
dishes["allergen_tags"] = safe_str_series(dishes["allergen_tags"], "__NONE__")
dishes["tags"] = safe_str_series(dishes["tags"], "__NONE__")

users["allergies"] = safe_str_series(users["allergies"], "__NONE__")
users["liked_tags"] = safe_str_series(users["liked_tags"], "__NONE__")
users["disliked_tags"] = safe_str_series(users["disliked_tags"], "__NONE__")

train["meal_slot"] = safe_str_series(train["meal_slot"], "__NA__")
test["meal_slot"]  = safe_str_series(test["meal_slot"], "__NA__")

# ----------------------------
# 1) Fast lookup tables + parsed sets for overlaps
# ----------------------------
def split_pipe_to_set(s: str) -> set:
    if s is None:
        return set()
    s = str(s).strip()
    if not s or s == "__NONE__" or s == "__NA__" or s.lower() == "nan":
        return set()
    return set([t for t in s.split("|") if t])

dish_cat = dict(zip(dishes["dish_id"].astype(int), dishes["category"].astype(str)))
dish_cal = dict(zip(dishes["dish_id"].astype(int), dishes["calories"].astype(float)))
dish_spicy = dict(zip(dishes["dish_id"].astype(int), dishes["spicy"].astype(int)))
dish_allergens_set = {int(i): split_pipe_to_set(a) for i, a in zip(dishes["dish_id"], dishes["allergen_tags"])}
dish_tags_set = {int(i): split_pipe_to_set(t) for i, t in zip(dishes["dish_id"], dishes["tags"])}

user_allergies_set = {int(i): split_pipe_to_set(a) for i, a in zip(users["user_id"], users["allergies"])}
user_liked_set = {int(i): split_pipe_to_set(t) for i, t in zip(users["user_id"], users["liked_tags"])}
user_disliked_set = {int(i): split_pipe_to_set(t) for i, t in zip(users["user_id"], users["disliked_tags"])}

user_flags = users.set_index("user_id")[["prefers_light_food","sweet_tooth","coffee_addict","microwave_trust"]].copy()

# global popularity (for tie-breaks and as feature)
dish_pop = train["target_dish_id"].value_counts()
dish_pop = dish_pop.to_dict()

# ----------------------------
# 2) Candidate-level dataset builder – optimized loop, negative sampling
# ----------------------------
BASE_COLS = ["user_id", "day", "meal_slot", "hangover_level", "guests_count", "diet_mode", "fridge_load_pct"]

def build_candidate_level_train(events: pd.DataFrame, neg_per_pos: int = 9) -> pd.DataFrame:
    gids = events["event_id"].to_numpy()
    uids = events["user_id"].to_numpy()
    day  = events["day"].to_numpy()
    slot = events["meal_slot"].to_numpy()
    hang = events["hangover_level"].to_numpy()
    gcnt = events["guests_count"].to_numpy()
    diet = events["diet_mode"].to_numpy()
    load = events["fridge_load_pct"].to_numpy()
    cands = events[cand_cols].to_numpy()
    target = events["target_dish_id"].to_numpy()

    rows = []
    rows_append = rows.append

    for i in range(len(events)):
        gid = int(gids[i])
        uid = int(uids[i])
        t = int(target[i])

        cand_row = cands[i].astype(int)

        # uniq candidates with first position
        seen = set()
        uniq = []
        pos_map = []
        for j in range(20):
            d = int(cand_row[j])
            if d not in seen:
                seen.add(d)
                uniq.append(d)
                pos_map.append(j + 1)

        # skip corrupted row (should not happen, but safe)
        if t not in seen:
            continue

        # negatives
        neg_ids = [d for d in uniq if d != t]
        if len(neg_ids) > neg_per_pos:
            neg_ids = rng.choice(neg_ids, size=neg_per_pos, replace=False).tolist()

        pick = [t] + neg_ids

        for d in pick:
            # candidate position in original cand_01..cand_20 (1..20)
            p = pos_map[uniq.index(d)]

            # user-dish interactions
            ua = user_allergies_set.get(uid, set())
            ul = user_liked_set.get(uid, set())
            ud = user_disliked_set.get(uid, set())

            da = dish_allergens_set.get(d, set())
            dt = dish_tags_set.get(d, set())

            allergen_conflict = 1 if len(ua & da) > 0 else 0
            liked_overlap = len(ul & dt)
            disliked_overlap = len(ud & dt)

            # light food signal: low calories preferred
            cal = float(dish_cal.get(d, 0.0))
            is_dessert = 1 if dish_cat.get(d, "__NA__") == "dessert" else 0
            is_drink = 1 if dish_cat.get(d, "__NA__") == "drink" else 0

            pop = int(dish_pop.get(d, 0))

            rows_append((
                gid, uid, int(day[i]), str(slot[i]), int(hang[i]), int(gcnt[i]), int(diet[i]), float(load[i]),
                int(d), int(p),
                int(allergen_conflict), int(liked_overlap), int(disliked_overlap),
                int(is_dessert), int(is_drink),
                float(cal), int(dish_spicy.get(d, 0)), int(pop),
                1 if d == t else 0
            ))

    out = pd.DataFrame(
        rows,
        columns=[
            "group_id","user_id","day","meal_slot","hangover_level","guests_count","diet_mode","fridge_load_pct",
            "dish_id","candidate_pos",
            "allergen_conflict","liked_overlap","disliked_overlap",
            "is_dessert","is_drink",
            "calories","spicy","dish_pop",
            "label"
        ]
    )
    return out

def build_candidate_level_test(queries: pd.DataFrame) -> pd.DataFrame:
    gids = queries["query_id"].to_numpy()
    uids = queries["user_id"].to_numpy()
    day  = queries["day"].to_numpy()
    slot = queries["meal_slot"].to_numpy()
    hang = queries["hangover_level"].to_numpy()
    gcnt = queries["guests_count"].to_numpy()
    diet = queries["diet_mode"].to_numpy()
    load = queries["fridge_load_pct"].to_numpy()
    cands = queries[cand_cols].to_numpy()

    rows = []
    rows_append = rows.append

    for i in range(len(queries)):
        gid = int(gids[i])
        uid = int(uids[i])

        cand_row = cands[i].astype(int)

        seen = set()
        uniq = []
        pos_map = []
        for j in range(20):
            d = int(cand_row[j])
            if d not in seen:
                seen.add(d)
                uniq.append(d)
                pos_map.append(j + 1)

        ua = user_allergies_set.get(uid, set())
        ul = user_liked_set.get(uid, set())
        ud = user_disliked_set.get(uid, set())

        for d in uniq:
            p = pos_map[uniq.index(d)]

            da = dish_allergens_set.get(d, set())
            dt = dish_tags_set.get(d, set())

            allergen_conflict = 1 if len(ua & da) > 0 else 0
            liked_overlap = len(ul & dt)
            disliked_overlap = len(ud & dt)

            cal = float(dish_cal.get(d, 0.0))
            is_dessert = 1 if dish_cat.get(d, "__NA__") == "dessert" else 0
            is_drink = 1 if dish_cat.get(d, "__NA__") == "drink" else 0
            pop = int(dish_pop.get(d, 0))

            rows_append((
                gid, uid, int(day[i]), str(slot[i]), int(hang[i]), int(gcnt[i]), int(diet[i]), float(load[i]),
                int(d), int(p),
                int(allergen_conflict), int(liked_overlap), int(disliked_overlap),
                int(is_dessert), int(is_drink),
                float(cal), int(dish_spicy.get(d, 0)), int(pop)
            ))

    out = pd.DataFrame(
        rows,
        columns=[
            "group_id","user_id","day","meal_slot","hangover_level","guests_count","diet_mode","fridge_load_pct",
            "dish_id","candidate_pos",
            "allergen_conflict","liked_overlap","disliked_overlap",
            "is_dessert","is_drink",
            "calories","spicy","dish_pop"
        ]
    )
    return out

NEG_PER_POS = 9

train_cl = build_candidate_level_train(train, neg_per_pos=NEG_PER_POS)
test_cl  = build_candidate_level_test(test)

print("Candidate-level shapes")
print("train_cl:", train_cl.shape, "label mean:", float(train_cl["label"].mean()))
print("test_cl :", test_cl.shape)

# join user flags (fast)
train_cl = train_cl.merge(user_flags, left_on="user_id", right_index=True, how="left")
test_cl  = test_cl.merge(user_flags, left_on="user_id", right_index=True, how="left")

# hard heuristic penalties/boosters as numeric features
# diet_mode: push away high calories a bit
train_cl["diet_cal_penalty"] = (train_cl["diet_mode"].astype(np.int16) * train_cl["calories"].astype(np.float32)) / 500.0
test_cl["diet_cal_penalty"]  = (test_cl["diet_mode"].astype(np.int16) * test_cl["calories"].astype(np.float32)) / 500.0

# prefers_light_food: similar penalty
train_cl["light_cal_penalty"] = (train_cl["prefers_light_food"].astype(np.int16) * train_cl["calories"].astype(np.float32)) / 500.0
test_cl["light_cal_penalty"]  = (test_cl["prefers_light_food"].astype(np.int16) * test_cl["calories"].astype(np.float32)) / 500.0

# sweet_tooth: dessert affinity
train_cl["sweet_dessert_aff"] = train_cl["sweet_tooth"].astype(np.int16) * train_cl["is_dessert"].astype(np.int16)
test_cl["sweet_dessert_aff"]  = test_cl["sweet_tooth"].astype(np.int16) * test_cl["is_dessert"].astype(np.int16)

# coffee_addict: drink affinity (в данных кофе встречается как теги, но category drink уже сильный сигнал)
train_cl["coffee_drink_aff"] = train_cl["coffee_addict"].astype(np.int16) * train_cl["is_drink"].astype(np.int16)
test_cl["coffee_drink_aff"]  = test_cl["coffee_addict"].astype(np.int16) * test_cl["is_drink"].astype(np.int16)

# allergen conflict: enforce strong negative signal
train_cl["allergen_penalty"] = train_cl["allergen_conflict"].astype(np.int16)
test_cl["allergen_penalty"]  = test_cl["allergen_conflict"].astype(np.int16)

# ensure categorical columns are object strings
train_cl["meal_slot"] = safe_str_series(train_cl["meal_slot"], "__NA__")
test_cl["meal_slot"]  = safe_str_series(test_cl["meal_slot"], "__NA__")

# ----------------------------
# 3) Feature set
# ----------------------------
feature_cols = [
    "user_id","day","meal_slot","hangover_level","guests_count","diet_mode","fridge_load_pct",
    "dish_id","candidate_pos",
    "prefers_light_food","sweet_tooth","coffee_addict","microwave_trust",
    "calories","spicy","dish_pop",
    "liked_overlap","disliked_overlap",
    "allergen_penalty",
    "diet_cal_penalty","light_cal_penalty",
    "sweet_dessert_aff","coffee_drink_aff",
]

cat_cols = ["meal_slot"]

# ----------------------------
# 4) Group holdout split + training with early stopping
# ----------------------------
all_groups = train_cl["group_id"].unique()
rng = np.random.default_rng(SEED)
rng.shuffle(all_groups)

n_val_groups = int(0.2 * len(all_groups))
val_groups = set(all_groups[:n_val_groups])

tr_mask = ~train_cl["group_id"].isin(val_groups)
va_mask = ~tr_mask

tr_df = train_cl.loc[tr_mask]
va_df = train_cl.loc[va_mask]

tr_pool = Pool(
    data=tr_df[feature_cols],
    label=tr_df["label"],
    group_id=tr_df["group_id"],
    cat_features=cat_cols,
)
va_pool = Pool(
    data=va_df[feature_cols],
    label=va_df["label"],
    group_id=va_df["group_id"],
    cat_features=cat_cols,
)

params = dict(
    loss_function="YetiRankPairwise",
    eval_metric="NDCG:top=5",
    iterations=1200,
    learning_rate=0.08,
    depth=6,
    l2_leaf_reg=8.0,
    random_seed=SEED,
    verbose=100,
    od_type="Iter",
    od_wait=100,
    # CPU stability
    allow_writing_files=False,
)

ranker = CatBoostRanker(**params)
ranker.fit(tr_pool, eval_set=va_pool, use_best_model=True)

# ----------------------------
# 5) Local NDCG@5 on holdout groups (true dish is label==1 per group)
# ----------------------------
def ndcg_at_5_from_ranked_lists(true_ids: np.ndarray, recs_2d: np.ndarray) -> float:
    score = 0.0
    for i in range(len(true_ids)):
        t = int(true_ids[i])
        row = recs_2d[i]
        pos = 0
        for j in range(5):
            if int(row[j]) == t:
                pos = j + 1
                break
        if pos == 0:
            continue
        score += 1.0 / np.log2(pos + 1)
    return float(score / len(true_ids))

va_scores = ranker.predict(va_pool)
va_tmp = va_df[["group_id","dish_id","label"]].copy()
va_tmp["score"] = va_scores

top5 = (
    va_tmp.sort_values(["group_id","score"], ascending=[True, False])
    .groupby("group_id")
    .head(5)
)

rec_series = top5.groupby("group_id")["dish_id"].apply(list)
# pad to 5 (на случай групп с <5 уникальными кандид.)
rec_series = rec_series.apply(lambda lst: (lst + lst[:5])[:5])

val_group_list = rec_series.index.to_numpy()
recs_2d = np.vstack(rec_series.to_list()).astype(int)

true_map = (
    va_tmp.loc[va_tmp["label"] == 1, ["group_id","dish_id"]]
    .drop_duplicates("group_id")
    .set_index("group_id")["dish_id"]
)
true_ids = true_map.loc[val_group_list].to_numpy().astype(int)

val_ndcg = ndcg_at_5_from_ranked_lists(true_ids, recs_2d)
print("Holdout groups NDCG@5:", val_ndcg)

# ----------------------------
# 6) Train on full train_cl and predict test, then enforce strict validity
# ----------------------------
full_pool = Pool(
    data=train_cl[feature_cols],
    label=train_cl["label"],
    group_id=train_cl["group_id"],
    cat_features=cat_cols,
)
test_pool = Pool(
    data=test_cl[feature_cols],
    group_id=test_cl["group_id"],
    cat_features=cat_cols,
)

ranker_full = CatBoostRanker(**params)
ranker_full.fit(full_pool)

test_scores = ranker_full.predict(test_pool)

test_tmp = test_cl[["group_id","dish_id"]].copy()
test_tmp["score"] = test_scores

# for each group_id pick top-5 by score
test_top = (
    test_tmp.sort_values(["group_id","score"], ascending=[True, False])
    .groupby("group_id")
    .head(20)  # keep more to fix dups if any
)

# candidates for validation and repair
test_cand_2d = test[cand_cols].to_numpy().astype(int)
qid_order = test["query_id"].astype(int).to_numpy()

cand_map = {int(qid_order[i]): test_cand_2d[i].tolist() for i in range(len(test))}

def make_top5_for_group(gid: int, dish_list_sorted: list[int]) -> list[int]:
    # enforce: unique, in candidates; fill missing with best remaining by (pop, dish_id)
    cand = cand_map[gid]
    cand_set = set(cand)

    out = []
    seen = set()
    for d in dish_list_sorted:
        d = int(d)
        if d in cand_set and d not in seen:
            seen.add(d)
            out.append(d)
        if len(out) == 5:
            return out

    # fill if недостаточно
    remaining = [d for d in cand if d not in seen]
    remaining = list(dict.fromkeys(map(int, remaining)))  # keep order, uniq

    remaining.sort(key=lambda x: (-dish_pop.get(int(x), 0), int(x)))
    for d in remaining:
        if d not in seen:
            out.append(int(d))
            seen.add(int(d))
        if len(out) == 5:
            break
    return out[:5]

# produce per group ranked dish list
group_lists = test_top.groupby("group_id")["dish_id"].apply(list).to_dict()

recs = []
for gid in qid_order:
    # sort candidates of this gid by score: we already took head(20) after sorting, so list is ok
    base_list = group_lists.get(int(gid), [])
    recs.append(make_top5_for_group(int(gid), base_list))

recs_2d = np.array(recs, dtype=int)

# strict validation
def validate_recs(recs_2d: np.ndarray, cand_2d: np.ndarray):
    n = recs_2d.shape[0]
    bad_dup = 0
    bad_out = 0
    for i in range(n):
        r = list(map(int, recs_2d[i].tolist()))
        if len(set(r)) != 5:
            bad_dup += 1
        cand_set = set(map(int, cand_2d[i].tolist()))
        if any(x not in cand_set for x in r):
            bad_out += 1
    print("Validate recs – dup rows:", bad_dup, "out-of-cand rows:", bad_out)
    if bad_out != 0 or bad_dup != 0:
        raise ValueError("Invalid recommendations detected")

validate_recs(recs_2d, test_cand_2d)

submission_rank = sub.copy()
for k in range(5):
    submission_rank[f"rec_{k+1}"] = recs_2d[:, k]

OUT_PATH = "/content/submission_ranker_optimized.csv"
submission_rank.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)
display(submission_rank.head(5))


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hCandidate-level shapes
train_cl: (3000000, 19) label mean: 0.1
test_cl : (381472, 18)
Pairwise scoring loss functions on CPU do not support one hot features. OneHotMaxSize set to 1
0:	test: 0.3902914	best: 0.3902914 (0)	total: 7.5s	remaining: 2h 29m 53s
100:	test: 0.4753610	best: 0.4754433 (99)	total: 11m 27s	remaining: 2h 4m 36s
200:	test: 0.4895929	best: 0.4896275 (199)	total: 22m 36s	remaining: 1h 52m 22s
300:	test: 0.4991793	best: 0.4992089 (299)	total: 33m 53s	remaining: 1h 41m 13s
400:	test: 0.5042832	best: 0.5042832 (400)	total: 45m 6s	remaining: 1h 29m 53s
500:	test: 0.5079254	best: 0.5079254 (500)	total: 56m 15s	remaining: 1h 18m 28s
600:	test: 0.5096889	best: 0.5096889 (600)	total: 1h 7m 23s	remaining: 1h 7m 10s
700:	test: 0.5107184	best: 0.5107419 (698)	total: 1h 18m 45s	remaining: 56m 4s
800:	test: 0.5117476	best: 0.5118071 (796)	total: 1h 30m 7s	remai

KeyboardInterrupt: 

In [None]:
# Use the already trained 'ranker' (holdout-trained) to score test
test_pool = Pool(
    data=test_cl[feature_cols],
    group_id=test_cl["group_id"],
    cat_features=cat_cols,
)

test_scores = ranker.predict(test_pool)

test_tmp = test_cl[["group_id", "dish_id"]].copy()
test_tmp["score"] = test_scores

test_top = (
    test_tmp.sort_values(["group_id", "score"], ascending=[True, False])
    .groupby("group_id")
    .head(20)
)

test_cand_2d = test[cand_cols].to_numpy().astype(int)
qid_order = test["query_id"].astype(int).to_numpy()
cand_map = {int(qid_order[i]): test_cand_2d[i].tolist() for i in range(len(test))}

def make_top5_for_group(gid: int, dish_list_sorted: list[int]) -> list[int]:
    cand = cand_map[gid]
    cand_set = set(cand)

    out = []
    seen = set()
    for d in dish_list_sorted:
        d = int(d)
        if d in cand_set and d not in seen:
            seen.add(d)
            out.append(d)
        if len(out) == 5:
            return out

    remaining = list(dict.fromkeys(map(int, cand)))
    remaining = [d for d in remaining if d not in seen]
    remaining.sort(key=lambda x: (-dish_pop.get(int(x), 0), int(x)))

    for d in remaining:
        if d not in seen:
            out.append(int(d))
            seen.add(int(d))
        if len(out) == 5:
            break
    return out[:5]

group_lists = test_top.groupby("group_id")["dish_id"].apply(list).to_dict()

recs = []
for gid in qid_order:
    base_list = group_lists.get(int(gid), [])
    recs.append(make_top5_for_group(int(gid), base_list))

recs_2d = np.array(recs, dtype=int)

def validate_recs(recs_2d: np.ndarray, cand_2d: np.ndarray):
    n = recs_2d.shape[0]
    bad_dup = 0
    bad_out = 0
    for i in range(n):
        r = list(map(int, recs_2d[i].tolist()))
        if len(set(r)) != 5:
            bad_dup += 1
        cand_set = set(map(int, cand_2d[i].tolist()))
        if any(x not in cand_set for x in r):
            bad_out += 1
    print("Validate recs – dup rows:", bad_dup, "out-of-cand rows:", bad_out)
    if bad_out != 0 or bad_dup != 0:
        raise ValueError("Invalid recommendations detected")

validate_recs(recs_2d, test_cand_2d)

submission_rank = sub.copy()
for k in range(5):
    submission_rank[f"rec_{k+1}"] = recs_2d[:, k]

OUT_PATH = "/content/submission_ranker_from_holdout_model.csv"
submission_rank.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


Validate recs – dup rows: 0 out-of-cand rows: 0
Saved: /content/submission_ranker_from_holdout_model.csv


In [8]:
import numpy as np
import pandas as pd

SEED = 42
rng = np.random.default_rng(SEED)
cand_cols = [f"cand_{i:02d}" for i in range(1, 21)]

def split_pipe(s: str):
    if s is None:
        return []
    s = str(s).strip()
    if (not s) or s in ("__NONE__", "__NA__") or s.lower() == "nan":
        return []
    return [t for t in s.split("|") if t]

# 1) Собираем словарь токен -> bit index на основе всех полей
all_tokens = set()
for col in ["allergen_tags", "tags"]:
    for x in dishes[col].astype(str).tolist():
        all_tokens.update(split_pipe(x))
for col in ["allergies", "liked_tags", "disliked_tags"]:
    for x in users[col].astype(str).tolist():
        all_tokens.update(split_pipe(x))

token2bit = {t: i for i, t in enumerate(sorted(all_tokens))}
# если токенов много (например 500–2000) – всё равно ок, Python int тянет
# bit_count работает быстро

def to_mask(s: str) -> int:
    m = 0
    for t in split_pipe(s):
        b = token2bit.get(t)
        if b is not None:
            m |= (1 << b)
    return m

# 2) Быстрые lookup для блюд
dish_cat = dishes.set_index("dish_id")["category"].astype(str).to_dict()
dish_cal = dishes.set_index("dish_id")["calories"].astype(float).to_dict()
dish_spicy = dishes.set_index("dish_id")["spicy"].astype(int).to_dict()

dish_allergen_mask = {int(d): to_mask(a) for d, a in zip(dishes["dish_id"], dishes["allergen_tags"].astype(str))}
dish_tags_mask     = {int(d): to_mask(t) for d, t in zip(dishes["dish_id"], dishes["tags"].astype(str))}

# 3) Быстрые lookup для юзеров
user_allerg_mask  = {int(u): to_mask(a) for u, a in zip(users["user_id"], users["allergies"].astype(str))}
user_liked_mask   = {int(u): to_mask(t) for u, t in zip(users["user_id"], users["liked_tags"].astype(str))}
user_disliked_mask= {int(u): to_mask(t) for u, t in zip(users["user_id"], users["disliked_tags"].astype(str))}

user_flags = users.set_index("user_id")[["prefers_light_food","sweet_tooth","coffee_addict","microwave_trust"]]

dish_pop = train["target_dish_id"].value_counts().to_dict()

def build_candidate_level_train_fast(events: pd.DataFrame, neg_per_pos: int = 9) -> pd.DataFrame:
    gids = events["event_id"].to_numpy(np.int32)
    uids = events["user_id"].to_numpy(np.int32)
    day  = events["day"].to_numpy(np.int16)
    slot = events["meal_slot"].astype(str).to_numpy()
    hang = events["hangover_level"].to_numpy(np.int8)
    gcnt = events["guests_count"].to_numpy(np.int8)
    diet = events["diet_mode"].to_numpy(np.int8)
    load = events["fridge_load_pct"].to_numpy(np.float32)
    cands = events[cand_cols].to_numpy(np.int16)
    target = events["target_dish_id"].to_numpy(np.int16)

    rows = []
    append = rows.append

    for i in range(len(events)):
        gid = int(gids[i])
        uid = int(uids[i])
        t   = int(target[i])

        cand_row = cands[i].tolist()

        # uniq + позиция первого вхождения
        pos_of = {}
        uniq = []
        for j, d in enumerate(cand_row, start=1):
            d = int(d)
            if d not in pos_of:
                pos_of[d] = j
                uniq.append(d)

        if t not in pos_of:
            continue

        neg_ids = [d for d in uniq if d != t]
        if len(neg_ids) > neg_per_pos:
            neg_ids = rng.choice(neg_ids, size=neg_per_pos, replace=False).tolist()

        pick = [t] + neg_ids

        ua = user_allerg_mask.get(uid, 0)
        ul = user_liked_mask.get(uid, 0)
        ud = user_disliked_mask.get(uid, 0)

        for d in pick:
            p = pos_of[d]

            da = dish_allergen_mask.get(d, 0)
            dt = dish_tags_mask.get(d, 0)

            allergen_conflict = 1 if (ua & da) != 0 else 0
            liked_overlap = (ul & dt).bit_count()
            disliked_overlap = (ud & dt).bit_count()

            cat = dish_cat.get(d, "__NA__")
            is_dessert = 1 if cat == "dessert" else 0
            is_drink   = 1 if cat == "drink" else 0

            cal = float(dish_cal.get(d, 0.0))
            spicy = int(dish_spicy.get(d, 0))
            pop = int(dish_pop.get(d, 0))

            append((
                gid, uid, int(day[i]), str(slot[i]), int(hang[i]), int(gcnt[i]), int(diet[i]), float(load[i]),
                int(d), int(p),
                int(allergen_conflict), int(liked_overlap), int(disliked_overlap),
                int(is_dessert), int(is_drink),
                float(cal), int(spicy), int(pop),
                1 if d == t else 0
            ))

    return pd.DataFrame(
        rows,
        columns=[
            "group_id","user_id","day","meal_slot","hangover_level","guests_count","diet_mode","fridge_load_pct",
            "dish_id","candidate_pos",
            "allergen_conflict","liked_overlap","disliked_overlap",
            "is_dessert","is_drink",
            "calories","spicy","dish_pop",
            "label"
        ]
    )

def build_candidate_level_test_fast(queries: pd.DataFrame) -> pd.DataFrame:
    gids = queries["query_id"].to_numpy(np.int32)
    uids = queries["user_id"].to_numpy(np.int32)
    day  = queries["day"].to_numpy(np.int16)
    slot = queries["meal_slot"].astype(str).to_numpy()
    hang = queries["hangover_level"].to_numpy(np.int8)
    gcnt = queries["guests_count"].to_numpy(np.int8)
    diet = queries["diet_mode"].to_numpy(np.int8)
    load = queries["fridge_load_pct"].to_numpy(np.float32)
    cands = queries[cand_cols].to_numpy(np.int16)

    rows = []
    append = rows.append

    for i in range(len(queries)):
        gid = int(gids[i])
        uid = int(uids[i])

        cand_row = cands[i].tolist()
        pos_of = {}
        uniq = []
        for j, d in enumerate(cand_row, start=1):
            d = int(d)
            if d not in pos_of:
                pos_of[d] = j
                uniq.append(d)

        ua = user_allerg_mask.get(uid, 0)
        ul = user_liked_mask.get(uid, 0)
        ud = user_disliked_mask.get(uid, 0)

        for d in uniq:
            p = pos_of[d]

            da = dish_allergen_mask.get(d, 0)
            dt = dish_tags_mask.get(d, 0)

            allergen_conflict = 1 if (ua & da) != 0 else 0
            liked_overlap = (ul & dt).bit_count()
            disliked_overlap = (ud & dt).bit_count()

            cat = dish_cat.get(d, "__NA__")
            is_dessert = 1 if cat == "dessert" else 0
            is_drink   = 1 if cat == "drink" else 0

            cal = float(dish_cal.get(d, 0.0))
            spicy = int(dish_spicy.get(d, 0))
            pop = int(dish_pop.get(d, 0))

            append((
                gid, uid, int(day[i]), str(slot[i]), int(hang[i]), int(gcnt[i]), int(diet[i]), float(load[i]),
                int(d), int(p),
                int(allergen_conflict), int(liked_overlap), int(disliked_overlap),
                int(is_dessert), int(is_drink),
                float(cal), int(spicy), int(pop)
            ))

    return pd.DataFrame(
        rows,
        columns=[
            "group_id","user_id","day","meal_slot","hangover_level","guests_count","diet_mode","fridge_load_pct",
            "dish_id","candidate_pos",
            "allergen_conflict","liked_overlap","disliked_overlap",
            "is_dessert","is_drink",
            "calories","spicy","dish_pop"
        ]
    )


In [9]:
NEG_PER_POS = 9

train_cl = build_candidate_level_train_fast(train, neg_per_pos=NEG_PER_POS)
test_cl  = build_candidate_level_test_fast(test)

print("train_cl:", train_cl.shape, "label mean:", float(train_cl["label"].mean()))
print("test_cl :", test_cl.shape)

# user flags
train_cl = train_cl.merge(user_flags, left_on="user_id", right_index=True, how="left")
test_cl  = test_cl.merge(user_flags, left_on="user_id", right_index=True, how="left")


train_cl: (3000000, 19) label mean: 0.1
test_cl : (381472, 18)


In [10]:
def safe_str_series(s: pd.Series, na_val: str = "__NA__") -> pd.Series:
    return s.astype("string").fillna(na_val).astype("object")

# категорик
train_cl["meal_slot"] = safe_str_series(train_cl["meal_slot"], "__NA__")
test_cl["meal_slot"]  = safe_str_series(test_cl["meal_slot"], "__NA__")

# penalties/affinities (как у тебя было)
train_cl["diet_cal_penalty"]  = (train_cl["diet_mode"].astype(np.int16) * train_cl["calories"].astype(np.float32)) / 500.0
test_cl["diet_cal_penalty"]   = (test_cl["diet_mode"].astype(np.int16) * test_cl["calories"].astype(np.float32)) / 500.0

train_cl["light_cal_penalty"] = (train_cl["prefers_light_food"].astype(np.int16) * train_cl["calories"].astype(np.float32)) / 500.0
test_cl["light_cal_penalty"]  = (test_cl["prefers_light_food"].astype(np.int16) * test_cl["calories"].astype(np.float32)) / 500.0

train_cl["sweet_dessert_aff"] = train_cl["sweet_tooth"].astype(np.int16) * train_cl["is_dessert"].astype(np.int16)
test_cl["sweet_dessert_aff"]  = test_cl["sweet_tooth"].astype(np.int16) * test_cl["is_dessert"].astype(np.int16)

train_cl["coffee_drink_aff"]  = train_cl["coffee_addict"].astype(np.int16) * train_cl["is_drink"].astype(np.int16)
test_cl["coffee_drink_aff"]   = test_cl["coffee_addict"].astype(np.int16) * test_cl["is_drink"].astype(np.int16)

train_cl["allergen_penalty"]  = train_cl["allergen_conflict"].astype(np.int16)
test_cl["allergen_penalty"]   = test_cl["allergen_conflict"].astype(np.int16)


In [12]:
!pip -q install -U catboost


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
from catboost import CatBoostRanker, Pool

feature_cols = [
    "user_id","day","meal_slot","hangover_level","guests_count","diet_mode","fridge_load_pct",
    "dish_id","candidate_pos",
    "prefers_light_food","sweet_tooth","coffee_addict","microwave_trust",
    "calories","spicy","dish_pop",
    "liked_overlap","disliked_overlap",
    "allergen_penalty",
    "diet_cal_penalty","light_cal_penalty",
    "sweet_dessert_aff","coffee_drink_aff",
]
cat_cols = ["meal_slot"]

all_groups = train_cl["group_id"].unique()
rng = np.random.default_rng(SEED)
rng.shuffle(all_groups)

n_val_groups = int(0.2 * len(all_groups))
val_groups = set(all_groups[:n_val_groups])

tr_df = train_cl[~train_cl["group_id"].isin(val_groups)]
va_df = train_cl[train_cl["group_id"].isin(val_groups)]

tr_pool = Pool(tr_df[feature_cols], label=tr_df["label"], group_id=tr_df["group_id"], cat_features=cat_cols)
va_pool = Pool(va_df[feature_cols], label=va_df["label"], group_id=va_df["group_id"], cat_features=cat_cols)

params_fast = dict(
    loss_function="YetiRankPairwise",
    eval_metric="NDCG:top=5",
    iterations=500,
    learning_rate=0.12,
    depth=5,
    l2_leaf_reg=8.0,
    min_data_in_leaf=100,
    subsample=0.7,
    rsm=0.8,
    random_seed=SEED,
    verbose=100,
    od_type="Iter",
    od_wait=60,
    allow_writing_files=False,
)

ranker = CatBoostRanker(**params_fast)
ranker.fit(tr_pool, eval_set=va_pool, use_best_model=True)

# NDCG@5 оценка на holdout
va_scores = ranker.predict(va_pool)
va_tmp = va_df[["group_id","dish_id","label"]].copy()
va_tmp["score"] = va_scores

top5 = (va_tmp.sort_values(["group_id","score"], ascending=[True, False]).groupby("group_id").head(5))
rec_series = top5.groupby("group_id")["dish_id"].apply(list).apply(lambda lst: (lst + lst[:5])[:5])

val_group_list = rec_series.index.to_numpy()
recs_2d = np.vstack(rec_series.to_list()).astype(int)

true_map = (va_tmp[va_tmp["label"] == 1][["group_id","dish_id"]].drop_duplicates("group_id").set_index("group_id")["dish_id"])
true_ids = true_map.loc[val_group_list].to_numpy().astype(int)

def ndcg_at_5_from_ranked_lists(true_ids: np.ndarray, recs_2d: np.ndarray) -> float:
    score = 0.0
    for i in range(len(true_ids)):
        t = int(true_ids[i])
        row = recs_2d[i]
        pos = 0
        for j in range(5):
            if int(row[j]) == t:
                pos = j + 1
                break
        if pos:
            score += 1.0 / np.log2(pos + 1)
    return float(score / len(true_ids))

print("Holdout NDCG@5:", ndcg_at_5_from_ranked_lists(true_ids, recs_2d))


Pairwise scoring loss functions on CPU do not support one hot features. OneHotMaxSize set to 1
0:	test: 0.3454176	best: 0.3454176 (0)	total: 8.05s	remaining: 1h 6m 58s
100:	test: 0.4764783	best: 0.4764783 (100)	total: 13m 20s	remaining: 52m 43s
200:	test: 0.4930106	best: 0.4930106 (200)	total: 26m 14s	remaining: 39m 1s
300:	test: 0.5026882	best: 0.5026882 (300)	total: 39m 23s	remaining: 26m 2s
400:	test: 0.5052482	best: 0.5052488 (398)	total: 52m 25s	remaining: 12m 56s
499:	test: 0.5077666	best: 0.5077989 (498)	total: 1h 5m 27s	remaining: 0us

bestTest = 0.5077988805
bestIteration = 498

Shrink model to first 499 iterations.
Holdout NDCG@5: 0.5078475276590516


In [14]:
# скоринг test_cl
test_pool = Pool(test_cl[feature_cols], group_id=test_cl["group_id"], cat_features=cat_cols)
test_scores = ranker.predict(test_pool)

test_tmp = test_cl[["group_id","dish_id"]].copy()
test_tmp["score"] = test_scores

test_top = (
    test_tmp.sort_values(["group_id","score"], ascending=[True, False])
    .groupby("group_id")
    .head(20)
)

test_cand_2d = test[cand_cols].to_numpy().astype(int)
qid_order = test["query_id"].astype(int).to_numpy()
cand_map = {int(qid_order[i]): test_cand_2d[i].tolist() for i in range(len(test))}

group_lists = test_top.groupby("group_id")["dish_id"].apply(list).to_dict()

def make_top5_for_group(gid: int, dish_list_sorted: list[int]) -> list[int]:
    cand = cand_map[gid]
    cand_set = set(cand)

    out, seen = [], set()
    for d in dish_list_sorted:
        d = int(d)
        if d in cand_set and d not in seen:
            seen.add(d); out.append(d)
        if len(out) == 5:
            return out

    remaining = list(dict.fromkeys(map(int, cand)))
    remaining = [d for d in remaining if d not in seen]
    remaining.sort(key=lambda x: (-dish_pop.get(int(x), 0), int(x)))

    for d in remaining:
        if d not in seen:
            out.append(int(d)); seen.add(int(d))
        if len(out) == 5:
            break
    return out[:5]

recs = []
for gid in qid_order:
    recs.append(make_top5_for_group(int(gid), group_lists.get(int(gid), [])))
recs_2d = np.array(recs, dtype=int)

def validate_recs(recs_2d: np.ndarray, cand_2d: np.ndarray):
    n = recs_2d.shape[0]
    bad_dup = 0
    bad_out = 0
    for i in range(n):
        r = list(map(int, recs_2d[i].tolist()))
        if len(set(r)) != 5:
            bad_dup += 1
        cand_set = set(map(int, cand_2d[i].tolist()))
        if any(x not in cand_set for x in r):
            bad_out += 1
    print("dup rows:", bad_dup, "out-of-cand rows:", bad_out)
    if bad_out or bad_dup:
        raise ValueError("Invalid recommendations detected")

validate_recs(recs_2d, test_cand_2d)

submission_rank = sub.copy()
for k in range(5):
    submission_rank[f"rec_{k+1}"] = recs_2d[:, k]

OUT_PATH = "/content/submission_ranker_bitmask_fast.csv"
submission_rank.to_csv(OUT_PATH, index=False)
print("Saved:", OUT_PATH)


dup rows: 0 out-of-cand rows: 0
Saved: /content/submission_ranker_bitmask_fast.csv
