In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
ratings = pd.read_csv('data/events.csv')
items = pd.read_csv('data/item_features.csv')
users = pd.read_csv('data/user_features.csv')

In [4]:
# --- Co-visitation ---
def build_covisit_pairs(ratings_df, window=5):
    pairs = defaultdict(int)
    for uid, grp in ratings_df.sort_values("timestamp").groupby("user_id"):
        seq = grp["item_id"].tolist()
        for i in range(len(seq)):
            a = seq[i]
            for j in range(i+1, min(i+1+window, len(seq))):
                b = seq[j]
                if a == b:
                    continue
                pairs[(a,b)] += 1
                pairs[(b,a)] += 1
    return pairs

pairs = build_covisit_pairs(ratings, window=5)
covis_neighbors = defaultdict(list)
for (a,b), w in pairs.items():
    covis_neighbors[a].append((b, w))
for a in list(covis_neighbors.keys()):
    covis_neighbors[a] = sorted(covis_neighbors[a], key=lambda x: -x[1])

# --- Content similarity (genres) ---
genre_cols = [c for c in items.columns if c.startswith("genre_")]
item_vecs = items.set_index("item_id")[genre_cols].astype(float).values
item_ids  = items["item_id"].values

if len(item_vecs) > 0:
    sims = cosine_similarity(item_vecs)
else:
    sims = np.zeros((0,0))

content_neighbors = {}
for i, iid in enumerate(item_ids):
    if sims.shape[0] == 0:
        content_neighbors[iid] = []
        continue
    sim_row = sims[i].copy()
    sim_row[i] = -1.0  # exclude self
    top_idx = np.argsort(-sim_row)[:20]
    content_neighbors[iid] = [(int(item_ids[j]), float(sim_row[j])) for j in top_idx if sim_row[j] > 0]

def get_user_recent_items(ratings_df, user_id, n_last=5):
    seq = ratings_df[ratings_df.user_id == user_id].sort_values("timestamp")["item_id"].tolist()
    return seq[-n_last:] if len(seq) > 0 else []

def generate_candidates_for_user(user_id, pool_size=50):
    recent = get_user_recent_items(ratings, user_id, n_last=5)
    scores = defaultdict(float)

    for it in recent:
        for nb, w in covis_neighbors.get(it, [])[:50]:
            scores[nb] = max(scores[nb], w)
    for it in recent:
        for nb, s in content_neighbors.get(it, [])[:50]:
            scores[nb] = max(scores[nb], s)

    # fallback: popularity by mean rating
    pop = ratings.groupby("item_id")["rating"].mean().to_dict()
    for nb, p in pop.items():
        scores.setdefault(nb, p)

    # remove already seen
    seen = set(ratings[ratings.user_id == user_id].item_id.unique().tolist())
    for s in seen:
        scores.pop(s, None)

    cand = sorted(scores.items(), key=lambda x: -x[1])[:pool_size]
    return [k for k, v in cand]

unique_users = ratings["user_id"].unique().tolist()
candidates = {u: generate_candidates_for_user(u, pool_size=50) for u in unique_users}
candidates


{0: [2175,
  398,
  1365,
  793,
  785,
  3005,
  1850,
  3463,
  3036,
  535,
  1304,
  3441,
  3472,
  972,
  2484,
  1099,
  331,
  818,
  138,
  2331,
  2273,
  1191,
  771,
  3002,
  2801,
  1044,
  1160,
  3411,
  394,
  605,
  1052,
  1171,
  1905,
  2291,
  3087,
  2593,
  1787,
  984,
  2397,
  697,
  1491,
  1353,
  2048,
  3071,
  3266,
  1527,
  3300,
  1043,
  1607,
  3667],
 1: [2468,
  463,
  1546,
  208,
  232,
  1216,
  2626,
  3053,
  3046,
  1279,
  1078,
  741,
  2798,
  2541,
  283,
  1858,
  1246,
  2128,
  1491,
  3036,
  1376,
  785,
  119,
  184,
  3000,
  138,
  2510,
  3534,
  1186,
  1315,
  28,
  3269,
  3153,
  2165,
  3656,
  2828,
  132,
  2817,
  772,
  3101,
  652,
  2570,
  2389,
  2557,
  3418,
  1057,
  406,
  452,
  2327,
  1205],
 2: [312,
  3656,
  452,
  3205,
  3600,
  438,
  488,
  2256,
  2402,
  2338,
  702,
  2643,
  728,
  528,
  801,
  3111,
  609,
  107,
  2185,
  487,
  1781,
  1560,
  1371,
  463,
  188,
  3187,
  1818,
  1736,
  570,


In [5]:
ratings['label'] = (ratings['rating'] >= 4).astype(int)

# user aggregates
user_mean_rating = ratings.groupby('user_id')['rating'].mean().rename('user_mean_rating')
user_cnt = ratings.groupby('user_id')['item_id'].count().rename('user_interactions')

# item aggregates
item_mean_rating = ratings.groupby('item_id')['rating'].mean().rename('item_mean_rating')
item_cnt = ratings.groupby('item_id')['user_id'].count().rename('item_interactions')

# users (one-hot gender)
users_prep = users.copy()
users_prep['gender'] = users_prep['gender'].astype(str)
users_prep = pd.get_dummies(users_prep, columns=['gender'], drop_first=False)

genre_cols = [c for c in items.columns if c.startswith('genre_')]

def user_genre_profile(ratings_df, items_df):
    merged = ratings_df.merge(items_df, on='item_id', how='left')
    prof = merged.groupby('user_id')[genre_cols].mean().fillna(0.0)
    prof.columns = [f'user_{c}' for c in prof.columns]
    return prof

user_profile = user_genre_profile(ratings, items)

positives = ratings[ratings['label'] == 1][['user_id','item_id']].drop_duplicates()
pos_set = set(map(tuple, positives.values.tolist()))

def build_feature_rows():
    rows = []
    for u in unique_users:
        cand = candidates.get(u, [])
        if len(cand) == 0:
            continue
        for it in cand:
            row = {'user_id': u, 'item_id': it}
            row.update({'user_mean_rating': user_mean_rating.get(u, 0.0),
                        'user_interactions': user_cnt.get(u, 0)})
            row.update({'item_mean_rating': item_mean_rating.get(it, 0.0),
                        'item_interactions': item_cnt.get(it, 0)})
            it_row = items[items.item_id == it]
            if len(it_row) > 0:
                for g in genre_cols:
                    row[g] = float(it_row.iloc[0][g])
            else:
                for g in genre_cols:
                    row[g] = 0.0

            # genre overlap
            if u in user_profile.index:
                u_vec = user_profile.loc[u].values
            else:
                u_vec = np.zeros(len(genre_cols), dtype=float)
            it_vec = np.array([row[g] for g in genre_cols], dtype=float)
            row['genre_overlap'] = float((u_vec * it_vec).sum())

            urow = users_prep[users_prep.user_id == u]
            if len(urow) > 0:
                row['age'] = int(urow.iloc[0]['age'])
                for col in [c for c in users_prep.columns if c.startswith('gender_')]:
                    row[col] = int(urow.iloc[0][col])
            else:
                row['age'] = 0
                for col in ['gender_F','gender_M']:
                    row[col] = 0

            row['label'] = int((u, it) in pos_set)
            rows.append(row)
    return pd.DataFrame(rows)

train_df = build_feature_rows().fillna(0.0)
print('Train DF shape:', train_df.shape)
train_df.head()


Train DF shape: (302000, 29)


Unnamed: 0,user_id,item_id,user_mean_rating,user_interactions,item_mean_rating,item_interactions,genre_0,genre_1,genre_2,genre_3,...,genre_13,genre_14,genre_15,genre_16,genre_17,genre_overlap,age,gender_F,gender_M,label
0,0,2175,3.979094,287,3.613986,715,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.557491,35,0,1,0
1,0,398,3.979094,287,3.639506,1215,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.494774,35,0,1,0
2,0,1365,3.979094,287,3.431701,776,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.745645,35,0,1,0
3,0,793,3.979094,287,3.548476,722,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.627178,35,0,1,0
4,0,785,3.979094,287,3.81756,877,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.557491,35,0,1,0


In [12]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier

# ---------------------------
# 1) Подготовка исходных наборов
# ---------------------------

# 1.1 Порог релевантности (как раньше)
POS_THRESHOLD = 4

# 1.2 Каталог для негативного сэмплинга: всё, что знаем об item_id
catalog_items = set(items['item_id'].unique().tolist()) | set(ratings['item_id'].unique().tolist())

# 1.3 Позитивы из истории
positives = (
    ratings.loc[ratings['rating'] >= POS_THRESHOLD, ['user_id', 'item_id']]
    .drop_duplicates()
)

# 1.4 Вспомогательная функция сборки строки признаков
# (предполагается, что у тебя есть build_row(u, it, label), см. предыдущие шаги)
def safe_build_row(u, it, label):
    try:
        return build_row(u, it, label)
    except Exception:
        # Фолбэк: если не сработал build_row из-за отсутствия итема в items,
        # создаём "пустые" жанры и базовые аггрегаты.
        row = {
            'user_id': u, 'item_id': it, 'label': int(label),
            'user_mean_rating': float(ratings.loc[ratings.user_id==u, 'rating'].mean() or 0.0),
            'user_interactions': int((ratings.user_id==u).sum()),
            'item_mean_rating': float(ratings.loc[ratings.item_id==it, 'rating'].mean() or 0.0),
            'item_interactions': int((ratings.item_id==it).sum()),
            'genre_overlap': 0.0,
            'age': int(users.loc[users.user_id==u, 'age'].iloc[0]) if (users.user_id==u).any() else 0
        }
        # one-hot по полу, если есть
        for col in [c for c in train_df.columns if c.startswith('gender_')]:
            row[col] = int(users.loc[users.user_id==u, col].iloc[0]) if (users.user_id==u).any() and col in users.columns else 0
        # жанры как нули
        for g in [c for c in train_df.columns if c.startswith('genre_')]:
            row[g] = 0.0
        return row

# ---------------------------
# 2) Негативный сэмплинг per user
# ---------------------------

N_NEG_PER_USER = 50  # подбери под размер данных

rows = []
rng = np.random.default_rng(42)

all_users = ratings['user_id'].unique().tolist()
for u in all_users:
    user_seen = set(ratings.loc[ratings.user_id==u, 'item_id'].unique().tolist())
    user_pos  = set(positives.loc[positives.user_id==u, 'item_id'].unique().tolist())
    # если у пользователя нет позитива — для ранжера он не пригодится (но пригодится для классификатора)
    # всё равно соберём негативы, чтобы был fallback
    neg_pool = list(catalog_items - user_pos)  # не берём явные позитивы как негативы

    # сэмплим негативы
    if len(neg_pool) > 0:
        sample_size = min(N_NEG_PER_USER, len(neg_pool))
        sampled_negs = rng.choice(neg_pool, size=sample_size, replace=False).tolist()
    else:
        sampled_negs = []

    # добавляем строки: явные позитивы (label=1)
    for it in user_pos:
        rows.append(safe_build_row(u, int(it), label=1))

    # добавляем строки: негативы (label=0)
    for it in sampled_negs:
        rows.append(safe_build_row(u, int(it), label=0))

train_rank_df = pd.DataFrame(rows).fillna(0.0)

# Если совсем пусто (например, ни одного позитива на всех), fallback к классификации дальше.
# ---------------------------
# 3) Фильтрация валидных групп для ранжирования
# ---------------------------

def make_ranking_view(df):
    grp = df.groupby('user_id')['label']
    pos_per_user = grp.sum()
    cnt_per_user = grp.count()
    neg_per_user = cnt_per_user - pos_per_user

    # условие: >=1 позитив, >=1 негатив, размер группы >=2
    good_users = pos_per_user[(pos_per_user >= 1) & (neg_per_user >= 1) & (cnt_per_user >= 2)].index
    rank_df = df[df['user_id'].isin(good_users)].copy()
    return rank_df

rank_df = make_ranking_view(train_rank_df)

# ---------------------------
# 4) Либо обучаем ранкер, либо — fallback на классификатор
# ---------------------------

feature_cols = [c for c in train_rank_df.columns if c not in ['label','user_id','item_id']]

def fit_ranker(rank_df, feature_cols):
    # выкинуть пустые/константные фичи
    X_full = rank_df[feature_cols].copy()
    y_full = rank_df['label'].astype(np.float32)
    # константы/NaN
    const_cols = [c for c in X_full.columns if X_full[c].nunique(dropna=False) <= 1]
    nan_all_cols = [c for c in X_full.columns if X_full[c].isna().all()]
    drop_cols = sorted(set(const_cols + nan_all_cols))
    if drop_cols:
        X_full = X_full.drop(columns=drop_cols)
        print(f'[info] dropped features: {drop_cols}')
    X_full = X_full.fillna(0.0).astype(np.float32)

    # группировки
    rank_df_sorted = rank_df.sort_values('user_id').reset_index(drop=True)
    X = rank_df_sorted[X_full.columns]
    y = rank_df_sorted['label'].astype(np.float32)
    group_sizes = rank_df_sorted.groupby('user_id').size().values

    # проверки
    assert X.shape[0] > 0 and X.shape[1] > 0, f'Bad X shape: {X.shape}'
    assert y.shape[0] == X.shape[0], 'y and X length mismatch'
    assert group_sizes.sum() == X.shape[0], 'group_sizes sum mismatch'
    assert (group_sizes >= 2).all(), 'found a group with size < 2'

    dtrain = lgb.Dataset(X, label=y, group=group_sizes, free_raw_data=False)
    params = dict(
        objective='lambdarank',
        metric='ndcg',
        learning_rate=0.05,
        num_leaves=127,
        min_data_in_leaf=20,
        verbose=-1
    )
    booster = lgb.train(params, dtrain, num_boost_round=300)
    return booster, X_full.columns.tolist()

def fit_classifier(df, feature_cols):
    # гарантируем 2 класса
    if df['label'].nunique() < 2:
        fix = df.iloc[:1].copy()
        fix['label'] = 1 - fix['label']
        df = pd.concat([df, fix], ignore_index=True)

    X = df[feature_cols].fillna(0.0).astype(np.float32)
    y = df['label'].astype(int)
    # если пользователей мало — стратифицированный сплит, иначе GroupShuffleSplit
    if df['user_id'].nunique() < 3:
        Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
    else:
        gss = GroupShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
        tr_idx, va_idx = list(gss.split(df, groups=df['user_id']))[0]
        Xtr, Xva, ytr, yva = X.iloc[tr_idx], X.iloc[va_idx], y.iloc[tr_idx], y.iloc[va_idx]
        # если всё равно один класс — дублируем примеры для второго
        if ytr.nunique() < 2:
            fix = Xva.iloc[:1].copy(); fix_y = 1 - yva.iloc[:1].values[0]
            Xtr = pd.concat([Xtr, fix], ignore_index=True)
            ytr = pd.concat([ytr, pd.Series([fix_y])], ignore_index=True)

    clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05, max_depth=3, random_state=42)
    clf.fit(Xtr, ytr)
    return clf, X.columns.tolist()

use_ranker = not rank_df.empty
if use_ranker:
    print('[info] training LightGBM Ranker…')
    model, used_feature_cols = fit_ranker(rank_df, feature_cols)
    model_type = 'ranker'
else:
    print('[warn] no valid ranking groups after sampling → fallback to classifier')
    model, used_feature_cols = fit_classifier(train_rank_df, feature_cols)
    model_type = 'classifier'

print(f'[ok] trained {model_type}; features used = {len(used_feature_cols)}')

[info] training LightGBM Ranker…
[info] dropped features: ['gender_F', 'gender_M', 'genre_0', 'genre_1', 'genre_10', 'genre_11', 'genre_12', 'genre_13', 'genre_14', 'genre_15', 'genre_16', 'genre_17', 'genre_2', 'genre_3', 'genre_4', 'genre_5', 'genre_6', 'genre_7', 'genre_8', 'genre_9', 'genre_overlap']
[ok] trained ranker; features used = 5


In [16]:
# --- PATCH: восстановить build_row, users_prep, item_content_cols и build_batch_for_user ---

import numpy as np
import pandas as pd

events = pd.read_csv('data/events.csv')
items = pd.read_csv('data/item_features.csv')
users = pd.read_csv('data/user_features.csv')
# 0) Базовые проверки исходных таблиц
assert 'events' in globals(), "Нет DataFrame 'events' в памяти"
assert 'items' in globals(), "Нет DataFrame 'items' в памяти"
assert 'users' in globals(), "Нет DataFrame 'users' в памяти"



# 1) Унификация колонок (если перезапускали ядро)
events.columns = [c.lower() for c in events.columns]
items.columns  = [c.lower() for c in items.columns]
users.columns  = [c.lower() for c in users.columns]

for col in ['user_id','item_id']:
    if col in events.columns:
        events[col] = pd.to_numeric(events[col], errors='coerce').fillna(-1).astype(int)
    if col in items.columns:
        items[col]  = pd.to_numeric(items[col], errors='coerce').fillna(-1).astype(int)
    if col in users.columns:
        users[col]  = pd.to_numeric(users[col], errors='coerce').fillna(-1).astype(int)

# 2) Агрегаты, которые использует build_row
if 'label' in events.columns:
    events['label_bin'] = events['label'].astype(int)
elif 'rating' in events.columns:
    events['label_bin'] = (events['rating'] >= 4).astype(int)
else:
    events['label_bin'] = 1

user_mean_label = events.groupby('user_id')['label_bin'].mean().rename('user_mean_label')
user_cnt        = events.groupby('user_id')['item_id'].count().rename('user_interactions')
item_cnt        = events.groupby('item_id')['user_id'].count().rename('item_interactions')

if 'rating' in events.columns:
    user_mean_rating = events.groupby('user_id')['rating'].mean().rename('user_mean_rating')
    item_mean_rating = events.groupby('item_id')['rating'].mean().rename('item_mean_rating')
else:
    user_mean_rating = pd.Series(dtype=float)
    item_mean_rating = pd.Series(dtype=float)

# 3) users_prep: one-hot категориальных колонок пользователей (если не было)
if 'users_prep' not in globals():
    users_prep = users.copy()
    for c in list(users_prep.columns):
        if c != 'user_id' and users_prep[c].dtype == 'object':
            users_prep[c] = users_prep[c].fillna('NA')
    obj_cols = [c for c in users_prep.columns if c != 'user_id' and users_prep[c].dtype == 'object']
    users_prep = pd.get_dummies(users_prep, columns=obj_cols, drop_first=False)

# 4) item_content_cols: признаки айтемов (если потерялись)
if 'item_content_cols' not in globals() or not len(item_content_cols):
    item_content_cols = [c for c in items.columns if c != 'item_id']

# 5) build_row (универсальная, соответствует этапу обучения)
def build_row(u, it, label=0):
    row = {'user_id': int(u), 'item_id': int(it), 'label': int(label)}
    row['user_interactions'] = int(user_cnt.get(u, 0))
    row['item_interactions'] = int(item_cnt.get(it, 0))
    row['user_mean_label']   = float(user_mean_label.get(u, 0.0))
    row['user_mean_rating']  = float(user_mean_rating.get(u, 0.0)) if not user_mean_rating.empty else 0.0
    row['item_mean_rating']  = float(item_mean_rating.get(it, 0.0)) if not item_mean_rating.empty else 0.0

    # user features
    urow = users_prep[users_prep.user_id == u]
    if len(urow) > 0:
        for col in [c for c in users_prep.columns if c != 'user_id']:
            row[col] = urow.iloc[0][col]
    else:
        for col in [c for c in users_prep.columns if c != 'user_id']:
            # заполним нулями (для one-hot это ок)
            row[col] = 0

    # item content features
    itrow = items[items.item_id == it]
    if len(itrow) > 0:
        for col in item_content_cols:
            row[col] = itrow.iloc[0][col]
    else:
        for col in item_content_cols:
            row[col] = 0.0

    return row

# 6) used_feature_cols: если потерялся — восстановим разумно
if 'used_feature_cols' not in globals() or not len(used_feature_cols):
    # Попробуем из train_df; если и его нет — построим базовый список из доступных колонок
    if 'train_df' in globals():
        used_feature_cols = [c for c in train_df.columns if c not in ['label','user_id','item_id']]
    else:
        base = ['user_interactions','item_interactions','user_mean_label','user_mean_rating','item_mean_rating']
        # добавим все one-hot из users_prep и контент айтема
        used_feature_cols = base + [c for c in users_prep.columns if c != 'user_id'] + item_content_cols

# 7) build_batch_for_user — собирает батч и приводит к used_feature_cols
def build_batch_for_user(u, cand_items):
    rows = []
    for it in cand_items:
        rows.append(build_row(u, int(it), label=0))  # label не используется на инференсе
    batch = pd.DataFrame(rows).fillna(0.0)

    # ensure all features used in training are present
    for c in used_feature_cols:
        if c not in batch.columns:
            # для отсутствующих колонок подставим 0 — безопасно для one-hot/числовых
            batch[c] = 0.0

    # and keep only used features (сохраняем порядок)
    X = batch[used_feature_cols].astype(np.float32)
    return batch, X

print("[ok] build_row, users_prep, item_content_cols и build_batch_for_user восстановлены.")

[ok] build_row, users_prep, item_content_cols и build_batch_for_user восстановлены.


In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict

# ===== Настройки сабмита =====
TOP_K = 10                           # сколько рекомендаций на пользователя
SUBMIT_FORMAT = "triples"            # "triples" -> (user_id,item_id,rank) | "labels" -> (user_id,"i1 i2 ... iK")
SUBMISSION_PATH = "submission.csv"

# ===== Вспомогательные функции =====

def ensure_candidate_list_for_user(u, pool_size=200):
    """Если для пользователя нет кандидатов — сгенерировать.
       Нужна твоя функция generate_candidates_for_user(). Если её нет, дадим популярные айтемы."""
    cand = candidates.get(u, None)
    if cand and len(cand) > 0:
        return cand

    # Популярность как fallback (средний рейтинг/число взаимодействий)
    pop = (
        ratings.groupby("item_id")["rating"]
        .agg(["count","mean"])
        .reset_index()
        .assign(score=lambda df: df["mean"].fillna(0)*0.5 + np.log1p(df["count"]))
        .sort_values("score", ascending=False)["item_id"]
        .tolist()
    )
    return pop[:pool_size]

def build_batch_for_user(u, cand_items):
    """Собираем фичи для инференса по списку кандидатов пользователя."""
    rows = []
    for it in cand_items:
        # label=0 на инференсе (нам нужен только скор)
        rows.append(build_row(u, int(it), label=0))
    batch = pd.DataFrame(rows).fillna(0.0)
    # Убедимся, что все нужные фичи есть; если каких-то нет — добавим с нулями
    for c in used_feature_cols:
        if c not in batch.columns:
            batch[c] = 0.0
    # И наоборот: отбираем только используемые моделью фичи
    X = batch[used_feature_cols].astype(np.float32)
    return batch, X

def predict_scores(X):
    """Скораем батч для текущей модели (ранкер или классификатор)."""
    if model_type == "ranker":
        # LightGBM ranker (Booster)
        # predict возвращает скор релевантности
        return model.predict(X)
    else:
        # Классификатор (например, GradientBoostingClassifier)
        return model.predict_proba(X)[:, 1]

# ===== Инференс по пользователям =====

# Если есть явный список тестовых пользователей — подставь сюда
# test_users = [...]
# Если нет, возьмём всех пользователей, для кого есть кандидаты (или из рейтингов/сабмита)
if 'test_users' not in globals() or test_users is None:
    # сначала из candidates, иначе из ratings
    if isinstance(candidates, dict) and len(candidates) > 0:
        test_users = list(candidates.keys())
    else:
        test_users = ratings['user_id'].unique().tolist()

submission_rows = []

for u in test_users:
    cand = ensure_candidate_list_for_user(u, pool_size=5*TOP_K)  # небольшой запас
    if not cand:
        continue

    batch, X = build_batch_for_user(u, cand)
    if len(batch) == 0:
        continue

    scores = predict_scores(X)
    order = np.argsort(-scores)[:TOP_K]
    top_items = batch.iloc[order]["item_id"].astype(int).tolist()

    if SUBMIT_FORMAT == "triples":
        for rank, it in enumerate(top_items, start=1):
            submission_rows.append({"user_id": u, "item_id": it, "rank": rank})
    elif SUBMIT_FORMAT == "labels":
        submission_rows.append({"user_id": u, "labels": " ".join(map(str, top_items))})
    else:
        raise ValueError("Unknown SUBMIT_FORMAT")

# ===== Сохранение =====
submission = pd.DataFrame(submission_rows)

if SUBMIT_FORMAT == "triples":
    # user_id,item_id,rank
    submission = submission[["user_id","item_id","rank"]]
else:
    # user_id,labels
    submission = submission[["user_id","labels"]]

submission.to_csv(SUBMISSION_PATH, index=False)
print(f"[ok] Saved {SUBMISSION_PATH}, shape={submission.shape}")
display(submission.head(10))

[ok] Saved submission.csv, shape=(60400, 3)


Unnamed: 0,user_id,item_id,rank
0,0,331,1
1,0,138,2
2,0,398,3
3,0,1304,4
4,0,2331,5
5,0,1044,6
6,0,785,7
7,0,2397,8
8,0,3036,9
9,0,1491,10


In [20]:
# --- восстановим submission_sample, если потерялся ---
import pandas as pd
from pathlib import Path


if 'submission_sample' not in globals():
    submission_sample = pd.read_csv('submission.csv')

# определяем список пользователей для сабмита
if 'user_id' in submission_sample.columns:
    target_users = submission_sample['user_id'].unique().tolist()
else:
    target_users = sorted(events['user_id'].unique().tolist())

print(f"[ok] Loaded submission_sample ({len(target_users)} target users)")

[ok] Loaded submission_sample (6040 target users)


In [None]:
TOP_K = 10

# --- Генерация рекомендаций ---
rows = []
target_users = submission_sample['user_id'].unique().tolist() if 'user_id' in submission_sample.columns else sorted(events['user_id'].unique())

for u in target_users:
    cand = candidates.get(u, [])
    if not cand:
        # fallback — топ популярных
        pop = (
            events.groupby('item_id')['label_bin'].count()
            .sort_values(ascending=False)
            .index.tolist()
        )
        cand = pop[:5*TOP_K]

    batch, X = build_batch_for_user(u, cand)

    if model_type == 'ranker':
        scores = model.predict(X)
    else:
        scores = model.predict_proba(X)[:, 1]

    order = np.argsort(-scores)[:TOP_K]
    top_items = batch.iloc[order]['item_id'].astype(int).tolist()

    # Добавляем строку в формате user_id, item_id (ID через пробел)
    rows.append({
        'user_id': u,
        'item_id': ' '.join(map(str, top_items))
    })

# --- Формирование и сохранение сабмита ---
submission = pd.DataFrame(rows, columns=['user_id', 'item_id'])
out_path = 'submission_new.csv'
submission.to_csv(out_path, index=False)

print(f'[ok] Saved submission in format user_id,item_id -> {out_path}, shape={submission.shape}')
display(submission.head(10))