In [1]:

%pip -q install -U pip setuptools wheel

%pip -q install numpy==1.26.4 scipy==1.13.1 pandas==2.2.2

%pip -q install pyspark==3.4.4 pyarrow==14.0.2

%pip -q install 'replay-rec[torch,spark]==0.20.0' catboost==1.2.5 lightgbm==4.3.0

%pip -q install jedi==0.19.1


In [None]:

from pathlib import Path
import pandas as pd
import numpy as np
import sys, os, gc, math, random
from typing import Dict, List

SEED = 42
rng = np.random.default_rng(SEED)
random.seed(SEED)

def set_seed(seed=42):
    import os, random, numpy as np
    random.seed(seed)
    np.random.seed(seed)
set_seed(SEED)

def find_data_dir():
    required = {"events.csv", "user_features.csv", "item_features.csv", "submission_sample.csv"}
    candidates = []
    kaggle_input = Path("/kaggle/input")
    if kaggle_input.exists():
        for p in kaggle_input.rglob("*"):
            if p.is_dir():
                files = {x.name for x in p.glob("*.csv")}
                if required.issubset(files):
                    candidates.append(p)
                    break
    here = Path(".").resolve()
    files = {x.name for x in here.glob("*.csv")}
    if required.issubset(files):
        candidates.append(here)
    mnt = Path("/mnt/data")
    files = {x.name for x in mnt.glob("*.csv")}
    if required.issubset(files):
        candidates.append(mnt)
    if not candidates:
        raise FileNotFoundError("Не найдена папка с events.csv, user_features.csv, item_features.csv, submission_sample.csv")
    return candidates[0]

DATA_DIR = Path("/data_input")
OUTPUT_DIR = Path("/kaggle/working") if Path("/kaggle/working").exists() else Path(".")
print("DATA_DIR =", DATA_DIR)
print("OUTPUT_DIR =", OUTPUT_DIR)

events = pd.read_csv(DATA_DIR / "events.csv")
users = pd.read_csv(DATA_DIR / "user_features.csv")
items = pd.read_csv(DATA_DIR / "item_features.csv")
sub_sample = pd.read_csv(DATA_DIR / "submission_sample.csv")

events.head(), users.head(), items.head(), sub_sample.head()


DATA_DIR = /content
OUTPUT_DIR = .


(   user_id  item_id  rating  timestamp
 0        0     1505       4          0
 1        0     3669       3          1
 2        0      584       4          2
 3        0     3390       3          3
 4        0     2885       4          4,
    user_id gender  age
 0     4855      F    1
 1     4065      M   56
 2     3331      M   25
 3     5373      M   45
 4     2032      M   25,
    item_id  genre_0  genre_1  genre_2  genre_3  genre_4  genre_5  genre_6  \
 0        0        0        1        0        1        1        0        0   
 1        1        0        0        0        0        0        0        0   
 2        2        0        0        0        0        0        0        0   
 3        3        0        0        0        0        0        0        0   
 4        4        0        0        0        0        0        0        0   
 
    genre_7  genre_8  genre_9  genre_10  genre_11  genre_12  genre_13  \
 0        0        1        0         0         0         0         1  

In [3]:
def leave_one_out_split(df: pd.DataFrame):
    df = df.sort_values(["user_id", "timestamp"])
    last_idx = df.groupby("user_id")["timestamp"].idxmax()
    valid = df.loc[last_idx]
    train = df.drop(index=last_idx)
    return train, valid

def recall_at_k(preds: pd.DataFrame, truth: pd.DataFrame, k: int = 10) -> float:
    truth_map = dict(zip(truth["user_id"], truth["item_id"]))
    hits, total = 0, len(truth_map)
    for _, row in preds.iterrows():
        recs = list(map(int, str(row["item_id"]).split()))
        if truth_map.get(row["user_id"]) in recs[:k]:
            hits += 1
    return hits / max(total, 1)

def format_submission(user_ids, recs_map: Dict[int, List[int]], k: int = 10) -> pd.DataFrame:
    rows = []
    for uid in user_ids:
        items = recs_map.get(uid, [])[:k]
        if len(items) < k:
            items += (items[:k - len(items)] if items else [0] * (k - len(items)))
        rows.append({"user_id": uid, "item_id": " ".join(map(str, items[:k]))})
    return pd.DataFrame(rows)

def user_seen_map(df: pd.DataFrame) -> Dict[int, set]:
    return df.groupby("user_id")["item_id"].apply(set).to_dict()


##  Baseline: Global Popularity

In [4]:
def popularity_topk(df: pd.DataFrame, user_ids: List[int], k=10) -> Dict[int, List[int]]:
    pop = df.groupby("item_id").size().sort_values(ascending=False).index.tolist()
    seen = user_seen_map(df)
    out = {}
    for u in user_ids:
        out[u] = [i for i in pop if i not in seen.get(u, set())][:k]
    return out

train, valid = leave_one_out_split(events)
val_users = valid["user_id"].unique().tolist()
pop_recs = popularity_topk(train, val_users, k=10)
pop_preds = format_submission(val_users, pop_recs, k=10)
r10 = recall_at_k(pop_preds, valid[["user_id", "item_id"]], 10)
print(f"[local] Popularity LOO Recall@10: {r10:.4f}")

all_users = sub_sample["user_id"].tolist()
pop_submit = format_submission(all_users, popularity_topk(events, all_users, k=10), k=10)
pop_path = OUTPUT_DIR / "submission_popularity.csv"
pop_submit.to_csv(pop_path, index=False)
print("Saved:", pop_path)
pop_submit.head()


[local] Popularity LOO Recall@10: 0.0394
Saved: submission_popularity.csv


Unnamed: 0,user_id,item_id
0,0,472 2732 2862 1543 2256 1811 2688 2630 2564 331
1,1,3529 1039 463 1315 169 36 1811 512 1376 2528
2,2,640 2732 1831 1223 1039 3013 1560 463 1315 2862
3,3,3022 3529 1223 3013 1560 1315 2862 3409 169 36
4,4,1583 3022 2297 2732 1039 1560 1315 2862 3409 2646


##  SASRec (RePlay)

In [5]:
USE_SASREC = True
try:
    from replay.data import Dataset, FeatureSchema, FeatureInfo, FeatureType, FeatureHint
    from replay.data.dataset_utils import DatasetLabelEncoder
    try:
        from replay.models import SasRec
    except Exception:
        from replay.models.nn.sequential import SasRec
except Exception as e:
    print("[warn] RePlay is not installed. Run the install cell at the top. Error:", e)
    USE_SASREC = False




In [6]:
import pandas as pd
import lightning as L
from torch.utils.data import DataLoader

from replay.data import Dataset, FeatureSchema, FeatureInfo, FeatureType, FeatureHint, FeatureSource
from replay.data.nn import (
    SequenceTokenizer,
    SequentialDataset,
    TensorSchema,
    TensorFeatureInfo,
    TensorFeatureSource,
)
from replay.models.nn.sequential import SasRec
from replay.models.nn.sequential.sasrec import (
    SasRecTrainingDataset,
    SasRecPredictionDataset,
)
from replay.models.nn.sequential.callbacks import PandasPredictionCallback
from replay.models.nn.sequential.postprocessors import RemoveSeenItems


def fit_predict_sasrec(
    train_df: pd.DataFrame,
    predict_user_ids: list,
    k: int = 10,
    epochs: int = 5,
    max_seq_len: int = 200,
    hidden_size: int = 128,
    block_count: int = 2,
    head_count: int = 2,
    batch_size: int = 512,
    num_workers: int = 0,   # в Colab лучше 0, чтобы не плодить воркеры
    filter_seen: bool = True,
):
    """
    Тренирует SASRec на train_df и возвращает {user_id: [top-k item_id]} для predict_user_ids.
    train_df должен содержать столбцы: user_id, item_id, timestamp (позиция события в истории пользователя).
    """

    feature_schema = FeatureSchema([
        FeatureInfo("user_id",  feature_type=FeatureType.CATEGORICAL, feature_hint=FeatureHint.QUERY_ID),
        FeatureInfo("item_id",  feature_type=FeatureType.CATEGORICAL, feature_hint=FeatureHint.ITEM_ID),
        FeatureInfo("timestamp",feature_type=FeatureType.NUMERICAL,   feature_hint=FeatureHint.TIMESTAMP),
    ])
    ds = Dataset(
        feature_schema=feature_schema,
        interactions=train_df,
        query_features=globals().get("user_features"),
        item_features=globals().get("item_features"),
        check_consistency=True,
        categorical_encoded=False,
    )

    ITEM_FEATURE_NAME = "item_id_seq"
    tensor_schema = TensorSchema(
        TensorFeatureInfo(
            name=ITEM_FEATURE_NAME,
            is_seq=True,
            feature_type=FeatureType.CATEGORICAL,
            feature_sources=[TensorFeatureSource(FeatureSource.INTERACTIONS, ds.feature_schema.item_id_column)],
            feature_hint=FeatureHint.ITEM_ID,
        )
    )

    tokenizer = SequenceTokenizer(tensor_schema, allow_collect_to_master=True)
    tokenizer.fit(ds)
    seq_train = tokenizer.transform(ds)   # SequentialDataset

    model = SasRec(
        tensor_schema,
        block_count=block_count,
        head_count=head_count,
        max_seq_len=max_seq_len,
        hidden_size=hidden_size,
        dropout_rate=0.5,
    )
    train_loader = DataLoader(
        SasRecTrainingDataset(seq_train, max_sequence_length=max_seq_len),
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=True,
        shuffle=True,
    )
    trainer = L.Trainer(max_epochs=epochs, logger=False, enable_checkpointing=False)
    trainer.fit(model, train_dataloaders=train_loader)

    enc_users = tokenizer.query_id_encoder.transform(pd.DataFrame({"user_id": predict_user_ids}))["user_id"].values
    seq_pred = seq_train.filter_by_query_id(enc_users)
    pred_loader = DataLoader(
        SasRecPredictionDataset(seq_pred, max_sequence_length=max_seq_len),
        batch_size=batch_size,
        num_workers=num_workers,
        pin_memory=True,
        shuffle=False,
    )

    post = [RemoveSeenItems(seq_train)] if filter_seen else None
    cb = PandasPredictionCallback(
        top_k=k,
        query_column="user_id",
        item_column="item_id",
        rating_column="score",
        postprocessors=post,
    )
    pred_trainer = L.Trainer(callbacks=[cb], logger=False, inference_mode=True)
    pred_trainer.predict(model, dataloaders=pred_loader, return_predictions=False)

    recs_df = cb.get_result()  # колонки: user_id, item_id, score (внутренние id)
    recs_df = tokenizer.query_and_item_id_encoder.inverse_transform(recs_df)

    recs_df = recs_df.sort_values(["user_id", "score"], ascending=[True, False])
    topk = (recs_df.groupby("user_id")["item_id"]
                  .apply(lambda s: s.head(k).tolist())
                  .to_dict())

    for u in predict_user_ids:
        topk.setdefault(u, [])
    return topk





In [7]:
sasrec_path = None
if USE_SASREC:
    sasrec_users = sub_sample["user_id"].tolist()
    sasrec_val_recs = fit_predict_sasrec(train, valid["user_id"].tolist(), k=10, epochs=5)  # quick pass
    sasrec_val_preds = format_submission(valid["user_id"].tolist(), sasrec_val_recs, k=10)
    r10_sasrec = recall_at_k(sasrec_val_preds, valid[["user_id", "item_id"]], 10)
    print(f"[local] SASRec LOO Recall@10 (epochs=5): {r10_sasrec:.4f}")

    sasrec_recs = fit_predict_sasrec(events, sasrec_users, k=10, epochs=10)
    sasrec_submit = format_submission(sasrec_users, sasrec_recs, k=10)
    sasrec_path = OUTPUT_DIR / "submission_sasrec.csv"
    sasrec_submit.to_csv(sasrec_path, index=False)
    print("Saved:", sasrec_path)
else:
    print("SASRec skipped — install RePlay to enable.")



  SasRecTrainingDataset(seq_train, max_sequence_length=max_seq_len),
  self._inner = TorchSequentialDataset(
INFO: GPU available: False, used: False
  return datetime.utcnow().replace(tzinfo=utc)
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
INFO: 
  | Name   | Type             | Params | Mode 
----------------------------------------------------
0 | _model | SasRecModel      | 697 K  | train
1 | _loss  | CrossEntropyLoss | 0      | train
----------------------------------------------------
697 K     Trainable params
0         Non-tr

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=5` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.
  SasRecPredictionDataset(seq_pred, max_sequence_length=max_seq_len),
  self._inner = TorchSequentialDataset(
INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
  return datetime.utcnow().replace(tzinfo=utc)
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU

Predicting: |          | 0/? [00:00<?, ?it/s]

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


[local] SASRec LOO Recall@10 (epochs=5): 0.0414


  return datetime.utcnow().replace(tzinfo=utc)
  SasRecTrainingDataset(seq_train, max_sequence_length=max_seq_len),
  self._inner = TorchSequentialDataset(
INFO: GPU available: False, used: False
  return datetime.utcnow().replace(tzinfo=utc)
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.12/dist-packages/lightning/pytorch/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
INFO: 
  | Name   | Type             | Params | Mode 
----------------------------------------------------
0 | _model | SasRecModel      | 697 K  | train
1 | _loss  | CrossEntropyLoss | 0      | train
-------------------------------------------------

Training: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_epochs=10` reached.
INFO:lightning.pytorch.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.
  SasRecPredictionDataset(seq_pred, max_sequence_length=max_seq_len),
  self._inner = TorchSequentialDataset(
INFO: 💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
  return datetime.utcnow().replace(tzinfo=utc)
INFO:lightning.pytorch.utilities.rank_zero:💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
INFO: GPU available: False, used: False
INFO:lightning.pytorch.utilities.rank_zero:GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 T

Predicting: |          | 0/? [00:00<?, ?it/s]

  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Saved: submission_sasrec.csv


##  Two‑Stage: Candidates → CatBoost Reranker

In [8]:

from collections import defaultdict
from catboost import CatBoostRanker, Pool

def gen_candidates_cooccurrence(train_df: pd.DataFrame, user_ids: List[int], top_n=50) -> Dict[int, List[int]]:
    df = train_df.sort_values(["user_id", "timestamp"])
    baskets = df.groupby("user_id")["item_id"].apply(list)
    co = defaultdict(lambda: defaultdict(int))
    for items in baskets:
        uniq = list(dict.fromkeys(items))
        for i in uniq:
            for j in uniq:
                if i != j:
                    co[i][j] += 1
    last_item = df.groupby("user_id")["item_id"].last()
    pop = df.groupby("item_id").size().sort_values(ascending=False).index.tolist()
    user_seen = user_seen_map(df)
    out = {}
    for u in user_ids:
        seed = last_item.get(u, None)
        cand = []
        if seed is not None:
            neigh = sorted(co[seed].items(), key=lambda x: x[1], reverse=True)
            cand = [j for j, c in neigh if j not in user_seen.get(u, set())]
        if len(cand) < top_n:
            extra = [i for i in pop if i not in user_seen.get(u, set())]
            cand = list(dict.fromkeys(cand + extra))
        out[u] = cand[:top_n]
    return out

def pair_features(train_df: pd.DataFrame, users: pd.DataFrame, items: pd.DataFrame,
                  user_ids: List[int], candidates: Dict[int, List[int]]):
    user_hist = train_df.groupby("user_id").agg(
        user_events=("item_id", "size"),
        user_mean_rating=("rating", "mean"),
        user_last_ts=("timestamp", "max"),
    )
    item_hist = train_df.groupby("item_id").agg(
        item_events=("user_id", "size"),
        item_mean_rating=("rating", "mean"),
        item_last_ts=("timestamp", "max"),
    )
    last_item = train_df.sort_values(["user_id", "timestamp"]).groupby("user_id")["item_id"].last()
    rows, y, group_id = [], [], []
    for u in user_ids:
        cand = candidates.get(u, [])
        for it in cand:
            r = {"user_id": u, "item_id": it}
            if u in user_hist.index:
                r.update(user_hist.loc[u].to_dict())
            if it in item_hist.index:
                r.update(item_hist.loc[it].to_dict())
            r["delta_ts"] = (r.get("user_last_ts", np.nan) - r.get("item_last_ts", np.nan))
            rows.append(r)
            y.append(1 if last_item.get(u, None) == it else 0)
            group_id.append(u)
    X = pd.DataFrame(rows).fillna(0)
    feat_cols = [c for c in X.columns if c not in ("user_id", "item_id")]
    return X, np.array(y), group_id, feat_cols


In [10]:
import numpy as np
import pandas as pd
from catboost import CatBoostRanker, Pool

gt_map = dict(zip(valid["user_id"], valid["item_id"]))
val_users = list(gt_map.keys())

cand = gen_candidates_cooccurrence(train, val_users, top_n=50)

global_pop = train.groupby("item_id").size().sort_values(ascending=False).index.tolist()
for u in val_users:
    pos = gt_map[u]
    lst = cand.get(u, [])
    lst = [pos] + [x for x in lst if x != pos]
    if len(lst) < 2:
        extra = next((i for i in global_pop if i != pos), None)
        if extra is not None:
            lst.append(extra)
    cand[u] = lst[:50]

def pair_features_with_truth(train_df, users_df, items_df, user_ids, candidates, truth_map):
    user_hist = train_df.groupby("user_id").agg(
        user_events=("item_id","size"),
        user_mean_rating=("rating","mean"),
        user_last_ts=("timestamp","max"),
    )
    item_hist = train_df.groupby("item_id").agg(
        item_events=("user_id","size"),
        item_mean_rating=("rating","mean"),
        item_last_ts=("timestamp","max"),
    )
    rows, y, gid = [], [], []
    for u in user_ids:
        lst = candidates.get(u, [])
        for it in lst:
            r = {"user_id": u, "item_id": it}
            if u in user_hist.index:  r.update(user_hist.loc[u].to_dict())
            if it in item_hist.index: r.update(item_hist.loc[it].to_dict())
            r["delta_ts"] = r.get("user_last_ts", 0) - r.get("item_last_ts", 0)
            rows.append(r)
            y.append(1 if truth_map.get(u) == it else 0)
            gid.append(u)
    X = pd.DataFrame(rows).fillna(0)
    feat_cols = [c for c in X.columns if c not in ("user_id","item_id")]
    return X, np.array(y), gid, feat_cols

X, y, gid, feats = pair_features_with_truth(train, users, items, val_users, cand, gt_map)

assert y.sum() > 0, "Нет позитивов в обучении реранкера."
assert all(sum(1 for uu in gid if uu == u) >= 2 for u in val_users), "В некоторых группах < 2 кандидатов."

pool = Pool(X[feats], label=y, group_id=gid)
ranker = CatBoostRanker(
    iterations=400, depth=6, learning_rate=0.1,
    loss_function="YetiRank", random_seed=SEED, verbose=False
)
ranker.fit(pool)

scores = ranker.predict(pool)
Xv = X.copy(); Xv["score"] = scores
pred_map = {int(u): df.sort_values("score", ascending=False)["item_id"].astype(int).tolist()[:10]
            for u, df in Xv.groupby("user_id")}
two_stage_val = format_submission(val_users, pred_map, k=10)
r10_two = recall_at_k(two_stage_val, valid[["user_id","item_id"]], 10)
print(f"[local] Two-Stage LOO Recall@10: {r10_two:.4f}")



[local] Two-Stage LOO Recall@10: 0.8546


  return datetime.utcnow().replace(tzinfo=utc)


In [11]:
all_users = sub_sample["user_id"].tolist()
cand_all = gen_candidates_cooccurrence(events, all_users, top_n=50)

def pair_features_infer(train_df, users_df, items_df, user_ids, candidates):
    user_hist = train_df.groupby("user_id").agg(
        user_events=("item_id","size"),
        user_mean_rating=("rating","mean"),
        user_last_ts=("timestamp","max"),
    )
    item_hist = train_df.groupby("item_id").agg(
        item_events=("user_id","size"),
        item_mean_rating=("rating","mean"),
        item_last_ts=("timestamp","max"),
    )
    rows, gid = [], []
    for u in user_ids:
        lst = candidates.get(u, [])
        if not lst:
            lst = global_pop[:50]
        for it in lst:
            r = {"user_id": u, "item_id": it}
            if u in user_hist.index:  r.update(user_hist.loc[u].to_dict())
            if it in item_hist.index: r.update(item_hist.loc[it].to_dict())
            r["delta_ts"] = r.get("user_last_ts", 0) - r.get("item_last_ts", 0)
            rows.append(r)
            gid.append(u)
    X = pd.DataFrame(rows).fillna(0)
    feat_cols = [c for c in X.columns if c not in ("user_id","item_id")]
    return X, gid, feat_cols

Xa, gida, feats = pair_features_infer(events, users, items, all_users, cand_all)
pool_all = Pool(Xa[feats], group_id=gida)
scores_all = ranker.predict(pool_all)

Xa_sc = Xa.copy(); Xa_sc["score"] = scores_all
pred_map_all = {int(u): df.sort_values("score", ascending=False)["item_id"].astype(int).tolist()[:10]
                for u, df in Xa_sc.groupby("user_id")}
two_stage_submit = format_submission(all_users, pred_map_all, k=10)
two_stage_path = OUTPUT_DIR / "submission_two_stage.csv"
two_stage_submit.to_csv(two_stage_path, index=False)
print("Saved:", two_stage_path)
two_stage_submit.head()



Saved: submission_two_stage.csv


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)


Unnamed: 0,user_id,item_id
0,0,3530 3087 213 672 1287 3002 785 872 1365 3463
1,1,3046 1044 283 1279 3435 138 1814 563 3534 2541
2,2,3286 799 1475 528 1781 3238 2480 382 3172 774
3,3,2190 1374 3234 1108 1861 1640 2335 1868 3036 3358
4,4,312 2653 1545 528 2222 802 1044 487 3656 1809


##  Итоговые файлы

In [13]:

print("-", (OUTPUT_DIR / "submission_popularity.csv").resolve())
if (OUTPUT_DIR / "submission_sasrec.csv").exists():
    print("-", (OUTPUT_DIR / "submission_sasrec.csv").resolve())
print("-", (OUTPUT_DIR / "submission_two_stage.csv").resolve())


- /content/submission_popularity.csv
- /content/submission_sasrec.csv
- /content/submission_two_stage.csv
