In [None]:
!pip -q install numpy==1.26.4 pandas==2.2.2
!pip -q install lightgbm==4.1.0 tqdm
!pip -q install torch==2.2.2 lightning==2.3.0

!pip -q install "git+https://github.com/sb-ai-lab/RePlay.git@main"

import numpy as np, pandas as pd, sys, importlib
print("Python:", sys.version)
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)


[31mERROR: git+https://github.com/sb-ai-lab/RePlay.git@main does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m✅ Python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
✅ NumPy: 1.26.4
✅ Pandas: 2.2.2


In [None]:
import os, random, warnings, math
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

warnings.filterwarnings("ignore")

SEED = 42
K = 10
CANDIDATES_PER_USER = 500
MAX_HISTORY_FOR_COVIS = 200

random.seed(SEED)
np.random.seed(SEED)
try:
    import torch
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
except Exception:
    pass

print("✅ Seeds fixed:", SEED)


✅ Seeds fixed: 42


In [None]:
DATA_DIR = Path("/content/data")
EVENTS_PATH = DATA_DIR / "events.csv"
ITEM_FEAT_PATH = DATA_DIR / "item_features.csv"
USER_FEAT_PATH = DATA_DIR / "user_features.csv"
SUB_SAMPLE_PATH = DATA_DIR / "submission_sample.csv"

for p in [EVENTS_PATH, ITEM_FEAT_PATH, USER_FEAT_PATH, SUB_SAMPLE_PATH]:
    if not p.exists():
        raise FileNotFoundError(f"Upload missing file: {p}")

events = pd.read_csv(EVENTS_PATH)
item_features = pd.read_csv(ITEM_FEAT_PATH)
user_features = pd.read_csv(USER_FEAT_PATH)
submission_sample = pd.read_csv(SUB_SAMPLE_PATH)

# normalize rating/timestamp
if "rating" not in events.columns:
    events["rating"] = 1.0

ts_num = pd.to_numeric(events["timestamp"], errors="coerce")
if ts_num.notna().mean() > 0.9:
    unit = "ms" if ts_num.median() > 1e12 else "s"
    events["timestamp"] = pd.to_datetime(ts_num, unit=unit)
else:
    events["timestamp"] = pd.to_datetime(events["timestamp"], errors="coerce")

print("✅ Loaded:", events.shape, item_features.shape, user_features.shape)
events.head(3)


✅ Loaded: (894149, 4) (3706, 19) (6040, 3)


Unnamed: 0,user_id,item_id,rating,timestamp
0,0,1505,4,1970-01-01 00:00:00
1,0,3669,3,1970-01-01 00:00:01
2,0,584,4,1970-01-01 00:00:02


In [None]:
events_sorted = events.sort_values(["user_id", "timestamp"])
last_idx = events_sorted.groupby("user_id").tail(1).index
train = events_sorted.drop(index=last_idx).copy()
valid = events_sorted.loc[last_idx].copy()

valid_targets = valid[["user_id", "item_id"]].rename(columns={"item_id": "target_item_id"})

popular_items = train.groupby("item_id").size().sort_values(ascending=False).index.tolist()
user_seen_map = {uid: grp["item_id"].tolist() for uid, grp in events.groupby("user_id")}

def ensure_topk(preds_map, popular_items, k=10, seen_map=None):
    out = {}
    for u in preds_map:
        seq = list(dict.fromkeys(preds_map[u]))[:k]  # drop duplicates
        seen = set() if seen_map is None else set(seen_map.get(u, []))
        for it in popular_items:
            if len(seq) >= k: break
            if it in seq or it in seen: continue
            seq.append(int(it))
        out[int(u)] = seq[:k]
    all_users = set(preds_map.keys())
    if seen_map is not None:
        all_users |= set(seen_map.keys())
    for u in all_users:
        if u not in out:
            out[u] = popular_items[:k]
    return out

def format_submission(users, preds_map, k=10):
    rows = []
    for u in users:
        items = preds_map.get(int(u), [])[:k]
        rows.append({"user_id": int(u), "item_id": " ".join(map(str, items))})
    return pd.DataFrame(rows)

def recall_at_k_from_map(preds_map, targets_df, k=10):
    # preds_map: {user: [items]}
    preds = (
        pd.Series(preds_map)
        .rename("items")
        .reset_index()
        .rename(columns={"index": "user_id"})
    )
    exploded = preds.explode("items").dropna()
    exploded["rank"] = exploded.groupby("user_id").cumcount() + 1
    topk = exploded[exploded["rank"] <= k].rename(columns={"items": "item_id"}).astype({"item_id": int, "user_id": int})
    merged = topk.merge(targets_df, on="user_id", how="inner")
    hit = (merged["item_id"] == merged["target_item_id"]).groupby(merged["user_id"]).any()
    return 0.0 if hit.empty else float(hit.mean())

print({"train_events": len(train), "valid_users": valid_targets['user_id'].nunique()})


{'train_events': 888109, 'valid_users': 6040}


In [None]:
USE_SASREC = True
try:
    from replay.preprocessing import LabelEncoder
    try:
        from replay.models.nn.sasrec import SasRec as SASREC_CLASS
    except Exception:
        from replay.models import SASRec as SASREC_CLASS  # fallback для иных API
    print("SASRec import OK")
except Exception as e:
    print("[warn] SASRec unavailable:", e)
    USE_SASREC = False

def sasrec_fit_predict(train_df: pd.DataFrame, user_ids: list[int], k=10, epochs=5):
    if not USE_SASREC:
        return {int(u): [] for u in user_ids}

    le_user = LabelEncoder(col="user_id")
    le_item = LabelEncoder(col="item_id")
    enc = le_user.fit_transform(train_df.copy())
    enc = le_item.fit_transform(enc)

    model = SASREC_CLASS(
        user_col="user_id",
        item_col="item_id",
        ts_col="timestamp",
        hidden_units=128,
        num_heads=2,
        num_layers=2,
        dropout_rate=0.2,
        seed=SEED
    )
    model.fit(enc, epochs=epochs)

    preds = {}
    for u in tqdm(user_ids, desc="SASRec predict"):
        u_enc = le_user.labels_mapping.get(u, None)
        if u_enc is None:
            preds[int(u)] = []
            continue
        rec_df = model.recommend([u_enc], k=k, filter_viewed=True)  # user_id (enc), item_id (enc), relevance
        rec_df = le_item.inverse_transform(rec_df)
        items = rec_df[rec_df["user_id"] == u]["item_id"].astype(int).tolist()
        preds[int(u)] = items[:k]
    return preds

# quick validation
val_users = valid_targets["user_id"].tolist()
sasrec_preds = sasrec_fit_predict(train, val_users, k=K, epochs=5)
sasrec_preds = ensure_topk(sasrec_preds, popular_items, K, user_seen_map)
r10_sas = recall_at_k_from_map(sasrec_preds, valid_targets, K)
print({"sasrec_recall@10": r10_sas})


[warn] SASRec unavailable: cannot import name 'SASRec' from 'replay.models' (/usr/local/lib/python3.12/dist-packages/replay/models/__init__.py)
{'sasrec_recall@10': 0.0}


In [None]:
print("🚀 Starting Two-stage pipeline...")

# ALS
le = LabelEncoder()
train_enc = le.fit_transform(train.copy())
als = ALS(rank=128, reg_param=0.05, iterations=15, implicit=True, seed=SEED)
als.fit(train_enc)

valid_users = valid_targets["user_id"].unique().tolist()
recs = als.recommend(valid_users, k=CANDIDATES_PER_USER)
recs = le.inverse_transform(recs)
recs = recs.rename(columns={"relevance": "als_rel"})

# Feature engineering
user_hist = train.groupby("user_id").agg(user_events=("item_id", "size"))
item_hist = train.groupby("item_id").agg(item_events=("user_id", "size"))
feats = recs.merge(user_hist, on="user_id", how="left").merge(item_hist, on="item_id", how="left").fillna(0)

# Train LightGBM (по аналогии с SASRec)
train_sorted = train.sort_values("timestamp")
pos_idx = train_sorted.groupby("user_id").tail(1).index
pos_map = dict(zip(train.loc[pos_idx, "user_id"], train.loc[pos_idx, "item_id"]))
feats["label"] = feats.apply(lambda r: 1 if pos_map.get(int(r["user_id"])) == int(r["item_id"]) else 0, axis=1)

X = feats[["als_rel", "user_events", "item_events"]].astype(float)
y = feats["label"].astype(int)
group_tr = feats.groupby("user_id").size().tolist()

ranker = lgb.LGBMRanker(objective="lambdarank", n_estimators=500, learning_rate=0.05, random_state=SEED)
ranker.fit(X, y, group=group_tr)

feats["score"] = ranker.predict(X)
pred_map = {int(u): df.sort_values("score", ascending=False)["item_id"].tolist()[:10] for u, df in feats.groupby("user_id")}
pred_map = ensure_topk(pred_map, popular_items, 10, user_seen_map)

r10_two = recall_at_k_from_map(pred_map, valid_targets, 10)
print({"two_stage_recall@10": round(r10_two, 4)})


NameError: name 'pd' is not defined