In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

import scipy.sparse as sp
from tqdm import tqdm

EPS = 1e-5

In [38]:
# Импорты и глобальная конфигурация
import os, glob, math, pickle, time
from collections import defaultdict, Counter

import numpy as np
import pandas as pd

from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.decomposition import TruncatedSVD

import matplotlib.pyplot as plt

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# --- Fast dev mode ---
DEV_MODE = False
MAX_USERS = 8000 if DEV_MODE else None  # None для полного прогона

MIN_BASKETS_PER_USER = 3  # должно быть >= 3, чтобы можно было сделать train/val/test

# ---Метрики и размер списка рекомендаций---
TOPK_LIST = [5, 10, 20]
TOPN_RECOMMEND = 200  # внутренняя длина списка кандидатов (сколько объектов ранжируем)


# --- UserKNN ---
USERKNN_TUNE_GRID = [50, 100, 200, 500]  # значения числа соседей для подбора
USERKNN_DEFAULT_K = 200

# --- TIFU-KNN(simple) ---
TIFU_GROUPS_GRID = [5, 7]              # варианты числа групп истории
TIFU_ALPHA_GRID = [0.5, 0.7, 0.9]      # варианты смешивания PIF/IU
TIFU_NEIGHBORS_GRID = [100, 300]       # варианты числа соседей

# --- ItemKNN (добавляем, чтобы "KNN baseline" был однозначно покрыт) ---
ITEMKNN_TUNE_GRID = [50, 100, 200]   # сколько похожих items хранить на item (topK)
ITEMKNN_DEFAULT_K = 100


TIFU_WITHIN_DECAY = 0.9  # затухание внутри группы
TIFU_GROUP_DECAY = 0.7   # затухание между группами
TIFU_DEFAULT_GROUPS = 7
TIFU_DEFAULT_ALPHA = 0.7
TIFU_DEFAULT_K = 300

print("DEV_MODE:", DEV_MODE, "MAX_USERS:", MAX_USERS)

def find_csv_candidate():
    # Ищем CSV-файлы в папке Kaggle input
    cands = glob.glob('/kaggle/input/*/*.csv') + glob.glob('/kaggle/input/*/*.CSV')
    if not cands:
        raise FileNotFoundError('В /kaggle/input не найдены CSV. Проверьте, что датасет добавлен в ноутбук.')

    # Предпочитаем файл, который похож на Ta-Feng по названию
    for p in cands:
        low = p.lower()
        if ('ta' in low and 'feng' in low) or ('tafeng' in low):
            return p

    # Если не нашли — берём самый большой CSV (как запасной вариант)
    cands = sorted(cands, key=lambda p: os.path.getsize(p), reverse=True)
    return cands[0]

def detect_columns(df: pd.DataFrame):
    cols = {c.lower(): c for c in df.columns}

    def pick(candidates):
        for c in candidates:
            if c in cols:
                return cols[c]
        return None

    user_col = pick(['customer_id', 'cust_id', 'user_id', 'userid', 'member_id', 'client_id'])
    item_col = pick(['product_id', 'item_id', 'prod_id', 'sku_id', 'article_id'])
    date_col = pick(['transaction_dt', 'trans_date', 'date', 't_dat', 'datetime', 'transaction_date'])

    # В Ta-Feng часто встречаются имена в верхнем регистре: CUSTOMER_ID, PRODUCT_ID, TRANSACTION_DT
    if user_col is None:
        for c in df.columns:
            if c.upper() == 'CUSTOMER_ID':
                user_col = c
                break
    if item_col is None:
        for c in df.columns:
            if c.upper() == 'PRODUCT_ID':
                item_col = c
                break
    if date_col is None:
        for c in df.columns:
            if c.upper() == 'TRANSACTION_DT':
                date_col = c
                break

    return user_col, item_col, date_col

df = pd.read_csv('/root/.cache/kagglehub/datasets/chiranjivdas09/ta-feng-grocery-dataset/versions/1/ta_feng_all_months_merged.csv')
print("Shape:", df.shape)
print("Columns:", list(df.columns)[:30])

user_col, item_col, date_col = detect_columns(df)
print("Detected columns:", {"user_col": user_col, "item_col": item_col, "date_col": date_col})

if user_col is None or item_col is None or date_col is None:
    raise ValueError(
        "Не удалось автоматически определить необходимые колонки. "
        "Пожалуйста, задайте user_col/item_col/date_col вручную после просмотра df.columns."
    )


DEV_MODE: False MAX_USERS: None
Shape: (817741, 9)
Columns: ['TRANSACTION_DT', 'CUSTOMER_ID', 'AGE_GROUP', 'PIN_CODE', 'PRODUCT_SUBCLASS', 'PRODUCT_ID', 'AMOUNT', 'ASSET', 'SALES_PRICE']
Detected columns: {'user_col': 'CUSTOMER_ID', 'item_col': 'PRODUCT_ID', 'date_col': 'TRANSACTION_DT'}


In [39]:
# Приводим типы к строкам (важно, чтобы не потерять ведущие нули в идентификаторах)
df[user_col] = df[user_col].astype(str)
df[item_col] = df[item_col].astype(str)

# Парсим дату, некорректные строки превращаются в NaT
df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

# Удаляем строки без даты/пользователя/товара
df = df.dropna(subset=[date_col, user_col, item_col]).copy()

# Округляем datetime вниз до даты (без времени)
df['__date'] = df[date_col].dt.floor('D')

# Группировка в корзины
basket_df = (
    df.groupby([user_col, '__date'])[item_col]
      .apply(lambda s: list(pd.unique(s)))   # уникальные товары в корзине
      .reset_index()
      .rename(columns={user_col: 'user_raw', '__date': 'date', item_col: 'items_raw'})
)

basket_df = basket_df.sort_values(['user_raw', 'date']).reset_index(drop=True)
print("Baskets:", basket_df.shape, "Unique users:", basket_df['user_raw'].nunique())
basket_df.head()

Baskets: (119578, 3) Unique users: 32266


Unnamed: 0,user_raw,date,items_raw
0,100021,2000-11-03,"[9310042571491, 4719783004070, 4711049230223, ..."
1,100021,2000-11-05,"[4710018004605, 4719111020109, 4710247005299, ..."
2,100021,2000-11-19,"[4711686002016, 47106710, 4711686002528, 47102..."
3,100021,2000-11-28,"[4711800531385, 4714981010038, 4710339772139, ..."
4,100021,2000-12-02,"[4710088436511, 4710094014741, 4710105045443, ..."


In [40]:
from collections import defaultdict
import numpy as np
import pandas as pd

# Фильтруем пользователей с достаточным числом корзин:
#    нам нужно минимум 3 корзины на пользователя, чтобы сформировать train/val/test
counts = basket_df.groupby('user_raw').size()
keep_users = counts[counts >= MIN_BASKETS_PER_USER].index
basket_df = basket_df[basket_df['user_raw'].isin(keep_users)].copy()

# Опционально: режим разработки (dev).
#    Если задан MAX_USERS, берём только первых N пользователей (после сортировки/порядка появления).
#    Это ускоряет эксперименты и отладку в Kaggle.
if MAX_USERS is not None:
    users = basket_df['user_raw'].unique()[:MAX_USERS]
    basket_df = basket_df[basket_df['user_raw'].isin(users)].copy()

# На всякий случай пересортируем по пользователю и времени,
#    чтобы дальнейший split по времени был корректным
basket_df = basket_df.sort_values(['user_raw', 'date']).reset_index(drop=True)
print("After filter/dev: baskets", basket_df.shape, "users", basket_df['user_raw'].nunique())

# Маппинг сырого user/item ID в индексы 0..n-1
#    Это нужно для эффективной работы с матрицами (scipy.sparse) и моделями.
user_ids = basket_df['user_raw'].unique()
item_ids = pd.unique(np.concatenate(basket_df['items_raw'].values))

user2idx = {u: i for i, u in enumerate(user_ids)}
item2idx = {it: i for i, it in enumerate(item_ids)}

# обратные отображения (удобно для дебага/вывода рекомендаций)
idx2user = {i: u for u, i in user2idx.items()}
idx2item = {i: it for it, i in item2idx.items()}

# 5) Добавляем индекс пользователя и переводим списки товаров в список индексов
basket_df['u'] = basket_df['user_raw'].map(user2idx)
basket_df['item_idx_list'] = basket_df['items_raw'].apply(lambda xs: [item2idx[x] for x in xs])

n_users = len(user2idx)
n_items = len(item2idx)
print("n_users:", n_users, "n_items:", n_items)

display(basket_df.head())

After filter/dev: baskets (95072, 3) users 14074
n_users: 14074 n_items: 22817


Unnamed: 0,user_raw,date,items_raw,u,item_idx_list
0,100021,2000-11-03,"[9310042571491, 4719783004070, 4711049230223, ...",0,"[0, 1, 2, 3, 4, 5]"
1,100021,2000-11-05,"[4710018004605, 4719111020109, 4710247005299, ...",0,"[6, 7, 8, 9, 10, 11, 12, 13, 14, 15]"
2,100021,2000-11-19,"[4711686002016, 47106710, 4711686002528, 47102...",0,"[16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]"
3,100021,2000-11-28,"[4711800531385, 4714981010038, 4710339772139, ...",0,"[28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38]"
4,100021,2000-12-02,"[4710088436511, 4710094014741, 4710105045443, ...",0,"[32, 39, 20, 40, 41]"


In [41]:
user_baskets = defaultdict(list)  # u -> list of (date, [items])
for row in basket_df.itertuples(index=False):
    user_baskets[row.u].append((row.date, row.item_idx_list))

# Гарантируем сортировку по времени (вдруг где-то нарушилась)
for u in user_baskets:
    user_baskets[u] = sorted(user_baskets[u], key=lambda x: x[0])

train_hist = {}
val_basket = {}
test_basket = {}

for u, seq in user_baskets.items():
    baskets = [b for _, b in seq]
    # На всякий случай проверяем минимальную длину
    if len(baskets) < 3:
        continue
    train_hist[u] = baskets[:-2]   # все корзины, кроме двух последних
    val_basket[u]  = baskets[-2]   # предпоследняя корзина
    test_basket[u] = baskets[-1]   # последняя корзина

print("Users with train/val/test:", len(train_hist))
assert len(train_hist) > 0, "No users available after filtering/splitting."

Users with train/val/test: 14074


In [42]:
# Строим user-item матрицу только по train_hist (без val/test), чтобы избежать утечки будущего.
# X_raw[u, it] = сколько train-корзин пользователя u содержали товар it (presence in basket).
rows, cols, data = [], [], []
for u, baskets in train_hist.items():
    c = Counter()
    for b in baskets:
        for it in set(b):   # presence in basket: учитываем товар один раз на корзину
            c[it] += 1
    for it, v in c.items():
        rows.append(u)
        cols.append(it)
        data.append(float(v))

X_raw = sparse.csr_matrix((data, (rows, cols)), shape=(n_users, n_items), dtype=np.float32)

# L2-нормировка по пользователям для косинусной похожести:
# cos(u,v) = dot(X_cos[u], X_cos[v])
X_cos = normalize(X_raw, norm='l2', axis=1)

# Разреженность матрицы: доля ненулевых элементов
density = X_raw.nnz / (n_users * n_items)
print("X_raw nnz:", X_raw.nnz, "density:", f"{density:.6f}")

X_raw nnz: 383239 density: 0.001193


In [43]:
def build_tifu_matrix(train_hist_dict, n_users, n_items, 
                      groups=TIFU_DEFAULT_GROUPS, 
                      alpha=TIFU_DEFAULT_ALPHA, 
                      within_decay=TIFU_WITHIN_DECAY, 
                      group_decay=TIFU_GROUP_DECAY):
    rows, cols, data = [], [], []
    
    for u, baskets in tqdm(train_hist_dict.items(), desc="Building TIFU Matrix"):
        flat_items = [item for basket in baskets for item in basket]
        m = len(flat_items)
        if m == 0:
            continue
            
        actual_groups = min(groups, m)
        group_size = int(np.ceil(m / actual_groups))
        
        item_scores = defaultdict(float)
        
        for g in range(actual_groups):
            start_idx = g * group_size
            end_idx = min((g + 1) * group_size, m)
            group_items = flat_items[start_idx:end_idx]
            
            g_weight = group_decay ** (actual_groups - 1 - g)
            
            for k, item in enumerate(group_items):
                w_weight = within_decay ** (len(group_items) - 1 - k)
                
                total_weight = g_weight * w_weight
                item_scores[item] += total_weight

        for item, score in item_scores.items():
            rows.append(u)
            cols.append(item)
            data.append(score)

    X_tifu = sparse.csr_matrix((data, (rows, cols)), shape=(n_users, n_items), dtype=np.float32)
    X_tifu = normalize(X_tifu, norm='l2', axis=1)
    return X_tifu

X_tifu = build_tifu_matrix(train_hist, n_users, n_items)


Building TIFU Matrix: 100%|██████████| 14074/14074 [00:00<00:00, 72334.64it/s]


In [44]:
import numpy as np
from scipy import sparse

def build_x_bin_from_xraw(X_raw_csr):
    Xb = X_raw_csr.copy().tocsr()
    Xb.data = np.ones_like(Xb.data, dtype=np.float32)
    return Xb

def topk_sorted_csr(mat_csr, k):
    """
    Оставляет top-k элементов по значению в каждой строке CSR и сортирует их по убыванию.
    """
    mat = mat_csr.tocsr()
    indptr, indices, data = mat.indptr, mat.indices, mat.data

    new_indptr = np.zeros(mat.shape[0] + 1, dtype=np.int32)
    new_indices = []
    new_data = []

    nnz_so_far = 0
    for i in range(mat.shape[0]):
        start, end = indptr[i], indptr[i + 1]
        row_idx = indices[start:end]
        row_data = data[start:end]

        if row_data.size == 0:
            new_indptr[i + 1] = nnz_so_far
            continue

        if row_data.size > k:
            top = np.argpartition(-row_data, k)[:k]
            top = top[np.argsort(-row_data[top])]
            row_idx = row_idx[top]
            row_data = row_data[top]
        else:
            order = np.argsort(-row_data)
            row_idx = row_idx[order]
            row_data = row_data[order]

        new_indices.extend(row_idx.tolist())
        new_data.extend(row_data.astype(np.float32).tolist())
        nnz_so_far += len(row_idx)
        new_indptr[i + 1] = nnz_so_far

    return sparse.csr_matrix(
        (np.array(new_data, dtype=np.float32),
         np.array(new_indices, dtype=np.int32),
         new_indptr),
        shape=mat.shape
    )

def build_item_cosine_sim_topk(X_raw_csr, topk=100, use_binary=True):
    """
    Строим item-item cosine similarity из train-матрицы:
    - X_bin: user×item (0/1)
    - C = X_bin.T @ X_bin: item×item co-occurrence по пользователям
    - cosine: C_ij / (||i|| * ||j||)
    - оставляем topk соседей на item
    """
    Xb = build_x_bin_from_xraw(X_raw_csr) if use_binary else X_raw_csr.tocsr()
    Xi = Xb.T.tocsr()  # item×user

    norms = np.sqrt(np.asarray(Xi.multiply(Xi).sum(axis=1)).ravel()) + 1e-12

    C = (Xi @ Xi.T).tocsr()
    C.setdiag(0.0)
    C.eliminate_zeros()

    C = C.tocoo()
    C.data = (C.data / (norms[C.row] * norms[C.col])).astype(np.float32)

    S = C.tocsr()
    S = topk_sorted_csr(S, topk)
    return S

def hyperbolic_tifu_recommender_factory(X_tifu_csr, S_hyp_csr, fallback_scores=None):
    n_items_local = X_tifu_csr.shape[1]

    if fallback_scores is None:
        fallback_scores = np.asarray(X_tifu_csr.sum(axis=0)).ravel().astype(np.float32)

    def recommend(u, topn=TOPN_RECOMMEND):
        sim_users = S_hyp_csr[u]
        scores_sparse = sim_users.dot(X_tifu_csr)

        if scores_sparse.nnz == 0:
            scores = fallback_scores
        else:
            scores = np.zeros(n_items_local, dtype=np.float32)
            scores[scores_sparse.indices] = scores_sparse.data.astype(np.float32)

        topn_clipped = min(topn, scores.shape[0])
        idx = np.argpartition(-scores, topn_clipped)[:topn_clipped]
        idx = idx[np.argsort(-scores[idx])]
        return idx.tolist()

    return recommend



In [45]:
def build_user_user_graph(R, tau=2):
    W = (R @ R.T).tocoo()
    
    mask = (W.row != W.col) & (W.data >= tau)
    edges = np.vstack([W.row[mask], W.col[mask]]).T
    
    return edges


user_edges = build_user_user_graph(X_raw, tau=2)
print(f"Num user edges: {len(user_edges)}")

Num user edges: 19195608


In [46]:
import torch
import torch.nn as nn
from tqdm import tqdm

EPS = 1e-5

class PoincareEmbedding(nn.Module):
    
    def __init__(self, num_items, dim):
        super().__init__()
        self.emb = nn.Embedding(num_items, dim)
        nn.init.uniform_(self.emb.weight, -1e-3, 1e-3)

    def forward(self, idx):
        return self.project_to_ball(self.emb(idx))
    
    def project_to_ball(self, x, eps=EPS):
        norm = torch.norm(x, dim=-1, keepdim=True)
        max_norm = 1 - eps
        return x / norm.clamp_min(EPS) * torch.clamp(norm, max=max_norm)
    
    def poincare_distance(self, x, y):
        x2 = (x * x).sum(dim=-1)
        y2 = (y * y).sum(dim=-1)
        diff2 = ((x - y) ** 2).sum(dim=-1)
        denom = (1 - x2) * (1 - y2)
        z = 1 + 2 * diff2 / denom.clamp_min(EPS)
        return torch.acosh(z.clamp_min(1 + EPS))
    
    def sample_negatives(self, batch_size, num_items, K):
        return torch.randint(low=0, high=num_items, size=(batch_size, K))
    
    def poincare_loss(self, i, j, negs):
        xi = self.forward(i)
        xj = self.forward(j)
        xk = self.forward(negs)
    
        d_pos = self.poincare_distance(xi, xj)
        d_neg = self.poincare_distance(xi.unsqueeze(1), xk)
    
        numerator = torch.exp(-d_pos)
        denominator = torch.exp(-d_neg).sum(dim=1)
    
        loss = -torch.log(numerator / denominator.clamp_min(EPS))
        return loss.mean()
    
    def train_embedding(self, edges, num_items, optimizer, epochs=10, batch_size=256, neg_k=10, device="cpu"):
        self.to(device)
        edges = torch.tensor(edges, dtype=torch.long, device=device)
        
        for epoch in range(epochs):
            perm = torch.randperm(len(edges), device=device)
            total_loss = 0.0
    
            for idx in range(0, len(edges), batch_size):
                batch_idx = perm[idx:idx + batch_size]
                batch = edges[batch_idx]
    
                i = batch[:, 0]
                j = batch[:, 1]
                negs = self.sample_negatives(len(i), num_items, neg_k).to(device)
    
                optimizer.zero_grad()
                loss = self.poincare_loss(i, j, negs)
                loss.backward()
                optimizer.step()
    
                with torch.no_grad():
                    self.emb.weight.copy_(self.project_to_ball(self.emb.weight))
    
                total_loss += loss.item() * len(i)
    
            print(f"Epoch {epoch+1}: loss = {total_loss / len(edges):.4f}")
        
        return self

In [47]:
num_users = X_raw.shape[0]
dim = 30
lr = 0.001

user_model = PoincareEmbedding(num_users, dim)
user_optimizer = torch.optim.Adam(user_model.parameters(), lr=lr)

user_model.train_embedding(
    edges=user_edges,
    num_items=num_users,  
    optimizer=user_optimizer,
    epochs=30,             
    batch_size=1024,
    neg_k=15,
    device='cuda' if torch.cuda.is_available() else 'cpu'
)


Epoch 1: loss = 2.1104
Epoch 2: loss = 2.0681
Epoch 3: loss = 2.0520
Epoch 4: loss = 2.0449
Epoch 5: loss = 2.0408
Epoch 6: loss = 2.0393
Epoch 7: loss = 2.0380
Epoch 8: loss = 2.0370
Epoch 9: loss = 2.0359
Epoch 10: loss = 2.0353
Epoch 11: loss = 2.0343
Epoch 12: loss = 2.0333
Epoch 13: loss = 2.0330
Epoch 14: loss = 2.0322
Epoch 15: loss = 2.0309
Epoch 16: loss = 2.0304
Epoch 17: loss = 2.0295
Epoch 18: loss = 2.0289
Epoch 19: loss = 2.0278
Epoch 20: loss = 2.0274
Epoch 21: loss = 2.0267
Epoch 22: loss = 2.0260
Epoch 23: loss = 2.0255
Epoch 24: loss = 2.0256
Epoch 25: loss = 2.0245
Epoch 26: loss = 2.0244
Epoch 27: loss = 2.0237
Epoch 28: loss = 2.0233
Epoch 29: loss = 2.0230
Epoch 30: loss = 2.0225


PoincareEmbedding(
  (emb): Embedding(14074, 30)
)

In [48]:
def poincare_dist_matrix(emb_weight):
    norm_sq = (emb_weight ** 2).sum(dim=1, keepdim=True)
    dist_sq = norm_sq + norm_sq.t() - 2 * (emb_weight @ emb_weight.t())
    
    denom = (1 - norm_sq) @ (1 - norm_sq).t()
    denom = torch.clamp(denom, min=1e-5)
    
    arg = 1 + 2 * dist_sq / denom
    return torch.acosh(torch.clamp(arg, min=1.0 + 1e-5))

def build_user_hyperbolic_knn(model, topk=300, device='cpu'):
    model.eval()
    with torch.no_grad():
        weights = model.emb.weight.data.to(device)
        n_users = weights.shape[0]
        
        dists = poincare_dist_matrix(weights)
        
        sims = torch.exp(-dists)
        
        sims.fill_diagonal_(0)
        
        vals, inds = torch.topk(sims, k=topk, dim=1)
        
        vals = vals.cpu().numpy().flatten()
        inds = inds.cpu().numpy().flatten()
        
        rows = np.repeat(np.arange(n_users), topk)
        
        S_hyp = sparse.csr_matrix((vals, (rows, inds)), shape=(n_users, n_users), dtype=np.float32)
        return S_hyp


In [112]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
S_hyp_user = build_user_hyperbolic_knn(user_model, topk=1000, device=device)

In [113]:
rec_hyp_tifu = hyperbolic_tifu_recommender_factory(X_tifu, S_hyp_user)

In [114]:
def recall_at_k(pred_items, true_items, k):
    """
    Recall@K = доля товаров из истинной корзины, которые попали в top-K рекомендаций.
    true_items: список товаров в истинной корзине (val/test)
    pred_items: ранжированный список рекомендаций
    """
    pred_k = pred_items[:k]
    true_set = set(true_items)
    if len(true_set) == 0:
        return 0.0
    return len(set(pred_k) & true_set) / len(true_set)

def ndcg_at_k(pred_items, true_items, k):
    """
    NDCG@K учитывает порядок: попадания в верхние позиции оцениваются выше.
    Здесь релевантность бинарная: товар релевантен, если он есть в true_items.
    """
    true_set = set(true_items)
    pred_k = pred_items[:k]

    # DCG
    dcg = 0.0
    for i, it in enumerate(pred_k):
        if it in true_set:
            dcg += 1.0 / np.log2(i + 2)  # i=0 -> log2(2)=1

    # IDCG: максимум возможного DCG при идеальном ранжировании
    ideal_hits = min(k, len(true_set))
    idcg = sum(1.0 / np.log2(i + 2) for i in range(ideal_hits))

    return dcg / idcg if idcg > 0 else 0.0

def evaluate_model(recommender_fn, users, true_baskets, topk_list=(5,10,20)):
    """
    Оцениваем модель на пользователях:
    - recommender_fn(u) должен возвращать ранжированный список item_id (индексы товаров)
    - true_baskets[u] — истинная корзина (список item_id)
    Возвращаем средние Recall@K и NDCG@K по пользователям для каждого K.
    """
    rows = []
    for u in users:
        u = int(u)
        pred = recommender_fn(u)
        true = true_baskets[u]
        for k in topk_list:
            rows.append({
                "u": u,
                "k": int(k),
                "recall": recall_at_k(pred, true, k),
                "ndcg": ndcg_at_k(pred, true, k),
            })

    return (pd.DataFrame(rows)
            .groupby("k")[["recall","ndcg"]].mean()
            .reset_index())

def tag_result(df_res, model_name, split_name):
    """Добавляем метаданные (название модели и сплит) к таблице метрик."""
    out = df_res.copy()
    out["model"] = model_name
    out["split"] = split_name
    return out

In [120]:

users_val = list(val_basket.keys())
res_userknn_hyper_val = evaluate_model(
    lambda u: rec_hyp_tifu(u, TOPN_RECOMMEND),
    users_val,
    val_basket,
    TOPK_LIST
)

val_table_user = pd.concat([
    tag_result(res_userknn_hyper_val, f"TIFU-KNN_Hyper(topk={1000})", "val"),
], ignore_index=True)

val_table_user


Unnamed: 0,k,recall,ndcg,model,split
0,5,0.052538,0.062253,TIFU-KNN_Hyper(topk=1000),val
1,10,0.068223,0.063113,TIFU-KNN_Hyper(topk=1000),val
2,20,0.089115,0.069009,TIFU-KNN_Hyper(topk=1000),val


In [121]:
users_test = list(test_basket.keys())
res_userknn_hyper_test = evaluate_model(
    lambda u: rec_hyp_tifu(u, TOPN_RECOMMEND),
    users_test,
    test_basket,
    TOPK_LIST
)

test_table_user = pd.concat([
    tag_result(res_userknn_hyper_test, f"TIFU-KNN_Hyper(topk={1000})", "test"),
], ignore_index=True)

test_table_user


Unnamed: 0,k,recall,ndcg,model,split
0,5,0.073137,0.087003,TIFU-KNN_Hyper(topk=1000),test
1,10,0.093088,0.088212,TIFU-KNN_Hyper(topk=1000),test
2,20,0.114282,0.093691,TIFU-KNN_Hyper(topk=1000),test
