<a href="https://colab.research.google.com/github/Aliaksandr-Borsuk/Recommender_Systems_project/blob/main/notebooks/06_ncf_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Подготовка


**Цель:**
- Реализовать и оценить Neural Collaborative Filtering (NCF) на PyTorch в двух режимах обучения:
- - BPR (Bayesian Personalized Ranking) - парное ранжирование,
- - BCE (Binary Cross-Entropy) - бинарная классификация с негативным сэмплированием.


**Данные:**
- используем train, test  из ноутбука 01 (251021_173655).
- все взаимодействия в test_warm участвуют в оценке без фильтрации по рейтингу  в соответствии с предыдущими ноутбуками.



## 01. Клонируем репозиторий. Подключаем GoogleDrive.

In [1]:
!rm -rf /content/Recommender_Systems_project
!git clone https://github.com/Aliaksandr-Borsuk/Recommender_Systems_project

Cloning into 'Recommender_Systems_project'...
remote: Enumerating objects: 147, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (125/125), done.[K
remote: Total 147 (delta 75), reused 52 (delta 14), pack-reused 0 (from 0)[K
Receiving objects: 100% (147/147), 680.87 KiB | 8.11 MiB/s, done.
Resolving deltas: 100% (75/75), done.


In [2]:
# подключаем диск
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 02. Импорты

In [5]:
import sys
sys.path.append("/content/Recommender_Systems_project/src")

import numpy as np
import pandas as pd
import pickle
from scipy.sparse import csr_matrix, load_npz
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
from pathlib import Path
from pprint import pprint

# from rs_datasets import MovieLens
from recommender.data_io import train_test_reader                 # для чтения сохранённых из 001_data_and_eda_1m_proba
from recommender.preprocessing import prepare_ui_matrix           # для получения матрицы взаимодействий
from recommender.metrics import model_evaluation                  # для оценки модели
from recommender.results_logger import save_experiment_results    # для сохранения результатов


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

DATA = Path("/content/drive/MyDrive/Colab Notebooks/data/")
PROCESSED = DATA / "processed"
RESULTS_DIR = DATA / "results"
TOP_K = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('DEVICE = ', DEVICE)

DEVICE =  cuda


## 03. Грузим train, test, meta_данные.



In [4]:
train_tast_path = '/content/drive/MyDrive/Colab Notebooks/data/processed/251021_173655'

train_warm, test_warm, meta_warm = train_test_reader(train_tast_path)
pprint(meta_warm, width=80, compact=False)
print(f'\ntrain shape : {train_warm.shape}')
print(f'test shape  : {test_warm.shape}')
print( '\n', '*'*50, '\ntrain.head')
display(train_warm.head(3))
print('\n', '*'*50, '\ntest.head')
display(test_warm.head(3))

{'columns': ['user_id', 'item_id', 'rating', 'timestamp', 'title', 'genres'],
 'created_at': '2025-10-21T17:37:00.607645',
 'min_test_interactions': 10,
 'min_train_interactions': 5,
 'n_items': 3662,
 'n_test_users': 836,
 'n_train_users': 5392,
 'test_shape': [94842, 6],
 'time_treshold': '2000-12-02T14:52:18',
 'train_shape': [800142, 6]}

train shape : (800142, 6)
test shape  : (94842, 6)

 ************************************************** 
train.head


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
0,635,1251,4,975768620,8 1/2 (1963),Drama
1,635,3948,4,975768294,Meet the Parents (2000),Comedy
2,635,1270,4,975768106,Back to the Future (1985),Comedy|Sci-Fi



 ************************************************** 
test.head


Unnamed: 0,user_id,item_id,rating,timestamp,title,genres
0,635,3789,5,975768788,"Pawnbroker, The (1965)",Drama
1,635,2987,5,979141847,Who Framed Roger Rabbit? (1988),Adventure|Animation|Film-Noir
2,635,2988,4,975769007,Melvin and Howard (1980),Drama


## 04. Загрузка implicit-матрицы

In [6]:
# Получение метрик на test
# загрузка
input_dir = PROCESSED/"artifacts"

# Загрузка матрицы взаимодействий
train_matrix = load_npz(input_dir / "train_matrix.npz")

# Загрузка словарей
with open(input_dir / "user2index.pkl", "rb") as f:
    user2index = pickle.load(f)

with open(input_dir / "item2index.pkl", "rb") as f:
    item2index = pickle.load(f)

with open(input_dir / "index2user.pkl", "rb") as f:
    index2user = pickle.load(f)

with open(input_dir / "index2item.pkl", "rb") as f:
    index2item = pickle.load(f)

assert isinstance(train_matrix, csr_matrix), "train_matrix должен быть csr_matrix"
train_matrix

# заменяем реальные ID на индексы
test_mapped = test_warm.assign(
    user_id = test_warm["user_id"].map(user2index),
    item_id = test_warm["item_id"].map(item2index)
)
assert test_mapped.isna().sum().sum() == 0, 'Achtung!!! Неизвестные пользователи или айтемы!!!'

# группируем
test_dict = test_mapped.groupby('user_id')['item_id'].apply(set).to_dict()

# all_items
all_items = set(train_warm['item_id'].map(item2index).dropna().astype(int).unique())
n_users, n_items = train_matrix.shape

## 05. DataLoader и Negative Sampling

In [7]:
class ImplicitCFDataset(Dataset):
    def __init__(self, train_matrix, num_negatives=1):
        super().__init__()
        self.users, self.items = train_matrix.nonzero()
        self.num_users, self.num_items = train_matrix.shape
        self.num_negatives = num_negatives

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        user = self.users[idx]
        pos_item = self.items[idx]
        # упрощённый negative sampling, не гарантирует что негативы действительно негативы
        # иногда можем схватить позитив... но уж чё уж....
        neg_items = np.random.randint(0, self.num_items, size=self.num_negatives)
        return user, pos_item, neg_items

def collate_fn(batch):
    '''
    batch — список кортежей: [(u1, i1+, [i1-]), (u2, i2+, [i2-]), ...]
    '''
    users, pos_items, neg_items = zip(*batch)
    users = torch.LongTensor(users)
    pos_items = torch.LongTensor(pos_items)
    # получаем плоский массив из списка списков (tenzor)
    neg_items = torch.LongTensor(np.concatenate(neg_items))
    return users, pos_items, neg_items # тензоры плоские

## 06. Модель NCF  
**NCF** - это нейросетевая альтернатива классической коллаборативной фильтрации (например, Matrix Factorization).
#### **Архитектура: GMF + MLP**  
  
1. **GMF - Generalized Matrix Factorization**
- - Это обобщение классической факторизации матриц.
- - Эмбеддинги пользователя и айтема умножаются поэлементно
- - Если бы дальше был линейный слой, это была бы точно Matrix Factorization.
- - GMF - линейная часть модели
2. **MLP - Multi-Layer Perceptron**
- - Эмбеддинги пользователя и айтема конкатенируются, а не перемножаются
- - Затем проходят через секвенцию полносвязных слоёв с ReLU и Dropout
- - Это нелинейная часть, способная моделировать сложные паттерны.

In [8]:
class NCF(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=16, mlp_layers=[64, 32, 16], dropout=0.2):
        super().__init__()
        self.user_gmf = nn.Embedding(n_users, emb_dim)
        self.item_gmf = nn.Embedding(n_items, emb_dim)
        self.user_mlp = nn.Embedding(n_users, mlp_layers[0] // 2)
        self.item_mlp = nn.Embedding(n_items, mlp_layers[0] // 2)

        self.mlp = nn.Sequential()
        for i, (in_f, out_f) in enumerate(zip(mlp_layers[:-1], mlp_layers[1:])):
            self.mlp.add_module(f'dropout_{i}', nn.Dropout(dropout))
            self.mlp.add_module(f'linear_{i}', nn.Linear(in_f, out_f))
            self.mlp.add_module(f'relu_{i}', nn.ReLU())

        self.final = nn.Linear(emb_dim + mlp_layers[-1], 1)
        self.sigmoid = nn.Sigmoid()

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Embedding):
                nn.init.normal_(m.weight, std=0.01)
            elif isinstance(m, nn.Linear):
                nn.init.xavier_normal_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, user_ids, item_ids):
        gmf = self.user_gmf(user_ids) * self.item_gmf(item_ids)

        mlp_user = self.user_mlp(user_ids)
        mlp_item = self.item_mlp(item_ids)
        mlp = torch.cat([mlp_user, mlp_item], dim=1)
        mlp = self.mlp(mlp)

        combined = torch.cat([gmf, mlp], dim=1)
        output = self.final(combined)
        return self.sigmoid(output).squeeze()

    def predict(self, user_ids, item_ids):
        with torch.no_grad():
            return self.forward(user_ids, item_ids)

## 07. Loss-функции

In [9]:
def bpr_loss(pos_scores, neg_scores):
    return -torch.log(torch.sigmoid(pos_scores - neg_scores)).mean()

def bce_loss(scores, labels):
    return nn.BCELoss()(scores, labels)

## 08. Обучение (BPR-режим)

In [10]:
# гиперпараметры
EPOCHS = 20
BATCH_SIZE = 2048
LR = 0.0001
EMB_DIM = 8
MLP_LAYERS = [16, 8]
NUM_NEGATIVES = 1

model = NCF(n_users, n_items, emb_dim=EMB_DIM, mlp_layers=MLP_LAYERS, dropout=0.4).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)

dataset = ImplicitCFDataset(train_matrix, num_negatives=NUM_NEGATIVES)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

model.train()
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for users, pos_items, neg_items in dataloader:
        users, pos_items, neg_items = users.to(DEVICE), pos_items.to(DEVICE), neg_items.to(DEVICE)

        optimizer.zero_grad()
        # Модель учится делать score(u, i+) > score(u, i-)
        pos_scores = model(users, pos_items)
        neg_scores = model(users.repeat_interleave(NUM_NEGATIVES), neg_items)
        loss = bpr_loss(pos_scores, neg_scores)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {epoch_loss:.4f}")

Epoch 5/20, Loss: 235.7643
Epoch 10/20, Loss: 211.5275
Epoch 15/20, Loss: 207.0460
Epoch 20/20, Loss: 205.3132


## 09. Инференс

In [11]:
def recommend_ncf(model, user_ids, train_matrix, k=10):
    model.eval()
    recs = {}
    all_items_tensor = torch.arange(n_items, device=DEVICE)
    for user in user_ids:
        user_tensor = torch.full((n_items,), user, dtype=torch.long, device=DEVICE)
        scores = model(user_tensor, all_items_tensor)
        seen = train_matrix[user].toarray().squeeze().astype(bool)
        scores[seen] = -1e9
        topk = torch.topk(scores, k).indices.cpu().tolist()
        recs[user] = topk
    return recs

## 10. Оценка

In [12]:
recs = recommend_ncf(model, list(test_dict.keys()), train_matrix, k=TOP_K)
result = model_evaluation(recs, test_dict, all_items, k=TOP_K, model_name='NCF_BPR')
display(result)

Unnamed: 0,hit_rate@10,precision@10,recall@10,ndcg@10,map@10,coverage@10
NCF_BPR,0.814593,0.31555,0.037944,0.328339,0.219007,0.036865


## 11. Сохранение

In [14]:
results_data, json_file, csv_file = save_experiment_results(
                                        result=result,
                                        model_name="NCF_BPR",
                                        meta=meta_warm,
                                        results_dir = RESULTS_DIR
                                    )

Результат добавлен в существующий CSV файл
JSON результат сохранен как: NCF_BPR_20251228_162516.json
CSV со всеми экспериментами: all_experiments_results.csv
Все результаты в: /content/drive/MyDrive/Colab Notebooks/data/results

СВОДКА ЭКСПЕРИМЕНТА
Модель: NCF_BPR
Метка времени: 20251228_162516
Дата оценки: 2025-12-28T16:25:16
Размер train: 800,142
Размер test: 94,842
Пользователей в test: 836
Уникальных предметов: 3662
HitRate@10: 81.5%
precision@10: 31.56%
recall@10: 3.79%
ndcg@10: 32.83%
map@10: 21.90%
Coverage@10: 3.69%

Последние эксперименты (8 всего):


Unnamed: 0,model_name,hit_rate@10,precision@10,recall@10,ndcg@10,map@10,coverage@10,timestamp,evaluation_date
3,truncated_svd_n_comp=5_n_iter=42,0.851675,0.348206,0.045528,0.365538,0.24731,0.091207,20251116_190936,"2025-11-16T19:09:36.863512,,,,,,,,,"
4,als_factors=5_iter=21_alpha=0.6_reg=0.02,0.840909,0.340191,0.043517,0.358804,0.24451,0.095576,20251210_110349,"2025-12-10T11:03:49.859474,,,,,,,,,"
5,ease_lambda=108727,0.838517,0.338517,0.042712,0.356757,0.243356,0.045603,20251211_185547,"2025-12-11T18:55:47.668555,,,,,,,,,"
6,slim_alpha_0.47_l1_ratio_0.14,0.825359,0.32201,0.041529,0.342581,0.225772,0.046969,20251214_171141,
7,NCF_BPR,0.814593,0.31555,0.037944,0.328339,0.219007,0.036865,20251228_162516,2025-12-28T16:25:16.373247


## 12. Обучение (BCE-режим)

In [15]:
class BCECFDataset(Dataset):
    def __init__(self, train_matrix):
        super().__init__()
        self.users, self.items = train_matrix.nonzero()
        self.labels = np.ones(len(self.users), dtype=np.float32)  # только позитивы
        self.num_users, self.num_items = train_matrix.shape

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

In [16]:
# гиперпараметры
EPOCHS = 20
BATCH_SIZE = 2048
LR = 0.0001
EMB_DIM = 8
MLP_LAYERS = [16, 8]
NUM_NEGATIVES = 1

model = NCF(n_users, n_items, emb_dim=EMB_DIM, mlp_layers=MLP_LAYERS, dropout=0.4).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)

# Dataset только по позитивам
bce_dataset = BCECFDataset(train_matrix)
bce_dataloader = DataLoader(bce_dataset, batch_size=BATCH_SIZE, shuffle=True)

model.train()
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for batch in bce_dataloader:
        users, pos_items, _ = batch
        users = users.to(DEVICE)
        pos_items = pos_items.to(DEVICE)
        B = users.size(0)

        # --- Формируем батч: [позитивы + негативы] ---
        # Позитивы
        all_users = users.repeat(NUM_NEGATIVES + 1)  # [u1, u1, ..., u2, u2, ...]
        all_items = pos_items.repeat(NUM_NEGATIVES + 1)
        labels = torch.ones_like(all_users, dtype=torch.float32, device=DEVICE)

        # Негативы
        neg_items = torch.randint(0, n_items, (B * NUM_NEGATIVES,), device=DEVICE)
        all_users[B:] = users.repeat_interleave(NUM_NEGATIVES)
        all_items[B:] = neg_items
        labels[B:] = 0.0  # негативные лейблы

        optimizer.zero_grad()
        scores = model(all_users, all_items)
        loss = bce_loss(scores, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, BCE Loss: {epoch_loss:.4f}")

Epoch 5/20, BCE Loss: 218.4326
Epoch 10/20, BCE Loss: 203.9686
Epoch 15/20, BCE Loss: 201.6438
Epoch 20/20, BCE Loss: 200.3700


## 13. Оценка

In [17]:
recs = recommend_ncf(model, list(test_dict.keys()), train_matrix, k=TOP_K)
result = model_evaluation(recs, test_dict, all_items, k=TOP_K, model_name='NCF_BCE')
display(result)

Unnamed: 0,hit_rate@10,precision@10,recall@10,ndcg@10,map@10,coverage@10
NCF_BCE,0.801435,0.311364,0.037039,0.324943,0.217343,0.047117


## 14. Сохранение

In [18]:
results_data, json_file, csv_file = save_experiment_results(
                                        result=result,
                                        model_name="NCF_BCE",
                                        meta=meta_warm,
                                        results_dir = RESULTS_DIR
                                    )

Результат добавлен в существующий CSV файл
JSON результат сохранен как: NCF_BCE_20251228_162658.json
CSV со всеми экспериментами: all_experiments_results.csv
Все результаты в: /content/drive/MyDrive/Colab Notebooks/data/results

СВОДКА ЭКСПЕРИМЕНТА
Модель: NCF_BCE
Метка времени: 20251228_162658
Дата оценки: 2025-12-28T16:26:58
Размер train: 800,142
Размер test: 94,842
Пользователей в test: 836
Уникальных предметов: 3662
HitRate@10: 80.1%
precision@10: 31.14%
recall@10: 3.70%
ndcg@10: 32.49%
map@10: 21.73%
Coverage@10: 4.71%

Последние эксперименты (9 всего):


Unnamed: 0,model_name,hit_rate@10,precision@10,recall@10,ndcg@10,map@10,coverage@10,timestamp,evaluation_date
4,als_factors=5_iter=21_alpha=0.6_reg=0.02,0.840909,0.340191,0.043517,0.358804,0.24451,0.095576,20251210_110349,"2025-12-10T11:03:49.859474,,,,,,,,,"
5,ease_lambda=108727,0.838517,0.338517,0.042712,0.356757,0.243356,0.045603,20251211_185547,"2025-12-11T18:55:47.668555,,,,,,,,,"
6,slim_alpha_0.47_l1_ratio_0.14,0.825359,0.32201,0.041529,0.342581,0.225772,0.046969,20251214_171141,
7,NCF_BPR,0.814593,0.31555,0.037944,0.328339,0.219007,0.036865,20251228_162516,2025-12-28T16:25:16.373247
8,NCF_BCE,0.801435,0.311364,0.037039,0.324943,0.217343,0.047117,20251228_162658,2025-12-28T16:26:58.143923


# Итого:


## **Выводы:**
- **NCF** уступает линейным и простым коллаборативным моделям  

Это согласуется с современными исследованиями: на малых и плотных implicit-датасетах (вроде MovieLens) простые модели часто превосходят нейросети, особенно если последние не тюнингуются глубоко.
  
- Разница между **NCF_BCE** и **NCF_BPR** статистически незначима, оба режима обучения дают схожее качество на этом датасете.

# P.S. Однако.

In [76]:
# Дадим модели больше свободы и времени для обучения BCE обучается быстрее, так что поколышем её.
EPOCHS = 100
BATCH_SIZE = 2048
LR = 0.0001
EMB_DIM = 8
MLP_LAYERS = [32, 16]
NUM_NEGATIVES = 1  # можно 3–5 для BCE

model = NCF(n_users, n_items, emb_dim=EMB_DIM, mlp_layers=MLP_LAYERS, dropout=0.4).to(DEVICE)
optimizer = optim.Adam(model.parameters(), lr=LR)

# Dataset только по позитивам
bce_dataset = BCECFDataset(train_matrix)
bce_dataloader = DataLoader(bce_dataset, batch_size=BATCH_SIZE, shuffle=True)

model.train()
for epoch in range(EPOCHS):
    epoch_loss = 0.0
    for batch in bce_dataloader:
        users, pos_items, _ = batch
        users = users.to(DEVICE)
        pos_items = pos_items.to(DEVICE)
        B = users.size(0)

        # --- Формируем батч: [позитивы + негативы] ---
        # Позитивы
        all_users = users.repeat(NUM_NEGATIVES + 1)  # [u1, u1, ..., u2, u2, ...]
        all_items = pos_items.repeat(NUM_NEGATIVES + 1)
        labels = torch.ones_like(all_users, dtype=torch.float32, device=DEVICE)

        # Негативы
        neg_items = torch.randint(0, n_items, (B * NUM_NEGATIVES,), device=DEVICE)
        all_users[B:] = users.repeat_interleave(NUM_NEGATIVES)
        all_items[B:] = neg_items
        labels[B:] = 0.0  # негативные лейблы

        optimizer.zero_grad()
        scores = model(all_users, all_items)
        loss = bce_loss(scores, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{EPOCHS}, BCE Loss: {epoch_loss:.4f}")

Epoch 10/100, BCE Loss: 199.1283
Epoch 20/100, BCE Loss: 193.5511
Epoch 30/100, BCE Loss: 187.1542
Epoch 40/100, BCE Loss: 180.7807
Epoch 50/100, BCE Loss: 175.7250
Epoch 60/100, BCE Loss: 171.5077
Epoch 70/100, BCE Loss: 168.3871
Epoch 80/100, BCE Loss: 166.5215
Epoch 90/100, BCE Loss: 164.6803
Epoch 100/100, BCE Loss: 163.4653


In [77]:
recs = recommend_ncf(model, list(test_dict.keys()), train_matrix, k=TOP_K)
result = model_evaluation(recs, test_dict, all_items, k=TOP_K, model_name='NCF_BCE_100')
display(result)

Unnamed: 0,hit_rate@10,precision@10,recall@10,ndcg@10,map@10,coverage@10
NCF_BCE_100,0.840909,0.323923,0.04403,0.33868,0.222461,0.296765


## **Выводы:**
- Высокий coverage@10 у NCF_BCE при 100 эпохах - достойно внимания при пвыборе модели для продакшион.
- NCF значительно разнообразнее рекомендует
- даже при сопоставимом HitRate (0.84) после 100 эпох - модель не вырождается в рекомендации популярного