In [3]:
import pandas as pd

ratings = pd.read_csv('ratings.csv')

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [4]:
metadata = pd.read_csv('movies_metadata.csv', low_memory=False)

metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [5]:
links = pd.read_csv('links.csv', dtype={'tmdbId': 'float64'})

links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
metadata = metadata.dropna(subset=['production_countries', 'imdb_id', 'original_language', 'popularity', 'production_companies', 'production_countries', 'revenue', 'spoken_languages', 'title', 'video', 'vote_average', 'release_date', 'overview'])

metadata['status'] = metadata['status'].fillna(metadata['status'].mode()[0])
metadata['runtime'] = metadata['runtime'].fillna(metadata['runtime'].median())

cols_to_drop = ['belongs_to_collection', 'homepage', 'tagline', 'poster_path']
metadata = metadata.drop(columns=cols_to_drop, errors='ignore')

In [79]:
metadata.isna().sum()

adult                   0
budget                  0
genres                  0
id                      0
imdb_id                 0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
title                   0
video                   0
vote_average            0
vote_count              0
dtype: int64

In [7]:
ratings = ratings.drop_duplicates(subset=['userId', 'movieId'], keep='last')
ratings = ratings[['userId', 'movieId', 'rating', 'timestamp']].copy()

links = links.dropna(subset=['movieId'])
links['movieId'] = links['movieId'].astype(int)

ratings = ratings[ratings['movieId'].isin(links['movieId'])].copy()

In [8]:
links = pd.read_csv('links.csv', dtype={'tmdbId': 'float64'})
links = links.dropna(subset=['tmdbId'])
links['tmdbId'] = links['tmdbId'].astype(int)
links = links.rename(columns={'tmdbId': 'movie_tmdb_id'})

In [9]:
metadata['id'] = pd.to_numeric(metadata['id'], errors='coerce')
metadata = metadata.dropna(subset=['id'])
metadata['id'] = metadata['id'].astype(int)

metadata = metadata.sort_values('vote_count', ascending=False)
metadata = metadata.drop_duplicates(subset='id', keep='first')

metadata = metadata.rename(columns={'id': 'tmdbId'})

In [10]:
ratings_with_tmdb = ratings.merge(links[['movieId', 'movie_tmdb_id']],
                                  on='movieId',
                                  how='left')

ratings_with_tmdb = ratings_with_tmdb.dropna(subset=['movie_tmdb_id'])
ratings_with_tmdb['movie_tmdb_id'] = ratings_with_tmdb['movie_tmdb_id'].astype(int)

print(f"Осталось оценок после связи с TMDb: {len(ratings_with_tmdb):,}")

Осталось оценок после связи с TMDb: 26,010,786


In [11]:
ratings_full = ratings_with_tmdb.rename(columns={'movie_tmdb_id': 'tmdbId'})

df = ratings_full.merge(metadata, on='tmdbId', how='left')

print(f"После merge с metadata: {len(df):,}")
print(f"Пропусков в title: {df['title'].isna().sum()}")

После merge с metadata: 26,010,786
Пропусков в title: 34110


In [12]:
df = df.dropna(subset=['title'])
print(f"После дропа title-NaN: {len(df):,}")

После дропа title-NaN: 25,976,676


In [13]:
from sklearn.preprocessing import LabelEncoder

if 'user_idx' not in df.columns or 'item_idx' not in df.columns:
    user_le = LabelEncoder()
    item_le = LabelEncoder()

    df['user_idx'] = user_le.fit_transform(df['userId'])
    df['item_idx'] = item_le.fit_transform(df['tmdbId'])

    user_mapping = pd.DataFrame({
        'userId': user_le.classes_,
        'user_idx': range(len(user_le.classes_))
    })
    item_mapping = pd.DataFrame({
        'tmdbId': item_le.classes_,
        'item_idx': range(len(item_le.classes_))
    })

    user_mapping.to_csv('user_mapping.csv', index=False)
    item_mapping.to_csv('item_mapping.csv', index=False)

num_users = df['user_idx'].nunique()
num_items = df['item_idx'].nunique()

print(f"Пользователей: {num_users:,} | Фильмов: {num_items:,}")
print(df[['user_idx', 'item_idx', 'rating']].head(3))

Пользователей: 270,882 | Фильмов: 43,678
   user_idx  item_idx  rating
0         0       150     1.0
1         0      4142     4.5
2         0       186     5.0


In [14]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(
    df[['user_idx', 'item_idx', 'rating']],
    test_size=0.15,
    random_state=42,
    stratify=df['rating'].round()
)

print(f"Train: {len(train_df):,} | Val: {len(val_df):,}")

Train: 22,080,174 | Val: 3,896,502


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import time

class RatingDataset(Dataset):
    def __init__(self, df):
        self.users  = torch.tensor(df['user_idx'].values, dtype=torch.long)
        self.items  = torch.tensor(df['item_idx'].values, dtype=torch.long)
        self.ratings = torch.tensor(df['rating'].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.ratings[idx]

batch_size = 4096
num_workers = 0

train_ds = RatingDataset(train_df)
val_ds   = RatingDataset(val_df)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=num_workers, pin_memory=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True)

In [16]:
# Модель (NCF)
class SimpleNCF(nn.Module):
    def __init__(self, n_users, n_items, emb_dim=64):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_dim)
        self.item_emb = nn.Embedding(n_items, emb_dim)

        self.mlp = nn.Sequential(
            nn.Linear(emb_dim * 2, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, u, i):
        user_vec = self.user_emb(u)
        item_vec = self.item_emb(i)
        x = torch.cat([user_vec, item_vec], dim=-1)
        return self.mlp(x).squeeze(-1)

device = torch.device("mps")
print("Device:", device)

model = SimpleNCF(num_users, num_items, emb_dim=64).to(device)
optimizer = optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-5)
criterion = nn.MSELoss()

Device: mps


In [90]:
from tqdm import tqdm

# Обучение
epochs = 6
best_val_loss = float('inf')

for epoch in tqdm(range(epochs), desc="Эпохи", unit="epoch"):
    start = time.time()

    model.train()
    train_loss = 0.0

    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False, unit="batch")

    for u, i, r in train_pbar:
        u, i, r = u.to(device), i.to(device), r.to(device)
        optimizer.zero_grad()
        pred = model(u, i)
        loss = criterion(pred, r)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * len(r)

        train_pbar.set_postfix(loss=f"{loss.item():.4f}")

    train_loss /= len(train_ds)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for u, i, r in val_loader:
            u, i, r = u.to(device), i.to(device), r.to(device)
            pred = model(u, i)
            val_loss += criterion(pred, r).item() * len(r)
    val_loss /= len(val_ds)

    rmse_train = train_loss ** 0.5
    rmse_val   = val_loss ** 0.5

    tqdm.write(f"[{epoch+1:2d}/{epochs}]  train RMSE: {rmse_train:.4f}  |  val RMSE: {rmse_val:.4f}  |  {time.time()-start:.1f} с")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_ncf_model.pt')
        tqdm.write("   → сохранена лучшая модель")

Эпохи:   0%|          | 0/6 [00:00<?, ?epoch/s]
  super().__init__(loader)

Epoch 1:   0%|          | 0/5391 [00:01<?, ?batch/s, loss=14.0152][A
Epoch 1:   0%|          | 1/5391 [00:01<2:17:45,  1.53s/batch, loss=14.0152][A
Epoch 1:   0%|          | 1/5391 [00:02<2:17:45,  1.53s/batch, loss=12.3985][A
Epoch 1:   0%|          | 2/5391 [00:02<1:28:12,  1.02batch/s, loss=12.3985][A
Epoch 1:   0%|          | 2/5391 [00:02<1:28:12,  1.02batch/s, loss=10.5810][A
Epoch 1:   0%|          | 3/5391 [00:02<53:08,  1.69batch/s, loss=10.5810]  [A
Epoch 1:   0%|          | 3/5391 [00:02<53:08,  1.69batch/s, loss=9.1297] [A
Epoch 1:   0%|          | 3/5391 [00:02<53:08,  1.69batch/s, loss=7.7131][A
Epoch 1:   0%|          | 5/5391 [00:02<27:47,  3.23batch/s, loss=7.7131][A
Epoch 1:   0%|          | 5/5391 [00:02<27:47,  3.23batch/s, loss=5.9252][A
Epoch 1:   0%|          | 5/5391 [00:02<27:47,  3.23batch/s, loss=4.6858][A
Epoch 1:   0%|          | 7/5391 [00:02<18:36,  4.82batch/s, loss=4.

[ 1/6]  train RMSE: 0.9260  |  val RMSE: 0.8785  |  1031.1 с


Эпохи:  17%|█▋        | 1/6 [17:11<1:25:56, 1031.39s/epoch]

   → сохранена лучшая модель



Epoch 2:   0%|          | 0/5391 [00:00<?, ?batch/s][A
Epoch 2:   0%|          | 0/5391 [00:02<?, ?batch/s, loss=0.7283][A
Epoch 2:   0%|          | 1/5391 [00:02<3:03:58,  2.05s/batch, loss=0.7283][A
Epoch 2:   0%|          | 1/5391 [00:02<3:03:58,  2.05s/batch, loss=0.7694][A
Epoch 2:   0%|          | 1/5391 [00:02<3:03:58,  2.05s/batch, loss=0.7630][A
Epoch 2:   0%|          | 3/5391 [00:02<53:30,  1.68batch/s, loss=0.7630]  [A
Epoch 2:   0%|          | 3/5391 [00:02<53:30,  1.68batch/s, loss=0.7660][A
Epoch 2:   0%|          | 3/5391 [00:02<53:30,  1.68batch/s, loss=0.7511][A
Epoch 2:   0%|          | 5/5391 [00:02<29:17,  3.06batch/s, loss=0.7511][A
Epoch 2:   0%|          | 5/5391 [00:02<29:17,  3.06batch/s, loss=0.7311][A
Epoch 2:   0%|          | 5/5391 [00:02<29:17,  3.06batch/s, loss=0.7501][A
Epoch 2:   0%|          | 7/5391 [00:02<19:23,  4.63batch/s, loss=0.7501][A
Epoch 2:   0%|          | 7/5391 [00:02<19:23,  4.63batch/s, loss=0.7808][A
Epoch 2:   0%|     

[ 2/6]  train RMSE: 0.8653  |  val RMSE: 0.8615  |  1047.4 с


Эпохи:  33%|███▎      | 2/6 [34:38<1:09:23, 1040.92s/epoch]

   → сохранена лучшая модель



Epoch 3:   0%|          | 0/5391 [00:00<?, ?batch/s][A
Epoch 3:   0%|          | 0/5391 [00:01<?, ?batch/s, loss=0.6861][A
Epoch 3:   0%|          | 1/5391 [00:01<2:03:57,  1.38s/batch, loss=0.6861][A
Epoch 3:   0%|          | 1/5391 [00:01<2:03:57,  1.38s/batch, loss=0.7436][A
Epoch 3:   0%|          | 1/5391 [00:01<2:03:57,  1.38s/batch, loss=0.7515][A
Epoch 3:   0%|          | 3/5391 [00:01<38:04,  2.36batch/s, loss=0.7515]  [A
Epoch 3:   0%|          | 3/5391 [00:01<38:04,  2.36batch/s, loss=0.7427][A
Epoch 3:   0%|          | 4/5391 [00:01<28:03,  3.20batch/s, loss=0.7427][A
Epoch 3:   0%|          | 4/5391 [00:01<28:03,  3.20batch/s, loss=0.7302][A
Epoch 3:   0%|          | 4/5391 [00:01<28:03,  3.20batch/s, loss=0.6897][A
Epoch 3:   0%|          | 6/5391 [00:01<17:05,  5.25batch/s, loss=0.6897][A
Epoch 3:   0%|          | 6/5391 [00:01<17:05,  5.25batch/s, loss=0.7582][A
Epoch 3:   0%|          | 6/5391 [00:01<17:05,  5.25batch/s, loss=0.6811][A
Epoch 3:   0%|     

[ 3/6]  train RMSE: 0.8471  |  val RMSE: 0.8568  |  2508.6 с
   → сохранена лучшая модель



Epoch 4:   0%|          | 0/5391 [00:00<?, ?batch/s][A
Epoch 4:   0%|          | 0/5391 [00:02<?, ?batch/s, loss=0.6768][A
Epoch 4:   0%|          | 1/5391 [00:02<3:03:55,  2.05s/batch, loss=0.6768][A
Epoch 4:   0%|          | 1/5391 [00:03<3:03:55,  2.05s/batch, loss=0.7226][A
Epoch 4:   0%|          | 2/5391 [00:03<2:41:29,  1.80s/batch, loss=0.7226][A
Epoch 4:   0%|          | 2/5391 [00:03<2:41:29,  1.80s/batch, loss=0.6902][A
Epoch 4:   0%|          | 3/5391 [00:03<1:33:12,  1.04s/batch, loss=0.6902][A
Epoch 4:   0%|          | 3/5391 [00:03<1:33:12,  1.04s/batch, loss=0.6701][A
Epoch 4:   0%|          | 4/5391 [00:03<1:01:03,  1.47batch/s, loss=0.6701][A
Epoch 4:   0%|          | 4/5391 [00:04<1:01:03,  1.47batch/s, loss=0.6772][A
Epoch 4:   0%|          | 5/5391 [00:04<43:21,  2.07batch/s, loss=0.6772]  [A
Epoch 4:   0%|          | 5/5391 [00:04<43:21,  2.07batch/s, loss=0.6994][A
Epoch 4:   0%|          | 6/5391 [00:04<32:16,  2.78batch/s, loss=0.6994][A
Epoch 4: 

[ 4/6]  train RMSE: 0.8344  |  val RMSE: 0.8427  |  949.4 с
   → сохранена лучшая модель



Epoch 5:   0%|          | 0/5391 [00:00<?, ?batch/s][A
Epoch 5:   0%|          | 0/5391 [00:01<?, ?batch/s, loss=0.6567][A
Epoch 5:   0%|          | 1/5391 [00:01<1:33:49,  1.04s/batch, loss=0.6567][A
Epoch 5:   0%|          | 1/5391 [00:01<1:33:49,  1.04s/batch, loss=0.6754][A
Epoch 5:   0%|          | 1/5391 [00:01<1:33:49,  1.04s/batch, loss=0.7095][A
Epoch 5:   0%|          | 3/5391 [00:01<30:07,  2.98batch/s, loss=0.7095]  [A
Epoch 5:   0%|          | 3/5391 [00:01<30:07,  2.98batch/s, loss=0.6350][A
Epoch 5:   0%|          | 3/5391 [00:01<30:07,  2.98batch/s, loss=0.6401][A
Epoch 5:   0%|          | 5/5391 [00:01<17:55,  5.01batch/s, loss=0.6401][A
Epoch 5:   0%|          | 5/5391 [00:01<17:55,  5.01batch/s, loss=0.7010][A
Epoch 5:   0%|          | 5/5391 [00:01<17:55,  5.01batch/s, loss=0.6897][A
Epoch 5:   0%|          | 7/5391 [00:01<12:48,  7.01batch/s, loss=0.6897][A
Epoch 5:   0%|          | 7/5391 [00:01<12:48,  7.01batch/s, loss=0.6862][A
Epoch 5:   0%|     

[ 5/6]  train RMSE: 0.8252  |  val RMSE: 0.8405  |  794.5 с


Эпохи:  83%|████████▎ | 5/6 [1:45:31<19:48, 1188.42s/epoch]

   → сохранена лучшая модель



Epoch 6:   0%|          | 0/5391 [00:00<?, ?batch/s][A
Epoch 6:   0%|          | 0/5391 [00:01<?, ?batch/s, loss=0.6453][A
Epoch 6:   0%|          | 1/5391 [00:01<1:34:29,  1.05s/batch, loss=0.6453][A
Epoch 6:   0%|          | 1/5391 [00:01<1:34:29,  1.05s/batch, loss=0.6798][A
Epoch 6:   0%|          | 1/5391 [00:01<1:34:29,  1.05s/batch, loss=0.6536][A
Epoch 6:   0%|          | 3/5391 [00:01<30:01,  2.99batch/s, loss=0.6536]  [A
Epoch 6:   0%|          | 3/5391 [00:01<30:01,  2.99batch/s, loss=0.6761][A
Epoch 6:   0%|          | 3/5391 [00:01<30:01,  2.99batch/s, loss=0.6831][A
Epoch 6:   0%|          | 5/5391 [00:01<17:49,  5.04batch/s, loss=0.6831][A
Epoch 6:   0%|          | 5/5391 [00:01<17:49,  5.04batch/s, loss=0.6891][A
Epoch 6:   0%|          | 5/5391 [00:01<17:49,  5.04batch/s, loss=0.6494][A
Epoch 6:   0%|          | 7/5391 [00:01<12:47,  7.01batch/s, loss=0.6494][A
Epoch 6:   0%|          | 7/5391 [00:01<12:47,  7.01batch/s, loss=0.6700][A
Epoch 6:   0%|     

[ 6/6]  train RMSE: 0.8173  |  val RMSE: 0.8352  |  863.7 с
   → сохранена лучшая модель





In [17]:
import pandas as pd
user_mapping = pd.read_csv('user_mapping.csv')
item_mapping = pd.read_csv('item_mapping.csv')

from sklearn.model_selection import train_test_split

data = df[['user_idx', 'item_idx', 'rating']].copy()

train_val_df, test_df = train_test_split(
    data,
    test_size=0.15,
    random_state=42,
    stratify=data['rating'].round()
)

train_df, val_df = train_test_split(
    train_val_df,
    test_size=0.15,
    random_state=42,
    stratify=train_val_df['rating'].round()
)

print(f"Размеры: Train {len(train_df):,}, Val {len(val_df):,}, Test {len(test_df):,}")

Размеры: Train 18,768,147, Val 3,312,027, Test 3,896,502


In [18]:
test_ds = RatingDataset(test_df)
test_loader = DataLoader(test_ds, batch_size=4096, shuffle=False, num_workers=0)

In [19]:
model = SimpleNCF(num_users, num_items, emb_dim=64).to(device)

model.load_state_dict(torch.load('best_ncf_model.pt'))
model.eval()

from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

all_preds = []
all_labels = []

with torch.no_grad():
    for u, i, r in test_loader:
        u, i, r = u.to(device), i.to(device), r.to(device)
        pred = model(u, i)
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(r.cpu().numpy())

rmse = np.sqrt(mean_squared_error(all_labels, all_preds))
mae = mean_absolute_error(all_labels, all_preds)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")

Test RMSE: 0.8352
Test MAE: 0.6417


в среднем модель ошибается на 0.64-0.84 балла по шкале от 0.5 до 5

In [22]:
from sklearn.metrics import ndcg_score, average_precision_score

def calculate_ranking_metrics(user_id, top_k=10):
    user_ratings = df[df['user_idx'] == user_id]

    if len(user_ratings) < 2:
        return None

    user_idx = user_mapping[user_mapping['userId'] == user_id]['user_idx'].values[0]
    user_tensor = torch.tensor([user_idx] * num_items, dtype=torch.long).to(device)
    all_items = torch.tensor(range(num_items), dtype=torch.long).to(device)

    with torch.no_grad():
        predictions = model(user_tensor, all_items).cpu().numpy()

    # Создать бинарные лейблы (рейтинг >= 4 = релевантный)
    real_ratings = user_ratings.set_index('item_idx')['rating']
    all_items_list = list(range(num_items))

    y_true = np.zeros(num_items)
    y_pred = predictions

    for item_idx, rating in real_ratings.items():
        y_true[item_idx] = 1 if rating >= 4.0 else 0

    rated_indices = list(real_ratings.index)
    y_true_subset = y_true[rated_indices]
    y_pred_subset = y_pred[rated_indices]

    if sum(y_true_subset) == 0:
        return None

    metrics = {}

    # NDCG@k
    try:
        metrics['ndcg@k'] = ndcg_score([y_true_subset], [y_pred_subset], k=top_k)
    except:
        metrics['ndcg@k'] = 0

    # Precision@k
    top_k_indices = np.argsort(y_pred_subset)[-top_k:][::-1]
    precision_at_k = np.mean(y_true_subset[top_k_indices])
    metrics['precision@k'] = precision_at_k

    # Recall@k
    relevant_count = np.sum(y_true_subset)
    recall_at_k = np.sum(y_true_subset[top_k_indices]) / relevant_count
    metrics['recall@k'] = recall_at_k

    # F1@k
    if precision_at_k + recall_at_k > 0:
        metrics['f1@k'] = 2 * precision_at_k * recall_at_k / (precision_at_k + recall_at_k)
    else:
        metrics['f1@k'] = 0

    # Average Precision (AP)
    metrics['ap'] = average_precision_score(y_true_subset, y_pred_subset)

    return metrics

def evaluate_ranking_metrics(n_users=100, top_k=10):
    """Оцениваем метрики ранжирования на выборке пользователей"""
    all_metrics = []
    user_ids = df['userId'].unique()[:n_users]

    for user_id in user_ids:
        metrics = calculate_ranking_metrics(user_id, top_k)
        if metrics:
            all_metrics.append(metrics)

    avg_metrics = {}
    for key in all_metrics[0].keys():
        avg_metrics[key] = np.mean([m[key] for m in all_metrics])

    return avg_metrics

ranking_results = evaluate_ranking_metrics(n_users=100000, top_k=10)
print("Метрики ранжирования (средние по 100 000 пользователям):")
for metric, value in ranking_results.items():
    print(f"  {metric}: {value:.4f}")

Метрики ранжирования (средние по 100 000 пользователям):
  ndcg@k: 0.7626
  precision@k: 0.6783
  recall@k: 0.4619
  f1@k: 0.4384
  ap: 0.7197


ndcg@k - Normalized Discounted Cumulative Gain - главная метрика качества ранжирования
67.8% рекомендаций в топ-10 действительно релевантны пользователю

Recall@k - Доля всех релевантных элементов, которые попали в топ-K
Модель находит 46% всех фильмов, которые понравятся пользователю

Precision@k - Доля релевантных элементов в топ-K рекомендациях
67.8% рекомендаций в топ-10 действительно релевантны пользователю

ap - Средняя точность по всем порогам

In [24]:
def calculate_personalization(n_users=50, top_k=10):
    """Рассчитываем персонализацию рекомендаций"""
    recommendations_matrix = []

    for user_id in df['userId'].unique()[:n_users]:
        user_idx = user_mapping[user_mapping['userId'] == user_id]['user_idx'].values[0]

        user_tensor = torch.tensor([user_idx] * num_items, dtype=torch.long).to(device)
        all_items = torch.tensor(range(num_items), dtype=torch.long).to(device)

        with torch.no_grad():
            predictions = model(user_tensor, all_items).cpu().numpy()

        top_indices = np.argsort(predictions)[-top_k:][::-1]
        recommendations_matrix.append(set(top_indices))

    personalization_scores = []
    for i in range(len(recommendations_matrix)):
        for j in range(i+1, len(recommendations_matrix)):
            intersection = len(recommendations_matrix[i] & recommendations_matrix[j])
            union = len(recommendations_matrix[i] | recommendations_matrix[j])
            similarity = intersection / union if union > 0 else 0
            personalization_scores.append(1 - similarity)

    return {
        'personalization': np.mean(personalization_scores) if personalization_scores else 0,
        'average_jaccard_similarity': 1 - np.mean(personalization_scores) if personalization_scores else 0
    }

personalization = calculate_personalization(n_users=50, top_k=10)
print("\nМетрики персонализации:")
for metric, value in personalization.items():
    print(f"  {metric}: {value:.4f}")


Метрики персонализации:
  personalization: 0.6163
  average_jaccard_similarity: 0.3837


Personalization - Мера уникальности рекомендаций для разных пользователей
0 (всем одинаковые) → 1 (полностью уникальные)

Average Jaccard Similarity - Среднее сходство между рекомендациями разных пользователей
рекомендации пересекаются на ~38%