Цель: научиться создавать рекомендательную систему на основе
графовых нейронных сетей (GNN), таких как легковесная графовая нейронная
сеть LightGCN, IRGNN (Item Relationship Graph Neural Network), применяя
методологию анализа графа взаимодействий пользователей и товаров/услуг,
научиться применять её для прогнозирования предпочтений пользователей,
оценить качество полученной модели на реальных данных.

1 часть – общий пример (1 балл)

In [None]:
import pandas as pd
from torch_geometric.data import Data
import torch
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
import os
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import pytorch_lightning as pl
import networkx as nx
import numpy as np
from tqdm.notebook import tqdm
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GraphConv

In [2]:
ratings = pd.read_csv('ml-latest-small\\ratings.csv')
movies = pd.read_csv('ml-latest-small\\movies.csv')

users = ratings['userId'].unique()
items = movies['movieId'].unique()

node_id_map = {uid: i for i, uid in enumerate(users)}
item_id_map = {iid: len(users)+i for i, iid in enumerate(items)}

In [3]:
edges = []
labels = []
for _, row in ratings.iterrows():
    user_idx = node_id_map[row['userId']]
    item_idx = item_id_map[row['movieId']]
    edges.append((user_idx, item_idx))
    labels.append(row['rating'])
    
edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
scaler = MinMaxScaler(feature_range=(0, 1))
ratings['rating'] = scaler.fit_transform(ratings[['rating']])
labels = torch.tensor(ratings['rating'].values, dtype=torch.float)

In [4]:
num_nodes = len(users) + len(items)
x = torch.eye(num_nodes)

edge_indices = list(range(edge_index.shape[1]))
train_idx, test_idx = train_test_split(edge_indices, test_size=0.2, random_state=42)
train_edge_index = edge_index[:, train_idx]
train_labels = labels[train_idx]

In [5]:
class RecommenderModel(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x.squeeze()

In [26]:
model = RecommenderModel(num_features=num_nodes, hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.MSELoss()

def train():
    model.train()
    optimizer.zero_grad()
    out = model(x, train_edge_index)
    preds = out[train_edge_index[1]]
    loss = criterion(preds, train_labels)
    loss.backward()
    optimizer.step()
    return float(loss)

for epoch in range(1, 101):
    loss = train()
    print(f'Epoch: {epoch}, Loss: {loss:.4f}')

Epoch: 1, Loss: 0.6062
Epoch: 2, Loss: 0.2913
Epoch: 3, Loss: 0.1618
Epoch: 4, Loss: 0.1196
Epoch: 5, Loss: 0.1528
Epoch: 6, Loss: 0.1839
Epoch: 7, Loss: 0.1756
Epoch: 8, Loss: 0.1445
Epoch: 9, Loss: 0.1132
Epoch: 10, Loss: 0.0948
Epoch: 11, Loss: 0.0908
Epoch: 12, Loss: 0.0951
Epoch: 13, Loss: 0.1004
Epoch: 14, Loss: 0.1025
Epoch: 15, Loss: 0.1001
Epoch: 16, Loss: 0.0943
Epoch: 17, Loss: 0.0866
Epoch: 18, Loss: 0.0790
Epoch: 19, Loss: 0.0732
Epoch: 20, Loss: 0.0701
Epoch: 21, Loss: 0.0696
Epoch: 22, Loss: 0.0709
Epoch: 23, Loss: 0.0726
Epoch: 24, Loss: 0.0732
Epoch: 25, Loss: 0.0722
Epoch: 26, Loss: 0.0698
Epoch: 27, Loss: 0.0667
Epoch: 28, Loss: 0.0638
Epoch: 29, Loss: 0.0618
Epoch: 30, Loss: 0.0607
Epoch: 31, Loss: 0.0604
Epoch: 32, Loss: 0.0606
Epoch: 33, Loss: 0.0608
Epoch: 34, Loss: 0.0607
Epoch: 35, Loss: 0.0602
Epoch: 36, Loss: 0.0593
Epoch: 37, Loss: 0.0583
Epoch: 38, Loss: 0.0573
Epoch: 39, Loss: 0.0565
Epoch: 40, Loss: 0.0559
Epoch: 41, Loss: 0.0556
Epoch: 42, Loss: 0.0555
E

In [29]:
test_edge_index = edge_index[:, test_idx]
test_labels = labels[test_idx]

def evaluate(model, x, edge_index, labels):
    model.eval()
    with torch.no_grad():
        pred = model(x, edge_index)
        preds = pred[edge_index[1]]  
        mse_loss = criterion(preds, labels)
    return float(mse_loss)

test_mse = evaluate(model, x, test_edge_index, labels[test_idx])
print(f'Test MSE: {test_mse:.4f}')

Test MSE: 0.0892


7. Самостоятельное задание

Проведите гиперпараметризацию модели, выбрав оптимальное количество
слоёв и размерность скрытых признаков.
Добавьте дополнительные признаки пользователей и объектов (пол,
возраст, жанр фильма и др.) и посмотрите влияние на точность рекомендаций.

In [31]:
hidden_dims = [8, 16, 32, 64]
num_layers_list = [1, 2, 3]

best_mse = float('inf')
best_params = {}

for hidden_dim in hidden_dims:
    for num_layers in num_layers_list:
        class FlexibleGCN(torch.nn.Module):
            def __init__(self, num_features, hidden_dim, num_layers):
                super().__init__()
                self.layers = torch.nn.ModuleList()
                if num_layers == 1:
                    self.layers.append(GCNConv(num_features, 1))
                else:
                    self.layers.append(GCNConv(num_features, hidden_dim))
                    for _ in range(num_layers - 2):
                        self.layers.append(GCNConv(hidden_dim, hidden_dim))
                    self.layers.append(GCNConv(hidden_dim, 1))
            def forward(self, x, edge_index):
                for i, conv in enumerate(self.layers):
                    x = conv(x, edge_index)
                    if i != len(self.layers) - 1:
                        x = F.relu(x)
                return x.squeeze()

        model = FlexibleGCN(num_features=num_nodes, hidden_dim=hidden_dim, num_layers=num_layers)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

        for epoch in range(101): 
            model.train()
            optimizer.zero_grad()
            out = model(x, train_edge_index)
            preds = out[train_edge_index[1]]
            loss = criterion(preds, train_labels)
            loss.backward()
            optimizer.step()
        
        test_mse = evaluate(model, x, test_edge_index, test_labels)
        print(f"Hidden: {hidden_dim}, Layers: {num_layers}, Test MSE: {test_mse:.4f}")

        if test_mse < best_mse:
            best_mse = test_mse
            best_params = {'hidden_dim': hidden_dim, 'num_layers': num_layers}

print(f"\nBest params: {best_params}, Best Test MSE: {best_mse:.4f}")

Hidden: 8, Layers: 1, Test MSE: 0.0863
Hidden: 8, Layers: 2, Test MSE: 0.0909
Hidden: 8, Layers: 3, Test MSE: 0.0765
Hidden: 16, Layers: 1, Test MSE: 0.0867
Hidden: 16, Layers: 2, Test MSE: 0.0905
Hidden: 16, Layers: 3, Test MSE: 0.0802
Hidden: 32, Layers: 1, Test MSE: 0.0871
Hidden: 32, Layers: 2, Test MSE: 0.0959
Hidden: 32, Layers: 3, Test MSE: 0.0859
Hidden: 64, Layers: 1, Test MSE: 0.0863
Hidden: 64, Layers: 2, Test MSE: 0.0971
Hidden: 64, Layers: 3, Test MSE: 0.0864

Best params: {'hidden_dim': 8, 'num_layers': 3}, Best Test MSE: 0.0765


3 часть – Применение графовой нейронной сети IRGNN для реализации
рекомендательной системы (2 балла)

In [11]:
df = pd.read_csv('ml-100k\\u.data', delimiter='\t', header=None, names=["user_id", "item_id",
"rating", "timestamp"])
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
graph = nx.DiGraph()

for index, group in tqdm(df.groupby(['item_id'])):
    graph.add_node(index)
    common_users = set(group['user_id'])
    for other_item_id, other_group in df.groupby(['item_id']):
        if other_item_id != index and not graph.has_edge(index, other_item_id):
            intersection = common_users.intersection(set(other_group['user_id']))
            weight = len(intersection)
            if weight > 0:
                graph.add_edge(index, other_item_id, weight=weight)

  0%|          | 0/1682 [00:00<?, ?it/s]

In [14]:
dgl_graph = dgl.from_networkx(graph, edge_attrs=['weight'])

class IRGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(IRGNN, self).__init__()
        self.conv1 = GraphConv(input_dim, hidden_dim)
        self.conv2 = GraphConv(hidden_dim, output_dim)

    def forward(self, g, features):
        h = F.relu(self.conv1(g, features))
        h = F.relu(self.conv2(g, h))
        return h

input_dim = 16
hidden_dim = 32
output_dim = 8
model = IRGNN(input_dim, hidden_dim, output_dim)
features = torch.randn(dgl_graph.number_of_nodes(), input_dim)
labels = torch.zeros(dgl_graph.number_of_nodes())
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [16]:
labels = labels.long()

for epoch in range(100):
    logits = model(dgl_graph, features)
    loss = F.cross_entropy(logits, labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch}: Loss {loss.item():.4f}")

Epoch 0: Loss 2.0958
Epoch 1: Loss 2.0500
Epoch 2: Loss 2.0148
Epoch 3: Loss 1.9821
Epoch 4: Loss 1.9485
Epoch 5: Loss 1.9141
Epoch 6: Loss 1.8780
Epoch 7: Loss 1.8399
Epoch 8: Loss 1.7991
Epoch 9: Loss 1.7555
Epoch 10: Loss 1.7091
Epoch 11: Loss 1.6600
Epoch 12: Loss 1.6083
Epoch 13: Loss 1.5541
Epoch 14: Loss 1.4975
Epoch 15: Loss 1.4386
Epoch 16: Loss 1.3776
Epoch 17: Loss 1.3147
Epoch 18: Loss 1.2502
Epoch 19: Loss 1.1843
Epoch 20: Loss 1.1174
Epoch 21: Loss 1.0498
Epoch 22: Loss 0.9821
Epoch 23: Loss 0.9145
Epoch 24: Loss 0.8477
Epoch 25: Loss 0.7821
Epoch 26: Loss 0.7180
Epoch 27: Loss 0.6558
Epoch 28: Loss 0.5959
Epoch 29: Loss 0.5391
Epoch 30: Loss 0.4858
Epoch 31: Loss 0.4363
Epoch 32: Loss 0.3907
Epoch 33: Loss 0.3491
Epoch 34: Loss 0.3115
Epoch 35: Loss 0.2778
Epoch 36: Loss 0.2478
Epoch 37: Loss 0.2211
Epoch 38: Loss 0.1977
Epoch 39: Loss 0.1770
Epoch 40: Loss 0.1590
Epoch 41: Loss 0.1432
Epoch 42: Loss 0.1294
Epoch 43: Loss 0.1173
Epoch 44: Loss 0.1068
Epoch 45: Loss 0.097

In [None]:
def predict_rating(user_id, item_id):
    device = next(model.parameters()).device
    user_rated_items = train_df[train_df.user_id == user_id]['item_id'].unique()
    if len(user_rated_items) == 0:
        return float('-inf')

    features_dev = features.to(device)
    g_dev = dgl_graph.to(device)

    model.eval()
    with torch.no_grad():
        all_item_emb = model(g_dev, features_dev)

        if item_id not in g_dev.ndata[dgl.NID].tolist() if 'nid' in g_dev.ndata else False:
            return float('-inf')

        target_emb = all_item_emb[item_id]

        rated_embs = []
        for iid in user_rated_items:
            if iid < all_item_emb.size(0):
                rated_embs.append(all_item_emb[iid])
        if len(rated_embs) == 0:
            return float('-inf')

        user_emb = torch.stack(rated_embs, dim=0).mean(dim=0)

        score = torch.dot(user_emb, target_emb).item()

    return score


def recommend_items(user_id, top_k=10):
    items_rated = set(train_df[train_df.user_id == user_id]['item_id'].tolist())

    all_items = set(df['item_id'].unique())
    unrated_items = list(all_items - items_rated)
    if len(items_rated) == 0:
        return []
    predictions = {}

    for iid in tqdm(unrated_items, desc=f"Predicting for user {user_id}"):
        try:
            score = predict_rating(user_id, iid)
        except Exception:
            continue
        predictions[iid] = score

    if not predictions:
        return []

    sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    top_items = [item_id for item_id, _ in sorted_predictions[:top_k]]
    return top_items
    
recommendations = recommend_items(test_df.iloc[0].user_id, top_k=10)
print(recommendations)

Predicting for user 877:   0%|          | 0/1617 [00:00<?, ?it/s]

[287, 301, 285, 49, 257, 293, 180, 312, 268, 299]


Самостоятельное задание\
Примените предложенную реализацию на другом открытом наборе данных
lastfm.zip.
Попробуйте реализовать расширенные версии модели (например, добавив
веса ребрам графа или учитывая временные факторы при формировании связей).

In [31]:
edges_path = os.path.join("lastfm", "lastfm.edges")
colnames = ["user_id", "item_id", "rating_dummy", "timestamp"]
lastfm_edges = pd.read_csv(edges_path, sep=",", names=colnames, header=None)

types_path = os.path.join("lastfm", "lastfm.types")
try:
    lastfm_types = pd.read_csv(types_path, sep=",", names=["item_id", "type"], header=None)
except FileNotFoundError:
    lastfm_types = None

In [32]:
small_lastfm_edges = lastfm_edges.sample(n=5000, random_state=42).reset_index(drop=True)

train_lastfm, test_lastfm = train_test_split(
    small_lastfm_edges, test_size=0.2, random_state=42
)

train_df = train_lastfm.copy()
test_df  = test_lastfm.copy()

In [33]:
grouped = train_df.groupby("item_id")[["user_id", "timestamp"]]

item2df = { item: group_df.reset_index(drop=True) 
            for item, group_df in grouped }

all_items = list(item2df.keys())
graph = nx.DiGraph()

for item in all_items:
    graph.add_node(item)

item2user2time = {}
for item, df_item in item2df.items():
    item2user2time[item] = dict(
        zip(df_item["user_id"].values, df_item["timestamp"].values)
    )


n_items = len(all_items)
for idx_i in tqdm(range(n_items), desc="Building item-item edges"):
    item_i = all_items[idx_i]
    user_times_i = item2user2time[item_i]
    users_i = set(user_times_i.keys())

    for idx_j in range(idx_i + 1, n_items):
        item_j = all_items[idx_j]
        user_times_j = item2user2time[item_j]
        users_j = set(user_times_j.keys())

        common_users = users_i.intersection(users_j)
        if not common_users:
            continue

        deltas = []
        for u in common_users:
            t_i = user_times_i[u]
            t_j = user_times_j[u]
            deltas.append(abs(t_i - t_j) / 86400.0)
        avg_delta_days = np.mean(deltas) if deltas else 0.0

        weight_ij = len(common_users) / (1.0 + avg_delta_days)
        weight_ji = weight_ij 

        graph.add_edge(item_i, item_j, weight=weight_ij)
        graph.add_edge(item_j, item_i, weight=weight_ji)

Building item-item edges:   0%|          | 0/3385 [00:00<?, ?it/s]

In [36]:
dgl_graph = dgl.from_networkx(graph, edge_attrs=['weight'])
dgl_graph = dgl.add_self_loop(dgl_graph)

num_nodes = dgl_graph.number_of_nodes() 
input_dim = features.size(1)           
features = torch.randn(num_nodes, input_dim)

labels = torch.zeros(num_nodes).long()

device = torch.device('cpu')
model.to(device)
features = features.to(device)
dgl_graph = dgl_graph.to(device)

for epoch in range(100):
    model.train()
    optimizer.zero_grad()

    logits = model(dgl_graph, features) 
    loss = F.cross_entropy(logits, labels.to(device))
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1:03d}, Loss: {loss.item():.4f}")

Epoch 010, Loss: 0.2358
Epoch 020, Loss: 0.0475
Epoch 030, Loss: 0.0061
Epoch 040, Loss: 0.0020
Epoch 050, Loss: 0.0012
Epoch 060, Loss: 0.0010
Epoch 070, Loss: 0.0008
Epoch 080, Loss: 0.0008
Epoch 090, Loss: 0.0007
Epoch 100, Loss: 0.0006


In [40]:
all_items_sorted = sorted(all_items) 
item_to_idx = { item: i for i, item in enumerate(all_items_sorted) }
idx_to_item = { i: item for item, i in item_to_idx.items() }

def predict_rating(user_id, item_id):
    device = next(model.parameters()).device
    user_rated_items = train_df[train_df.user_id == user_id]['item_id'].unique()
    if len(user_rated_items) == 0:
        return float('-inf')

    if item_id not in item_to_idx:
        return float('-inf')

    g_dev = dgl_graph.to(device)
    features_dev = features.to(device)

    model.eval()
    with torch.no_grad():
        global all_item_emb
        all_item_emb = model(g_dev, features_dev) 

        node_idx = item_to_idx[item_id]
        target_emb = all_item_emb[node_idx]  

        rated_embs = []
        for iid in user_rated_items:
            if iid not in item_to_idx:
                continue
            idx = item_to_idx[iid]
            rated_embs.append(all_item_emb[idx])
        if len(rated_embs) == 0:
            return float('-inf')

        user_emb = torch.stack(rated_embs, dim=0).mean(dim=0)
        score = torch.dot(user_emb, target_emb).item()

    return score

In [None]:
def recommend_items(user_id, top_k=10):
    items_rated = set(train_df[train_df.user_id == user_id]['item_id'].tolist())
    all_items_set = set(lastfm_edges['item_id'].unique())
    unrated_items = list(all_items_set - items_rated)
    if len(items_rated) == 0:
        return []

    device = next(model.parameters()).device
    g_dev = dgl_graph
    features_dev = features
    model.eval()
    with torch.no_grad():
        all_item_emb = model(g_dev, features_dev) 

    rated_embs = []
    for iid in items_rated:
        if iid not in item_to_idx:
            continue
        idx = item_to_idx[iid]
        rated_embs.append(all_item_emb[idx])
    if len(rated_embs) == 0:
        return []

    user_emb = torch.stack(rated_embs, dim=0).mean(dim=0) 

    predictions = {}
    for iid in tqdm(unrated_items, desc=f"Predicting for user {user_id}"):
        if iid not in item_to_idx:
            continue
        idx = item_to_idx[iid]
        target_emb = all_item_emb[idx]
        score = torch.dot(user_emb, target_emb).item()
        predictions[iid] = score

    if not predictions:
        return []

    sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    top_items = [item_id for item_id, _ in sorted_predictions[:top_k]]
    return top_items

sample_user = test_df.iloc[0].user_id
recs = recommend_items(sample_user, top_k=10)
print(f"Рекомендации для LastFM-пользователя {sample_user}: {recs}")


Predicting for user 459:   0%|          | 0/1258695 [00:00<?, ?it/s]

Рекомендации для LastFM-пользователя 459: [48089, 1093107, 796668, 68119, 1094994, 1109971, 1210870, 1085692, 1100689, 1174619]
